I prefer to collect all my imports at the top of the file in one place rather than distribute them throughout, and document the versions of the key packages:
import sqlite3
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from pyproj import Transformer
import pyproj
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import statsmodels.api as sm
from sklearn.inspection import permutation_importance
import random
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from pickle import dump
import sklearn
import xgboost
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.transform import linear_cmap
import bokeh.palettes as bp
from bokeh.models import Scatter
from bokeh.models import ColorBar
from bokeh.models import HoverTool
from bokeh.io import show
from bokeh.io import output_notebook
output_notebook(hide_banner=True)
print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))
pandas==2.0.3 numpy==1.26.0 pyproj==3.6.1 statsmodels.api==0.14.0 sklearn==1.3.1 xgboost==1.7.3
The hashed version of the database is for the blog - it hashes the property id's and hides the addresses for privacy:
database_path = 'databases/forsale2_hashed.db'
The methodology adopted with the database store -> extract -> load -> transform is such that the raw data is stored as strings as the numerical data can take unexpected non-numerical values which are unknown in advance. We want the separate data store process to be simple, high reliability and not prone to failing due to unexpected data types over which we have no control. The various features added are explained in the comments:
# df is our main feature / target dataframe (from the propery addresses table) to which we will add various additional features
# Here we are loading all the data in the databse tables.
# If it gets too large we can limit to last X entries using DESC LIMIT X
con = sqlite3.connect(database_path)
df = pd.read_sql("SELECT * FROM addresses NATURAL JOIN dates", con, index_col='pid')
prices_df = pd.read_sql("SELECT * FROM prices NATURAL JOIN dates", con, index_col='pid')
status_df = pd.read_sql("SELECT * FROM status NATURAL JOIN dates", con, index_col='pid')
con.close()
# Impute 1 bath where baths is missing ('None' in database)
df['baths'] = np.where(df['baths'] == 'None', 1, df['baths'])
# Convert to numerical
df['beds'] = df['beds'].astype('int')
df['baths'] = df['baths'].astype('int')
df['latitude'] = df['latitude'].astype('float')
df['longitude'] = df['longitude'].astype('float')
# Impute 'Fixed(NP)' where the price type wasn't entered ('' in database)
prices_df['price_type'] = np.where(prices_df['price_type'] == '', 'Fixed(NP)', prices_df['price_type'])
# Get the last price and price type and add to the main df
df['current_price_type'] = prices_df.groupby('pid').tail(1)['price_type']
df['current_price'] = prices_df.groupby('pid').tail(1)['price']
# Drop any prices not convertible to numerical (they are of no use to us anyway), and convert
print('Number of properties before removing invalid prices:', len(df))
df = df[df['current_price'].str.isdigit()]
df['current_price'] = df['current_price'].astype('int')
print('Number of properties after removing invalid prices:', len(df))
# Replace term 'Active' with 'For Sale' - Active has a broader context here and includes 'Under Offer' etc
status_df['status'] = np.where(status_df['status'] == 'Active', 'For Sale', status_df['status'])
# Get last status and add to main df
df['current_status'] = status_df.groupby('pid').tail(1)['status']
# Database stores longitude / latitude property location
# We need Web Mercator since the purpose is to plot them on an opensource map.
origin_shift = np.pi * 6378137
df['eastings'] = df['longitude'] * origin_shift / 180.0
df['northings'] = np.log(np.tan((90 + df['latitude']) * np.pi / 360.0)) * origin_shift / np.pi
Number of properties before removing invalid prices: 7670 Number of properties after removing invalid prices: 7650
The following features are the initial candidates for the ML model(s):
# Add a higher level house / flat catagorical feature - dwelling_cat
# Human labelling here as many terms are used in the listings
flats_str = 'Flat|Apartment|Duplex|Ground Flat|Maisonette|Studio|Penthouse|Ground Maisonette|Triplex'
houses_str = 'Town House|Mews|Terraced|Semi-Detached|End of Terrace|Detached|House|Semi-Detached Bungalow|Detached Villa| \
Detached Bungalow|Villa|Bungalow|Terraced Bungalow|Cottage|Link Detached House|Semi-detached Villa| \
Barn Conversion|Lodge|Character Property'
# If it's not a Flat, test if it's a House and if not, it's an 'Other'
df['dwelling_cat'] = np.where(df['dwelling_subtype'].str.contains(flats_str), 'Flat', \
np.where(df['dwelling_subtype'].str.contains(houses_str), 'House', 'Other'))
# Add some one-hot features around building category / sub-type
df['is_house'] = np.where(df['dwelling_cat'] == 'House', 1, 0)
df['is_flat'] = np.where(df['dwelling_cat'] == 'Flat', 1, 0)
df['is_semi_detached'] = np.where(df['dwelling_subtype'].str.contains('Semi-Detached|Semi-detached'), 1, 0)
df['is_terraced'] = np.where(df['dwelling_subtype'].str.contains('Terraced|End of Terrace'), 1, 0)
df['is_detached'] = np.where((df['dwelling_subtype'] == 'Detached') |
(df['dwelling_subtype'].str.contains('Detached Bungalow|Detached Villa')),1, 0)
df['is_apartment'] = np.where(df['dwelling_subtype'] == 'Apartment', 1, 0)
df['is_ground_flat'] = np.where(df['dwelling_subtype'] == 'Ground Flat', 1, 0)
df['is_townhouse'] = np.where(df['dwelling_subtype'] == 'Town House',1 , 0)
# Add a days on market (dom) feature - round to nearest day (12 hours)
df['dom'] = ((pd.to_datetime(df['dt_last_seen']) -
pd.to_datetime(df['dt_first_listed'].str.strip('Z').str.replace('T',' '))) +
datetime.timedelta(hours=12)).dt.days
# Sale Price features
df['is_fixed_price'] = np.where(df['current_price_type'].str.contains('Fixed'), 1, 0)
df['price_changes'] = prices_df.groupby('pid').count()['price'] - 1
# Sale Status features
df['is_under_offer'] = np.where(df['current_status'].str.contains('offer'), 1, 0)
df['is_sold'] = np.where(df['current_status'].str.contains('Sold'), 1, 0)
df['is_forsale'] = np.where(df['current_status'].str.contains('For Sale'), 1, 0)
df['status_changes'] = status_df.groupby('pid').count()['status'] - 1
# m-factor is 'motivation-factor' described in part 4
df['m_factor'] = np.where(df['dom'] > 0, np.round(100 * df['price_changes'] / df['dom'],2), 0)
# The target (Log10 Price)
df['log_price'] = np.log10(df['current_price'])
# Helper function for converting from listing location in easting / northing coordinates to a full postcode
def lookup_postcode_from_eastings_northings(easting, northing, postcodes_df):
# Euclidean distance to all postcodes
postcodes_df['distance'] = np.sqrt((postcodes_df['wm-northings']-northing)**2 + (postcodes_df['wm-eastings']-easting)**2)
# Find the smallest distance
idx_min = postcodes_df['distance'].idxmin()
postcode = postcodes_df.iloc[idx_min]['Postcode']
return postcode
We are using the open source Codepoint data (UK government, Ordinance Survey data) to convert an easting / northing coordinate to a full postcode so that we can then generate a feature based on either postcode district (course granularity) or postcode sector (fine granularity). The features are added as one-hot attributes.
# Look up the postcode from lat / lon
# LOAD APPROPRIATE POSTCODE FILE
# Load the CodePoint file and add Lon/Lat conversion to it (from Eastings/Northings)
# Source: https://osdatahub.os.uk/downloads/open/CodePointOpen
# Attribution: Contains public sector information licensed under the Open Government Licence v3.0.
colnames = ['Postcode','Positional_quality_indicator','Eastings','Northings','Country_code','NHS_regional_HA_code','NHS_HA_code','Admin_county_code','Admin_district_code','Admin_ward_code']
postcodes_df = pd.read_csv('codepo_gb\\Data\\CSV\\eh.csv', names=colnames)
postcodes_df = postcodes_df[postcodes_df['Postcode'].str.contains('EH1|EH2|EH3 |EH4 |EH5 |EH6 |EH7 |EH8 |EH9 |EH30|EH31|EH32')]
# Just retain postcode and Eastings/Northings columns
postcodes_df = postcodes_df[['Postcode','Eastings','Northings']]
postcodes_df.reset_index(inplace=True, drop=True)
# Add the OSGB36 / EPSG27700 Eastings/Northings to Web Mercator (WGS84) Lon/Lat conversion
# Lat/Long Web Mercator projection (WGS84)
v84 = pyproj.Proj(proj="latlong", towgs84="0,0,0", ellps="WGS84")
# OSGB36 (EPSG:27700) projection
v36 = pyproj.Proj(proj="latlong", k=0.9996012717, ellps="airy",
# Datum transformation to WGS84
towgs84="446.448, -125.157, 542.060, 0.1502, 0.2470, 0.8421,-20.4894")
# UK National Grid is EPSG:27700 or OSGB36
vgrid = pyproj.Proj("EPSG:27700")
# Converts OSGB36 (EPSG:27700) Eastings / Northings to OSGB36 longitude / latitude
vlon36, vlat36 = vgrid(postcodes_df['Eastings'].values, postcodes_df['Northings'].values, inverse=True)
# Convert lat/lon from OSGB36 to Web Mercator lat/lon using a projection defn (see above)
transformer = pyproj.Transformer.from_proj(v36, v84)
converted = transformer.transform(vlon36, vlat36)
postcodes_df['longitude'] = np.round(converted[0],6)
postcodes_df['latitude'] = np.round(converted[1],6)
# Convert UK Eastings / Northings to web mercator Easting / Northings
transformer = pyproj.Transformer.from_crs("EPSG:27700", "EPSG:3857")
converted = transformer.transform(postcodes_df['Eastings'], postcodes_df['Northings'])
postcodes_df['wm-eastings'] = np.round(converted[0],6)
postcodes_df['wm-northings'] = np.round(converted[1],6)
df['postcode_full'] = df.apply(lambda x: lookup_postcode_from_eastings_northings(x['eastings'], x['northings'], postcodes_df), axis=1)
df['postcode_district'] = [x.split(' ')[0] for x in df['postcode_full']]
df['postcode_sector'] = [x.split(' ')[0] + ' ' + x.split(' ')[-1][0] for x in df['postcode_full']]
# One hot encode the area postcode districts and sectors
postcode_districts_df = pd.get_dummies(df['postcode_district'])
postcode_district_feature_names = postcode_districts_df.columns
df = df.join(postcode_districts_df)
postcode_sectors_df = pd.get_dummies(df['postcode_sector'])
postcode_sector_feature_names = postcode_sectors_df.columns
df = df.join(postcode_sectors_df)
The first thing we need to do is remove all the 'Other' properties from the data. These are things like Garages, land for sale and so on.
print('Number of properties before removing \"Others\":', len(df))
df = df[df['dwelling_cat'] != 'Other']
print('Number of properties after removing \"Others\":', len(df))
Number of properties before removing "Others": 7650 Number of properties after removing "Others": 7474
Let's take a look at the non-binary features we'll initially be using in our model:
# non-binary features (to be standardised)
nb_feature_names = ["beds","baths","latitude","longitude","dom","price_changes","status_changes","m_factor"]
df[nb_feature_names].describe()
| beds | baths | latitude | longitude | dom | price_changes | status_changes | m_factor | |
|---|---|---|---|---|---|---|---|---|
| count | 7474.000000 | 7474.000000 | 7474.000000 | 7474.000000 | 7474.000000 | 7474.000000 | 7474.000000 | 7474.000000 |
| mean | 2.638614 | 1.467220 | 55.930873 | -3.210222 | 93.219160 | 0.214075 | 0.341049 | 0.339620 |
| std | 1.172234 | 1.512094 | 0.217301 | 1.140148 | 83.733674 | 0.542022 | 0.574873 | 1.101422 |
| min | 0.000000 | 0.000000 | 41.697300 | -72.722829 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 2.000000 | 1.000000 | 55.921700 | -3.236875 | 38.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 2.000000 | 1.000000 | 55.942113 | -3.195746 | 73.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 3.000000 | 2.000000 | 55.959346 | -3.147716 | 124.000000 | 0.000000 | 1.000000 | 0.000000 |
| max | 10.000000 | 114.000000 | 56.039905 | -1.423048 | 1071.000000 | 6.000000 | 5.000000 | 25.000000 |
pd.plotting.scatter_matrix(df[nb_feature_names], figsize=(14,14))
plt.show()
We can see the following anomalies:
Investigation of the entries in the database revealed that these were otherwise typical house / flat listings but that the data had been obviously entered in error. The total number of affected entries is small (<<1%) and so we will simply drop these properties completely from the data.
print('Number of properties before cleaning:', len(df))
# Remove the incorrect / outlier lat/long entries - they are well outside the Edinburgh area
df = df[(df['latitude'] >= 55.7) & (df['latitude'] <= 56.1)]
df = df[(df['longitude'] >= -3.6) & (df['longitude'] <= -2.6)]
# Remove the baths outlier (it has over 100 baths!)
df = df[df['baths'] <= 10]
print('Number of properties after cleaning:', len(df))
Number of properties before cleaning: 7474 Number of properties after cleaning: 7459
pd.plotting.scatter_matrix(df[nb_feature_names], figsize=(14,14))
plt.show()
The cleaned data distributions now all look much better.
Before we do any modelling of the dataset, we will split off an (unseen) test set (20% of the data) that we will use at the very end to get an estimate of the model performance.
tv_df, unseen_test_df = train_test_split(df, test_size=0.2, random_state=42)
# Standardise the numerical features
nb_feature_names = ["beds","baths","latitude","longitude","dom","price_changes","status_changes","m_factor"]
scaler = StandardScaler().set_output(transform="pandas")
tv_df[nb_feature_names] = scaler.fit_transform(tv_df[nb_feature_names])
tv_df[nb_feature_names].describe()
| beds | baths | latitude | longitude | dom | price_changes | status_changes | m_factor | |
|---|---|---|---|---|---|---|---|---|
| count | 5.967000e+03 | 5.967000e+03 | 5.967000e+03 | 5.967000e+03 | 5.967000e+03 | 5.967000e+03 | 5.967000e+03 | 5.967000e+03 |
| mean | -9.764455e-17 | -1.214603e-16 | 4.234439e-15 | 3.506868e-15 | 2.619732e-17 | 1.190787e-18 | -5.953936e-18 | -2.679271e-17 |
| std | 1.000084e+00 | 1.000084e+00 | 1.000084e+00 | 1.000084e+00 | 1.000084e+00 | 1.000084e+00 | 1.000084e+00 | 1.000084e+00 |
| min | -2.246990e+00 | -1.875799e+00 | -4.141363e+00 | -2.699389e+00 | -1.092261e+00 | -3.866743e-01 | -5.935836e-01 | -2.966163e-01 |
| 25% | -5.412729e-01 | -5.840630e-01 | -3.684722e-01 | -4.918111e-01 | -6.492996e-01 | -3.866743e-01 | -5.935836e-01 | -2.966163e-01 |
| 50% | -5.412729e-01 | -5.840630e-01 | 1.862591e-01 | -4.049936e-02 | -2.413091e-01 | -3.866743e-01 | -5.935836e-01 | -2.966163e-01 |
| 75% | 3.115857e-01 | 7.076731e-01 | 6.585393e-01 | 4.781900e-01 | 3.531913e-01 | -3.866743e-01 | 1.145206e+00 | -2.966163e-01 |
| max | 6.281596e+00 | 1.104156e+01 | 2.879644e+00 | 4.078616e+00 | 1.139225e+01 | 1.083189e+01 | 8.100362e+00 | 2.185871e+01 |
After Standardisation we see that the means are all very close to 0 and the standard deviations are all equal to 1.
We will use cross-validation with 10-folds as the basis for evaluating different models. The scoring metrics will be the coefficient of determination (R^2) which explains how well the model predicts the outcome and Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), Mean Absolute Percentage Error (MAPE). We also want an estimate of the model performance which is less biased by the particular sampling of the cross-validation folds. So, to this end, we also perform cross-validation 10 times, each time selecting a different seed for the fold selection and then averaging the overall results. We achieve this with the following function:
def get_test_scores(model, X, y, scoring, num_rounds=10):
test_df = pd.DataFrame()
train_df = pd.DataFrame()
for i in range(num_rounds):
cv = KFold(n_splits=10, random_state=i, shuffle=True)
scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)
for score, score_name in zip(scoring, score_names):
test_df.at[i, 'test_' + score] = scores['test_' + score].mean()
train_df.at[i, 'train_' + score] = scores['train_' + score].mean()
return test_df.mean(axis=0).to_dict(), train_df.mean(axis=0).to_dict()
scoring = ['r2','neg_root_mean_squared_error','neg_mean_absolute_error', 'neg_mean_absolute_percentage_error']
score_names = ['R^2', 'RMSE', 'MAE', 'MAPE']
Our first predictive model will be a naive prediction over which we can determine if subsequent models have any better skill. The primary listing feature for a property is how many bedrooms it has so we will use this as our naive baseline prediction. We will perform linear regression using this single feature and the log10 of the price as the target.
results_df = pd.DataFrame()
# Let's start with a naive prediction - linear regression with just the number of bedrooms
y = tv_df['log_price'].values
X = tv_df[['beds']].values
scores, _ = get_test_scores(LinearRegression(), X, y, scoring)
for score, score_name in zip(scoring, score_names):
results_df.at['Naive',score_name] = scores['test_' + score]
display(results_df)
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
We now build our full feature set - the one-hot (binary) features plus the standardised (non-binary) parametrics:
onehot_feature_names = [
"is_house",
"new_build",
"is_terraced",
"is_semi_detached",
"is_detached",
"is_apartment",
"is_ground_flat",
"is_townhouse",
"is_fixed_price",
"is_under_offer",
"is_sold"
]
all_feature_names = nb_feature_names + onehot_feature_names
We will use the following regressors 'out-of-the-box' to select one to go forward with: Linear Regression, Decision Tree, Random Forest, XG-Boost and K-Nearest-Neighbours.
# Let's evaluate some regressors with all the features
y = tv_df['log_price'].values
X = tv_df[all_feature_names].values
regressors = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), XGBRegressor(), KNeighborsRegressor()]
regressor_names = ['LR','DT','RF','XGB','KNN']
for regressor, regressor_name in zip(regressors, regressor_names):
scores, _ = get_test_scores(regressor, X, y, scoring)
for score, score_name in zip(scoring, score_names):
results_df.at[regressor_name, score_name] = scores['test_' + score]
display(results_df)
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
| LR | 0.600303 | -0.147170 | -0.113796 | -0.020705 |
| DT | 0.632995 | -0.140943 | -0.095719 | -0.017396 |
| RF | 0.799147 | -0.104267 | -0.073076 | -0.013279 |
| XGB | 0.820373 | -0.098588 | -0.070993 | -0.012922 |
| KNN | 0.657843 | -0.136155 | -0.101451 | -0.018482 |
We make the following observations:
Whilst we recognise that feature selection optimisation and hyper-parameter tuning could change the rankings, for the purposes of this demo, we choose the XG-Boost regressor to go forward with. A more comprehensive approach would take both forward and perform the following optimsations on both.
We define the following 4-plot visualisation of the residuals to gain an insight into model performance:
def plot_residuals_analysis(X, y, preds):
residuals = y - preds
residuals_mean = np.mean(residuals)
residuals_std = np.std(residuals)
residuals_standardised = (residuals - residuals_mean) / residuals_std
# Coefficient of Determination (R^2)
# The ratio (percentage if x100) of the variation in target explained by the explanatory variables
r2 = np.round(1.0 - (np.sum((y - preds)**2) / np.sum( ( y - np.mean(y) )**2 )),3)
plt.figure(figsize=(12,12))
# Predicted vs Observed
plt.subplot(2,2,1)
plt.scatter(y, preds)
plt.plot([4.5,7],[4.5,7], color='red')
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.title('Predicted vs Observed. R^2: ' + str(r2))
plt.grid()
# Residuals vs Predicted
plt.subplot(2,2,2)
plt.scatter(preds, residuals_standardised)
plt.xlabel('Predicted')
plt.ylabel('Residuals Standardised')
plt.grid()
plt.title('Residuals vs Predicted')
# Residuals Histogram
plt.subplot(2,2,3)
bins = np.arange(-4,4,0.08)
a, b = np.histogram(residuals_standardised, bins=bins)
plt.bar(b[:99], a, width=0.08)
plt.grid()
plt.xlabel('Standardised Residuals')
plt.ylabel('Frequency')
plt.title('Standardised Residuals Histogram')
# plt.plot(bins, scipy.stats.norm(np.mean(residuals_standardised), np.std(residuals_standardised)).pdf(bins), color='red')
# QQ Plot
ax = plt.subplot(2,2,4)
sm.qqplot(residuals_standardised, line ='45', ax=ax)
plt.xlim(-5,5)
plt.ylim(-7.5,7.5)
plt.grid()
plt.title('Standardised Residuals Q-Q Plot')
plt.tight_layout()
plt.show()
cv = KFold(n_splits=10, random_state=42, shuffle=True)
preds = cross_val_predict(XGBRegressor(), X, y, cv=cv)
plot_residuals_analysis(X, y, preds)
We make the following observations:
Overall, these plots tell us that we have a robust statistical basis for our model.
tv_df['preds'] = 10**preds
tv_df['pct_err'] = 100*(tv_df['preds'] - tv_df['current_price'])/tv_df['current_price']
After reviewing the actual sales listings for the 'outlier' properties, we make the following observations:
There's a small (14) group of properties that are special situations where the asking price of the property is below approximately £70k and looking at the actual listings show that they are distressed (abnormal) sales.
There is 1 property that was listed as a 2-bed flat in Edinburgh but in fact was a portfolio of 6 properties spread around Scotland.
These 15 properties will be removed.
# Remove the distressed sales
tv_df = tv_df[tv_df['log_price'] >= 4.82]
# Removing this just by reference to it's position in the table but in reality we would refer to it's actual ID or address
tv_df = tv_df.sort_values('pct_err', ascending=True).iloc[1:]
Let's now re-fit the model and look at the residuals with the outliers removed:
y = tv_df['log_price'].values
X = tv_df[all_feature_names].values
cv = KFold(n_splits=10, random_state=42, shuffle=True)
preds = cross_val_predict(XGBRegressor(), X, y, cv=cv)
plot_residuals_analysis(X, y, preds)
y = tv_df['log_price'].values
X = tv_df[all_feature_names].values
scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
for score, score_name in zip(scoring, score_names):
results_df.at['XGB - Outliers', score_name] = scores['test_' + score]
Let's update the model performance table with the outliers removed:
results_df
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
| LR | 0.600303 | -0.147170 | -0.113796 | -0.020705 |
| DT | 0.632995 | -0.140943 | -0.095719 | -0.017396 |
| RF | 0.799147 | -0.104267 | -0.073076 | -0.013279 |
| XGB | 0.820373 | -0.098588 | -0.070993 | -0.012922 |
| KNN | 0.657843 | -0.136155 | -0.101451 | -0.018482 |
| XGB - Outliers | 0.834264 | -0.093816 | -0.069136 | -0.012557 |
Having selected our regressor, checked the residuals and removed the outliers that we don't wish to model, we now turn to feature selection. We want to make sure that none of the features in our model are negatively impacting the performance. In a linear regressor we can look at the regressed constants for each feature to understand the relative importance. In Tree's, Random Forests and XG-Boost we can look at a returned parameter called _feature_importances to gauge the relative importance of the features from the regressors perspective:
# Let's get the feature importances with XGBoost regressor
y = tv_df['log_price'].values
X = tv_df[all_feature_names].values
model = XGBRegressor()
model.fit(X, y)
importance = model.feature_importances_
fi_df = pd.DataFrame()
fi_df['Feature'] = all_feature_names
fi_df['Importance'] = np.round(importance,3)
fi_df = fi_df.sort_values('Importance', ascending=False)
fi_df
| Feature | Importance | |
|---|---|---|
| 0 | beds | 0.215 |
| 9 | new_build | 0.194 |
| 1 | baths | 0.159 |
| 12 | is_detached | 0.098 |
| 13 | is_apartment | 0.081 |
| 8 | is_house | 0.054 |
| 3 | longitude | 0.038 |
| 16 | is_fixed_price | 0.029 |
| 2 | latitude | 0.025 |
| 15 | is_townhouse | 0.019 |
| 18 | is_sold | 0.018 |
| 10 | is_terraced | 0.013 |
| 11 | is_semi_detached | 0.011 |
| 17 | is_under_offer | 0.011 |
| 7 | m_factor | 0.009 |
| 4 | dom | 0.008 |
| 6 | status_changes | 0.007 |
| 14 | is_ground_flat | 0.006 |
| 5 | price_changes | 0.005 |
Here, we will start with the most important feature (as identified by the regressor) and then add and evaluate the model performance for each subsequent feature:
# Start with the first most important feature, evaluate the metrics then add add the other features one by one
fs_df = pd.DataFrame()
for idx in range(1, len(fi_df)+1):
sub_feature_names = fi_df.iloc[:idx]['Feature'].values
y = tv_df['log_price'].values
X = tv_df[sub_feature_names].values
scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
for score, score_name in zip(scoring, score_names):
fs_df.at[sub_feature_names[-1], score_name] = scores['test_' + score]
fs_df
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| beds | 0.476601 | -0.166817 | -0.132100 | -0.024025 |
| new_build | 0.502626 | -0.162617 | -0.128681 | -0.023393 |
| baths | 0.557600 | -0.153345 | -0.121539 | -0.022107 |
| is_detached | 0.565145 | -0.152014 | -0.119777 | -0.021784 |
| is_apartment | 0.581233 | -0.149183 | -0.117102 | -0.021297 |
| is_house | 0.583870 | -0.148708 | -0.116689 | -0.021225 |
| longitude | 0.731404 | -0.119420 | -0.087927 | -0.015979 |
| is_fixed_price | 0.734413 | -0.118748 | -0.087586 | -0.015916 |
| latitude | 0.843408 | -0.091192 | -0.066206 | -0.012017 |
| is_townhouse | 0.843097 | -0.091276 | -0.066281 | -0.012031 |
| is_sold | 0.841454 | -0.091768 | -0.066742 | -0.012115 |
| is_terraced | 0.842457 | -0.091462 | -0.066448 | -0.012061 |
| is_semi_detached | 0.842475 | -0.091463 | -0.066387 | -0.012051 |
| is_under_offer | 0.842369 | -0.091491 | -0.066531 | -0.012077 |
| m_factor | 0.840081 | -0.092152 | -0.067314 | -0.012220 |
| dom | 0.834394 | -0.093776 | -0.069027 | -0.012536 |
| status_changes | 0.833361 | -0.094077 | -0.069227 | -0.012572 |
| is_ground_flat | 0.833178 | -0.094125 | -0.069348 | -0.012595 |
| price_changes | 0.834455 | -0.093763 | -0.069061 | -0.012544 |
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(fs_df.index.values, fs_df['R^2'])
plt.xticks(rotation=90)
plt.title('Coefficient of Determination vs Feature Set')
plt.xlabel('Incremental Feature Addition')
plt.ylabel('R^2')
plt.grid()
plt.subplot(1,2,2)
plt.plot(fs_df.index.values, fs_df['RMSE'], label='RMSE')
plt.plot(fs_df.index.values, fs_df['MAE'], label='MAE')
plt.plot(fs_df.index.values, fs_df['MAPE'], label='MAPE')
plt.xticks(rotation=90)
plt.title('Cross-Validation Error Metrics vs Feature Set')
plt.xlabel('Incremental Feature Addition')
plt.ylabel('Metric Value')
plt.legend()
plt.grid()
plt.show()
Because of this last point, we will evaluate the feature selection one more way - by starting with all the features and then evaluating whether removing a feature would improve the model performance or not. If so, we remove it and then repeat until we have removed all the features that are negatively impacting performance:
# Start with all the features, remove one, evaluate scores and decide whether to keep or not. Repeat with remaining.
metric_name = 'test_r2'
not_done = True
remaining_feature_names = all_feature_names.copy()
while not_done:
y = tv_df['log_price'].values
X = tv_df[remaining_feature_names].values
all_features_scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
print('Num remaining features:', len(remaining_feature_names))
print('Score with remaining features:', np.round(all_features_scores[metric_name],4))
max_improved_score = 0
max_improved_score_name = 'None'
for removed_feature_name in remaining_feature_names:
sub_feature_names = [x for x in remaining_feature_names if x != removed_feature_name]
y = tv_df['log_price'].values
X = tv_df[sub_feature_names].values
scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
if scores[metric_name] - all_features_scores[metric_name] > max_improved_score:
max_improved_score = scores[metric_name] - all_features_scores[metric_name]
max_improved_score_name = removed_feature_name
print('Score improved best by removing feature:', max_improved_score_name)
print('Score improved by:', np.round(max_improved_score,4))
if max_improved_score_name != 'None':
print('Removing:', max_improved_score_name)
print('')
remaining_feature_names.remove(max_improved_score_name)
else:
not_done = False
Num remaining features: 19 Score with remaining features: 0.8343 Score improved best by removing feature: dom Score improved by: 0.0043 Removing: dom Num remaining features: 18 Score with remaining features: 0.8386 Score improved best by removing feature: m_factor Score improved by: 0.0022 Removing: m_factor Num remaining features: 17 Score with remaining features: 0.8408 Score improved best by removing feature: price_changes Score improved by: 0.0011 Removing: price_changes Num remaining features: 16 Score with remaining features: 0.8418 Score improved best by removing feature: is_apartment Score improved by: 0.0009 Removing: is_apartment Num remaining features: 15 Score with remaining features: 0.8427 Score improved best by removing feature: is_sold Score improved by: 0.001 Removing: is_sold Num remaining features: 14 Score with remaining features: 0.8437 Score improved best by removing feature: status_changes Score improved by: 0.0006 Removing: status_changes Num remaining features: 13 Score with remaining features: 0.8443 Score improved best by removing feature: is_under_offer Score improved by: 0.0008 Removing: is_under_offer Num remaining features: 12 Score with remaining features: 0.8452 Score improved best by removing feature: is_ground_flat Score improved by: 0.0002 Removing: is_ground_flat Num remaining features: 11 Score with remaining features: 0.8454 Score improved best by removing feature: is_semi_detached Score improved by: 0.0001 Removing: is_semi_detached Num remaining features: 10 Score with remaining features: 0.8454 Score improved best by removing feature: None Score improved by: 0
remaining_feature_names
['beds', 'baths', 'latitude', 'longitude', 'is_house', 'new_build', 'is_terraced', 'is_detached', 'is_townhouse', 'is_fixed_price']
We see that we are left with a feature list that is very similar (but not identical) to the first method. In this case we do end up with a very slightly better performance and so will use this as our optimised set of features to go forward with.
y = tv_df['log_price'].values
X = tv_df[remaining_feature_names].values
scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
for score, score_name in zip(scoring, score_names):
results_df.at['XGB - Outliers + Feat Opt.', score_name] = scores['test_' + score]
results_df
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
| LR | 0.600303 | -0.147170 | -0.113796 | -0.020705 |
| DT | 0.632995 | -0.140943 | -0.095719 | -0.017396 |
| RF | 0.799147 | -0.104267 | -0.073076 | -0.013279 |
| XGB | 0.820373 | -0.098588 | -0.070993 | -0.012922 |
| KNN | 0.657843 | -0.136155 | -0.101451 | -0.018482 |
| XGB - Outliers | 0.834264 | -0.093816 | -0.069136 | -0.012557 |
| XGB - Outliers + Feat Opt. | 0.845432 | -0.090608 | -0.065819 | -0.011945 |
Now we want to see if adding postcode sector location helps the performance of the model:
features_with_postcode_sectors = remaining_feature_names + postcode_sector_feature_names.to_list()
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values
scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
scores
{'test_r2': 0.85193999309906,
'test_neg_root_mean_squared_error': -0.08866673508671732,
'test_neg_mean_absolute_error': -0.06389402697447671,
'test_neg_mean_absolute_percentage_error': -0.01159143201926165}
We can see that adding postcode sector features is positive. Here's the updated performance table:
for score, score_name in zip(scoring, score_names):
results_df.at['XGB - Outliers + Feat Opt + Postcode Sec', score_name] = scores['test_' + score]
results_df
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
| LR | 0.600303 | -0.147170 | -0.113796 | -0.020705 |
| DT | 0.632995 | -0.140943 | -0.095719 | -0.017396 |
| RF | 0.799147 | -0.104267 | -0.073076 | -0.013279 |
| XGB | 0.820373 | -0.098588 | -0.070993 | -0.012922 |
| KNN | 0.657843 | -0.136155 | -0.101451 | -0.018482 |
| XGB - Outliers | 0.834264 | -0.093816 | -0.069136 | -0.012557 |
| XGB - Outliers + Feat Opt. | 0.845432 | -0.090608 | -0.065819 | -0.011945 |
| XGB - Outliers + Feat Opt + Postcode Sec | 0.851940 | -0.088667 | -0.063894 | -0.011591 |
We now turn to potentially further improving the model perfomance by tuning the hyper-parameters of the XG-Boost regressor. I usually take an iterative approach to this depending on how much compute / time I want to throw at the problem. I usually start, however, by taking the approach outlined in the paper 'Random Search for Hyper-Parameter Optimisation' by Bergstra & Bengio (2012). It is much more efficient at searching a hitherto unknown optimsation space with multiple variables than, say, a brute-force grid search.
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values
%%time
# This cell takes hour(s) to run so skip and load pre-run in next cell if already run
# XGBoost hyper-parameter tuning
result_list = []
for trial in range(1000):
learning_rate = random.uniform(0.1, 0.3)
max_depth = random.randint(5, 21)
min_child_weight = random.randint(1, 10)
max_delta_step = random.randint(0, 10)
subsample = random.uniform(0.3, 1.0)
l2 = random.randint(1, 10)
model = XGBRegressor(learning_rate = learning_rate,
max_depth = max_depth, min_child_weight=min_child_weight,
max_delta_step=max_delta_step, subsample=subsample,
reg_lambda=l2)
test_scores, train_scores = get_test_scores(model, X, y, scoring, num_rounds=10)
result_list.append({'learning_rate':learning_rate,
'max_depth':max_depth,
'min_child_weight':min_child_weight,
'max_delta_step':max_delta_step,
'subsample':subsample,
'l2':l2 }
| test_scores
| train_scores
)
# print('Trial: ', trial, 'Test R^2: ', test_scores['test_r2'])
hyper_df = pd.DataFrame(result_list)
hyper_df.to_csv('blog-hyper-parameter-tuning.csv')
CPU times: total: 8d 20h 34min 21s Wall time: 14h 12min 52s
hyper_df = pd.read_csv('blog-hyper-parameter-tuning.csv', index_col=0)
hyper_df.sort_values('test_r2', ascending=False)
| learning_rate | max_depth | min_child_weight | max_delta_step | subsample | l2 | test_r2 | test_neg_root_mean_squared_error | test_neg_mean_absolute_error | test_neg_mean_absolute_percentage_error | train_r2 | train_neg_root_mean_squared_error | train_neg_mean_absolute_error | train_neg_mean_absolute_percentage_error | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 149 | 0.101028 | 20 | 5 | 4 | 0.773805 | 3 | 0.863260 | -0.085212 | -0.060920 | -0.011044 | 0.959900 | -0.046269 | -0.033455 | -0.006081 |
| 836 | 0.119428 | 20 | 5 | 10 | 0.802562 | 2 | 0.862890 | -0.085320 | -0.060635 | -0.010994 | 0.971459 | -0.039032 | -0.028280 | -0.005147 |
| 470 | 0.139690 | 21 | 4 | 0 | 0.797019 | 5 | 0.862810 | -0.085356 | -0.060549 | -0.010977 | 0.976561 | -0.035371 | -0.025250 | -0.004594 |
| 516 | 0.122226 | 19 | 4 | 10 | 0.720593 | 9 | 0.862735 | -0.085380 | -0.060962 | -0.011051 | 0.958067 | -0.047315 | -0.033714 | -0.006121 |
| 968 | 0.105230 | 21 | 8 | 9 | 0.797111 | 3 | 0.862684 | -0.085395 | -0.061217 | -0.011098 | 0.953915 | -0.049604 | -0.035939 | -0.006529 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 820 | 0.114304 | 6 | 10 | 1 | 0.760050 | 9 | 0.814053 | -0.099417 | -0.073993 | -0.013434 | 0.843786 | -0.091335 | -0.068281 | -0.012407 |
| 182 | 0.132569 | 5 | 1 | 1 | 0.752993 | 7 | 0.813523 | -0.099559 | -0.074393 | -0.013508 | 0.850650 | -0.089306 | -0.067233 | -0.012227 |
| 735 | 0.116081 | 5 | 1 | 2 | 0.364070 | 9 | 0.813519 | -0.099560 | -0.074365 | -0.013498 | 0.842695 | -0.091653 | -0.068871 | -0.012514 |
| 34 | 0.101536 | 5 | 3 | 2 | 0.512211 | 4 | 0.813195 | -0.099644 | -0.074717 | -0.013568 | 0.842008 | -0.091853 | -0.069337 | -0.012606 |
| 250 | 0.103713 | 5 | 10 | 8 | 0.370739 | 10 | 0.808812 | -0.100817 | -0.075356 | -0.013677 | 0.829182 | -0.095509 | -0.071478 | -0.012980 |
1000 rows × 14 columns
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values
# Default scores
learning_rate = 0.3
max_depth = 6
min_child_weight = 1
max_delta_step = 0
subsample = 1
l2 = 1
model = XGBRegressor(learning_rate = learning_rate,
max_depth = max_depth, min_child_weight=min_child_weight,
max_delta_step=max_delta_step, subsample=subsample,
reg_lambda=l2)
default_test_scores, default_train_scores = get_test_scores(model, X, y, scoring)
defvals = {'learning_rate':0.3,
'max_depth':6,
'min_child_weight':1,
'max_delta_step':0,
'subsample':1,
'l2':1,
}
optvals = hyper_df.sort_values('test_r2', ascending=False).iloc[0]
learning_rate = optvals['learning_rate']
max_depth = int(optvals['max_depth'])
min_child_weight = int(optvals['min_child_weight'])
max_delta_step = int(optvals['max_delta_step'])
subsample = optvals['subsample']
l2 = int(optvals['l2'])
model = XGBRegressor(learning_rate = learning_rate,
max_depth = max_depth, min_child_weight=min_child_weight,
max_delta_step=max_delta_step, subsample=subsample,
reg_lambda=l2)
optimum_test_scores, optimum_train_scores = get_test_scores(model, X, y, scoring)
optvals
learning_rate 0.101028 max_depth 20.000000 min_child_weight 5.000000 max_delta_step 4.000000 subsample 0.773805 l2 3.000000 test_r2 0.863260 test_neg_root_mean_squared_error -0.085212 test_neg_mean_absolute_error -0.060920 test_neg_mean_absolute_percentage_error -0.011044 train_r2 0.959900 train_neg_root_mean_squared_error -0.046269 train_neg_mean_absolute_error -0.033455 train_neg_mean_absolute_percentage_error -0.006081 Name: 149, dtype: float64
idx = 1
plt.figure(figsize=(10,10))
for col in hyper_df.columns[:6]:
plt.subplot(3,2,idx)
plt.scatter(hyper_df[col], hyper_df['test_r2'])
plt.plot(optvals[col], optimum_test_scores['test_r2'], marker='o', color='red')
plt.plot(defvals[col], default_test_scores['test_r2'], marker='o', color='red')
plt.xlabel(col)
plt.ylabel('R^2')
plt.grid()
plt.title(col + ' Opt. Val: ' + str(np.round(optvals[col],3)))
idx += 1
plt.tight_layout()
plt.show()
idx = 1
plt.figure(figsize=(10,8))
for score in scoring:
plt.subplot(2,2,idx)
plt.scatter(hyper_df['train_' + score], hyper_df['test_' + score])
plt.plot(optvals['train_' + score], optvals['test_' + score], marker='o', color='red')
plt.plot(default_train_scores['train_' + score], default_test_scores['test_' + score], marker='o', color='red')
plt.grid()
plt.xlabel('Train ' + score)
plt.ylabel('Test ' + score)
plt.title(score)
idx += 1
plt.tight_layout()
plt.show()
for score, score_name in zip(scoring, score_names):
results_df.at['XGB - Outliers + Feat Opt + Postcode Sec + HyperTune', score_name] = optimum_test_scores['test_' + score]
results_df
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
| LR | 0.600303 | -0.147170 | -0.113796 | -0.020705 |
| DT | 0.632995 | -0.140943 | -0.095719 | -0.017396 |
| RF | 0.799147 | -0.104267 | -0.073076 | -0.013279 |
| XGB | 0.820373 | -0.098588 | -0.070993 | -0.012922 |
| KNN | 0.657843 | -0.136155 | -0.101451 | -0.018482 |
| XGB - Outliers | 0.834264 | -0.093816 | -0.069136 | -0.012557 |
| XGB - Outliers + Feat Opt. | 0.845432 | -0.090608 | -0.065819 | -0.011945 |
| XGB - Outliers + Feat Opt + Postcode Sec | 0.851940 | -0.088667 | -0.063894 | -0.011591 |
| XGB - Outliers + Feat Opt + Postcode Sec + HyperTune | 0.863260 | -0.085212 | -0.060920 | -0.011044 |
We can see that through the process of outlier elimination, feature selection optimisation and hyper-parameter tuning, we have improved the cross-validation metrics across the board. It's now time to evaluate our model on the unseen data we set aside at the very beginning of the process.
# Fit model with hyper-tuned paramaters to all the train / validation data
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values
xgb = model.fit(X, y)
# Transform the unseen test set with the scaler originally fitted to the train / validation set
std_unseen_test_df = unseen_test_df.copy()
std_unseen_test_df[nb_feature_names] = scaler.transform(std_unseen_test_df[nb_feature_names])
# Predict the unseen test set values using the fitted model
unseen_test_y = std_unseen_test_df['log_price'].values
unseen_test_X = std_unseen_test_df[features_with_postcode_sectors].values
preds = xgb.predict(unseen_test_X)
results_df.at['Unseen Test Set','R^2'] = r2_score(unseen_test_y, preds)
results_df.at['Unseen Test Set','RMSE'] = -np.sqrt(mean_squared_error(unseen_test_y, preds))
results_df.at['Unseen Test Set','MAE'] = -mean_absolute_error(unseen_test_y, preds)
results_df.at['Unseen Test Set','MAPE'] = mean_absolute_percentage_error(unseen_test_y, preds)
results_df
| R^2 | RMSE | MAE | MAPE | |
|---|---|---|---|---|
| Naive | 0.459261 | -0.171257 | -0.133765 | -0.024356 |
| LR | 0.600303 | -0.147170 | -0.113796 | -0.020705 |
| DT | 0.632995 | -0.140943 | -0.095719 | -0.017396 |
| RF | 0.799147 | -0.104267 | -0.073076 | -0.013279 |
| XGB | 0.820373 | -0.098588 | -0.070993 | -0.012922 |
| KNN | 0.657843 | -0.136155 | -0.101451 | -0.018482 |
| XGB - Outliers | 0.834264 | -0.093816 | -0.069136 | -0.012557 |
| XGB - Outliers + Feat Opt. | 0.845432 | -0.090608 | -0.065819 | -0.011945 |
| XGB - Outliers + Feat Opt + Postcode Sec | 0.851940 | -0.088667 | -0.063894 | -0.011591 |
| XGB - Outliers + Feat Opt + Postcode Sec + HyperTune | 0.863260 | -0.085212 | -0.060920 | -0.011044 |
| Unseen Test Set | 0.866544 | -0.087153 | -0.060425 | 0.010997 |
We can see that the scoring on the unseen test set is very close to the cross-validation results, which is reassuring. We would perhaps expect a model trained on all the training / validation data to be better than one trained on only 90% of it (as is the case in the cross-validation). Where we are on the performance vs data volume curve can be estimated as follows:
xvals = []
yvals = []
for num_samples in range(500, len(tv_df), 100):
sample_df = tv_df.sample(num_samples)
y = sample_df['log_price'].values
X = sample_df[features_with_postcode_sectors].values
score = cross_val_score(model, X, y, cv=cv).mean()
xvals.append(num_samples)
yvals.append(score)
# All the data
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values
score = cross_val_score(model, X, y, cv=cv).mean()
xvals.append(num_samples)
yvals.append(score)
plt.plot(xvals, yvals)
plt.grid()
plt.xlabel('Num Samples')
plt.ylabel('R^2')
plt.title('XGBoost R^2 vs Number of Data Points')
plt.show()
We can see that as we add more data we are still improving the model performance in the order of the difference we see between a model trained on 90% vs 100% of the training / validation data. For example, we are in the region of the curve where adding, say, 1000 data points will improve the R^2 score around 0.5%-1%.
We now release a final model, trained on all the available data. First we need to standardise all the data in the non-binary features. Recall that the dataframe that contains all the data (training / validation and the unseen test set) was simply called df:
# Make a deep copy so we keep the original df intact when we standardise
final_df = df.copy()
# Standardise the numerical (non-binary) features for the final model (some were dropped during optimisation)
final_nb_feature_names = ["beds","baths","latitude","longitude"]
final_scaler = StandardScaler().set_output(transform="pandas")
final_df[nb_feature_names] = final_scaler.fit_transform(final_df[final_nb_feature_names])
y = final_df['log_price'].values
X = final_df[features_with_postcode_sectors].values
# Instantiate an XGBoost regressor with the optimum hyper-parameters
learning_rate = optvals['learning_rate']
max_depth = int(optvals['max_depth'])
min_child_weight = int(optvals['min_child_weight'])
max_delta_step = int(optvals['max_delta_step'])
subsample = optvals['subsample']
l2 = int(optvals['l2'])
final_model = XGBRegressor(learning_rate = learning_rate,
max_depth = max_depth, min_child_weight=min_child_weight,
max_delta_step=max_delta_step, subsample=subsample,
reg_lambda=l2)
final_xgb_model = final_model.fit(X, y)
OK, so now we've built a model, let's put it to some use. We'll build an interactive property value predictor based on full postcodes. We'll take the dataframe that contains all the postcodes and their locations (longitude / latitude) and then we'll add the remaining model features to describe a generic two-bedroom, 1 bath flat (as an example). We'll then standardize using the scaler and run this through the fitted model. The output will be a prediction for a two bed, 1 bath flat at postcode resolution for the whole of the Edinburgh area. We'll render this on an interactive map using Bokeh. First, we need our postcode dataframe:
postcode_predictor_df = postcodes_df[['Postcode','longitude','latitude','wm-eastings','wm-northings']].copy()
postcode_predictor_df.head(1)
| Postcode | longitude | latitude | wm-eastings | wm-northings | |
|---|---|---|---|---|---|
| 0 | EH1 1AD | -3.192588 | 55.948899 | -355397.470027 | 7.548249e+06 |
We'll add in all the required features for the model and assign the values for a 2-bed, 1 bath flat:
remaining_feature_names
['beds', 'baths', 'latitude', 'longitude', 'is_house', 'new_build', 'is_terraced', 'is_detached', 'is_townhouse', 'is_fixed_price']
# Add the remaining features
postcode_predictor_df['beds'] = 2
postcode_predictor_df['baths'] = 1
postcode_predictor_df['is_house'] = 0
postcode_predictor_df['new_build'] = 0
postcode_predictor_df['is_terraced'] = 0
postcode_predictor_df['is_detached'] = 0
postcode_predictor_df['is_townhouse'] = 0
postcode_predictor_df['is_fixed_price'] = 0
# One hot encode the postcode sector
postcode_predictor_df['postcode_sector'] = [x.split(' ')[0] + ' ' + x.split(' ')[-1][0] for x in postcode_predictor_df['Postcode']]
postcode_predictor_df = postcode_predictor_df.join(pd.get_dummies(postcode_predictor_df['postcode_sector']))
Next we standardise the non-binary features with the final model scaler and then feed the entire feature set into the final model to obtain a prediction (in £k) for each postcode:
# Standardise the non-binary features with the final fitted scaler
postcode_predictor_df[final_nb_feature_names] = final_scaler.transform(postcode_predictor_df[final_nb_feature_names])
# Form the feature matrix
X = postcode_predictor_df[features_with_postcode_sectors].values
# Get the predictions
preds = final_xgb_model.predict(X)
postcode_predictor_df['predicted_value'] = (10**(preds-3.0)).astype('int')
Finally, we can render a map with Glyph's for each postcode. The colour of the glyph represents the predicted value according to the colour mapper on the right. This chart is interactive. Use the controls on the right to zoom in and move around the map. Hover over any one of the glyphs to see the predicted value in £ for that postcode. Those that are familiar with the Edinburgh area will note that the model has represented the value of a 2-bed, 1-bath flat quite well across the various districts of the city.
# Render an OSM in Bokeh using x-y range values for Edinburgh area (determined previously)
p = figure(x_range=(-391222,-321222), y_range=(7501548, 7571548),
x_axis_type="mercator", y_axis_type="mercator",
height=400, width=800)
# Add the OSM
p.add_tile("OpenStreetMap Mapnik")
source = ColumnDataSource(postcode_predictor_df[['Postcode','wm-eastings','wm-northings','predicted_value']])
lcval = postcode_predictor_df['predicted_value'].min()
ucval = postcode_predictor_df['predicted_value'].max() * 0.8
mapper = linear_cmap('predicted_value', bp.Plasma256, lcval, ucval*0.8)
glyph = Scatter(x='wm-eastings', y='wm-northings', fill_color=mapper, fill_alpha=0.6, size=4, line_color=mapper, line_alpha=0)
g1 = p.add_glyph(source, glyph)
color_bar = ColorBar(color_mapper=mapper['transform'], location=(0,0), title='Predicted Price £k')
p.add_layout(color_bar, 'right')
hover = HoverTool(renderers=[g1],tooltips = [('Postcode','@Postcode'),
('Est. Value','£@predicted_value k'),
])
p.add_tools(hover)
show(p)
Another use case for the model would be to predict the value for each home that is on the market. In this case we would use leave-one-out cross-validation (LOO-CV) to obtain the predicted value for each property for when it was held out. In practice, I have found that it's not necessary to go to the extreme of LOO-CV for this dataset / model - a 10-fold cross-validation repeated 10 times and averaged provides a reasonable, low-variance estimate of the values equivalent to CV-LOO. An example of this use case is shown in the final dashboard for this project.
# jupyter nbconvert --to html --no-prompt adsg_house_hunting2_part5.ipynb