import sqlite3
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from pyproj import Transformer
import pyproj
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import statsmodels.api as sm
from sklearn.inspection import permutation_importance
import random
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from pickle import dump
import sklearn
import xgboost
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.transform import linear_cmap
import bokeh.palettes as bp
from bokeh.models import Scatter
from bokeh.models import ColorBar
from bokeh.models import HoverTool
from bokeh.io import show
from bokeh.io import output_notebook
output_notebook(hide_banner=True)


print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

pandas==2.0.3
numpy==1.26.0
pyproj==3.6.1
statsmodels.api==0.14.0
sklearn==1.3.1
xgboost==1.7.3


database_path = 'databases/forsale2_hashed.db'


# df is our main feature / target dataframe (from the propery addresses table) to which we will add various additional features
# Here we are loading all the data in the databse tables. 
# If it gets too large we can limit to last X entries using DESC LIMIT X
con = sqlite3.connect(database_path)
df = pd.read_sql("SELECT * FROM addresses NATURAL JOIN dates", con, index_col='pid')
prices_df = pd.read_sql("SELECT * FROM prices NATURAL JOIN dates", con, index_col='pid')
status_df = pd.read_sql("SELECT * FROM status NATURAL JOIN dates", con, index_col='pid')
con.close()

# Impute 1 bath where baths is missing ('None' in database)
df['baths'] = np.where(df['baths'] == 'None', 1, df['baths'])

# Convert to numerical
df['beds'] = df['beds'].astype('int')
df['baths'] = df['baths'].astype('int')
df['latitude'] = df['latitude'].astype('float')
df['longitude'] = df['longitude'].astype('float')

# Impute 'Fixed(NP)' where the price type wasn't entered ('' in database)
prices_df['price_type'] = np.where(prices_df['price_type'] == '', 'Fixed(NP)', prices_df['price_type'])

# Get the last price and price type and add to the main df
df['current_price_type'] = prices_df.groupby('pid').tail(1)['price_type']
df['current_price'] = prices_df.groupby('pid').tail(1)['price']

# Drop any prices not convertible to numerical (they are of no use to us anyway), and convert
print('Number of properties before removing invalid prices:', len(df))
df = df[df['current_price'].str.isdigit()]
df['current_price'] = df['current_price'].astype('int')
print('Number of properties after removing invalid prices:', len(df))

# Replace term 'Active' with 'For Sale' - Active has a broader context here and includes 'Under Offer' etc
status_df['status'] = np.where(status_df['status'] == 'Active', 'For Sale', status_df['status'])

# Get last status and add to main df
df['current_status'] = status_df.groupby('pid').tail(1)['status']

# Database stores longitude / latitude property location
# We need Web Mercator since the purpose is to plot them on an opensource map.
origin_shift = np.pi * 6378137
df['eastings'] = df['longitude'] * origin_shift / 180.0
df['northings'] = np.log(np.tan((90 + df['latitude']) * np.pi / 360.0)) * origin_shift / np.pi

Number of properties before removing invalid prices: 7670
Number of properties after removing invalid prices: 7650


# Add a higher level house / flat catagorical feature - dwelling_cat
# Human labelling here as many terms are used in the listings
flats_str = 'Flat|Apartment|Duplex|Ground Flat|Maisonette|Studio|Penthouse|Ground Maisonette|Triplex'

houses_str = 'Town House|Mews|Terraced|Semi-Detached|End of Terrace|Detached|House|Semi-Detached Bungalow|Detached Villa| \
              Detached Bungalow|Villa|Bungalow|Terraced Bungalow|Cottage|Link Detached House|Semi-detached Villa| \
              Barn Conversion|Lodge|Character Property'

# If it's not a Flat, test if it's a House and if not, it's an 'Other'
df['dwelling_cat'] = np.where(df['dwelling_subtype'].str.contains(flats_str), 'Flat', \
            np.where(df['dwelling_subtype'].str.contains(houses_str), 'House', 'Other'))

# Add some one-hot features around building category / sub-type
df['is_house'] = np.where(df['dwelling_cat'] == 'House', 1, 0)
df['is_flat'] = np.where(df['dwelling_cat'] == 'Flat', 1, 0)
df['is_semi_detached'] = np.where(df['dwelling_subtype'].str.contains('Semi-Detached|Semi-detached'), 1, 0)
df['is_terraced'] = np.where(df['dwelling_subtype'].str.contains('Terraced|End of Terrace'), 1, 0)
df['is_detached'] = np.where((df['dwelling_subtype'] == 'Detached') | 
                             (df['dwelling_subtype'].str.contains('Detached Bungalow|Detached Villa')),1, 0)
df['is_apartment'] = np.where(df['dwelling_subtype'] == 'Apartment', 1, 0)
df['is_ground_flat'] = np.where(df['dwelling_subtype'] == 'Ground Flat', 1, 0)
df['is_townhouse'] = np.where(df['dwelling_subtype'] == 'Town House',1 , 0)    
    
# Add a days on market (dom) feature - round to nearest day (12 hours)
df['dom'] = ((pd.to_datetime(df['dt_last_seen']) - 
              pd.to_datetime(df['dt_first_listed'].str.strip('Z').str.replace('T',' '))) + 
             datetime.timedelta(hours=12)).dt.days

# Sale Price features
df['is_fixed_price'] = np.where(df['current_price_type'].str.contains('Fixed'), 1, 0)
df['price_changes'] = prices_df.groupby('pid').count()['price'] - 1

# Sale Status features
df['is_under_offer'] = np.where(df['current_status'].str.contains('offer'), 1, 0)
df['is_sold'] = np.where(df['current_status'].str.contains('Sold'), 1, 0)
df['is_forsale'] = np.where(df['current_status'].str.contains('For Sale'), 1, 0)
df['status_changes'] = status_df.groupby('pid').count()['status'] - 1

# m-factor is 'motivation-factor' described in part 4
df['m_factor'] = np.where(df['dom'] > 0, np.round(100 * df['price_changes'] / df['dom'],2), 0)

# The target (Log10 Price)
df['log_price'] = np.log10(df['current_price'])


# Helper function for converting from listing location in easting / northing coordinates to a full postcode
def lookup_postcode_from_eastings_northings(easting, northing, postcodes_df):
    # Euclidean distance to all postcodes
    postcodes_df['distance'] = np.sqrt((postcodes_df['wm-northings']-northing)**2 + (postcodes_df['wm-eastings']-easting)**2)
    
    # Find the smallest distance
    idx_min = postcodes_df['distance'].idxmin()
    postcode = postcodes_df.iloc[idx_min]['Postcode']

    return postcode


# Look up the postcode from lat / lon
# LOAD APPROPRIATE POSTCODE FILE

# Load the CodePoint file and add Lon/Lat conversion to it (from Eastings/Northings)
# Source: https://osdatahub.os.uk/downloads/open/CodePointOpen
# Attribution: Contains public sector information licensed under the Open Government Licence v3.0.
colnames = ['Postcode','Positional_quality_indicator','Eastings','Northings','Country_code','NHS_regional_HA_code','NHS_HA_code','Admin_county_code','Admin_district_code','Admin_ward_code']

postcodes_df = pd.read_csv('codepo_gb\\Data\\CSV\\eh.csv', names=colnames)    
postcodes_df = postcodes_df[postcodes_df['Postcode'].str.contains('EH1|EH2|EH3 |EH4 |EH5 |EH6 |EH7 |EH8 |EH9 |EH30|EH31|EH32')]

# Just retain postcode and Eastings/Northings columns
postcodes_df = postcodes_df[['Postcode','Eastings','Northings']]
    
postcodes_df.reset_index(inplace=True, drop=True)

# Add the OSGB36 / EPSG27700 Eastings/Northings to Web Mercator (WGS84) Lon/Lat conversion

# Lat/Long Web Mercator projection (WGS84)
v84 = pyproj.Proj(proj="latlong", towgs84="0,0,0", ellps="WGS84")

# OSGB36 (EPSG:27700) projection
v36 = pyproj.Proj(proj="latlong", k=0.9996012717, ellps="airy",
    # Datum transformation to WGS84
    towgs84="446.448, -125.157, 542.060, 0.1502, 0.2470, 0.8421,-20.4894")

# UK National Grid is EPSG:27700 or OSGB36
vgrid = pyproj.Proj("EPSG:27700")

# Converts OSGB36 (EPSG:27700) Eastings / Northings to OSGB36 longitude / latitude
vlon36, vlat36 = vgrid(postcodes_df['Eastings'].values, postcodes_df['Northings'].values, inverse=True)
    
# Convert lat/lon from OSGB36 to Web Mercator lat/lon using a projection defn (see above)
transformer = pyproj.Transformer.from_proj(v36, v84)
converted = transformer.transform(vlon36, vlat36)
    
postcodes_df['longitude'] = np.round(converted[0],6)
postcodes_df['latitude'] = np.round(converted[1],6)
    
# Convert UK Eastings / Northings to web mercator Easting / Northings
transformer = pyproj.Transformer.from_crs("EPSG:27700", "EPSG:3857")
converted = transformer.transform(postcodes_df['Eastings'], postcodes_df['Northings'])
    
postcodes_df['wm-eastings'] = np.round(converted[0],6)
postcodes_df['wm-northings'] = np.round(converted[1],6)
    
df['postcode_full'] = df.apply(lambda x: lookup_postcode_from_eastings_northings(x['eastings'], x['northings'], postcodes_df), axis=1)
df['postcode_district'] = [x.split(' ')[0] for x in df['postcode_full']]
df['postcode_sector'] = [x.split(' ')[0] + ' ' + x.split(' ')[-1][0] for x in df['postcode_full']]

# One hot encode the area postcode districts and sectors
postcode_districts_df = pd.get_dummies(df['postcode_district'])
postcode_district_feature_names = postcode_districts_df.columns
df = df.join(postcode_districts_df)

postcode_sectors_df = pd.get_dummies(df['postcode_sector'])
postcode_sector_feature_names = postcode_sectors_df.columns
df = df.join(postcode_sectors_df)


print('Number of properties before removing \"Others\":', len(df))

df = df[df['dwelling_cat'] != 'Other']

print('Number of properties after removing \"Others\":', len(df))

Number of properties before removing "Others": 7650
Number of properties after removing "Others": 7474


# non-binary features (to be standardised)
nb_feature_names = ["beds","baths","latitude","longitude","dom","price_changes","status_changes","m_factor"]
df[nb_feature_names].describe()


pd.plotting.scatter_matrix(df[nb_feature_names], figsize=(14,14))
plt.show()


print('Number of properties before cleaning:', len(df))

# Remove the incorrect / outlier lat/long entries - they are well outside the Edinburgh area
df = df[(df['latitude'] >= 55.7) & (df['latitude'] <= 56.1)]
df = df[(df['longitude'] >= -3.6) & (df['longitude'] <= -2.6)]

# Remove the baths outlier (it has over 100 baths!)
df = df[df['baths'] <= 10]
    
print('Number of properties after cleaning:', len(df))

Number of properties before cleaning: 7474
Number of properties after cleaning: 7459


pd.plotting.scatter_matrix(df[nb_feature_names], figsize=(14,14))
plt.show()


tv_df, unseen_test_df = train_test_split(df, test_size=0.2, random_state=42)


# Standardise the numerical features
nb_feature_names = ["beds","baths","latitude","longitude","dom","price_changes","status_changes","m_factor"]
scaler = StandardScaler().set_output(transform="pandas")
tv_df[nb_feature_names] = scaler.fit_transform(tv_df[nb_feature_names])


tv_df[nb_feature_names].describe()


def get_test_scores(model, X, y, scoring, num_rounds=10):
    
    test_df = pd.DataFrame()
    train_df = pd.DataFrame()

    for i in range(num_rounds):
        cv = KFold(n_splits=10, random_state=i, shuffle=True)
        
        scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)
        
        for score, score_name in zip(scoring, score_names):
            test_df.at[i, 'test_' + score] = scores['test_' + score].mean()            
            train_df.at[i, 'train_' + score] = scores['train_' + score].mean()          

    return test_df.mean(axis=0).to_dict(), train_df.mean(axis=0).to_dict()


scoring = ['r2','neg_root_mean_squared_error','neg_mean_absolute_error', 'neg_mean_absolute_percentage_error']
score_names = ['R^2', 'RMSE', 'MAE', 'MAPE']


results_df = pd.DataFrame()

# Let's start with a naive prediction - linear regression with just the number of bedrooms
y = tv_df['log_price'].values
X = tv_df[['beds']].values

scores, _ = get_test_scores(LinearRegression(), X, y, scoring)

for score, score_name in zip(scoring, score_names):
    results_df.at['Naive',score_name] = scores['test_' + score]

display(results_df)


onehot_feature_names = [
    "is_house",
    "new_build",
    "is_terraced",
    "is_semi_detached",
    "is_detached",
    "is_apartment",
    "is_ground_flat",
    "is_townhouse",
    "is_fixed_price",
    "is_under_offer",
    "is_sold"
]


all_feature_names = nb_feature_names + onehot_feature_names


# Let's evaluate some regressors with all the features
y = tv_df['log_price'].values
X = tv_df[all_feature_names].values

regressors = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), XGBRegressor(), KNeighborsRegressor()]
 
regressor_names = ['LR','DT','RF','XGB','KNN']
for regressor, regressor_name in zip(regressors, regressor_names):
    scores, _ = get_test_scores(regressor, X, y, scoring)
    for score, score_name in zip(scoring, score_names):
        results_df.at[regressor_name, score_name] = scores['test_' + score]

display(results_df)


def plot_residuals_analysis(X, y, preds):

    residuals = y - preds
    residuals_mean = np.mean(residuals)
    residuals_std = np.std(residuals)
    residuals_standardised = (residuals - residuals_mean) / residuals_std
    
    # Coefficient of Determination (R^2)
    # The ratio (percentage if x100) of the variation in target explained by the explanatory variables
    r2 = np.round(1.0 - (np.sum((y - preds)**2) / np.sum( ( y - np.mean(y) )**2 )),3)
    
    plt.figure(figsize=(12,12))

    # Predicted vs Observed
    plt.subplot(2,2,1)
    plt.scatter(y, preds)
    plt.plot([4.5,7],[4.5,7], color='red')
    plt.xlabel('Observed')
    plt.ylabel('Predicted')
    plt.title('Predicted vs Observed. R^2: ' + str(r2))
    plt.grid()

    # Residuals vs Predicted
    plt.subplot(2,2,2)
    plt.scatter(preds, residuals_standardised)
    plt.xlabel('Predicted')
    plt.ylabel('Residuals Standardised')
    plt.grid()
    plt.title('Residuals vs Predicted')
    
    # Residuals Histogram
    plt.subplot(2,2,3)
    bins = np.arange(-4,4,0.08)
    a, b = np.histogram(residuals_standardised, bins=bins)
    plt.bar(b[:99], a, width=0.08)
    plt.grid()
    plt.xlabel('Standardised Residuals')
    plt.ylabel('Frequency')
    plt.title('Standardised Residuals Histogram')
#    plt.plot(bins, scipy.stats.norm(np.mean(residuals_standardised), np.std(residuals_standardised)).pdf(bins), color='red')
    
    # QQ Plot
    ax = plt.subplot(2,2,4)
    sm.qqplot(residuals_standardised, line ='45', ax=ax)
    plt.xlim(-5,5)
    plt.ylim(-7.5,7.5)
    plt.grid()
    plt.title('Standardised Residuals Q-Q Plot')

    plt.tight_layout()
    plt.show()


cv = KFold(n_splits=10, random_state=42, shuffle=True)
preds = cross_val_predict(XGBRegressor(), X, y, cv=cv)
plot_residuals_analysis(X, y, preds)


tv_df['preds'] = 10**preds
tv_df['pct_err'] = 100*(tv_df['preds'] - tv_df['current_price'])/tv_df['current_price']


# Remove the distressed sales
tv_df = tv_df[tv_df['log_price'] >= 4.82]
# Removing this just by reference to it's position in the table but in reality we would refer to it's actual ID or address
tv_df = tv_df.sort_values('pct_err', ascending=True).iloc[1:]


y = tv_df['log_price'].values
X = tv_df[all_feature_names].values
cv = KFold(n_splits=10, random_state=42, shuffle=True)
preds = cross_val_predict(XGBRegressor(), X, y, cv=cv)
plot_residuals_analysis(X, y, preds)


y = tv_df['log_price'].values
X = tv_df[all_feature_names].values

scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
for score, score_name in zip(scoring, score_names):
    results_df.at['XGB - Outliers', score_name] = scores['test_' + score]


results_df


# Let's get the feature importances with XGBoost regressor
y = tv_df['log_price'].values
X = tv_df[all_feature_names].values

model = XGBRegressor()
model.fit(X, y)
importance = model.feature_importances_
fi_df = pd.DataFrame()
fi_df['Feature'] = all_feature_names
fi_df['Importance'] = np.round(importance,3)
fi_df = fi_df.sort_values('Importance', ascending=False)


fi_df


# Start with the first most important feature, evaluate the metrics then add add the other features one by one
fs_df = pd.DataFrame()
for idx in range(1, len(fi_df)+1):
    sub_feature_names = fi_df.iloc[:idx]['Feature'].values
    
    y = tv_df['log_price'].values
    X = tv_df[sub_feature_names].values

    scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
    for score, score_name in zip(scoring, score_names):
        fs_df.at[sub_feature_names[-1], score_name] = scores['test_' + score]


fs_df


plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(fs_df.index.values, fs_df['R^2'])
plt.xticks(rotation=90)
plt.title('Coefficient of Determination vs Feature Set')
plt.xlabel('Incremental Feature Addition')
plt.ylabel('R^2')
plt.grid()

plt.subplot(1,2,2)
plt.plot(fs_df.index.values, fs_df['RMSE'], label='RMSE')
plt.plot(fs_df.index.values, fs_df['MAE'], label='MAE')
plt.plot(fs_df.index.values, fs_df['MAPE'], label='MAPE')
plt.xticks(rotation=90)
plt.title('Cross-Validation Error Metrics vs Feature Set')
plt.xlabel('Incremental Feature Addition')
plt.ylabel('Metric Value')
plt.legend()
plt.grid()

plt.show()


# Start with all the features, remove one, evaluate scores and decide whether to keep or not. Repeat with remaining.

metric_name = 'test_r2'
    
not_done =  True
remaining_feature_names = all_feature_names.copy()

while not_done:
    y = tv_df['log_price'].values
    X = tv_df[remaining_feature_names].values

    all_features_scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)

    print('Num remaining features:', len(remaining_feature_names))
    print('Score with remaining features:', np.round(all_features_scores[metric_name],4))

    max_improved_score = 0
    max_improved_score_name = 'None'

    for removed_feature_name in remaining_feature_names:
        sub_feature_names = [x for x in remaining_feature_names if x != removed_feature_name]

        y = tv_df['log_price'].values
        X = tv_df[sub_feature_names].values

        scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
    
        if scores[metric_name] - all_features_scores[metric_name] > max_improved_score:
            max_improved_score = scores[metric_name] - all_features_scores[metric_name]
            max_improved_score_name = removed_feature_name
        
    print('Score improved best by removing feature:', max_improved_score_name)
    print('Score improved by:', np.round(max_improved_score,4))
    
    if max_improved_score_name != 'None':
        print('Removing:', max_improved_score_name)
        print('')
        remaining_feature_names.remove(max_improved_score_name)
    else:
        not_done = False

Num remaining features: 19
Score with remaining features: 0.8343
Score improved best by removing feature: dom
Score improved by: 0.0043
Removing: dom

Num remaining features: 18
Score with remaining features: 0.8386
Score improved best by removing feature: m_factor
Score improved by: 0.0022
Removing: m_factor

Num remaining features: 17
Score with remaining features: 0.8408
Score improved best by removing feature: price_changes
Score improved by: 0.0011
Removing: price_changes

Num remaining features: 16
Score with remaining features: 0.8418
Score improved best by removing feature: is_apartment
Score improved by: 0.0009
Removing: is_apartment

Num remaining features: 15
Score with remaining features: 0.8427
Score improved best by removing feature: is_sold
Score improved by: 0.001
Removing: is_sold

Num remaining features: 14
Score with remaining features: 0.8437
Score improved best by removing feature: status_changes
Score improved by: 0.0006
Removing: status_changes

Num remaining features: 13
Score with remaining features: 0.8443
Score improved best by removing feature: is_under_offer
Score improved by: 0.0008
Removing: is_under_offer

Num remaining features: 12
Score with remaining features: 0.8452
Score improved best by removing feature: is_ground_flat
Score improved by: 0.0002
Removing: is_ground_flat

Num remaining features: 11
Score with remaining features: 0.8454
Score improved best by removing feature: is_semi_detached
Score improved by: 0.0001
Removing: is_semi_detached

Num remaining features: 10
Score with remaining features: 0.8454
Score improved best by removing feature: None
Score improved by: 0


remaining_feature_names

['beds',
 'baths',
 'latitude',
 'longitude',
 'is_house',
 'new_build',
 'is_terraced',
 'is_detached',
 'is_townhouse',
 'is_fixed_price']


y = tv_df['log_price'].values
X = tv_df[remaining_feature_names].values

scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
for score, score_name in zip(scoring, score_names):
    results_df.at['XGB - Outliers + Feat Opt.', score_name] = scores['test_' + score]


results_df


features_with_postcode_sectors = remaining_feature_names + postcode_sector_feature_names.to_list()
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values

scores, _ = get_test_scores(XGBRegressor(), X, y, scoring)
scores

{'test_r2': 0.85193999309906,
 'test_neg_root_mean_squared_error': -0.08866673508671732,
 'test_neg_mean_absolute_error': -0.06389402697447671,
 'test_neg_mean_absolute_percentage_error': -0.01159143201926165}


for score, score_name in zip(scoring, score_names):
    results_df.at['XGB - Outliers + Feat Opt + Postcode Sec', score_name] = scores['test_' + score]


results_df


y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values


%%time
# This cell takes hour(s) to run so skip and load pre-run in next cell if already run

# XGBoost hyper-parameter tuning
result_list = []
for trial in range(1000):
    learning_rate = random.uniform(0.1, 0.3)
    max_depth = random.randint(5, 21)
    min_child_weight = random.randint(1, 10)
    max_delta_step = random.randint(0, 10)
    subsample = random.uniform(0.3, 1.0)
    l2 = random.randint(1, 10)
    
    model = XGBRegressor(learning_rate = learning_rate,
                          max_depth = max_depth, min_child_weight=min_child_weight,
                          max_delta_step=max_delta_step, subsample=subsample,
                          reg_lambda=l2)
    
    test_scores, train_scores = get_test_scores(model, X, y, scoring, num_rounds=10)
    
    result_list.append({'learning_rate':learning_rate, 
                        'max_depth':max_depth, 
                        'min_child_weight':min_child_weight,
                        'max_delta_step':max_delta_step, 
                        'subsample':subsample,
                        'l2':l2 }
                        | test_scores
                        | train_scores
                        )
    
#    print('Trial: ', trial, 'Test R^2: ', test_scores['test_r2'])
    
hyper_df = pd.DataFrame(result_list)
hyper_df.to_csv('blog-hyper-parameter-tuning.csv')

CPU times: total: 8d 20h 34min 21s
Wall time: 14h 12min 52s


hyper_df = pd.read_csv('blog-hyper-parameter-tuning.csv', index_col=0)
hyper_df.sort_values('test_r2', ascending=False)


y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values


# Default scores
learning_rate = 0.3
max_depth = 6
min_child_weight = 1
max_delta_step = 0
subsample = 1
l2 = 1
    
model = XGBRegressor(learning_rate = learning_rate,
                          max_depth = max_depth, min_child_weight=min_child_weight,
                          max_delta_step=max_delta_step, subsample=subsample,
                          reg_lambda=l2)
        
default_test_scores, default_train_scores = get_test_scores(model, X, y, scoring)

defvals = {'learning_rate':0.3,
          'max_depth':6,
          'min_child_weight':1,
          'max_delta_step':0,
          'subsample':1,
          'l2':1,
          }


optvals = hyper_df.sort_values('test_r2', ascending=False).iloc[0]

learning_rate = optvals['learning_rate']
max_depth = int(optvals['max_depth'])
min_child_weight = int(optvals['min_child_weight'])
max_delta_step = int(optvals['max_delta_step'])
subsample = optvals['subsample']
l2 = int(optvals['l2'])

model = XGBRegressor(learning_rate = learning_rate,
                          max_depth = max_depth, min_child_weight=min_child_weight,
                          max_delta_step=max_delta_step, subsample=subsample,
                          reg_lambda=l2)
        
optimum_test_scores, optimum_train_scores = get_test_scores(model, X, y, scoring)


optvals

learning_rate                                0.101028
max_depth                                   20.000000
min_child_weight                             5.000000
max_delta_step                               4.000000
subsample                                    0.773805
l2                                           3.000000
test_r2                                      0.863260
test_neg_root_mean_squared_error            -0.085212
test_neg_mean_absolute_error                -0.060920
test_neg_mean_absolute_percentage_error     -0.011044
train_r2                                     0.959900
train_neg_root_mean_squared_error           -0.046269
train_neg_mean_absolute_error               -0.033455
train_neg_mean_absolute_percentage_error    -0.006081
Name: 149, dtype: float64


idx = 1
plt.figure(figsize=(10,10))
for col in hyper_df.columns[:6]:
    plt.subplot(3,2,idx)
    plt.scatter(hyper_df[col], hyper_df['test_r2'])
    plt.plot(optvals[col], optimum_test_scores['test_r2'], marker='o', color='red')
    plt.plot(defvals[col], default_test_scores['test_r2'], marker='o', color='red')
    plt.xlabel(col)
    plt.ylabel('R^2')
    plt.grid()
    plt.title(col + ' Opt. Val: ' + str(np.round(optvals[col],3)))
    idx += 1
plt.tight_layout()
plt.show()


idx = 1
plt.figure(figsize=(10,8))
for score in scoring:
    plt.subplot(2,2,idx)
    plt.scatter(hyper_df['train_' + score], hyper_df['test_' + score])
    plt.plot(optvals['train_' + score], optvals['test_' + score], marker='o', color='red')
    plt.plot(default_train_scores['train_' + score], default_test_scores['test_' + score], marker='o', color='red')
    plt.grid()
    plt.xlabel('Train ' + score)
    plt.ylabel('Test ' + score)
    plt.title(score)
    idx += 1
plt.tight_layout()
plt.show()


for score, score_name in zip(scoring, score_names):
    results_df.at['XGB - Outliers + Feat Opt + Postcode Sec + HyperTune', score_name] = optimum_test_scores['test_' + score]


results_df


# Fit model with hyper-tuned paramaters to all the train / validation data
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values
xgb = model.fit(X, y)


# Transform the unseen test set with the scaler originally fitted to the train / validation set
std_unseen_test_df = unseen_test_df.copy()
std_unseen_test_df[nb_feature_names] = scaler.transform(std_unseen_test_df[nb_feature_names])


# Predict the unseen test set values using the fitted model
unseen_test_y = std_unseen_test_df['log_price'].values
unseen_test_X = std_unseen_test_df[features_with_postcode_sectors].values
preds = xgb.predict(unseen_test_X)

results_df.at['Unseen Test Set','R^2'] = r2_score(unseen_test_y, preds)
results_df.at['Unseen Test Set','RMSE'] = -np.sqrt(mean_squared_error(unseen_test_y, preds))
results_df.at['Unseen Test Set','MAE'] = -mean_absolute_error(unseen_test_y, preds)
results_df.at['Unseen Test Set','MAPE'] = mean_absolute_percentage_error(unseen_test_y, preds)


results_df


xvals = []
yvals = []
for num_samples in range(500, len(tv_df), 100):
    sample_df = tv_df.sample(num_samples)

    y = sample_df['log_price'].values
    X = sample_df[features_with_postcode_sectors].values

    score = cross_val_score(model, X, y, cv=cv).mean()
    xvals.append(num_samples)
    yvals.append(score)
    
# All the data
y = tv_df['log_price'].values
X = tv_df[features_with_postcode_sectors].values

score = cross_val_score(model, X, y, cv=cv).mean()
xvals.append(num_samples)
yvals.append(score)

plt.plot(xvals, yvals)
plt.grid()
plt.xlabel('Num Samples')
plt.ylabel('R^2')
plt.title('XGBoost R^2 vs Number of Data Points')
plt.show()


# Make a deep copy so we keep the original df intact when we standardise
final_df = df.copy()


# Standardise the numerical (non-binary) features for the final model (some were dropped during optimisation)
final_nb_feature_names = ["beds","baths","latitude","longitude"]
final_scaler = StandardScaler().set_output(transform="pandas")
final_df[nb_feature_names] = final_scaler.fit_transform(final_df[final_nb_feature_names])


y = final_df['log_price'].values
X = final_df[features_with_postcode_sectors].values

# Instantiate an XGBoost regressor with the optimum hyper-parameters
learning_rate = optvals['learning_rate']
max_depth = int(optvals['max_depth'])
min_child_weight = int(optvals['min_child_weight'])
max_delta_step = int(optvals['max_delta_step'])
subsample = optvals['subsample']
l2 = int(optvals['l2'])

final_model = XGBRegressor(learning_rate = learning_rate,
                          max_depth = max_depth, min_child_weight=min_child_weight,
                          max_delta_step=max_delta_step, subsample=subsample,
                          reg_lambda=l2)

final_xgb_model = final_model.fit(X, y)


postcode_predictor_df = postcodes_df[['Postcode','longitude','latitude','wm-eastings','wm-northings']].copy()
postcode_predictor_df.head(1)


remaining_feature_names

['beds',
 'baths',
 'latitude',
 'longitude',
 'is_house',
 'new_build',
 'is_terraced',
 'is_detached',
 'is_townhouse',
 'is_fixed_price']


# Add the remaining features
postcode_predictor_df['beds'] = 2
postcode_predictor_df['baths'] = 1
postcode_predictor_df['is_house'] = 0
postcode_predictor_df['new_build'] = 0
postcode_predictor_df['is_terraced'] = 0
postcode_predictor_df['is_detached'] = 0
postcode_predictor_df['is_townhouse'] = 0
postcode_predictor_df['is_fixed_price'] = 0

# One hot encode the postcode sector
postcode_predictor_df['postcode_sector'] = [x.split(' ')[0] + ' ' + x.split(' ')[-1][0] for x in postcode_predictor_df['Postcode']]
postcode_predictor_df = postcode_predictor_df.join(pd.get_dummies(postcode_predictor_df['postcode_sector']))


# Standardise the non-binary features with the final fitted scaler
postcode_predictor_df[final_nb_feature_names] = final_scaler.transform(postcode_predictor_df[final_nb_feature_names])

# Form the feature matrix
X = postcode_predictor_df[features_with_postcode_sectors].values

# Get the predictions
preds = final_xgb_model.predict(X)
postcode_predictor_df['predicted_value'] = (10**(preds-3.0)).astype('int')


# Render an OSM in Bokeh using x-y range values for Edinburgh area (determined previously)
p = figure(x_range=(-391222,-321222), y_range=(7501548, 7571548),
            x_axis_type="mercator", y_axis_type="mercator",
            height=400, width=800)

# Add the OSM
p.add_tile("OpenStreetMap Mapnik")

source = ColumnDataSource(postcode_predictor_df[['Postcode','wm-eastings','wm-northings','predicted_value']])

lcval = postcode_predictor_df['predicted_value'].min()
ucval = postcode_predictor_df['predicted_value'].max() * 0.8

mapper = linear_cmap('predicted_value', bp.Plasma256, lcval, ucval*0.8)

glyph = Scatter(x='wm-eastings', y='wm-northings', fill_color=mapper, fill_alpha=0.6, size=4, line_color=mapper, line_alpha=0)
g1 = p.add_glyph(source, glyph)

    
color_bar = ColorBar(color_mapper=mapper['transform'], location=(0,0), title='Predicted Price £k')
p.add_layout(color_bar, 'right')

hover = HoverTool(renderers=[g1],tooltips = [('Postcode','@Postcode'),
                                            ('Est. Value','£@predicted_value k'),
                                            ])

p.add_tools(hover)
show(p)


# jupyter nbconvert --to html --no-prompt adsg_house_hunting2_part5.ipynb

	beds	baths	latitude	longitude	dom	price_changes	status_changes	m_factor
count	7474.000000	7474.000000	7474.000000	7474.000000	7474.000000	7474.000000	7474.000000	7474.000000
mean	2.638614	1.467220	55.930873	-3.210222	93.219160	0.214075	0.341049	0.339620
std	1.172234	1.512094	0.217301	1.140148	83.733674	0.542022	0.574873	1.101422
min	0.000000	0.000000	41.697300	-72.722829	0.000000	0.000000	0.000000	0.000000
25%	2.000000	1.000000	55.921700	-3.236875	38.000000	0.000000	0.000000	0.000000
50%	2.000000	1.000000	55.942113	-3.195746	73.000000	0.000000	0.000000	0.000000
75%	3.000000	2.000000	55.959346	-3.147716	124.000000	0.000000	1.000000	0.000000
max	10.000000	114.000000	56.039905	-1.423048	1071.000000	6.000000	5.000000	25.000000

	beds	baths	latitude	longitude	dom	price_changes	status_changes	m_factor
count	5.967000e+03	5.967000e+03	5.967000e+03	5.967000e+03	5.967000e+03	5.967000e+03	5.967000e+03	5.967000e+03
mean	-9.764455e-17	-1.214603e-16	4.234439e-15	3.506868e-15	2.619732e-17	1.190787e-18	-5.953936e-18	-2.679271e-17
std	1.000084e+00	1.000084e+00	1.000084e+00	1.000084e+00	1.000084e+00	1.000084e+00	1.000084e+00	1.000084e+00
min	-2.246990e+00	-1.875799e+00	-4.141363e+00	-2.699389e+00	-1.092261e+00	-3.866743e-01	-5.935836e-01	-2.966163e-01
25%	-5.412729e-01	-5.840630e-01	-3.684722e-01	-4.918111e-01	-6.492996e-01	-3.866743e-01	-5.935836e-01	-2.966163e-01
50%	-5.412729e-01	-5.840630e-01	1.862591e-01	-4.049936e-02	-2.413091e-01	-3.866743e-01	-5.935836e-01	-2.966163e-01
75%	3.115857e-01	7.076731e-01	6.585393e-01	4.781900e-01	3.531913e-01	-3.866743e-01	1.145206e+00	-2.966163e-01
max	6.281596e+00	1.104156e+01	2.879644e+00	4.078616e+00	1.139225e+01	1.083189e+01	8.100362e+00	2.185871e+01

	R^2	RMSE	MAE	MAPE
Naive	0.459261	-0.171257	-0.133765	-0.024356
LR	0.600303	-0.147170	-0.113796	-0.020705
DT	0.632995	-0.140943	-0.095719	-0.017396
RF	0.799147	-0.104267	-0.073076	-0.013279
XGB	0.820373	-0.098588	-0.070993	-0.012922
KNN	0.657843	-0.136155	-0.101451	-0.018482

	R^2	RMSE	MAE	MAPE
Naive	0.459261	-0.171257	-0.133765	-0.024356
LR	0.600303	-0.147170	-0.113796	-0.020705
DT	0.632995	-0.140943	-0.095719	-0.017396
RF	0.799147	-0.104267	-0.073076	-0.013279
XGB	0.820373	-0.098588	-0.070993	-0.012922
KNN	0.657843	-0.136155	-0.101451	-0.018482
XGB - Outliers	0.834264	-0.093816	-0.069136	-0.012557

	R^2	RMSE	MAE	MAPE
beds	0.476601	-0.166817	-0.132100	-0.024025
new_build	0.502626	-0.162617	-0.128681	-0.023393
baths	0.557600	-0.153345	-0.121539	-0.022107
is_detached	0.565145	-0.152014	-0.119777	-0.021784
is_apartment	0.581233	-0.149183	-0.117102	-0.021297
is_house	0.583870	-0.148708	-0.116689	-0.021225
longitude	0.731404	-0.119420	-0.087927	-0.015979
is_fixed_price	0.734413	-0.118748	-0.087586	-0.015916
latitude	0.843408	-0.091192	-0.066206	-0.012017
is_townhouse	0.843097	-0.091276	-0.066281	-0.012031
is_sold	0.841454	-0.091768	-0.066742	-0.012115
is_terraced	0.842457	-0.091462	-0.066448	-0.012061
is_semi_detached	0.842475	-0.091463	-0.066387	-0.012051
is_under_offer	0.842369	-0.091491	-0.066531	-0.012077
m_factor	0.840081	-0.092152	-0.067314	-0.012220
dom	0.834394	-0.093776	-0.069027	-0.012536
status_changes	0.833361	-0.094077	-0.069227	-0.012572
is_ground_flat	0.833178	-0.094125	-0.069348	-0.012595
price_changes	0.834455	-0.093763	-0.069061	-0.012544

Data Extraction & Transformation¶

Feature Engineering¶

Data Cleaning¶

Train / Validation / Test Split¶

Standardisation¶

Model Evaluation Methodology¶

Naive Prediction¶

Regressor Selection¶

Residuals Analysis¶

Outlier Checking & Removal¶

Feature Selection Optimisation¶

Hyper-Parameter Tuning¶

Final Model Release¶

Inference¶

	Feature	Importance
0	beds	0.215
9	new_build	0.194
1	baths	0.159
12	is_detached	0.098
13	is_apartment	0.081
8	is_house	0.054
3	longitude	0.038
16	is_fixed_price	0.029
2	latitude	0.025
15	is_townhouse	0.019
18	is_sold	0.018
10	is_terraced	0.013
11	is_semi_detached	0.011
17	is_under_offer	0.011
7	m_factor	0.009
4	dom	0.008
6	status_changes	0.007
14	is_ground_flat	0.006
5	price_changes	0.005

	learning_rate	max_depth	min_child_weight	max_delta_step	subsample	l2	test_r2	test_neg_root_mean_squared_error	test_neg_mean_absolute_error	test_neg_mean_absolute_percentage_error	train_r2	train_neg_root_mean_squared_error	train_neg_mean_absolute_error	train_neg_mean_absolute_percentage_error
149	0.101028	20	5	4	0.773805	3	0.863260	-0.085212	-0.060920	-0.011044	0.959900	-0.046269	-0.033455	-0.006081
836	0.119428	20	5	10	0.802562	2	0.862890	-0.085320	-0.060635	-0.010994	0.971459	-0.039032	-0.028280	-0.005147
470	0.139690	21	4	0	0.797019	5	0.862810	-0.085356	-0.060549	-0.010977	0.976561	-0.035371	-0.025250	-0.004594
516	0.122226	19	4	10	0.720593	9	0.862735	-0.085380	-0.060962	-0.011051	0.958067	-0.047315	-0.033714	-0.006121
968	0.105230	21	8	9	0.797111	3	0.862684	-0.085395	-0.061217	-0.011098	0.953915	-0.049604	-0.035939	-0.006529
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
820	0.114304	6	10	1	0.760050	9	0.814053	-0.099417	-0.073993	-0.013434	0.843786	-0.091335	-0.068281	-0.012407
182	0.132569	5	1	1	0.752993	7	0.813523	-0.099559	-0.074393	-0.013508	0.850650	-0.089306	-0.067233	-0.012227
735	0.116081	5	1	2	0.364070	9	0.813519	-0.099560	-0.074365	-0.013498	0.842695	-0.091653	-0.068871	-0.012514
34	0.101536	5	3	2	0.512211	4	0.813195	-0.099644	-0.074717	-0.013568	0.842008	-0.091853	-0.069337	-0.012606
250	0.103713	5	10	8	0.370739	10	0.808812	-0.100817	-0.075356	-0.013677	0.829182	-0.095509	-0.071478	-0.012980