import pandas as pd
import numpy as np
import re
import json
import numpy as np
import timeit
from datetime import datetime
# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
# Set global theme
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import plotly.express as px
from shapely.geometry import Polygon, MultiPolygon
import geopandas as gpd
import matplotlib.pyplot as plt
import geojsoncontour
from scipy.interpolate import griddata
from numpy import linspace
from sklearn.model_selection import train_test_split
#Selecting a central city point to center all graphs around - Swietokrzyska Subway
center_coors=52.235176, 21.008393
This notebook is 3rd part of a Warsaw Real Estate market analysis project. In this part we build initial model for real estate benchmarking with focus on Feature Selection and Error analysis based on spatial data, which allows to determine if we can improve model accuracy with data enrichment - especially with data desribing building location, which is the key driver of price and is not present in the offer itself.
## Reading data from GitHub
df = pd.read_excel(r"https://raw.githubusercontent.com/Jan-Majewski/Project_Portfolio/master/03_Real_Estate_pricing_in_Warsaw/RE_models_input_enriched.xlsx")
df.head()
Id | offer_date | Area | Price | latitude | longitude | build_year | building_floors_num | rooms_num | City | ... | time_return_transit_5PM | distance_return_driving_5PM | time_return_driving_5PM | price_decrease_from_20k | price_decrease_per_10min | restaurant_price_level | restaurant_mean_rating | restaurant_mean_popularity | restaurant_count | restaurant_ratings_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 54693264 | 2020-07-07 | 59.31 | 414576 | 52.344113 | 20.944478 | 2020 | 3 | 3 | warszawa | ... | 48.183333 | 15.96 | 25.65 | 12215.632653 | 2446.388382 | 1.75 | 4.325 | 233.8 | 10 | 2338 |
1 | 60139185 | 2020-07-07 | 80.00 | 618000 | 52.343998 | 20.944001 | 1998 | 4 | 3 | warszawa | ... | 48.183333 | 15.96 | 25.65 | 12215.632653 | 2446.388382 | 1.75 | 4.325 | 233.8 | 10 | 2338 |
2 | 60654016 | 2020-07-07 | 65.12 | 620000 | 52.345895 | 20.943476 | 2012 | 4 | 3 | warszawa | ... | 48.183333 | 15.96 | 25.65 | 12215.632653 | 2446.388382 | 1.75 | 4.325 | 233.8 | 10 | 2338 |
3 | 60647324 | 2020-07-07 | 102.56 | 614334 | 52.342630 | 20.942790 | 2020 | 3 | 4 | warszawa | ... | 48.183333 | 15.96 | 25.65 | 12215.632653 | 2446.388382 | 1.75 | 4.325 | 233.8 | 10 | 2338 |
4 | 59470914 | 2020-07-07 | 52.00 | 398000 | 52.344822 | 20.941277 | 2021 | 5 | 3 | warszawa | ... | 48.183333 | 15.96 | 25.65 | 12215.632653 | 2446.388382 | 1.75 | 4.325 | 233.8 | 10 | 2338 |
5 rows × 75 columns
df.shape
(25240, 75)
df['rooms_num']=np.where(df['rooms_num']=="more",10,df['rooms_num'])
## Removing columns not needed in this project
df.drop(columns=['floor_no',"City","district_old",'price_decrease_from_20k','price_decrease_per_10min'],inplace=True)
df.columns
Index(['Id', 'offer_date', 'Area', 'Price', 'latitude', 'longitude', 'build_year', 'building_floors_num', 'rooms_num', 'subdistrict', 'market', 'Building_material', 'Building_ownership', 'Building_type', 'Construction_status', 'Heating', 'Windows_type', 'Equipment_types_dishwasher', 'Equipment_types_fridge', 'Equipment_types_furniture', 'Equipment_types_oven', 'Equipment_types_stove', 'Equipment_types_tv', 'Equipment_types_washing_machine', 'Extras_types_air_conditioning', 'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement', 'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift', 'Extras_types_separate_kitchen', 'Extras_types_terrace', 'Extras_types_two_storey', 'Extras_types_usable_room', 'Media_types_cable-television', 'Media_types_cable_television', 'Media_types_electricity', 'Media_types_internet', 'Media_types_phone', 'Media_types_sewage', 'Media_types_water', 'Security_types_alarm', 'Security_types_anti_burglary_door', 'Security_types_closed_area', 'Security_types_entryphone', 'Security_types_monitoring', 'Security_types_roller_shutters', 'floor_num', 'district', 'unit_price', 'lon_mod', 'lat_mod', 'index', 'grid_price', 'sample_size', 'geo_Id', 'distance_transit_8AM', 'time_transit_8AM', 'distance_driving_8AM', 'time_driving_8AM', 'distance_return_transit_5PM', 'time_return_transit_5PM', 'distance_return_driving_5PM', 'time_return_driving_5PM', 'restaurant_price_level', 'restaurant_mean_rating', 'restaurant_mean_popularity', 'restaurant_count', 'restaurant_ratings_count'], dtype='object')
# Averaging distance and transport time from morning and evening
df["distance_driving"]=(df.distance_driving_8AM+df.distance_return_driving_5PM)/2
df["distance_transit"]=(df.distance_transit_8AM+df.distance_return_transit_5PM)/2
df["time_driving"]=(df.time_driving_8AM+df.time_return_driving_5PM)/2
df["time_transit"]=(df.time_return_transit_5PM+df.time_transit_8AM)/2
# Removing morning and evening transport data to leave only averages
df.drop(columns=['distance_transit_8AM', 'time_transit_8AM', 'distance_driving_8AM',
'time_driving_8AM', 'distance_return_transit_5PM',
'time_return_transit_5PM', 'distance_return_driving_5PM',
'time_return_driving_5PM'],inplace=True)
#Mapping districts between East and West banks of Vistula River
east_bank_dict={
'Wola':0,
'Subburbs':0,
'Downtown':0,
'Ochota':0,
'Mokotow':0,
'Bialoleka':1,
'Bielany':0,
'Bemowo':0,
'Southern Praga':1,
'Wilanow':0,
'Praga':1,
'Ursynow':0,
'Zoliborz':0,
'Wlochy':0,
'Targowek':1,
'Wawer':1,
"Other":0}
df["east_bank"]=df.district.apply(lambda x: east_bank_dict[x])
Let's investigate categorical data composition to see how complete and diverse each category looks.
#Unique features with key property characteristics
unique_features=["district","market","Building_material","Building_ownership","Building_type","Construction_status","Heating",
"Windows_type"]
# Iteraring through each category allows us to create 8 graphs with 10 lines of code
for feature in unique_features:
feature_list=["Id"]
feature_list.append(feature)
df_temp=df[feature_list].groupby(feature, as_index=False).count()
df_temp.rename(columns={"Id":"share"},inplace=True)
df_temp.sort_values(by="share",inplace=True, ascending=False)
df_temp["share"]=df_temp["share"]/df.shape[0]
df_temp["share"]=np.around(df_temp["share"],3)*100
df_temp["colour"]=np.where(df_temp[feature]=="not_specified","missing_data","valid_data")
fig = px.bar(df_temp, x=feature, y='share', color="colour",
color_discrete_sequence=["blue", "red"],
category_orders={"colour": ["valid_data", "missing_data"]},)
print("\n Feature summary for {} - Share of category within whole sample".format(feature))
fig.show()
print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")
Feature summary for district - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for market - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for Building_material - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for Building_ownership - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for Building_type - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for Construction_status - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for Heating - Share of category within whole sample
---------------------------------------------------------------------------------------------------------------------------- Feature summary for Windows_type - Share of category within whole sample
----------------------------------------------------------------------------------------------------------------------------
Are there any clear relations between unit price and categorical features? Let's investigate using box plots.
for feature in unique_features:
feature_list=["unit_price"]
feature_list.append(feature)
df_temp=df[feature_list]
print("\n Unit price distribution in split by {}".format(feature))
fig = fig = px.box(df_temp, y="unit_price", x=feature, points="suspectedoutliers");
fig.update_yaxes(range=[5000, 25000])
print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")
Unit price distribution in split by district
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by market
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by Building_material
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by Building_ownership
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by Building_type
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by Construction_status
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by Heating
---------------------------------------------------------------------------------------------------------------------------- Unit price distribution in split by Windows_type
----------------------------------------------------------------------------------------------------------------------------
We need to transform categorical data into onehot columns before feeding it to ml models. As I will use the same data for DNNs I will also drop one column for each categorical feature to avoid collinearity.
from sklearn.preprocessing import OneHotEncoder
#Extracting categorical features
df_cat=df[unique_features]
enc=OneHotEncoder()
enc.fit(df_cat)
one_hot_val=enc.fit_transform(df_cat).toarray().astype(int)
one_hot_columns=enc.get_feature_names(unique_features)
OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error', sparse=True)
df_one_hot=pd.DataFrame(one_hot_val, columns=one_hot_columns)
df_one_hot.columns
Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany', 'district_Downtown', 'district_Mokotow', 'district_Ochota', 'district_Other', 'district_Praga', 'district_Southern Praga', 'district_Subburbs', 'district_Targowek', 'district_Ursynow', 'district_Wawer', 'district_Wilanow', 'district_Wlochy', 'district_Wola', 'district_Zoliborz', 'market_primary', 'market_secondary', 'Building_material_breezeblock', 'Building_material_brick', 'Building_material_cellular_concrete', 'Building_material_concrete', 'Building_material_concrete_plate', 'Building_material_hydroton', 'Building_material_not_specified', 'Building_material_other', 'Building_material_reinforced_concrete', 'Building_material_silikat', 'Building_material_wood', 'Building_ownership_co_operative_ownership', 'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe', 'Building_ownership_full_ownership', 'Building_ownership_not_specified', 'Building_ownership_share', 'Building_type_apartment', 'Building_type_block', 'Building_type_detached', 'Building_type_house', 'Building_type_infill', 'Building_type_loft', 'Building_type_not_specified', 'Building_type_ribbon', 'Building_type_tenement', 'Construction_status_not_specified', 'Construction_status_ready_to_use', 'Construction_status_to_completion', 'Construction_status_to_renovation', 'Heating_boiler_room', 'Heating_electrical', 'Heating_gas', 'Heating_not_specified', 'Heating_other', 'Heating_tiled_stove', 'Heating_urban', 'Windows_type_aluminium', 'Windows_type_not_specified', 'Windows_type_plastic', 'Windows_type_wooden'], dtype='object')
not_specified_idx=[]
for i in range(0,df_one_hot.columns.shape[0]):
if "not_specified" in df_one_hot.columns[i]:
not_specified_idx.append(i)
not_specified_idx=np.asarray(not_specified_idx)
Let's drop all "not_specified" columns and "market_secondary" and "Mokotow" district to aboid collinearity.
drop_collinear_cols=list(df_one_hot.columns[not_specified_idx])
drop_collinear_cols.append('district_Mokotow')
drop_collinear_cols.append('market_secondary')
#Function substracting list2 elements from list1
def list_diff(list1, list2):
out = []
for ele in list1:
if not ele in list2:
out.append(ele)
return out
df_one_hot.columns
Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany', 'district_Downtown', 'district_Mokotow', 'district_Ochota', 'district_Other', 'district_Praga', 'district_Southern Praga', 'district_Subburbs', 'district_Targowek', 'district_Ursynow', 'district_Wawer', 'district_Wilanow', 'district_Wlochy', 'district_Wola', 'district_Zoliborz', 'market_primary', 'market_secondary', 'Building_material_breezeblock', 'Building_material_brick', 'Building_material_cellular_concrete', 'Building_material_concrete', 'Building_material_concrete_plate', 'Building_material_hydroton', 'Building_material_not_specified', 'Building_material_other', 'Building_material_reinforced_concrete', 'Building_material_silikat', 'Building_material_wood', 'Building_ownership_co_operative_ownership', 'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe', 'Building_ownership_full_ownership', 'Building_ownership_not_specified', 'Building_ownership_share', 'Building_type_apartment', 'Building_type_block', 'Building_type_detached', 'Building_type_house', 'Building_type_infill', 'Building_type_loft', 'Building_type_not_specified', 'Building_type_ribbon', 'Building_type_tenement', 'Construction_status_not_specified', 'Construction_status_ready_to_use', 'Construction_status_to_completion', 'Construction_status_to_renovation', 'Heating_boiler_room', 'Heating_electrical', 'Heating_gas', 'Heating_not_specified', 'Heating_other', 'Heating_tiled_stove', 'Heating_urban', 'Windows_type_aluminium', 'Windows_type_not_specified', 'Windows_type_plastic', 'Windows_type_wooden'], dtype='object')
#Substracting collinear columns from categorical columns. Modyfing categorical data DataFrame
df_cat_columns=list_diff(df_one_hot.columns,drop_collinear_cols)
df_cat=df_one_hot[df_cat_columns]
columns_base=list_diff(df.columns, unique_features)
# Removing columns without any predictive value and columns not within initial dataset (added to datasource after initial iteration).
drop_columns_base=[
'Price','latitude','longitude','lon_mod','lat_mod','grid_price','sample_size','City','subdistrict',
'offer_date',"index","geo_Id","east_bank", "floor_no"
]
columns_base=list_diff(columns_base, drop_columns_base)
# Removing GoogleMaps and Restaurant data from initial data set as they were added after initial error analysis
columns_google_maps=['distance_driving','distance_transit', 'time_driving', 'time_transit']
columns_restaurants=[ 'restaurant_price_level', 'restaurant_mean_rating','restaurant_mean_popularity', 'restaurant_count',"restaurant_ratings_count"]
columns_base=list_diff(columns_base, columns_google_maps)
columns_base=list_diff(columns_base, columns_restaurants)
columns_base
['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num', 'Equipment_types_dishwasher', 'Equipment_types_fridge', 'Equipment_types_furniture', 'Equipment_types_oven', 'Equipment_types_stove', 'Equipment_types_tv', 'Equipment_types_washing_machine', 'Extras_types_air_conditioning', 'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement', 'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift', 'Extras_types_separate_kitchen', 'Extras_types_terrace', 'Extras_types_two_storey', 'Extras_types_usable_room', 'Media_types_cable-television', 'Media_types_cable_television', 'Media_types_electricity', 'Media_types_internet', 'Media_types_phone', 'Media_types_sewage', 'Media_types_water', 'Security_types_alarm', 'Security_types_anti_burglary_door', 'Security_types_closed_area', 'Security_types_entryphone', 'Security_types_monitoring', 'Security_types_roller_shutters', 'floor_num', 'unit_price']
# Combining base numerical data with categorical data
df_num=df[columns_base]
ml_data=pd.concat([df_num,df_cat],axis=1)
ml_data.columns[:]
Index(['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num', 'Equipment_types_dishwasher', 'Equipment_types_fridge', 'Equipment_types_furniture', 'Equipment_types_oven', 'Equipment_types_stove', 'Equipment_types_tv', 'Equipment_types_washing_machine', 'Extras_types_air_conditioning', 'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement', 'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift', 'Extras_types_separate_kitchen', 'Extras_types_terrace', 'Extras_types_two_storey', 'Extras_types_usable_room', 'Media_types_cable-television', 'Media_types_cable_television', 'Media_types_electricity', 'Media_types_internet', 'Media_types_phone', 'Media_types_sewage', 'Media_types_water', 'Security_types_alarm', 'Security_types_anti_burglary_door', 'Security_types_closed_area', 'Security_types_entryphone', 'Security_types_monitoring', 'Security_types_roller_shutters', 'floor_num', 'unit_price', 'district_Bemowo', 'district_Bialoleka', 'district_Bielany', 'district_Downtown', 'district_Ochota', 'district_Other', 'district_Praga', 'district_Southern Praga', 'district_Subburbs', 'district_Targowek', 'district_Ursynow', 'district_Wawer', 'district_Wilanow', 'district_Wlochy', 'district_Wola', 'district_Zoliborz', 'market_primary', 'Building_material_breezeblock', 'Building_material_brick', 'Building_material_cellular_concrete', 'Building_material_concrete', 'Building_material_concrete_plate', 'Building_material_hydroton', 'Building_material_other', 'Building_material_reinforced_concrete', 'Building_material_silikat', 'Building_material_wood', 'Building_ownership_co_operative_ownership', 'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe', 'Building_ownership_full_ownership', 'Building_ownership_share', 'Building_type_apartment', 'Building_type_block', 'Building_type_detached', 'Building_type_house', 'Building_type_infill', 'Building_type_loft', 'Building_type_ribbon', 'Building_type_tenement', 'Construction_status_ready_to_use', 'Construction_status_to_completion', 'Construction_status_to_renovation', 'Heating_boiler_room', 'Heating_electrical', 'Heating_gas', 'Heating_other', 'Heating_tiled_stove', 'Heating_urban', 'Windows_type_aluminium', 'Windows_type_plastic', 'Windows_type_wooden'], dtype='object')
# Splitting data into X and y, removing price outliers
X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)
from sklearn.feature_selection import SelectKBest, f_regression
bestfeatures = SelectKBest(score_func=f_regression, k="all")
# Selecting X features carrying most information about y
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
# Formatting scores into single DataFrame
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
featureScores.nlargest(50,'Score').head(20)
Feature | Score | |
---|---|---|
39 | district_Downtown | 7061.087788 |
37 | district_Bialoleka | 4019.670677 |
74 | Building_type_tenement | 1944.798855 |
52 | market_primary | 1795.763210 |
68 | Building_type_block | 1738.136823 |
44 | district_Subburbs | 1285.386747 |
67 | Building_type_apartment | 1260.089487 |
1 | build_year | 1248.918973 |
86 | Windows_type_wooden | 1072.417253 |
75 | Construction_status_ready_to_use | 764.632223 |
76 | Construction_status_to_completion | 631.575191 |
35 | floor_num | 582.200143 |
50 | district_Wola | 576.595866 |
2 | building_floors_num | 572.941746 |
13 | Extras_types_balcony | 570.238415 |
11 | Extras_types_air_conditioning | 536.278968 |
83 | Heating_urban | 438.529114 |
65 | Building_ownership_full_ownership | 402.054301 |
45 | district_Targowek | 323.404624 |
3 | rooms_num | 311.584572 |
# Investigating how many features made it past 200 f_regression score threshold
featureScores.query("Score>200").shape
(33, 2)
# Creating list of selected top features
top_features=featureScores.query("Score>200").Feature.unique()
top_features
array(['build_year', 'building_floors_num', 'rooms_num', 'Equipment_types_dishwasher', 'Equipment_types_fridge', 'Equipment_types_furniture', 'Equipment_types_tv', 'Equipment_types_washing_machine', 'Extras_types_air_conditioning', 'Extras_types_balcony', 'Extras_types_garden', 'Extras_types_lift', 'floor_num', 'district_Bemowo', 'district_Bialoleka', 'district_Downtown', 'district_Subburbs', 'district_Targowek', 'district_Wawer', 'district_Wola', 'district_Zoliborz', 'market_primary', 'Building_material_brick', 'Building_ownership_full_ownership', 'Building_type_apartment', 'Building_type_block', 'Building_type_tenement', 'Construction_status_ready_to_use', 'Construction_status_to_completion', 'Heating_urban', 'Windows_type_aluminium', 'Windows_type_plastic', 'Windows_type_wooden'], dtype=object)
# Visualizing selected features
selected_features=featureScores.query("Score>200")
selected_features.sort_values(by="Score",inplace=True,ascending=True)
import plotly.express as px
fig = px.bar(selected_features, y='Feature', x='Score',orientation='h')
fig.show()
X=X[top_features]
random_state=15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
# Extracting train and test idx for later merge with additional data or geography coordinates
test_idx=np.asarray(X_test.index)
train_idx=np.asarray(X_train.index)
X_test_coors=df[[ "unit_price",'latitude', 'longitude',"lat_mod","lon_mod"]].iloc[test_idx]
X_test_coors.reset_index(inplace=True,drop=True)
X_test.shape
(4987, 33)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)
X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
from sklearn.ensemble import RandomForestRegressor
# Selecting RF regressor with some regularization parameteres
rf_model=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
max_depth=25, max_features=15, max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.01,
min_impurity_split=None, min_samples_leaf=5,
n_estimators=500, n_jobs=2, oob_score=False,
random_state=10, verbose=0, warm_start=False)
rf_model.fit(X_train,y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=25, max_features=15, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.01, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2, oob_score=False, random_state=10, verbose=0, warm_start=False)
#Investigating Train set performance
rf_model.score(X_train,y_train)
0.7757492984608529
#Investigating Test set performance
rf_model.score(X_test,y_test)
0.6655517982906214
We still seem to have some overfitting - this could probably be handdled with better hyperparameter tuning. As the overfitting is not too bad for a Random Forest model I will proceed with current setup.
# Investigating model feature importance
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)
trace0=go.Bar(
x=importance_df.tail(10).coefficients,
y=importance_df.tail(10).features,
orientation="h",
width=0.5,
marker=dict(
color="blue",
opacity=0.5
),
)
data=[trace0]
figure=go.Figure(
data=data,
layout=go.Layout(
title="Initial model - Feature importance",
xaxis=dict(title="Importance"),
yaxis=dict(title="Feature")
))
iplot(figure)
# Function comparing key performance KPIs for any sklearn model
def performance_summary(model, X_test, y_test ):
y_hat=model.predict(X_test)
df_summary=pd.DataFrame(y_hat, columns=["y_hat"])
df_summary["y_true"]=y_test
df_summary["abs_error"]=np.abs(df_summary.y_true-df_summary.y_hat)
df_summary["error"]=df_summary.y_hat-df_summary.y_true
df_summary["relative_error"]= (df_summary["error"]/df_summary.y_true)
df_summary["relative_abs_error"]= (df_summary["abs_error"]/df_summary.y_true)
print("Share of forecasts within 25% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.25").shape[0]/df_summary.shape[0]))
print("Share of forecasts within 10% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.10").shape[0]/df_summary.shape[0]))
print("Share of forecasts within 5% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.05").shape[0]/df_summary.shape[0]))
return(df_summary)
# Function combining test data with coordinates and creating aggregated grid data for error mapping
def test_error_mapping(df_summary,X_test_coors):
df_summary_dnn_coors=pd.concat([df_summary,X_test_coors], axis=1)
df_summary_dnn_coors[["relative_error","relative_abs_error"]]=df_summary_dnn_coors[["relative_error","relative_abs_error"]]*100
# lat_mod and lon_mod are coordinates rounded into grid, which allows us to create around 600 tiles in warsaw and analyse mean error in each
df_map=df_summary_dnn_coors[["lat_mod","lon_mod","relative_error","relative_abs_error","abs_error","error","y_hat","y_true"]]\
.groupby(["lat_mod","lon_mod"], as_index=False).mean()
return(df_map)
df_summary=performance_summary(rf_model, X_test, y_test)
Share of forecasts within 25% absolute error 0.912 Share of forecasts within 10% absolute error 0.634 Share of forecasts within 5% absolute error 0.375
df_summary.describe()
y_hat | y_true | abs_error | error | relative_error | relative_abs_error | |
---|---|---|---|---|---|---|
count | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 |
mean | 11159.682329 | 11128.766393 | 1177.861976 | 30.915936 | 0.025094 | 0.102908 |
std | 2403.227325 | 3094.201353 | 1346.995704 | 1789.155814 | 0.146392 | 0.107090 |
min | 5832.991302 | 5083.000000 | 0.277364 | -12708.867441 | -0.557246 | 0.000024 |
25% | 9495.179696 | 8951.000000 | 312.572331 | -584.721788 | -0.051806 | 0.030674 |
50% | 10932.310587 | 10577.000000 | 752.587469 | 135.429784 | 0.013567 | 0.071721 |
75% | 12502.698582 | 12546.000000 | 1536.980399 | 881.170739 | 0.088319 | 0.136689 |
max | 20986.553687 | 25000.000000 | 12708.867441 | 8824.520863 | 1.122775 | 1.122775 |
# Importing a map filter, which removes irrelevant areas from contourplot used for heatmap
map_filter=pd.read_csv(r"https://raw.githubusercontent.com/Jan-Majewski/Project_Portfolio/master/03_Real_Estate_pricing_in_Warsaw/map_filter.csv",delimiter=",",header=None)
def validate_json(json_dict, df_contour):
'''
As contour plots do not display correctly if even one of the contour layers is not populated with values and its geometry
is empty this function removes such instances
'''
valid_idx=[]
valid_features=[]
for i in range(0,df_contour.shape[0]):
if len(json_dict["features"][i]["geometry"]["coordinates"])>2:
valid_idx.append(i)
valid_features.append(json_dict["features"][i])
json_dict["features"]=valid_features
df_contour=df_contour.iloc[valid_idx,:]
return(json_dict,df_contour)
# Function to interpolate grid data into a finer 100x100 mesh used for heatmaps
def create_grid_data(x,y,z,grid_size):
xi = linspace(x.min(),x.max(),grid_size);
yi = linspace(y.min(),y.max(),grid_size);
zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method="linear")
return(xi,yi,zi)
class GeoHeatmap:
'''
Class used to create geojson countour and corresponding data DataFramea,
which is later used by plotly to create choropleth heatmap
'''
def __init__(self, x,y,z,zstep):
xi,yi,zi=create_grid_data(x,y,z,100)
# Filtering created grid with map_filter
zi=zi*map_filter
cs = plt.contourf(xi,yi,zi,range(-50,int(np.nanmax(zi)),zstep),cmap=plt.cm.jet)
# cs = plt.contourf(xi,yi,zi,range(int(np.nanmin(zi)),int(np.nanmax(zi)),zstep),cmap=plt.cm.jet)
plt.close()
geojson = geojsoncontour.contourf_to_geojson(
contourf=cs,
ndigits=3,
)
json_dict=eval(geojson)
arr_temp=np.ones([len(json_dict["features"]),2])
for i in range(0, len(json_dict["features"])):
json_dict["features"][i]["id"]=i
arr_temp[i,0]=i
arr_temp[i,1]=float(json_dict["features"][i]["properties"]["title"])
df_contour=pd.DataFrame(arr_temp, columns=["Id","value"])
self.json_dict, self.df_contour=validate_json(json_dict, df_contour)
def plot(self, title, zmin, zmax):
geojson=self.json_dict
df=self.df_contour
trace = go.Choroplethmapbox(
geojson=geojson,
locations=df.Id,
z=df.value,
colorscale="jet",
name="",
hovertemplate= "<b>Relative error:</br></br> %{z:,.0f}% </br></br>",
showscale=True,
zauto=False,
zmax=zmax,
zmin=zmin,
marker_line_width=0,
marker=dict(opacity=0.5),
)
layout = go.Layout(
title=title,
legend=dict(
orientation="h"),
height = 800,
# top, bottom, left and right margins
margin = dict(t = 80, b = 0, l = 0, r = 0),
font = dict(color = 'dark grey', size = 18),
mapbox = dict(
center = dict(
lat = center_coors[0],
lon = center_coors[1]
),
# default level of zoom
zoom = 10.3,
# default map style
style = "carto-positron"
)
)
data=[trace]
figure=dict(
data=data,
layout=layout,
)
iplot(figure)
# Creating a coors grid with average error used as map input
df_map=test_error_mapping(df_summary,X_test_coors)
# Initializing GeoHeatmap Class
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
# Plotting error
error_GeoHeatmap.plot("Initial model - Relative error heatmap [%]",-30,30)
# Adding east_bank feature to test and train set based on previously extracted idx
X_train["east_bank"]=df["east_bank"].iloc[train_idx].values
X_test["east_bank"]=df["east_bank"].iloc[test_idx].values
# Training model on enriched datasource
rf_model.fit(X_train,y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=25, max_features=15, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.01, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2, oob_score=False, random_state=10, verbose=0, warm_start=False)
# Analyzing model performance
rf_model.score(X_test,y_test)
0.6708279238284628
df_summary=performance_summary(rf_model, X_test, y_test)
Share of forecasts within 25% absolute error 0.916 Share of forecasts within 10% absolute error 0.637 Share of forecasts within 5% absolute error 0.385
df_summary.describe()
y_hat | y_true | abs_error | error | relative_error | relative_abs_error | |
---|---|---|---|---|---|---|
count | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 |
mean | 11161.353513 | 11128.766393 | 1163.803175 | 32.587120 | 0.024825 | 0.101492 |
std | 2411.597380 | 3094.201353 | 1340.451781 | 1774.953048 | 0.144598 | 0.105934 |
min | 5845.777954 | 5083.000000 | 0.101406 | -12736.130278 | -0.535934 | 0.000014 |
25% | 9459.796185 | 8951.000000 | 310.949220 | -558.505103 | -0.050330 | 0.029884 |
50% | 10975.166644 | 10577.000000 | 750.628936 | 143.799449 | 0.013885 | 0.070732 |
75% | 12511.045358 | 12546.000000 | 1505.049911 | 882.976851 | 0.088799 | 0.135543 |
max | 20841.224622 | 25000.000000 | 12736.130278 | 8893.177414 | 1.185477 | 1.185477 |
# Analyzing eastbank flag effect on feature importance
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X_train.columns
importance_df.sort_values(by="coefficients",inplace=True)
trace0=go.Bar(
x=importance_df.tail(10).coefficients,
y=importance_df.tail(10).features,
orientation="h",
width=0.5,
marker=dict(
color="blue",
opacity=0.5
),
)
data=[trace0]
figure=go.Figure(
data=data,
layout=go.Layout(
title="East bank flag added - Feature importance",
xaxis=dict(title="Importance"),
yaxis=dict(title="Feature")
))
iplot(figure)
df_map=test_error_mapping(df_summary,X_test_coors)
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
error_GeoHeatmap.plot("East-bank flag added - Relative error heatmap [%]",-30,30)
# Feature added from google maps
columns_google_maps
['distance_driving', 'distance_transit', 'time_driving', 'time_transit']
# Repeating feature selection with enriched data source
df_num=df[columns_base+["east_bank"]+columns_google_maps]
ml_data=pd.concat([df_num,df_cat],axis=1)
X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
featureScores.sort_values(by="Score",inplace=True,ascending=True)
featureScores.nlargest(10,'Score')
Feature | Score | |
---|---|---|
38 | distance_transit | 15110.951918 |
37 | distance_driving | 14331.765525 |
40 | time_transit | 11729.905071 |
39 | time_driving | 9771.050976 |
44 | district_Downtown | 7061.087788 |
36 | east_bank | 4161.069394 |
42 | district_Bialoleka | 4019.670677 |
79 | Building_type_tenement | 1944.798855 |
57 | market_primary | 1795.763210 |
73 | Building_type_block | 1738.136823 |
trace0=go.Bar(
x=featureScores.tail(10).Score,
y=featureScores.tail(10).Feature,
orientation="h",
width=0.5,
marker=dict(
color="blue",
opacity=0.5
),
)
data=[trace0]
figure=go.Figure(
data=data,
layout=go.Layout(
title="GoogleMaps data added - f_regression scores",
xaxis=dict(title="Importance"),
yaxis=dict(title="Score")
))
iplot(figure)
# Analyzing features passing f_regression score threshold
featureScores.query("Score>200").shape
top_features=featureScores.query("Score>200").Feature.unique()
(38, 2)
# Repeating process of splitting data - same random state ensures comparability
X=X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)
# Training model on data source with googlemaps data
rf_model.fit(X_train,y_train)
rf_model.score(X_test,y_test)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=25, max_features=15, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.01, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2, oob_score=False, random_state=10, verbose=0, warm_start=False)
0.7389784345233201
# Performance analysis
df_summary=performance_summary(rf_model, X_test, y_test)
Share of forecasts within 25% absolute error 0.937 Share of forecasts within 10% absolute error 0.690 Share of forecasts within 5% absolute error 0.439
df_summary.describe()
y_hat | y_true | abs_error | error | relative_error | relative_abs_error | |
---|---|---|---|---|---|---|
count | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 |
mean | 11161.112366 | 11128.766393 | 1020.064673 | 32.345973 | 0.021116 | 0.088994 |
std | 2517.794293 | 3094.201353 | 1207.601444 | 1580.504794 | 0.129155 | 0.095945 |
min | 5738.905381 | 5083.000000 | 0.185510 | -12380.710792 | -0.526839 | 0.000020 |
25% | 9326.003461 | 8951.000000 | 249.986888 | -479.112622 | -0.042638 | 0.024725 |
50% | 10946.202199 | 10577.000000 | 638.837969 | 96.950829 | 0.009605 | 0.061140 |
75% | 12547.364005 | 12546.000000 | 1327.224263 | 755.154385 | 0.076540 | 0.119597 |
max | 21818.906420 | 25000.000000 | 12380.710792 | 7263.202510 | 1.220820 | 1.220820 |
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)
trace0=go.Bar(
x=importance_df.tail(10).coefficients,
y=importance_df.tail(10).features,
orientation="h",
width=0.5,
marker=dict(
color="blue",
opacity=0.5
),
)
data=[trace0]
figure=go.Figure(
data=data,
layout=go.Layout(
title="Googlemaps enriched model - Feature importance",
xaxis=dict(title="Importance"),
yaxis=dict(title="Feature")
))
iplot(figure)
df_map=test_error_mapping(df_summary,X_test_coors)
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
error_GeoHeatmap.plot("Google maps data added - Relative error heatmap [%]",-30,30)
# Features added based on local restaurants (within 1km) data
columns_restaurants
['restaurant_price_level', 'restaurant_mean_rating', 'restaurant_mean_popularity', 'restaurant_count', 'restaurant_ratings_count']
# Repeating feature selection process
df_num=df[columns_base+["east_bank"]+columns_google_maps+columns_restaurants]
ml_data=pd.concat([df_num,df_cat],axis=1)
X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
featureScores.nlargest(50,'Score').head(20)
Specs | Score | |
---|---|---|
38 | distance_transit | 15110.951918 |
37 | distance_driving | 14331.765525 |
40 | time_transit | 11729.905071 |
39 | time_driving | 9771.050976 |
44 | restaurant_count | 8054.954714 |
43 | restaurant_mean_popularity | 7599.663197 |
49 | district_Downtown | 7061.087788 |
45 | restaurant_ratings_count | 5505.569986 |
41 | restaurant_price_level | 4783.021585 |
42 | restaurant_mean_rating | 4512.937869 |
36 | east_bank | 4161.069394 |
47 | district_Bialoleka | 4019.670677 |
84 | Building_type_tenement | 1944.798855 |
62 | market_primary | 1795.763210 |
78 | Building_type_block | 1738.136823 |
54 | district_Subburbs | 1285.386747 |
77 | Building_type_apartment | 1260.089487 |
1 | build_year | 1248.918973 |
96 | Windows_type_wooden | 1072.417253 |
85 | Construction_status_ready_to_use | 764.632223 |
# Investigating number of features passing importance threshold
featureScores.query("Score>200").shape
top_features=featureScores.query("Score>200").Specs.unique()
(43, 2)
X=X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)
# Training and scoring the model
rf_model.fit(X_train,y_train)
rf_model.score(X_test,y_test)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=25, max_features=15, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.01, min_impurity_split=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2, oob_score=False, random_state=10, verbose=0, warm_start=False)
0.7423337887710196
# Model performance analysis
df_summary=performance_summary(rf_model, X_test, y_test)
Share of forecasts within 25% absolute error 0.939 Share of forecasts within 10% absolute error 0.693 Share of forecasts within 5% absolute error 0.446
df_summary.describe()
y_hat | y_true | abs_error | error | relative_error | relative_abs_error | |
---|---|---|---|---|---|---|
count | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 | 4987.000000 |
mean | 11157.505698 | 11128.766393 | 1011.659073 | 28.739305 | 0.020655 | 0.088274 |
std | 2520.718485 | 3094.201353 | 1201.356772 | 1570.379357 | 0.128461 | 0.095578 |
min | 5753.684067 | 5083.000000 | 0.328317 | -12689.720184 | -0.539988 | 0.000033 |
25% | 9303.500195 | 8951.000000 | 253.678220 | -471.524392 | -0.042963 | 0.024451 |
50% | 10954.167837 | 10577.000000 | 616.500692 | 89.566120 | 0.009131 | 0.059925 |
75% | 12509.783497 | 12546.000000 | 1333.592176 | 747.952363 | 0.075358 | 0.119922 |
max | 21777.652215 | 25000.000000 | 12689.720184 | 7792.430903 | 1.230854 | 1.230854 |
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)
trace0=go.Bar(
x=importance_df.tail(10).coefficients,
y=importance_df.tail(10).features,
orientation="h",
width=0.5,
marker=dict(
color="blue",
opacity=0.5
),
)
data=[trace0]
figure=go.Figure(
data=data,
layout=go.Layout(
title="Final model - Feature importance",
xaxis=dict(title="Importance"),
yaxis=dict(title="Feature")
))
iplot(figure)
df_map=test_error_mapping(df_summary,X_test_coors)
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
error_GeoHeatmap.plot("Restaurant data added - Relative error heatmap [%]",-30,30)