In [1]:

import pandas as pd
import numpy as np
import re
import json
import numpy as np
import timeit 
from datetime import datetime

In [2]:

# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()

# Set global theme

import plotly.figure_factory as ff

import plotly.graph_objects as go
import plotly.express as px

In [3]:

import plotly.express as px
from shapely.geometry import Polygon, MultiPolygon
import geopandas as gpd
import matplotlib.pyplot as plt
import geojsoncontour

from scipy.interpolate import griddata
from numpy import linspace

In [4]:

from sklearn.model_selection import train_test_split

In [5]:

#Selecting a central city point to center all graphs around - Swietokrzyska Subway 
center_coors=52.235176, 21.008393

Project overwiev¶

This notebook is 3rd part of a Warsaw Real Estate market analysis project. In this part we build initial model for real estate benchmarking with focus on Feature Selection and Error analysis based on spatial data, which allows to determine if we can improve model accuracy with data enrichment - especially with data desribing building location, which is the key driver of price and is not present in the offer itself.

Data input and initial transformation¶

In [6]:

## Reading data from GitHub
df = pd.read_excel(r"https://raw.githubusercontent.com/Jan-Majewski/Project_Portfolio/master/03_Real_Estate_pricing_in_Warsaw/RE_models_input_enriched.xlsx")

In [7]:

df.head()

Out[7]:

	Id	offer_date	Area	Price	latitude	longitude	build_year	building_floors_num	rooms_num	City	...	time_return_transit_5PM	distance_return_driving_5PM	time_return_driving_5PM	price_decrease_from_20k	price_decrease_per_10min	restaurant_price_level	restaurant_mean_rating	restaurant_mean_popularity	restaurant_count	restaurant_ratings_count
0	54693264	2020-07-07	59.31	414576	52.344113	20.944478	2020	3	3	warszawa	...	48.183333	15.96	25.65	12215.632653	2446.388382	1.75	4.325	233.8	10	2338
1	60139185	2020-07-07	80.00	618000	52.343998	20.944001	1998	4	3	warszawa	...	48.183333	15.96	25.65	12215.632653	2446.388382	1.75	4.325	233.8	10	2338
2	60654016	2020-07-07	65.12	620000	52.345895	20.943476	2012	4	3	warszawa	...	48.183333	15.96	25.65	12215.632653	2446.388382	1.75	4.325	233.8	10	2338
3	60647324	2020-07-07	102.56	614334	52.342630	20.942790	2020	3	4	warszawa	...	48.183333	15.96	25.65	12215.632653	2446.388382	1.75	4.325	233.8	10	2338
4	59470914	2020-07-07	52.00	398000	52.344822	20.941277	2021	5	3	warszawa	...	48.183333	15.96	25.65	12215.632653	2446.388382	1.75	4.325	233.8	10	2338

5 rows × 75 columns

In [8]:

df.shape

Out[8]:

(25240, 75)

In [9]:

df['rooms_num']=np.where(df['rooms_num']=="more",10,df['rooms_num'])

In [10]:

## Removing columns not needed in this project
df.drop(columns=['floor_no',"City","district_old",'price_decrease_from_20k','price_decrease_per_10min'],inplace=True)

In [11]:

df.columns

Out[11]:

Index(['Id', 'offer_date', 'Area', 'Price', 'latitude', 'longitude',
       'build_year', 'building_floors_num', 'rooms_num', 'subdistrict',
       'market', 'Building_material', 'Building_ownership', 'Building_type',
       'Construction_status', 'Heating', 'Windows_type',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_cable_television',
       'Media_types_electricity', 'Media_types_internet', 'Media_types_phone',
       'Media_types_sewage', 'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
       'Security_types_roller_shutters', 'floor_num', 'district', 'unit_price',
       'lon_mod', 'lat_mod', 'index', 'grid_price', 'sample_size', 'geo_Id',
       'distance_transit_8AM', 'time_transit_8AM', 'distance_driving_8AM',
       'time_driving_8AM', 'distance_return_transit_5PM',
       'time_return_transit_5PM', 'distance_return_driving_5PM',
       'time_return_driving_5PM', 'restaurant_price_level',
       'restaurant_mean_rating', 'restaurant_mean_popularity',
       'restaurant_count', 'restaurant_ratings_count'],
      dtype='object')

In [12]:

# Averaging distance and transport time from morning and evening
df["distance_driving"]=(df.distance_driving_8AM+df.distance_return_driving_5PM)/2
df["distance_transit"]=(df.distance_transit_8AM+df.distance_return_transit_5PM)/2
df["time_driving"]=(df.time_driving_8AM+df.time_return_driving_5PM)/2
df["time_transit"]=(df.time_return_transit_5PM+df.time_transit_8AM)/2

In [13]:

# Removing morning and evening transport data to leave only averages
df.drop(columns=['distance_transit_8AM', 'time_transit_8AM', 'distance_driving_8AM',
       'time_driving_8AM', 'distance_return_transit_5PM',
       'time_return_transit_5PM', 'distance_return_driving_5PM',
       'time_return_driving_5PM'],inplace=True)

In [14]:

#Mapping districts between East and West banks of Vistula River

east_bank_dict={
    'Wola':0,
    'Subburbs':0,
    'Downtown':0,
    'Ochota':0,
    'Mokotow':0,
    'Bialoleka':1,
    'Bielany':0,
    'Bemowo':0,
    'Southern Praga':1,
    'Wilanow':0,
    'Praga':1,
    'Ursynow':0,
    'Zoliborz':0,
    'Wlochy':0,
    'Targowek':1,
    'Wawer':1,
    "Other":0}

df["east_bank"]=df.district.apply(lambda x: east_bank_dict[x])

Data exploration¶

Category composition¶

Let's investigate categorical data composition to see how complete and diverse each category looks.

In [15]:

#Unique features with key property characteristics
unique_features=["district","market","Building_material","Building_ownership","Building_type","Construction_status","Heating",
"Windows_type"]

In [16]:

# Iteraring through each category allows us to create 8 graphs with 10 lines of code

for feature in unique_features:
    feature_list=["Id"]
    feature_list.append(feature)

    df_temp=df[feature_list].groupby(feature, as_index=False).count()
    df_temp.rename(columns={"Id":"share"},inplace=True)
    df_temp.sort_values(by="share",inplace=True, ascending=False)
    df_temp["share"]=df_temp["share"]/df.shape[0]
    df_temp["share"]=np.around(df_temp["share"],3)*100
    df_temp["colour"]=np.where(df_temp[feature]=="not_specified","missing_data","valid_data")
    
    fig = px.bar(df_temp, x=feature, y='share', color="colour",
                 color_discrete_sequence=["blue", "red"],
                 category_orders={"colour": ["valid_data", "missing_data"]},)
    print("\n Feature summary for {} - Share of category within whole sample".format(feature))
    fig.show()
    print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")

    

 Feature summary for district - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for market - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_material - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_ownership - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Building_type - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Construction_status - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Heating - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------




 Feature summary for Windows_type - Share of category within whole sample

----------------------------------------------------------------------------------------------------------------------------

Average unit price by feature¶

Are there any clear relations between unit price and categorical features? Let's investigate using box plots.

In [17]:

for feature in unique_features:
    feature_list=["unit_price"]
    feature_list.append(feature)

    df_temp=df[feature_list]
    
    print("\n Unit price distribution in split by {}".format(feature))
    
    fig = fig = px.box(df_temp, y="unit_price", x=feature, points="suspectedoutliers");
    fig.update_yaxes(range=[5000, 25000])

    print("----------------------------------------------------------------------------------------------------------------------------\n\n\n")

 Unit price distribution in split by district

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by market

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_material

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_ownership

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Building_type

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Construction_status

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Heating

----------------------------------------------------------------------------------------------------------------------------




 Unit price distribution in split by Windows_type

----------------------------------------------------------------------------------------------------------------------------

Transforming data for ML models¶

Change categorical data into OneHot vectors¶

We need to transform categorical data into onehot columns before feeding it to ml models. As I will use the same data for DNNs I will also drop one column for each categorical feature to avoid collinearity.

In [18]:

from sklearn.preprocessing import OneHotEncoder

In [19]:

#Extracting categorical features
df_cat=df[unique_features]

In [20]:

enc=OneHotEncoder()
enc.fit(df_cat)

one_hot_val=enc.fit_transform(df_cat).toarray().astype(int)
one_hot_columns=enc.get_feature_names(unique_features)

Out[20]:

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [21]:

df_one_hot=pd.DataFrame(one_hot_val, columns=one_hot_columns)

In [22]:

df_one_hot.columns

Out[22]:

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Other', 'district_Praga', 'district_Southern Praga',
       'district_Subburbs', 'district_Targowek', 'district_Ursynow',
       'district_Wawer', 'district_Wilanow', 'district_Wlochy',
       'district_Wola', 'district_Zoliborz', 'market_primary',
       'market_secondary', 'Building_material_breezeblock',
       'Building_material_brick', 'Building_material_cellular_concrete',
       'Building_material_concrete', 'Building_material_concrete_plate',
       'Building_material_hydroton', 'Building_material_not_specified',
       'Building_material_other', 'Building_material_reinforced_concrete',
       'Building_material_silikat', 'Building_material_wood',
       'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_not_specified',
       'Building_ownership_share', 'Building_type_apartment',
       'Building_type_block', 'Building_type_detached', 'Building_type_house',
       'Building_type_infill', 'Building_type_loft',
       'Building_type_not_specified', 'Building_type_ribbon',
       'Building_type_tenement', 'Construction_status_not_specified',
       'Construction_status_ready_to_use', 'Construction_status_to_completion',
       'Construction_status_to_renovation', 'Heating_boiler_room',
       'Heating_electrical', 'Heating_gas', 'Heating_not_specified',
       'Heating_other', 'Heating_tiled_stove', 'Heating_urban',
       'Windows_type_aluminium', 'Windows_type_not_specified',
       'Windows_type_plastic', 'Windows_type_wooden'],
      dtype='object')

In [23]:

not_specified_idx=[]
for i in range(0,df_one_hot.columns.shape[0]):
    if "not_specified" in df_one_hot.columns[i]:
        not_specified_idx.append(i)
not_specified_idx=np.asarray(not_specified_idx)

Let's drop all "not_specified" columns and "market_secondary" and "Mokotow" district to aboid collinearity.

In [24]:

drop_collinear_cols=list(df_one_hot.columns[not_specified_idx])
drop_collinear_cols.append('district_Mokotow')
drop_collinear_cols.append('market_secondary')

In [25]:

#Function substracting list2 elements from list1
def list_diff(list1, list2):
    out = []
    for ele in list1:
        if not ele in list2:
            out.append(ele)
    return out

In [26]:

df_one_hot.columns

Out[26]:

Index(['district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Mokotow', 'district_Ochota',
       'district_Other', 'district_Praga', 'district_Southern Praga',
       'district_Subburbs', 'district_Targowek', 'district_Ursynow',
       'district_Wawer', 'district_Wilanow', 'district_Wlochy',
       'district_Wola', 'district_Zoliborz', 'market_primary',
       'market_secondary', 'Building_material_breezeblock',
       'Building_material_brick', 'Building_material_cellular_concrete',
       'Building_material_concrete', 'Building_material_concrete_plate',
       'Building_material_hydroton', 'Building_material_not_specified',
       'Building_material_other', 'Building_material_reinforced_concrete',
       'Building_material_silikat', 'Building_material_wood',
       'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_not_specified',
       'Building_ownership_share', 'Building_type_apartment',
       'Building_type_block', 'Building_type_detached', 'Building_type_house',
       'Building_type_infill', 'Building_type_loft',
       'Building_type_not_specified', 'Building_type_ribbon',
       'Building_type_tenement', 'Construction_status_not_specified',
       'Construction_status_ready_to_use', 'Construction_status_to_completion',
       'Construction_status_to_renovation', 'Heating_boiler_room',
       'Heating_electrical', 'Heating_gas', 'Heating_not_specified',
       'Heating_other', 'Heating_tiled_stove', 'Heating_urban',
       'Windows_type_aluminium', 'Windows_type_not_specified',
       'Windows_type_plastic', 'Windows_type_wooden'],
      dtype='object')

In [27]:

#Substracting collinear columns from categorical columns. Modyfing categorical data DataFrame
df_cat_columns=list_diff(df_one_hot.columns,drop_collinear_cols)
df_cat=df_one_hot[df_cat_columns]

Handling non categorical data¶

In [28]:

columns_base=list_diff(df.columns, unique_features)

In [29]:

# Removing columns without any predictive value and columns not within initial dataset (added to datasource after initial iteration). 

drop_columns_base=[
    'Price','latitude','longitude','lon_mod','lat_mod','grid_price','sample_size','City','subdistrict',
    'offer_date',"index","geo_Id","east_bank", "floor_no"
                  ]
columns_base=list_diff(columns_base, drop_columns_base)

In [30]:

# Removing GoogleMaps and Restaurant data from initial data set as they were added after initial error analysis
columns_google_maps=['distance_driving','distance_transit', 'time_driving', 'time_transit']
columns_restaurants=[ 'restaurant_price_level', 'restaurant_mean_rating','restaurant_mean_popularity', 'restaurant_count',"restaurant_ratings_count"]

columns_base=list_diff(columns_base, columns_google_maps)
columns_base=list_diff(columns_base, columns_restaurants)
columns_base

Out[30]:

['Id',
 'Area',
 'build_year',
 'building_floors_num',
 'rooms_num',
 'Equipment_types_dishwasher',
 'Equipment_types_fridge',
 'Equipment_types_furniture',
 'Equipment_types_oven',
 'Equipment_types_stove',
 'Equipment_types_tv',
 'Equipment_types_washing_machine',
 'Extras_types_air_conditioning',
 'Extras_types_attic',
 'Extras_types_balcony',
 'Extras_types_basement',
 'Extras_types_garage',
 'Extras_types_garden',
 'Extras_types_lift',
 'Extras_types_separate_kitchen',
 'Extras_types_terrace',
 'Extras_types_two_storey',
 'Extras_types_usable_room',
 'Media_types_cable-television',
 'Media_types_cable_television',
 'Media_types_electricity',
 'Media_types_internet',
 'Media_types_phone',
 'Media_types_sewage',
 'Media_types_water',
 'Security_types_alarm',
 'Security_types_anti_burglary_door',
 'Security_types_closed_area',
 'Security_types_entryphone',
 'Security_types_monitoring',
 'Security_types_roller_shutters',
 'floor_num',
 'unit_price']

Model based on property data only¶

Selecting best features¶

In [31]:

# Combining base numerical data with categorical data
df_num=df[columns_base]
ml_data=pd.concat([df_num,df_cat],axis=1)

In [32]:

ml_data.columns[:]

Out[32]:

Index(['Id', 'Area', 'build_year', 'building_floors_num', 'rooms_num',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_oven',
       'Equipment_types_stove', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_attic', 'Extras_types_balcony', 'Extras_types_basement',
       'Extras_types_garage', 'Extras_types_garden', 'Extras_types_lift',
       'Extras_types_separate_kitchen', 'Extras_types_terrace',
       'Extras_types_two_storey', 'Extras_types_usable_room',
       'Media_types_cable-television', 'Media_types_cable_television',
       'Media_types_electricity', 'Media_types_internet', 'Media_types_phone',
       'Media_types_sewage', 'Media_types_water', 'Security_types_alarm',
       'Security_types_anti_burglary_door', 'Security_types_closed_area',
       'Security_types_entryphone', 'Security_types_monitoring',
       'Security_types_roller_shutters', 'floor_num', 'unit_price',
       'district_Bemowo', 'district_Bialoleka', 'district_Bielany',
       'district_Downtown', 'district_Ochota', 'district_Other',
       'district_Praga', 'district_Southern Praga', 'district_Subburbs',
       'district_Targowek', 'district_Ursynow', 'district_Wawer',
       'district_Wilanow', 'district_Wlochy', 'district_Wola',
       'district_Zoliborz', 'market_primary', 'Building_material_breezeblock',
       'Building_material_brick', 'Building_material_cellular_concrete',
       'Building_material_concrete', 'Building_material_concrete_plate',
       'Building_material_hydroton', 'Building_material_other',
       'Building_material_reinforced_concrete', 'Building_material_silikat',
       'Building_material_wood', 'Building_ownership_co_operative_ownership',
       'Building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
       'Building_ownership_full_ownership', 'Building_ownership_share',
       'Building_type_apartment', 'Building_type_block',
       'Building_type_detached', 'Building_type_house', 'Building_type_infill',
       'Building_type_loft', 'Building_type_ribbon', 'Building_type_tenement',
       'Construction_status_ready_to_use', 'Construction_status_to_completion',
       'Construction_status_to_renovation', 'Heating_boiler_room',
       'Heating_electrical', 'Heating_gas', 'Heating_other',
       'Heating_tiled_stove', 'Heating_urban', 'Windows_type_aluminium',
       'Windows_type_plastic', 'Windows_type_wooden'],
      dtype='object')

In [33]:

# Splitting data into X and y, removing price outliers
X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)

In [34]:

from sklearn.feature_selection import SelectKBest, f_regression
bestfeatures = SelectKBest(score_func=f_regression, k="all")

In [35]:

# Selecting X features carrying most information about y
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
# Formatting scores into single DataFrame
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
featureScores.nlargest(50,'Score').head(20)

Out[35]:

	Feature	Score
39	district_Downtown	7061.087788
37	district_Bialoleka	4019.670677
74	Building_type_tenement	1944.798855
52	market_primary	1795.763210
68	Building_type_block	1738.136823
44	district_Subburbs	1285.386747
67	Building_type_apartment	1260.089487
1	build_year	1248.918973
86	Windows_type_wooden	1072.417253
75	Construction_status_ready_to_use	764.632223
76	Construction_status_to_completion	631.575191
35	floor_num	582.200143
50	district_Wola	576.595866
2	building_floors_num	572.941746
13	Extras_types_balcony	570.238415
11	Extras_types_air_conditioning	536.278968
83	Heating_urban	438.529114
65	Building_ownership_full_ownership	402.054301
45	district_Targowek	323.404624
3	rooms_num	311.584572

In [36]:

# Investigating how many features made it past 200 f_regression score threshold
featureScores.query("Score>200").shape

Out[36]:

(33, 2)

In [37]:

# Creating list of selected top features
top_features=featureScores.query("Score>200").Feature.unique()
top_features

Out[37]:

array(['build_year', 'building_floors_num', 'rooms_num',
       'Equipment_types_dishwasher', 'Equipment_types_fridge',
       'Equipment_types_furniture', 'Equipment_types_tv',
       'Equipment_types_washing_machine', 'Extras_types_air_conditioning',
       'Extras_types_balcony', 'Extras_types_garden', 'Extras_types_lift',
       'floor_num', 'district_Bemowo', 'district_Bialoleka',
       'district_Downtown', 'district_Subburbs', 'district_Targowek',
       'district_Wawer', 'district_Wola', 'district_Zoliborz',
       'market_primary', 'Building_material_brick',
       'Building_ownership_full_ownership', 'Building_type_apartment',
       'Building_type_block', 'Building_type_tenement',
       'Construction_status_ready_to_use',
       'Construction_status_to_completion', 'Heating_urban',
       'Windows_type_aluminium', 'Windows_type_plastic',
       'Windows_type_wooden'], dtype=object)

In [38]:

# Visualizing selected features

selected_features=featureScores.query("Score>200")

selected_features.sort_values(by="Score",inplace=True,ascending=True)

import plotly.express as px
fig = px.bar(selected_features, y='Feature', x='Score',orientation='h')
fig.show()

Spliting into train and test sets¶

In [39]:

X=X[top_features]

random_state=15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [40]:

# Extracting train and test idx for later merge with additional data or geography coordinates

test_idx=np.asarray(X_test.index)
train_idx=np.asarray(X_train.index)

In [41]:

X_test_coors=df[[ "unit_price",'latitude', 'longitude',"lat_mod","lon_mod"]].iloc[test_idx]
X_test_coors.reset_index(inplace=True,drop=True)

In [42]:

X_test.shape

Out[42]:

(4987, 33)

In [43]:

y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

Training initial model¶

In [44]:

from sklearn.ensemble import RandomForestRegressor

In [45]:

# Selecting RF regressor with some regularization parameteres
rf_model=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=15, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=5,
                    
                      n_estimators=500, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [46]:

rf_model.fit(X_train,y_train)

Out[46]:

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=15, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [47]:

#Investigating Train set performance
rf_model.score(X_train,y_train)

Out[47]:

0.7757492984608529

In [48]:

#Investigating Test set performance
rf_model.score(X_test,y_test)

Out[48]:

0.6655517982906214

We still seem to have some overfitting - this could probably be handdled with better hyperparameter tuning. As the overfitting is not too bad for a Random Forest model I will proceed with current setup.

In [49]:

# Investigating model feature importance

importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)

trace0=go.Bar(
        x=importance_df.tail(10).coefficients,
        y=importance_df.tail(10).features,
        orientation="h",
        width=0.5,
        marker=dict(
        color="blue",
          

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout( 
        title="Initial model - Feature importance",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Feature")

    
    ))
iplot(figure)

In [50]:

# Function comparing key performance KPIs for any sklearn model

def performance_summary(model, X_test, y_test ):
    
    y_hat=model.predict(X_test)
    
    df_summary=pd.DataFrame(y_hat, columns=["y_hat"])
    df_summary["y_true"]=y_test
    df_summary["abs_error"]=np.abs(df_summary.y_true-df_summary.y_hat)
    df_summary["error"]=df_summary.y_hat-df_summary.y_true
    df_summary["relative_error"]= (df_summary["error"]/df_summary.y_true)
    df_summary["relative_abs_error"]= (df_summary["abs_error"]/df_summary.y_true)
    
    print("Share of forecasts within 25% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.25").shape[0]/df_summary.shape[0]))
    
    print("Share of forecasts within 10% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.10").shape[0]/df_summary.shape[0]))
    
    print("Share of forecasts within 5% absolute error {:.3f}\n".format(df_summary.query("relative_abs_error<0.05").shape[0]/df_summary.shape[0]))
    
    return(df_summary)
    

In [51]:

# Function combining test data with coordinates and creating aggregated grid data for error mapping

def test_error_mapping(df_summary,X_test_coors):
    df_summary_dnn_coors=pd.concat([df_summary,X_test_coors], axis=1)
    df_summary_dnn_coors[["relative_error","relative_abs_error"]]=df_summary_dnn_coors[["relative_error","relative_abs_error"]]*100
    
    # lat_mod and lon_mod are coordinates rounded into grid, which allows us to create around 600 tiles in warsaw and analyse mean error in each
    df_map=df_summary_dnn_coors[["lat_mod","lon_mod","relative_error","relative_abs_error","abs_error","error","y_hat","y_true"]]\
    .groupby(["lat_mod","lon_mod"], as_index=False).mean()
    return(df_map)

In [52]:

df_summary=performance_summary(rf_model, X_test, y_test)

Share of forecasts within 25% absolute error 0.912

Share of forecasts within 10% absolute error 0.634

Share of forecasts within 5% absolute error 0.375

In [53]:

df_summary.describe()

Out[53]:

	y_hat	y_true	abs_error	error	relative_error	relative_abs_error
count	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000
mean	11159.682329	11128.766393	1177.861976	30.915936	0.025094	0.102908
std	2403.227325	3094.201353	1346.995704	1789.155814	0.146392	0.107090
min	5832.991302	5083.000000	0.277364	-12708.867441	-0.557246	0.000024
25%	9495.179696	8951.000000	312.572331	-584.721788	-0.051806	0.030674
50%	10932.310587	10577.000000	752.587469	135.429784	0.013567	0.071721
75%	12502.698582	12546.000000	1536.980399	881.170739	0.088319	0.136689
max	20986.553687	25000.000000	12708.867441	8824.520863	1.122775	1.122775

Error analysis¶

Creating GeoHeatmap Class¶

In [54]:

# Importing a map filter, which removes irrelevant areas from contourplot used for heatmap
map_filter=pd.read_csv(r"https://raw.githubusercontent.com/Jan-Majewski/Project_Portfolio/master/03_Real_Estate_pricing_in_Warsaw/map_filter.csv",delimiter=",",header=None)

In [55]:

def validate_json(json_dict, df_contour):
    
    '''
    As contour plots do not display correctly if even one of the contour layers is not populated with values and its geometry 
    is empty this function removes such instances
    '''
    valid_idx=[]
    valid_features=[]
    
    for i in range(0,df_contour.shape[0]):
        if len(json_dict["features"][i]["geometry"]["coordinates"])>2:
            valid_idx.append(i)
            valid_features.append(json_dict["features"][i])
            
    json_dict["features"]=valid_features
    df_contour=df_contour.iloc[valid_idx,:]
    

    
    return(json_dict,df_contour)
            

In [56]:

# Function to interpolate grid data into a finer 100x100 mesh used for heatmaps

def create_grid_data(x,y,z,grid_size):
    
    xi = linspace(x.min(),x.max(),grid_size);
    yi = linspace(y.min(),y.max(),grid_size);
    zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method="linear")
    
    return(xi,yi,zi)

In [57]:

class GeoHeatmap:
    
    '''
    Class used to create geojson countour and corresponding data DataFramea, 
    which is later used by plotly to create choropleth heatmap
    '''
    
    def __init__(self, x,y,z,zstep):
        xi,yi,zi=create_grid_data(x,y,z,100)
        # Filtering created grid with map_filter
        zi=zi*map_filter
    
        cs = plt.contourf(xi,yi,zi,range(-50,int(np.nanmax(zi)),zstep),cmap=plt.cm.jet)

#      cs = plt.contourf(xi,yi,zi,range(int(np.nanmin(zi)),int(np.nanmax(zi)),zstep),cmap=plt.cm.jet)
        plt.close()

        geojson = geojsoncontour.contourf_to_geojson(
        contourf=cs,
        ndigits=3,

        )

        json_dict=eval(geojson)

        arr_temp=np.ones([len(json_dict["features"]),2])

        for i in range(0, len(json_dict["features"])):
            json_dict["features"][i]["id"]=i

            arr_temp[i,0]=i
            arr_temp[i,1]=float(json_dict["features"][i]["properties"]["title"])

        df_contour=pd.DataFrame(arr_temp, columns=["Id","value"])

        self.json_dict, self.df_contour=validate_json(json_dict, df_contour)
        
    def plot(self, title, zmin, zmax):
        geojson=self.json_dict
        df=self.df_contour
        
        trace = go.Choroplethmapbox(
                                    geojson=geojson,
                                    locations=df.Id,
                                    z=df.value,
                                    colorscale="jet",
                                    name="",
                                    hovertemplate= "<b>Relative error:</br></br> %{z:,.0f}% </br></br>",
  
                                    showscale=True,
                                    zauto=False,
                                    zmax=zmax,
                                    zmin=zmin,

                                    marker_line_width=0,

                                    marker=dict(opacity=0.5),

                                   )
        layout = go.Layout(
            title=title,

             legend=dict(
                    orientation="h"),

            height = 800,
            # top, bottom, left and right margins
            margin = dict(t = 80, b = 0, l = 0, r = 0),
            font = dict(color = 'dark grey', size = 18),

            mapbox = dict(

                center = dict(
                    lat = center_coors[0],
                    lon = center_coors[1]
                ),
                # default level of zoom
                zoom = 10.3,
                # default map style
                style = "carto-positron"
            )

        )

        data=[trace]

        figure=dict(
            data=data,
            layout=layout,

            )

        iplot(figure)

Visualizing prediction error¶

In [58]:

# Creating a coors grid with average error used as map input
df_map=test_error_mapping(df_summary,X_test_coors)

In [59]:

# Initializing GeoHeatmap Class
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)

In [60]:

# Plotting error
error_GeoHeatmap.plot("Initial model - Relative error heatmap [%]",-30,30)

Adding East Bank flag¶

In [61]:

# Adding east_bank feature to test and train set based on previously extracted idx
X_train["east_bank"]=df["east_bank"].iloc[train_idx].values
X_test["east_bank"]=df["east_bank"].iloc[test_idx].values

In [62]:

# Training model on enriched datasource
rf_model.fit(X_train,y_train)

Out[62]:

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=15, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [63]:

# Analyzing model performance
rf_model.score(X_test,y_test)

Out[63]:

0.6708279238284628

In [64]:

df_summary=performance_summary(rf_model, X_test, y_test)

Share of forecasts within 25% absolute error 0.916

Share of forecasts within 10% absolute error 0.637

Share of forecasts within 5% absolute error 0.385

In [65]:

df_summary.describe()

Out[65]:

	y_hat	y_true	abs_error	error	relative_error	relative_abs_error
count	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000
mean	11161.353513	11128.766393	1163.803175	32.587120	0.024825	0.101492
std	2411.597380	3094.201353	1340.451781	1774.953048	0.144598	0.105934
min	5845.777954	5083.000000	0.101406	-12736.130278	-0.535934	0.000014
25%	9459.796185	8951.000000	310.949220	-558.505103	-0.050330	0.029884
50%	10975.166644	10577.000000	750.628936	143.799449	0.013885	0.070732
75%	12511.045358	12546.000000	1505.049911	882.976851	0.088799	0.135543
max	20841.224622	25000.000000	12736.130278	8893.177414	1.185477	1.185477

East Bank flag feature importance¶

In [66]:

# Analyzing eastbank flag effect on feature importance
importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X_train.columns
importance_df.sort_values(by="coefficients",inplace=True)

trace0=go.Bar(
        x=importance_df.tail(10).coefficients,
        y=importance_df.tail(10).features,
        orientation="h",
        width=0.5,
        marker=dict(
        color="blue",
          

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout( 
        title="East bank flag added - Feature importance",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Feature")

    
    ))
iplot(figure)

East-bank flag ballancing error on both sides of Vistula¶

In [67]:

df_map=test_error_mapping(df_summary,X_test_coors)
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
error_GeoHeatmap.plot("East-bank flag added - Relative error heatmap [%]",-30,30)

Adding GoogleMaps transport data¶

In [68]:

# Feature added from google maps
columns_google_maps

Out[68]:

['distance_driving', 'distance_transit', 'time_driving', 'time_transit']

In [69]:

# Repeating feature selection with enriched data source
df_num=df[columns_base+["east_bank"]+columns_google_maps]
ml_data=pd.concat([df_num,df_cat],axis=1)

X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)

fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)

featureScores.columns = ['Feature','Score']
featureScores.sort_values(by="Score",inplace=True,ascending=True)
featureScores.nlargest(10,'Score')

Out[69]:

	Feature	Score
38	distance_transit	15110.951918
37	distance_driving	14331.765525
40	time_transit	11729.905071
39	time_driving	9771.050976
44	district_Downtown	7061.087788
36	east_bank	4161.069394
42	district_Bialoleka	4019.670677
79	Building_type_tenement	1944.798855
57	market_primary	1795.763210
73	Building_type_block	1738.136823

In [70]:

trace0=go.Bar(
        x=featureScores.tail(10).Score,
        y=featureScores.tail(10).Feature, 
        orientation="h",
        width=0.5,
        marker=dict(
        color="blue",
          

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout( 
        title="GoogleMaps data added - f_regression scores",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Score")

    
    ))
iplot(figure)

In [71]:

# Analyzing features passing  f_regression score threshold
featureScores.query("Score>200").shape
top_features=featureScores.query("Score>200").Feature.unique()

Out[71]:

(38, 2)

In [72]:

# Repeating process of splitting data - same random state ensures comparability

X=X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [73]:

# Training model on data source with googlemaps data
rf_model.fit(X_train,y_train)
rf_model.score(X_test,y_test)

Out[73]:

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=15, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

Out[73]:

0.7389784345233201

In [74]:

# Performance analysis
df_summary=performance_summary(rf_model, X_test, y_test)

Share of forecasts within 25% absolute error 0.937

Share of forecasts within 10% absolute error 0.690

Share of forecasts within 5% absolute error 0.439

In [75]:

df_summary.describe()

Out[75]:

	y_hat	y_true	abs_error	error	relative_error	relative_abs_error
count	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000
mean	11161.112366	11128.766393	1020.064673	32.345973	0.021116	0.088994
std	2517.794293	3094.201353	1207.601444	1580.504794	0.129155	0.095945
min	5738.905381	5083.000000	0.185510	-12380.710792	-0.526839	0.000020
25%	9326.003461	8951.000000	249.986888	-479.112622	-0.042638	0.024725
50%	10946.202199	10577.000000	638.837969	96.950829	0.009605	0.061140
75%	12547.364005	12546.000000	1327.224263	755.154385	0.076540	0.119597
max	21818.906420	25000.000000	12380.710792	7263.202510	1.220820	1.220820

Feature importance - GoogleMaps takes the whole podium¶

In [76]:

importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)

trace0=go.Bar(
        x=importance_df.tail(10).coefficients,
        y=importance_df.tail(10).features,
        orientation="h",
        width=0.5,
        marker=dict(
        color="blue",
          

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout( 
        title="Googlemaps enriched model - Feature importance",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Feature")

    
    ))
iplot(figure)

Error analysis - balanced performance across majority of areas¶

In [77]:

df_map=test_error_mapping(df_summary,X_test_coors)
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
error_GeoHeatmap.plot("Google maps data added - Relative error heatmap [%]",-30,30)

Adding Restaurant data to capture location popularity¶

In [78]:

# Features added based on local restaurants (within 1km) data
columns_restaurants

Out[78]:

['restaurant_price_level',
 'restaurant_mean_rating',
 'restaurant_mean_popularity',
 'restaurant_count',
 'restaurant_ratings_count']

In [79]:

# Repeating feature selection process

df_num=df[columns_base+["east_bank"]+columns_google_maps+columns_restaurants]
ml_data=pd.concat([df_num,df_cat],axis=1)

X=ml_data.copy()
X=X.query("unit_price<=25000 and unit_price>5000")
y=X.unit_price
X.drop(columns=["unit_price","Id"],inplace=True)

fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
featureScores.nlargest(50,'Score').head(20)

Out[79]:

	Specs	Score
38	distance_transit	15110.951918
37	distance_driving	14331.765525
40	time_transit	11729.905071
39	time_driving	9771.050976
44	restaurant_count	8054.954714
43	restaurant_mean_popularity	7599.663197
49	district_Downtown	7061.087788
45	restaurant_ratings_count	5505.569986
41	restaurant_price_level	4783.021585
42	restaurant_mean_rating	4512.937869
36	east_bank	4161.069394
47	district_Bialoleka	4019.670677
84	Building_type_tenement	1944.798855
62	market_primary	1795.763210
78	Building_type_block	1738.136823
54	district_Subburbs	1285.386747
77	Building_type_apartment	1260.089487
1	build_year	1248.918973
96	Windows_type_wooden	1072.417253
85	Construction_status_ready_to_use	764.632223

In [80]:

# Investigating number of features passing importance threshold
featureScores.query("Score>200").shape
top_features=featureScores.query("Score>200").Specs.unique()

Out[80]:

(43, 2)

In [81]:

X=X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [82]:

# Training and scoring the model
rf_model.fit(X_train,y_train)
rf_model.score(X_test,y_test)

Out[82]:

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=15, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.01,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=2, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

Out[82]:

0.7423337887710196

In [83]:

# Model performance analysis
df_summary=performance_summary(rf_model, X_test, y_test)

Share of forecasts within 25% absolute error 0.939

Share of forecasts within 10% absolute error 0.693

Share of forecasts within 5% absolute error 0.446

In [84]:

df_summary.describe()

Out[84]:

	y_hat	y_true	abs_error	error	relative_error	relative_abs_error
count	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000	4987.000000
mean	11157.505698	11128.766393	1011.659073	28.739305	0.020655	0.088274
std	2520.718485	3094.201353	1201.356772	1570.379357	0.128461	0.095578
min	5753.684067	5083.000000	0.328317	-12689.720184	-0.539988	0.000033
25%	9303.500195	8951.000000	253.678220	-471.524392	-0.042963	0.024451
50%	10954.167837	10577.000000	616.500692	89.566120	0.009131	0.059925
75%	12509.783497	12546.000000	1333.592176	747.952363	0.075358	0.119922
max	21777.652215	25000.000000	12689.720184	7792.430903	1.230854	1.230854

Feature importance - 3 Restautant features make it to top 10¶

In [85]:

importance_df=pd.DataFrame(rf_model.feature_importances_,columns=["coefficients"])
importance_df["features"]=X.columns
importance_df.sort_values(by="coefficients",inplace=True)

trace0=go.Bar(
        x=importance_df.tail(10).coefficients,
        y=importance_df.tail(10).features,
        orientation="h",
        width=0.5,
        marker=dict(
        color="blue",
          

        opacity=0.5
        ),
       
    )








data=[trace0]
figure=go.Figure(
    data=data,
    layout=go.Layout( 
        title="Final model - Feature importance",
        xaxis=dict(title="Importance"),
        yaxis=dict(title="Feature")

    
    ))
iplot(figure)

Error analysis - better balance across popular locations such as Saska Kepa¶

In [86]:

df_map=test_error_mapping(df_summary,X_test_coors)
error_GeoHeatmap=GeoHeatmap(df_map.lon_mod,df_map.lat_mod, df_map.relative_error,5)
error_GeoHeatmap.plot("Restaurant data added - Relative error heatmap [%]",-30,30)

In [ ]: