#!/usr/bin/env python # coding: utf-8 # # Data Driven Modeling # ### (Theme of this semester: CODING AS LITERACY) #
# ### PhD seminar series at Chair for Computer Aided Architectural Design (CAAD), ETH Zurich # # [Vahid Moosavi](https://vahidmoosavi.com) #
# # # # 13th Session #
# 14 March 2017 # # # Data Driven Modeling in Practice: Real Estate Market # ### Topics to be discussed # * Main Elements of full stack development of data driven project. # * Today: Real Estate Project # # # * **Waht are the key questions in real estate market?** # * **Where and How to get the data?** # * **Data Wrangling: Big Data is always messy** # * **Machine Learning Part** # * **Deploying the model on a server** # # ![](images/RealEstate.png) # # #
#
# ### # # Data collection for this project # * web Crawling of Real Estate Search Portals: Homegate.ch, Immoscout.ch, ... # * https://www.crummy.com/software/BeautifulSoup/ # * Available open source data sets including demographics, transport data, education, environment,... # * Open Street Map: https://osmbuildings.org/?lat=40.71098&lon=-74.00276&zoom=15&rotation=0&tilt=30 # * GeoAdmin.ch : https://map.geo.admin.ch/ # * Mapzen: http://tangrams.github.io/gui-demo/ # * Mapbox Studio: https://www.mapbox.com/mapbox-studio/ # * Data Driven Styling: https://www.mapbox.com/blog/population-inspector/?utm_source=nov-newsletter&utm_medium=email&utm_content=blog-track-pop&utm_campaign=nov-newsletter # * Google Geocoding APIs: https://console.developers.google.com/apis/library?project=googcse20150904 # # Data Cleaning # ### Data resampling and synthetic data generation # * We will talk about it in another session on resampling and bootstraping # ### Geocoding of the data # In[19]: import warnings warnings.filterwarnings("ignore") import datetime import pandas as pd # import pandas.io.data import numpy as np import matplotlib from matplotlib import pyplot as plt import sys import sompylib.sompy as SOM# from pandas import Series, DataFrame from ipywidgets import interact, HTML, FloatSlider get_ipython().run_line_magic('matplotlib', 'inline') # # Basic analysis of the cleaned data # In[10]: path ='./Data/Real_Estate_Market/All_Listing_latlng_2017_2_23.txt' listing_with_latlong = pd.read_csv(path) listing_with_latlong.head() # In[11]: print listing_with_latlong.shape #Kinds of features we are interested in cols = ['ID', 'ZIP', 'Date','Type','Rooms','Floor', 'Living space','Floor space','Room height','Volume', 'Year built','Last renovation','Net rent','Additional expenses', 'Rent','lng','lat'] # 'Available','lng','lat','Public transport','Shopping','Kindergarten','Primary school','Secondary school','Motorway'] listing = listing_with_latlong[cols] #Types of listing we are interested in Types = ['Apartment','Attic compartment','Attic flat','Bachelor flat','Bifamiliar house','Cellar compartment','Chalet', 'Duplex','Farm house','Granny flat','Home','Roof flat','Row house','Rustic house','Single house','Studio','Terrace flat', 'Terrace house','Villa'] Types = ['Apartment'] id_type = [] for i in range(listing.shape[0]): if listing['Type'].values[i] in Types: id_type.append(i) listing = listing.ix[id_type] cols = ['Rent','ZIP', 'Type', 'Rooms','Year built','Last renovation', 'Living space', 'Floor', 'lng','lat'] listing = listing[cols] try: ind_by = listing['Floor']=='GF' listing['Floor'].ix[ind_by] =0 ind_by = listing['Floor']=='Underground' listing['Floor'].ix[ind_by] =-1 except: print 'floor is not in the columns' listing['Rent'] = listing['Rent'].values[:].astype(float) #Remove outliers based on a global statistics, calculated previously # Supply_stat = listing.describe(percentiles=[.001,.01,.2,.5,.99,.995,.999]) for f in ['Rooms','Living space','Rent']: mx = Supply_stat[f].ix['99.9%'] ind = listing[f]>mx listing[f].ix[ind]=mx mn = Supply_stat[f].ix['0.1%'] ind = listing[f](Zlatbound[1]+eps) # ind1 = listing['lat'].values[:]<(Zlatbound[0]-eps) # ind2 = listing['lng'].values[:]>(Zlngbound[1]+eps) # ind3 = listing['lng'].values[:]<(Zlngbound[0]-eps) # indremove = (ind0 + ind1 + ind2 + ind3) # indkeep = [not i for i in indremove] # print indremove.sum() # listing = listing.ix[indkeep] listing = listing[['Rent','ZIP','Rooms','Year built','Living space','Floor','lng','lat']] listing = listing.dropna() listing.index = range(listing.shape[0]) print listing.shape # In[12]: plt.plot(listing.lng,listing.lat,'.',markersize=1) # In[13]: listing.head() # In[14]: Supply_stat # ### Basic Analysis # In[15]: def hist1d_Supply_query(Area,Min_Rooms,Min_Size,Max_Rent): Supply_Threshold=1 Max_Size=500 Max_Rooms=11 Min_Rent=100 q = Area fig = plt.figure(figsize=(12,3)) font = {'size' : 8} plt.rc('font', **font) ind_q_Supply = (listing['ZIP']==q) q_data_Supply = listing.ix[ind_q_Supply] import itertools import numpy as np if Min_Rooms >= Max_Rooms: Max_Rooms = Min_Rooms if Min_Size>=Max_Size: Max_Size= Min_Size if Min_Rent>=Max_Rent: Max_Rent= Min_Rent def check_search(df): amn = df['Rooms']>=Min_Rooms amx = df['Rooms']<=Max_Rooms bmn = df['Living space']>=Min_Size bmx = df['Living space']<=Max_Size cmn = df['Rent']>=Min_Rent cmx = df['Rent']<=Max_Rent return amn&bmn&cmn&amx&bmx&cmx if q_data_Supply.shape[0]>=1: rooms = [] sizes =[] prices = [] print "**************************************************************" print 'Number of unique Supply Ads: {}.'.format(q_data_Supply.shape[0]) ax = plt.subplot(1,3,1) plt.title('Supply: Room') rooms = q_data_Supply['Rooms'].dropna().values[:] room_bin = range(int(Supply_stat['Rooms'].ix['1%']),int(Supply_stat['Rooms'].ix['99.5%']+1)) a = plt.hist(rooms,bins=room_bin,alpha=1,color='white',linewidth=.5,edgecolor='black',rwidth=1,normed=False) ax.yaxis.grid(True) ax = plt.subplot(1,3,2) plt.title('Supply: Size') mn = int(Supply_stat['Living space'].ix['1%']) mx = int(Supply_stat['Living space'].ix['99.5%']) R = mx-mn # stp = int(R/30) stp = 20 size_bin = range(mn,mx+stp,stp) sizes = q_data_Supply['Living space'].dropna().values[:] a = plt.hist(sizes,bins=size_bin,alpha=1,color='white',linewidth=.5,edgecolor='black',normed=False) ax.yaxis.grid(True) ax = plt.subplot(1,3,3) plt.title('Supply: Price') mn = int(Supply_stat['Rent'].ix['1%']) mx = int(Supply_stat['Rent'].ix['99.5%']) R = mx-mn stp = 200 price_bin = range(mn,mx+stp,stp) prices = q_data_Supply['Rent'].dropna().values[:] a = plt.hist(prices,bins=price_bin,alpha=1,color='white',linewidth=.5,edgecolor='black',normed=False) ax.yaxis.grid(True) plt.tight_layout() mn = int(Supply_stat['Rent'].ix['1%']) mx = int(Supply_stat['Rent'].ix['99.5%']) plt.xlim(mn,mx) plt.title('Relative Supply Distributions: Price') else: print "\n**************************************************************" print 'Not enough Supply this area with the zip code {}.'.format(Area) return q_data_Supply_bounded = q_data_Supply.ix[check_search(q_data_Supply)] if q_data_Supply_bounded.shape[0]>=1: rooms = [] sizes =[] prices = [] print "**************************************************************" print 'Number of unique Supply Ads based on your query: {}.'.format(q_data_Supply_bounded.shape[0]) ax = plt.subplot(1,3,1) plt.title('No. of rooms') rooms = q_data_Supply_bounded['Rooms'].dropna().values[:] room_bin = range(int(Supply_stat['Rooms'].ix['1%']),int(Supply_stat['Rooms'].ix['99.5%']+1)) a = plt.hist(rooms,bins=room_bin,alpha=1,color='red',linewidth=.5,edgecolor='black',rwidth=1,normed=False) ax.yaxis.grid(True) ax = plt.subplot(1,3,2) plt.title('Living space (m^2)') mn = int(Supply_stat['Living space'].ix['1%']) mx = int(Supply_stat['Living space'].ix['99.5%']) R = mx-mn # stp = int(R/30) stp = 20 size_bin = range(mn,mx+stp,stp) sizes = q_data_Supply_bounded['Living space'].dropna().values[:] a = plt.hist(sizes,bins=size_bin,alpha=1,color='red',linewidth=.5,edgecolor='black',normed=False) ax.yaxis.grid(True) ax = plt.subplot(1,3,3) plt.title('Monthly rent') mn = int(Supply_stat['Rent'].ix['1%']) mx = int(Supply_stat['Rent'].ix['99.5%']) R = mx-mn stp = 200 price_bin = range(mn,mx+stp,stp) prices = q_data_Supply_bounded['Rent'].dropna().values[:] a = plt.hist(prices,bins=price_bin,alpha=1,color='red',linewidth=.5,edgecolor='black',normed=False) ax.yaxis.grid(True) plt.tight_layout() else: print "\n**************************************************************" print 'Based on your query there are not enough Supply for this area with the zip code {}.'.format(Area) return # In[16]: interact(hist1d_Supply_query,Area=(8001,8008,1),Min_Rooms=(1,6,1),Min_Size=(10,100,10),Max_Rent=(1000,6000,100)); # In[17]: def spatial_query_supply(Min_Rooms,Min_Size,Max_Rent,whattoplot='percentofcoverage'): Supply_Threshold=1 Max_Size=500 Max_Rooms=11 Min_Rent=100 cmapname="RdYlBu_r" # cmapname="B" import itertools import numpy as np if Min_Rooms >= Max_Rooms: Max_Rooms = Min_Rooms if Min_Size>=Max_Size: Max_Size= Min_Size if Min_Rent>=Max_Rent: Max_Rent= Min_Rent def check_search(df): amn = df['Rooms'].values[:]>=Min_Rooms amx = df['Rooms'].values[:]<=Max_Rooms bmn = df['Living space'].values[:]>=Min_Size bmx = df['Living space'].values[:]<=Max_Size cmn = df['Rent'].values[:]>=Min_Rent cmx = df['Rent'].values[:]<=Max_Rent return (amn*bmn*cmn*amx*bmx*cmx).sum() Complete_data_zip = listing.copy() Complete_data_zip.index = Complete_data_zip['ZIP'] geo_info = [] total_demand = [] percentofchange = [] long_lat_zip_all = Complete_data_zip.groupby(by='ZIP')['ZIP','lng','lat'].first() zip_GB = Complete_data_zip.groupby(by='ZIP') long_lat_zip_specific = zip_GB['ZIP','lng','lat'].first() ind_zip = zip_GB.size()>Supply_Threshold long_lat_zip_sel = long_lat_zip_specific.ix[ind_zip] total_specific_supply=zip_GB.size()[ind_zip] total_supply = total_specific_supply.values[:] total_interest_in_property1 = Complete_data_zip.ix[ind_zip].groupby(by='ZIP').apply(check_search) total_cases = total_interest_in_property1.values[:] percentofcoverage = 100*total_cases/(total_specific_supply.values[:]).astype(float) #To Plot fig = plt.figure(figsize=(12,8)) if whattoplot=='total_cases': ax = fig.add_subplot(1,1,1) md = np.median(total_cases) sd = np.std(total_cases) mn = md-2*sd mx= md+2.5*sd mn = np.min(total_cases) sc = plt.scatter(long_lat_zip_sel.lng,long_lat_zip_sel.lat,c=total_cases,s=20,vmin=mn,vmax=mx,marker='o',edgecolor='None', cmap=cmapname ,alpha=1) ticklabels = np.round(np.linspace(mn,mx,5),decimals=3).astype(int).astype(str) ticklabels[-1]=">"+ticklabels[-1] cbar = plt.colorbar(sc,ticks=np.round(np.linspace(mn,mx,5),decimals=3).astype(int),shrink=0.4) cbar.ax.set_yticklabels(ticklabels) plt.xticks([]) plt.yticks([]) plt.title("Total number of available cases") plt.axis('off') if whattoplot=='percentofcoverage': ax = fig.add_subplot(1,1,1) md = np.median(percentofcoverage) sd = np.std(percentofcoverage) mn = np.min(percentofcoverage) mx = np.max(percentofcoverage) sc = plt.scatter(long_lat_zip_sel.lng,long_lat_zip_sel.lat,c=percentofcoverage,s=20,vmin=mn,vmax=mx,marker='o',edgecolor='None', cmap=cmapname ,alpha=1) ticklabels = np.round(np.linspace(mn,mx,5),decimals=3).astype(int).astype(str) ticklabels[-1]=ticklabels[-1] cbar = plt.colorbar(sc,ticks=np.round(np.linspace(mn,mx,5),decimals=3).astype(int),shrink=0.4) cbar.ax.set_yticklabels(ticklabels) plt.xticks([]) plt.yticks([]) plt.title("percent of coverage") plt.axis('off') plt.tight_layout() font = {'size' : 12} plt.rc('font', **font) plt.tight_layout() # In[18]: interact(spatial_query_supply,whattoplot=(['total_cases','percentofcoverage']),Min_Rooms=(1,6,1),Min_Size=(10,100,10),Max_Rent=(1000,5000,100)); # # Automated Property Evaluation # ### More of Machine Learning # In[28]: Mat_all = listing.copy() listing.head() # # Out of sample prediction # ## Train test Split # In[29]: sel_cols = range(1,Mat_all.shape[1]) X = Mat_all.values[:,sel_cols] y = Mat_all.values[:,0] import xgboost as xgb from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333) from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn.preprocessing import PolynomialFeatures from sklearn import linear_model def my_custom_loss_func(ground_truth, predictions): return np.median(100*np.abs(predictions-ground_truth)/ground_truth) n_estimators = 200 regr = DecisionTreeRegressor() regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'DecisionTreeRegressor', my_custom_loss_func(y_test,preds) regr = BaggingRegressor(DecisionTreeRegressor(), n_estimators=n_estimators,n_jobs=-1) regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'BaggingRegressor', my_custom_loss_func(y_test,preds) regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'RandomForestRegressor', my_custom_loss_func(y_test,preds) regr = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'ExtraTreesRegressor', my_custom_loss_func(y_test,preds) regr = xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=n_estimators, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None).fit(X_train,y_train) preds = regr.predict(X_test) print 'XGBoost', my_custom_loss_func(y_test,preds) degree = 2 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]), my_custom_loss_func(y_test,preds) degree = 3 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]), my_custom_loss_func(y_test,preds) degree = 4 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]), my_custom_loss_func(y_test,preds) degree = 5 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]), my_custom_loss_func(y_test,preds) # ## This method of out of sample prediction is data demanding # ## Further, it might be overfitted to test data # #### There is a better way than this... # # Cross Validation # ## K-fold # In[17]: from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn.metrics import make_scorer def my_custom_loss_func(ground_truth, predictions): return np.median(-100*np.abs(predictions-ground_truth)/ground_truth) loss = make_scorer(my_custom_loss_func, greater_is_better=False) # loss = 'neg_median_absolute_error' # loss = None sel_cols = range(1,Mat_all.shape[1]) X = Mat_all.values[:,sel_cols] y = Mat_all.values[:,0] n_estimators = 100 regr = DecisionTreeRegressor() scores = cross_val_score(regr, X, y,cv=3,scoring=loss) print 'DecisionTreeRegressor', np.median(scores) regr = BaggingRegressor(DecisionTreeRegressor(), n_estimators=n_estimators,n_jobs=-1) scores = cross_val_score(regr, X, y,cv=3,scoring=loss) print 'BaggingRegressor', np.median(scores) regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) scores = cross_val_score(regr, X, y,cv=3,scoring=loss) print 'RandomForestRegressor', np.median(scores) regr = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) scores = cross_val_score(regr, X, y,cv=3,scoring=loss) print 'ExtraTreesRegressor', np.median(scores) # # Grid Search with Cross Validation # ## Meta-Parameter optimization # In[37]: from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn.metrics import make_scorer def my_custom_loss_func(ground_truth, predictions): return np.median(-100*np.abs(predictions-ground_truth)/ground_truth) loss = make_scorer(my_custom_loss_func, greater_is_better=False) # loss = 'neg_median_absolute_error' # loss = None sel_cols = range(1,Mat_all.shape[1]) X = Mat_all.values[:,sel_cols] y = Mat_all.values[:,0] n_estimators = 50 # model = DecisionTreeRegressor() # regr = GridSearchCV(model,param_grid={'min_samples_split':[2,10]},cv=3,scoring=loss) # regr.fit(X,y) # print 'DecisionTreeRegressor',regr.best_score_ ,regr.best_params_ # model = BaggingRegressor(DecisionTreeRegressor(), n_estimators=n_estimators,n_jobs=-1) # regr = GridSearchCV(model,param_grid={'n_estimators':[50,100]},cv=3,scoring=loss) # regr.fit(X,y) # print 'BaggingRegressor', regr.best_score_ ,regr.best_params_ # model = RandomForestRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) # regr = GridSearchCV(model,param_grid={'n_estimators':[50,100],'min_samples_split':[2,10]},cv=3,scoring=loss) # regr.fit(X,y) # print 'RandomForestRegressor', regr.best_score_ ,regr.best_params_ # model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) # regr = GridSearchCV(model,param_grid={'n_estimators':[50,100],'min_samples_split':[2,10]},cv=3,scoring=loss) # regr.fit(X,y) # print 'ExtraTreesRegressor', regr.best_score_ ,regr.best_params_ n_estimators = 2000 model = xgb.XGBRegressor(max_depth=X_train.shape[1], learning_rate=0.1, n_estimators=n_estimators, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None) regr = GridSearchCV(model,param_grid={'max_depth':[3,X_train.shape[1]],'n_estimators':[1000,2000]},cv=3,scoring=loss) regr.fit(X,y) print 'XGBoost', regr.best_score_ ,regr.best_params_ # # Distribution of Error matters a lot! # In[42]: sel_cols = range(1,Mat_all.shape[1]) X = Mat_all.values[:,sel_cols] y = Mat_all.values[:,0] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn.preprocessing import PolynomialFeatures from sklearn import linear_model def my_custom_loss_func(ground_truth, predictions): return 100*(predictions-ground_truth)/ground_truth ERRS = pd.DataFrame() n_estimators = 200 regr = DecisionTreeRegressor() regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'DecisionTreeRegressor', np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['DecisionTreeRegressor'] = my_custom_loss_func(y_test,preds) regr = BaggingRegressor(DecisionTreeRegressor(), n_estimators=n_estimators,n_jobs=-1) regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'BaggingRegressor', np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['BaggingRegressor'] = my_custom_loss_func(y_test,preds) regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'RandomForestRegressor', np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['RandomForestRegressor'] = my_custom_loss_func(y_test,preds) regr = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=None,min_samples_split=2, random_state=0,n_jobs=-1) regr.fit(X_train,y_train) preds = regr.predict(X_test) print 'ExtraTreesRegressor', np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['ExtraTreesRegressor'] = my_custom_loss_func(y_test,preds) n_estimators = 2000 regr = xgb.XGBRegressor(max_depth=X_train.shape[1], learning_rate=0.1, n_estimators=n_estimators, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None) regr.fit(X_train,y_train,) preds = regr.predict(X_test) print 'XGBoost', np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['XGBoost'] = my_custom_loss_func(y_test,preds) degree = 2 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]), np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1])] = my_custom_loss_func(y_test,preds) degree = 3 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]),np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1])] = my_custom_loss_func(y_test,preds) degree = 4 poly = PolynomialFeatures(degree=degree) X_train_ = poly.fit_transform(X_train) regr = linear_model.LinearRegression() regr.fit(X_train_, y_train) X_test_ = poly.fit_transform(X_test) preds = regr.predict(X_test_) print 'Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1]), np.median(np.abs(my_custom_loss_func(y_test,preds))) ERRS['Polynomial_degree:{} with dimensions: {} '.format(degree,X_train_.shape[1])] = my_custom_loss_func(y_test,preds) # In[43]: ERRS.head() # In[46]: font = {'family' : 'normal', 'weight' : 'normal', 'size' : 7} fig = plt.figure(figsize=(15,15)) for i in range(ERRS.shape[1]): plt.subplot(3,3,i+1) plt.hist(ERRS.values[:,i],bins=500,normed=True,color='black',alpha=.5) plt.title(ERRS.columns[i]) plt.rc('font', **font) plt.xlim(-50,50) # ## Looking at the error distribution more quantitatively # In[45]: quality = [1,5,10,15,20] percentiles = pd.DataFrame(data= np.zeros((len(quality),ERRS.shape[1])),index=quality,columns=ERRS.columns) for percent in quality: for m in range(ERRS.shape[1]): a = np.abs(ERRS.values[:,m]) n = float(a.shape[0]) percentiles.ix[percent,m]= 100*a[a<=percent].shape[0]/n percentiles med_error = pd.DataFrame(data=np.median(np.abs(ERRS.values[:]),axis=0)[np.newaxis,:],columns=ERRS.columns,index=['median']) med_error percentiles = pd.concat((med_error,percentiles)) percentiles.T # # Other possible use cases # * ** Spatial Price Sensitivity Analysis for developers and constructors** # * **Trend analysis and forecasting for banks** # * ** Estimating the dynamics of different region** # * **Personalized real estate search for end users** # * ** Many other use cases for other stakeholders: Banks, Real Estate Agents, Mortgage lenders, Movers,...** # # * ** Further: Using Machine Learning for Data Collection** # * Logging users interaction # * Receiving the feedbacks from users # ## Deploying the trained model on a web application # * **We need to serve the trained model like an API.** # In[107]: # from sklearn.externals import joblib # # Dump the trained model # joblib.dump(regr, 'model.pkl') # # Load it like this on the server # regr = joblib.load('model.pkl') # * **To do so, we need a "Web Framework"** # * Flask is a nice, minimal and easy one! # * **Further, we need a host and server which works with Flask.** # * https://www.pythonanywhere.com/ # I put them together here in a complete web application!