#!/usr/bin/env python # coding: utf-8 # In[5]: import pandas as pd import matplotlib.cm as cm import matplotlib.pyplot as plt import seaborn as sns # In[6]: get_ipython().run_line_magic('matplotlib', 'inline') # In[8]: df = pd.read_csv('houses.csv') len(df) # In[9]: df.info() # In[10]: df.head() # ## Data Cleaning # In[11]: df.immo_id = df.immo_id.astype(str) df.zip_code = df.zip_code.astype(str) df2=df.drop(columns=["time_dest", "time_dest2", "time_dest3"]) df2.describe() # ## Interesting statistics # ### Providers with most apartment offers # In[12]: df2.groupby("contact_name").size().sort_values(ascending=False) # ### Districts with most apartment offers # In[13]: df2.groupby("district").size().sort_values(ascending=False) # In[24]: df3=df2[(df2.sqm<600) & (df2.area < 15000)] len(df3) # In[27]: df3.plot(x="sqm", y="rent", c="area", kind="scatter",colormap=cm.Set1, figsize=(15,10)) # In[28]: df3.plot(x="lng", y="lat", c="rent", kind="scatter", figsize=(15,10), colormap=cm.Blues) # In[29]: df3.hist(bins=20,figsize=(15,10), column=["area","lat","lng","media_count","rent","rooms","sqm"]) # In[30]: corr=df3.corr() # In[31]: fig, ax = plt.subplots(figsize=(15,10)) sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, center=0.0, cmap=sns.diverging_palette(5, 250, as_cmap=True), annot=True, fmt=".2f", ax=ax) # In[38]: sns.pairplot(df3, vars=["sqm","rooms","rent","media_count", "area"], kind="reg") # In[39]: import numpy as np from sklearn import linear_model from sklearn.metrics import r2_score # In[41]: # One-hot encoding of zip_codes hot_zip = pd.get_dummies(df3.zip_code) #df3.balcony = df3.balcony.astype(int) #df3.garden = df3.garden.astype(int) #df3.kitchen = df3.kitchen.astype(int) #df3.private = df3.private.astype(int) df3.cellar = df3.cellar.astype(int) # In[42]: X = pd.concat((df3[["sqm","rooms","area","cellar"]], hot_zip), axis=1) y = df3[["rent"]] # In[43]: X.head() # In[44]: # a simple linear model doesn't perform too well simple = linear_model.LinearRegression() simple.fit(X,y) y_pred = simple.predict(X) r2_score(y, y_pred) # In[45]: # Robustly fit linear model with RANSAC (RANdom SAmple Consensus) algorithm regressor = linear_model.RANSACRegressor(linear_model.LinearRegression()) # In[46]: regressor.fit(X,y) # In[47]: inlier_mask = regressor.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) print(u'%.1f%% der Wohnungen als Ausreißer identifiziert' % (sum(outlier_mask)*100.0/(sum(outlier_mask)+sum(inlier_mask)))) # In[48]: y_pred = regressor.predict(X) df3["rent_predicted"] = y_pred # In[49]: r2_ransac = r2_score(y[inlier_mask], y_pred[inlier_mask]) # In[51]: fig, ax = plt.subplots(figsize=(15,10)) sns.regplot(df3['rent'][inlier_mask], df3['rent_predicted'][inlier_mask]) plt.scatter(df3['rent'][outlier_mask], df3['rent_predicted'][outlier_mask], alpha=0.4, c='r') plt.title('Prediction of price for houses in Brandenburg\n(zip_code, rooms, sqm)') plt.text(250, 2000, r'$R^2=%.2f$' % r2_ransac) plt.tight_layout() # plt.savefig('LinReg-rent-apartment.png', dpi=150) # In[52]: df3["rent_error"] = df3["rent"] - df3["rent_predicted"] # In[53]: top = df3.sort_values("rent_error") # In[54]: top # In[55]: from tpot import TPOTRegressor from sklearn.model_selection import train_test_split # In[56]: X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) # In[57]: tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, scoring="r2") tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) # In[58]: y_pred = tpot.predict(X) # In[59]: y_pred_test = tpot.predict(X_test) r2_score(y_test, y_pred_test) # In[ ]: