In [1]:
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
In [2]:
%matplotlib inline
In [3]:
df = pd.read_csv('apartments_train.csv')
len(df)
Out[3]:
31932
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31932 entries, 0 to 31931
Data columns (total 24 columns):
address         26863 non-null object
area            0 non-null float64
balcony         31932 non-null bool
cellar          0 non-null float64
city            31932 non-null object
contact_name    25134 non-null object
district        31932 non-null object
extra_costs     31932 non-null float64
garden          31932 non-null bool
immo_id         31932 non-null int64
kitchen         31932 non-null bool
lat             26740 non-null float64
lng             26740 non-null float64
media_count     31932 non-null int64
private         31932 non-null bool
rent            31932 non-null float64
rooms           31932 non-null float64
sqm             31932 non-null float64
time_dest       0 non-null float64
time_dest2      0 non-null float64
time_dest3      0 non-null float64
title           31932 non-null object
url             31932 non-null object
zip_code        31932 non-null int64
dtypes: bool(4), float64(11), int64(3), object(6)
memory usage: 5.0+ MB
In [5]:
df.head()
Out[5]:
address area balcony cellar city contact_name district extra_costs garden immo_id ... private rent rooms sqm time_dest time_dest2 time_dest3 title url zip_code
0 Holteistraße 17 NaN True NaN Berlin Felix Henne Friedrichshain (Friedrichshain) 360.0 False 104509324 ... False 1095.00 3.0 82.52 NaN NaN NaN Wohnen am Stadtpark https://www.immobilienscout24.de/expose/104509324 10245
1 Kremmener Straße 11 NaN True NaN Berlin Ihr Makler in Berlin! Mitte (Mitte) 282.0 False 102537923 ... False 1824.00 3.0 102.48 NaN NaN NaN Außergewöhnliches Industrieloft mit 2 edlen Bä... https://www.immobilienscout24.de/expose/102537923 10435
2 Rudolf-Schwarz-Str. 26 NaN False NaN Berlin NaN Prenzlauer Berg (Prenzlauer Berg) 137.0 False 106461482 ... False 655.00 2.0 54.56 NaN NaN NaN Gemütliche 2 Zimmer Wohnung in Prenzlauer Berg... https://www.immobilienscout24.de/expose/106461482 10407
3 NaN NaN True NaN Berlin Michael Latteck Wilmersdorf (Wilmersdorf) 122.0 False 106325874 ... False 899.85 2.0 59.99 NaN NaN NaN attraktive Pärchen-Wohnung am Güntzel-Kiez! Pa... https://www.immobilienscout24.de/expose/106325874 10717
4 Bredowstr. 5 NaN False NaN Berlin Vermietungsteam Stadthaus Verwaltungsgesellschaft Tiergarten (Tiergarten) 97.0 False 106316967 ... False 482.00 1.0 37.59 NaN NaN NaN Moabit! - Westfälisches Viertel! - 1 Zimmer - ... https://www.immobilienscout24.de/expose/106316967 10551

5 rows × 24 columns

Data Cleaning

In [6]:
df.immo_id = df.immo_id.astype(str)
df.zip_code = df.zip_code.astype(str)
df2=df.drop(columns=["time_dest", "time_dest2", "time_dest3", "area", "cellar"])
df2.describe()
Out[6]:
area cellar extra_costs lat lng media_count rent rooms sqm
count 0.0 0.0 31932.000000 26740.000000 26740.000000 31932.000000 31932.000000 31932.000000 31932.000000
mean NaN NaN 221.975585 52.508081 13.396624 10.970468 979.486026 2.490923 78.363778
std NaN NaN 1396.253013 0.045758 0.098108 6.313705 6761.499857 1.220792 44.324234
min NaN NaN 0.000000 52.373770 13.116530 0.000000 0.000000 1.000000 0.000000
25% NaN NaN 130.000000 52.479450 13.330520 7.000000 520.000000 2.000000 53.760000
50% NaN NaN 180.000000 52.510890 13.395005 10.000000 719.000000 2.000000 68.900000
75% NaN NaN 250.000000 52.537560 13.452780 14.000000 1098.000000 3.000000 92.142500
max NaN NaN 208597.090000 52.656390 13.726850 77.000000 1000000.000000 100.000000 2489.660000

Interesting statistics

Providers with most apartment offers

In [7]:
df2.groupby("contact_name").size().sort_values(ascending=False)
Out[7]:
contact_name
Vermietungsteam Stadthaus Verwaltungsgesellschaft            948
Britta C. Gruttmann                                          417
Ingo Pries                                                   393
Petra Hubertus                                               348
Manuel Lan                                                   298
Melanie Schröder                                             283
Sandra Smarslik                                              281
Thomas Voss                                                  267
Schönlife Immobilien                                         265
Kathrin Aschurow                                             253
Ihr Makler in Berlin!                                        252
Alexandra Nepomnyashcha                                      223
Sandra Linke                                                 213
Beate Akitoye                                                198
Hans-Jürgen  Kloss                                           194
Pia Unterlauf                                                186
Diana Wilhelm                                                183
BIDDEX IMMOBILIEN                                            172
Petra Hubertus reco immobilienbüro                           171
Andrej Schafranek                                            165
Sonja Stanisch                                               160
Vermietungsteam Wiesenpark Quartier                          158
Ulrich Keiling                                               155
Vermietungsabteilung -  Engel & Völkers Berlin Mitte GmbH    154
Konstanze Mühlbach                                           151
Frau Schulze Vermietung                                      150
Corinne Adolph                                               144
Giulia Valentino                                             139
Dana Marsig                                                  136
Michael Chibac                                               135
                                                            ... 
Dirk Friebe                                                    1
Dirk Elsner                                                    1
Vanessa Murri                                                  1
Maik Wehner                                                    1
Dipl.-Ing. Ulrike Stübner                                      1
Ehrhard Hawelka                                                1
MaklerBüro Wille  Frau Schardt- [email protected]       1
Manfred Klinkert                                               1
Manuel Hennings                                                1
Sven Tillmanns                                                 1
Dieter Huber                                                   1
Manuel Martinez                                                1
Maciej Auda                                                    1
MICHAEL DIETRICH                                               1
Dirk Wippermann                                                1
M. Solms                                                       1
M. Kleinschmidt Immobilien                                     1
Vanessa Mummert                                                1
Dominique McRae                                                1
Doreen  Smolarek                                               1
Doreen Kärgel                                                  1
M. Karge                                                       1
M. Jens                                                        1
Luise Schirmer                                                 1
Dorothea Friedrich                                             1
Lucia Demmel                                                   1
Liebermann Vernen                                              1
Liane Polzin                                                   1
Leipzig Engel & Völkers                                        1
Simone Burgold                                                 1
Length: 1656, dtype: int64

Districts with most apartment offers

In [8]:
df2.groupby("district").size().sort_values(ascending=False)
Out[8]:
district
Mitte (Mitte)                              2535
Charlottenburg (Charlottenburg)            2401
Prenzlauer Berg (Prenzlauer Berg)          1982
Friedrichshain (Friedrichshain)            1825
Neukölln (Neukölln)                        1615
Wedding (Wedding)                          1543
Tiergarten (Tiergarten)                    1524
Wilmersdorf (Wilmersdorf)                  1425
Kreuzberg (Kreuzberg)                      1133
Spandau (Spandau)                          1088
Schöneberg (Schöneberg)                     998
Köpenick (Köpenick)                         764
Lichterfelde (Steglitz)                     679
Weißensee (Weißensee)                       658
Reinickendorf (Reinickendorf)               626
Steglitz (Steglitz)                         590
Pankow (Pankow)                             589
Adlershof (Treptow)                         518
Marzahn (Marzahn)                           450
Niederschönhausen (Pankow)                  409
Hellersdorf (Hellersdorf)                   399
Zehlendorf (Zehlendorf)                     385
Lichtenberg (Lichtenberg)                   379
Karlshorst (Lichtenberg)                    343
Tempelhof (Tempelhof)                       342
Niederschöneweide (Treptow)                 323
Dahlem (Zehlendorf)                         319
Kaulsdorf (Hellersdorf)                     293
Mariendorf (Tempelhof)                      292
Staaken (Spandau)                           277
                                           ... 
Neu-Hohenschönhausen (Hohenschönhausen)     110
Baumschulenweg (Treptow)                    104
Treptow (Treptow)                            94
Mahlsdorf (Hellersdorf)                      92
Wittenau (Reinickendorf)                     92
Französisch Buchholz (Pankow)                84
Grünau (Köpenick)                            75
Kladow (Spandau)                             74
Nikolassee (Zehlendorf)                      73
Wannsee (Zehlendorf)                         64
Siemensstadt (Spandau)                       63
Biesdorf (Marzahn)                           55
Plänterwald (Treptow)                        52
Frohnau (Reinickendorf)                      51
Hermsdorf (Reinickendorf)                    40
Buch (Pankow)                                35
Lübars (Reinickendorf)                       28
Haselhorst (Spandau)                         28
Konradshöhe (Reinickendorf)                  27
Rahnsdorf (Köpenick)                         26
Heinersdorf (Weißensee)                      23
Heiligensee (Reinickendorf)                  22
Gatow (Spandau)                              19
Müggelheim (Köpenick)                        13
Bohnsdorf (Treptow)                          12
Blankenburg (Weißensee)                       9
Schmöckwitz (Köpenick)                        6
Rummelsburg (Lichtenberg)                     4
Malchow (Hohenschönhausen)                    2
Wartenberg (Hohenschönhausen)                 1
Length: 79, dtype: int64
In [15]:
df3=df2[(df2.rent<2000) & (df2.sqm<200) & (df2.rooms < 10) & (df2.extra_costs < 2000)]
len(df3)
Out[15]:
30011
In [16]:
df3.plot(x="sqm", y="rent", c="rooms", kind="scatter", colormap=cm.Set1, figsize=(15,10))
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x11de9cc18>
In [17]:
df3.plot(x="lng", y="lat", c="rent", kind="scatter", figsize=(15,10), colormap=cm.Blues)
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x120c73438>
In [18]:
df3.hist(bins=20,figsize=(15,10), column=["extra_costs","lat","lng","media_count","rent","rooms","sqm"])
Out[18]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x121451f98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x121489048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1217216d8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x121748d68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12177a3c8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12177a400>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1217d50f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1217fb780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x121824e10>]],
      dtype=object)
In [19]:
corr=df3.corr()
In [20]:
fig, ax = plt.subplots(figsize=(15,10)) 
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, center=0.0, 
            cmap=sns.diverging_palette(5, 250, as_cmap=True), annot=True, fmt=".2f", ax=ax)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1218e7470>
In [22]:
# df4=df3.dropna()
sns.pairplot(df3, vars=["sqm","rooms","rent","media_count", "extra_costs"], kind="reg")
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[22]:
<seaborn.axisgrid.PairGrid at 0x11b9952e8>
In [25]:
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score
In [26]:
# One-hot encoding of zip_codes
hot_zip = pd.get_dummies(df3.zip_code)
df3.balcony = df3.balcony.astype(int)
df3.garden = df3.garden.astype(int)
df3.kitchen = df3.kitchen.astype(int)
df3.private = df3.private.astype(int)
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/site-packages/pandas/core/generic.py:4405: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
In [27]:
X = pd.concat((df3[["sqm","rooms","balcony","garden","kitchen","private"]], hot_zip), axis=1)
y = df3[["rent"]]
In [28]:
X.head()
Out[28]:
sqm rooms balcony garden kitchen private 10059 10115 10117 10119 ... 14167 14169 14179 14193 14195 14197 14199 14974 19719 20147
0 82.52 3.0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 102.48 3.0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 54.56 2.0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 59.99 2.0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 37.59 1.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 241 columns

In [29]:
# a simple linear model doesn't perform too well
simple = linear_model.LinearRegression()
simple.fit(X,y)
y_pred = simple.predict(X)
r2_score(y, y_pred)
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)
Out[29]:
0.805445021495854
In [30]:
# Robustly fit linear model with RANSAC (RANdom SAmple Consensus) algorithm 
regressor = linear_model.RANSACRegressor(linear_model.LinearRegression())
In [31]:
regressor.fit(X,y)
Out[31]:
RANSACRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
        is_data_valid=None, is_model_valid=None, loss='absolute_loss',
        max_skips=inf, max_trials=100, min_samples=None, random_state=None,
        residual_metric=None, residual_threshold=None, stop_n_inliers=inf,
        stop_probability=0.99, stop_score=inf)
In [55]:
inlier_mask = regressor.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
print(u'%.1f%% der Wohnungen als Ausreißer identifiziert' % (sum(outlier_mask)*100.0/(sum(outlier_mask)+sum(inlier_mask))))
24.0% der Wohnungen als Ausreißer identifiziert
In [56]:
y_pred = regressor.predict(X)
df3["rent_predicted"] = y_pred
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
In [57]:
r2_ransac = r2_score(y[inlier_mask], y_pred[inlier_mask])
In [60]:
fig, ax = plt.subplots(figsize=(15,10)) 
sns.regplot(df3['rent'][inlier_mask],
            df3['rent_predicted'][inlier_mask])
plt.scatter(df3['rent'][outlier_mask],
            df3['rent_predicted'][outlier_mask],
            alpha=0.4,
            c='r')

plt.title('Prediction of rent for apartments in Berlin')
plt.text(250, 2000, r'$R^2=%.2f$' % r2_ransac)
plt.tight_layout()
# plt.savefig('LinReg-rent-apartment.png', dpi=150)
/Users/andi/.local/share/virtualenvs/ImmoSpider-M0SYrQN4/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval