In [9]:

import pandas as pd

In [10]:

df = pd.read_csv("train.csv")

In [11]:

df['Target']

Out[11]:

0       4
1       4
2       4
3       4
4       4
       ..
9552    2
9553    2
9554    2
9555    2
9556    2
Name: Target, Length: 9557, dtype: int64

In [14]:

df.dtypes

Out[14]:

Id                  object
v2a1               float64
hacdor               int64
rooms                int64
hacapo               int64
                    ...   
SQBovercrowding    float64
SQBdependency      float64
SQBmeaned          float64
agesq                int64
Target               int64
Length: 143, dtype: object

In [16]:

import sys
!"{sys.executable}" -m pip install --upgrade sklearn

Requirement already up-to-date: sklearn in /usr/local/lib/python3.7/site-packages (0.0)
Requirement already satisfied, skipping upgrade: scikit-learn in /usr/local/lib/python3.7/site-packages (from sklearn) (0.22.2.post1)
Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in /usr/local/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.14.5)
Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.1.0)
Requirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.7/site-packages (from scikit-learn->sklearn) (0.14.1)

In [18]:

df_short = df[['idhogar', 'bedrooms', 
               'rooms', 'age', 'male', 
               'meaneduc', 'Target']]

In [20]:

import seaborn as sns

In [22]:

%matplotlib inline

In [25]:

sns.pairplot(df_short.drop('idhogar', axis=1))

/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:29: RuntimeWarning: invalid value encountered in reduce
  return umr_minimum(a, axis, None, out, keepdims)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:26: RuntimeWarning: invalid value encountered in reduce
  return umr_maximum(a, axis, None, out, keepdims)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-a7da63723f18> in <module>()
----> 1 sns.pairplot(df_short.drop('idhogar', axis=1))

/usr/local/lib/python3.7/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, dropna, plot_kws, diag_kws, grid_kws, size)
   2105     if grid.square_grid:
   2106         if diag_kind == "hist":
-> 2107             grid.map_diag(plt.hist, **diag_kws)
   2108         elif diag_kind == "kde":
   2109             diag_kws.setdefault("shade", True)

/usr/local/lib/python3.7/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
   1397                     color = fixed_color
   1398 
-> 1399                 func(data_k, label=label_k, color=color, **kwargs)
   1400 
   1401             self._clean_axis(ax)

/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py in hist(x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, normed, hold, data, **kwargs)
   3130                       histtype=histtype, align=align, orientation=orientation,
   3131                       rwidth=rwidth, log=log, color=color, label=label,
-> 3132                       stacked=stacked, normed=normed, data=data, **kwargs)
   3133     finally:
   3134         ax._hold = washold

/usr/local/lib/python3.7/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1853                         "the Matplotlib list!)" % (label_namer, func.__name__),
   1854                         RuntimeWarning, stacklevel=2)
-> 1855             return func(ax, *args, **kwargs)
   1856 
   1857         inner.__doc__ = _add_data_doc(inner.__doc__,

/usr/local/lib/python3.7/site-packages/matplotlib/axes/_axes.py in hist(***failed resolving arguments***)
   6528             # this will automatically overwrite bins,
   6529             # so that each histogram uses the same bins
-> 6530             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
   6531             m = m.astype(float)  # causes problems later if it's an int
   6532             if mlast is None:

/usr/local/lib/python3.7/site-packages/numpy/lib/function_base.py in histogram(a, bins, range, normed, weights, density)
    665     if first_edge > last_edge:
    666         raise ValueError(
--> 667             'max must be larger than min in range parameter.')
    668     if not np.all(np.isfinite([first_edge, last_edge])):
    669         raise ValueError(

ValueError: max must be larger than min in range parameter.

In [27]:

df_short.plot.scatter(x='age', y='Target')

Out[27]:

<matplotlib.axes._subplots.AxesSubplot at 0x10cee8320>

In [30]:

sns.boxplot(y='age', x='Target', data=df_short)

Out[30]:

<matplotlib.axes._subplots.AxesSubplot at 0x10d02beb8>

In [31]:

sns.violinplot(y='age', x='Target', data=df_short)

Out[31]:

<matplotlib.axes._subplots.AxesSubplot at 0x10b27ff28>

$$Target = \beta_0 + \beta_1 \times age$$

In [32]:

from sklearn.linear_model import LinearRegression

In [33]:

linreg = LinearRegression()

In [34]:

X = df_short[['age']]
y = df_short['Target']

In [36]:

linreg.fit(X, y, )

/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_base.py:533: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)

Out[36]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [37]:

linreg.coef_

Out[37]:

array([0.00549434])

In [38]:

linreg.intercept_

Out[38]:

3.1138160973114517

In [39]:

df_short['Target'].mean()

Out[39]:

3.302291514073454

In [40]:

df_short['Target'].value_counts()

Out[40]:

4    5996
2    1597
3    1209
1     755
Name: Target, dtype: int64

In [41]:

import numpy as np

In [43]:

x = np.linspace(df_short['age'].min(), df_short['age'].max(), 200)

In [54]:

linreg.predict(np.array([[10], [20]]))

Out[54]:

array([3.16875952, 3.22370294])

In [49]:

import matplotlib.pyplot as plt

In [55]:

plt.plot(df_short['age'], df_short['Target'], 'o', alpha=0.1)
plt.plot(x, linreg.predict(x.reshape(-1, 1)))

Out[55]:

[<matplotlib.lines.Line2D at 0x10b4fa1d0>]

In [56]:

linreg2 = LinearRegression()

In [65]:

df_short.isna().any()

Out[65]:

idhogar     False
bedrooms    False
rooms       False
age         False
male        False
meaneduc     True
Target      False
dtype: bool

In [67]:

df_short['meaneduc'] = df_short['meaneduc'].fillna(0)

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

In [81]:

linreg2.fit(df_short.drop(['idhogar', 'Target'], axis=1), 
            df_short['Target'])

Out[81]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [82]:

linreg2

Out[82]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [84]:

((linreg.predict(X) - y) ** 2).mean()

Out[84]:

1.0050158300039478

In [85]:

((linreg2.predict(df_short.drop(['idhogar', 'Target'], axis=1)) - y) ** 2).mean()

Out[85]:

0.8624010835941041

In [86]:

from sklearn.model_selection import train_test_split

In [87]:

X_train, X_test, y_train, y_test = train_test_split(
  df_short.drop(['idhogar', 'Target'], axis=1),
  y, 
)

In [88]:

linreg2.fit(X_train, y_train)

Out[88]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [90]:

((linreg2.predict(X_test) - y_test) ** 2).mean()

Out[90]:

0.8823821201209054

In [91]:

((linreg2.predict(X_train) - y_train) ** 2).mean()

Out[91]:

0.855857284896824

In [92]:

from sklearn.tree import DecisionTreeRegressor

In [94]:

tree = DecisionTreeRegressor(max_depth=2)

In [95]:

tree.fit(X_train, y_train)

Out[95]:

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [96]:

from sklearn.tree import plot_tree

In [101]:

plot_tree(tree, feature_names=X_train.columns, fontsize=12)

Out[101]:

[Text(170.9,187.033,'meaneduc <= 6.708\nmse = 1.017\nsamples = 7167\nvalue = 3.3'),
 Text(85.45,112.22,'age <= 14.5\nmse = 1.2\nsamples = 2178\nvalue = 2.816'),
 Text(42.725,37.4067,'mse = 1.129\nsamples = 463\nvalue = 2.397'),
 Text(128.175,37.4067,'mse = 1.159\nsamples = 1715\nvalue = 2.929'),
 Text(256.35,112.22,'meaneduc <= 10.292\nmse = 0.79\nsamples = 4989\nvalue = 3.512'),
 Text(213.625,37.4067,'mse = 0.985\nsamples = 2366\nvalue = 3.31'),
 Text(299.075,37.4067,'mse = 0.544\nsamples = 2623\nvalue = 3.694')]

In [102]:

((tree.predict(X_test) - y_test) ** 2).mean()

Out[102]:

0.9157655392666136

In [103]:

((tree.predict(X_train) - y_train) ** 2).mean()

Out[103]:

0.8744096213243181

In [108]:

tree = DecisionTreeRegressor(max_depth=10)

In [109]:

tree.fit(X_train, y_train)

Out[109]:

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [110]:

((tree.predict(X_test) - y_test) ** 2).mean()

Out[110]:

0.9751601165366853

In [111]:

((tree.predict(X_train) - y_train) ** 2).mean()

Out[111]:

0.58269011250063

In [114]:

test_mses = []
train_mses = []
depths = range(1, 20)
for max_depth in depths:
    tree = DecisionTreeRegressor(max_depth=max_depth)
    tree.fit(X_train, y_train)
    test_mses.append(((tree.predict(X_test) - y_test) ** 2).mean())
    train_mses.append(((tree.predict(X_train) - y_train) ** 2).mean())
    

In [116]:

plt.plot(depths, test_mses, label="test mse")
plt.plot(depths, train_mses, label="train mse")
plt.legend()

Out[116]:

<matplotlib.legend.Legend at 0x1129dc0f0>

In [ ]: