import pandas as pd
df = pd.read_csv("train.csv")
df['Target']
0 4 1 4 2 4 3 4 4 4 .. 9552 2 9553 2 9554 2 9555 2 9556 2 Name: Target, Length: 9557, dtype: int64
df.dtypes
Id object v2a1 float64 hacdor int64 rooms int64 hacapo int64 ... SQBovercrowding float64 SQBdependency float64 SQBmeaned float64 agesq int64 Target int64 Length: 143, dtype: object
import sys
!"{sys.executable}" -m pip install --upgrade sklearn
Requirement already up-to-date: sklearn in /usr/local/lib/python3.7/site-packages (0.0) Requirement already satisfied, skipping upgrade: scikit-learn in /usr/local/lib/python3.7/site-packages (from sklearn) (0.22.2.post1) Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in /usr/local/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.14.5) Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.1.0) Requirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.7/site-packages (from scikit-learn->sklearn) (0.14.1)
df_short = df[['idhogar', 'bedrooms',
'rooms', 'age', 'male',
'meaneduc', 'Target']]
import seaborn as sns
%matplotlib inline
sns.pairplot(df_short.drop('idhogar', axis=1))
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:29: RuntimeWarning: invalid value encountered in reduce return umr_minimum(a, axis, None, out, keepdims) /usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:26: RuntimeWarning: invalid value encountered in reduce return umr_maximum(a, axis, None, out, keepdims)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-25-a7da63723f18> in <module>() ----> 1 sns.pairplot(df_short.drop('idhogar', axis=1)) /usr/local/lib/python3.7/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, dropna, plot_kws, diag_kws, grid_kws, size) 2105 if grid.square_grid: 2106 if diag_kind == "hist": -> 2107 grid.map_diag(plt.hist, **diag_kws) 2108 elif diag_kind == "kde": 2109 diag_kws.setdefault("shade", True) /usr/local/lib/python3.7/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs) 1397 color = fixed_color 1398 -> 1399 func(data_k, label=label_k, color=color, **kwargs) 1400 1401 self._clean_axis(ax) /usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py in hist(x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, normed, hold, data, **kwargs) 3130 histtype=histtype, align=align, orientation=orientation, 3131 rwidth=rwidth, log=log, color=color, label=label, -> 3132 stacked=stacked, normed=normed, data=data, **kwargs) 3133 finally: 3134 ax._hold = washold /usr/local/lib/python3.7/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs) 1853 "the Matplotlib list!)" % (label_namer, func.__name__), 1854 RuntimeWarning, stacklevel=2) -> 1855 return func(ax, *args, **kwargs) 1856 1857 inner.__doc__ = _add_data_doc(inner.__doc__, /usr/local/lib/python3.7/site-packages/matplotlib/axes/_axes.py in hist(***failed resolving arguments***) 6528 # this will automatically overwrite bins, 6529 # so that each histogram uses the same bins -> 6530 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs) 6531 m = m.astype(float) # causes problems later if it's an int 6532 if mlast is None: /usr/local/lib/python3.7/site-packages/numpy/lib/function_base.py in histogram(a, bins, range, normed, weights, density) 665 if first_edge > last_edge: 666 raise ValueError( --> 667 'max must be larger than min in range parameter.') 668 if not np.all(np.isfinite([first_edge, last_edge])): 669 raise ValueError( ValueError: max must be larger than min in range parameter.
df_short.plot.scatter(x='age', y='Target')
<matplotlib.axes._subplots.AxesSubplot at 0x10cee8320>
sns.boxplot(y='age', x='Target', data=df_short)
<matplotlib.axes._subplots.AxesSubplot at 0x10d02beb8>
sns.violinplot(y='age', x='Target', data=df_short)
<matplotlib.axes._subplots.AxesSubplot at 0x10b27ff28>
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
X = df_short[['age']]
y = df_short['Target']
linreg.fit(X, y, )
/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_base.py:533: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver. linalg.lstsq(X, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
linreg.coef_
array([0.00549434])
linreg.intercept_
3.1138160973114517
df_short['Target'].mean()
3.302291514073454
df_short['Target'].value_counts()
4 5996 2 1597 3 1209 1 755 Name: Target, dtype: int64
import numpy as np
x = np.linspace(df_short['age'].min(), df_short['age'].max(), 200)
linreg.predict(np.array([[10], [20]]))
array([3.16875952, 3.22370294])
import matplotlib.pyplot as plt
plt.plot(df_short['age'], df_short['Target'], 'o', alpha=0.1)
plt.plot(x, linreg.predict(x.reshape(-1, 1)))
[<matplotlib.lines.Line2D at 0x10b4fa1d0>]
linreg2 = LinearRegression()
df_short.isna().any()
idhogar False bedrooms False rooms False age False male False meaneduc True Target False dtype: bool
df_short['meaneduc'] = df_short['meaneduc'].fillna(0)
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
linreg2.fit(df_short.drop(['idhogar', 'Target'], axis=1),
df_short['Target'])
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
linreg2
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
((linreg.predict(X) - y) ** 2).mean()
1.0050158300039478
((linreg2.predict(df_short.drop(['idhogar', 'Target'], axis=1)) - y) ** 2).mean()
0.8624010835941041
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df_short.drop(['idhogar', 'Target'], axis=1),
y,
)
linreg2.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
((linreg2.predict(X_test) - y_test) ** 2).mean()
0.8823821201209054
((linreg2.predict(X_train) - y_train) ** 2).mean()
0.855857284896824
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X_train, y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best')
from sklearn.tree import plot_tree
plot_tree(tree, feature_names=X_train.columns, fontsize=12)
[Text(170.9,187.033,'meaneduc <= 6.708\nmse = 1.017\nsamples = 7167\nvalue = 3.3'), Text(85.45,112.22,'age <= 14.5\nmse = 1.2\nsamples = 2178\nvalue = 2.816'), Text(42.725,37.4067,'mse = 1.129\nsamples = 463\nvalue = 2.397'), Text(128.175,37.4067,'mse = 1.159\nsamples = 1715\nvalue = 2.929'), Text(256.35,112.22,'meaneduc <= 10.292\nmse = 0.79\nsamples = 4989\nvalue = 3.512'), Text(213.625,37.4067,'mse = 0.985\nsamples = 2366\nvalue = 3.31'), Text(299.075,37.4067,'mse = 0.544\nsamples = 2623\nvalue = 3.694')]
((tree.predict(X_test) - y_test) ** 2).mean()
0.9157655392666136
((tree.predict(X_train) - y_train) ** 2).mean()
0.8744096213243181
tree = DecisionTreeRegressor(max_depth=10)
tree.fit(X_train, y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best')
((tree.predict(X_test) - y_test) ** 2).mean()
0.9751601165366853
((tree.predict(X_train) - y_train) ** 2).mean()
0.58269011250063
test_mses = []
train_mses = []
depths = range(1, 20)
for max_depth in depths:
tree = DecisionTreeRegressor(max_depth=max_depth)
tree.fit(X_train, y_train)
test_mses.append(((tree.predict(X_test) - y_test) ** 2).mean())
train_mses.append(((tree.predict(X_train) - y_train) ** 2).mean())
plt.plot(depths, test_mses, label="test mse")
plt.plot(depths, train_mses, label="train mse")
plt.legend()
<matplotlib.legend.Legend at 0x1129dc0f0>