Notebook

Exercise: Boston house prices (MLP regression)¶

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [3]:

#loading the dataset direclty from sklearn
boston = datasets.load_boston()
X = boston.data
y = boston.target
print(boston.feature_names)
print(boston.DESCR)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

In [4]:

bos = pd.DataFrame(boston.data, columns = boston.feature_names)
correlation_matrix = bos.corr().round(2)
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(ax=ax, data=correlation_matrix, annot=True, cmap="YlGnBu")

Out[4]:

<AxesSubplot:>

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

Linear regressor as a comparision¶

In [6]:

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

plt.subplots(figsize=(5,5))
plt.xlim(0., 1.1*np.max(y_test))
plt.ylim(0., 1.1*np.max(y_test))
plt.xlabel("Actual house price (k$)", fontsize=16)
plt.ylabel("Predited house price (k$)", fontsize=16)
plt.scatter(y_test, y_pred_lr)
xv = np.linspace(0., 1.1*np.max(y_test), 100)
plt.plot(xv, xv)

rms = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print(rms)

4.672548554009624

Define Multi-layer perceptron¶

In [7]:

from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(100), activation='logistic', random_state=1, max_iter=5000)
mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)

plt.subplots(figsize=(5,5))
plt.xlim(0., 1.1*np.max(y_test))
plt.ylim(0., 1.1*np.max(y_test))
plt.xlabel("Actual house price (k$)", fontsize=16)
plt.ylabel("Predited house price (k$)", fontsize=16)
plt.scatter(y_test, y_pred_mlp)
xv = np.linspace(0., 1.1*np.max(y_test), 100)
plt.plot(xv, xv, 'black')

rms = np.sqrt(mean_squared_error(y_test, y_pred_mlp))
print(f"root mean square error {rms:.2f}")
plt.savefig("boston_house_prices.pdf")

root mean square error 4.07

Plot mean square error vs. training epoch¶

In [8]:

# plt.plot(mlp.loss_curve_)
n_epochs = len(mlp.loss_curve_)
i_epoch = [ i for i in range(n_epochs)]
plt.plot(i_epoch, mlp.loss_curve_)
plt.xlabel('epochs', fontsize=16)
plt.ylabel('mean square error', fontsize=16)

Out[8]:

Text(0, 0.5, 'mean square error')

Hyperparameter optimization¶

In [12]:

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'hidden_layer_sizes': [(50,), (100,), (200,), (300,)]}
]

rgr = GridSearchCV(
    MLPRegressor(activation='logistic', random_state=1, max_iter=5000), 
    param_grid, scoring='neg_mean_squared_error', cv=5)

rgr.fit(X_train, y_train)

Out[12]:

GridSearchCV(cv=5,
             estimator=MLPRegressor(activation='logistic', max_iter=5000,
                                    random_state=1),
             param_grid=[{'hidden_layer_sizes': [(50,), (100,), (200,),
                                                 (300,)]}],
             scoring='neg_mean_squared_error')

In [10]:

rgr.cv_results_

Out[10]:

{'mean_fit_time': array([3.40978374, 3.23036819, 3.21716728, 3.04543743]),
 'std_fit_time': array([0.78036379, 0.27373415, 0.85777762, 1.01502716]),
 'mean_score_time': array([0.00091372, 0.00087595, 0.0012547 , 0.00152955]),
 'std_score_time': array([2.35679031e-04, 7.47978020e-05, 2.05420131e-04, 3.88200935e-04]),
 'param_hidden_layer_sizes': masked_array(data=[(50,), (100,), (200,), (300,)],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': (50,)},
  {'hidden_layer_sizes': (100,)},
  {'hidden_layer_sizes': (200,)},
  {'hidden_layer_sizes': (300,)}],
 'split0_test_score': array([-28.69741356, -28.1151561 , -29.65411621, -30.70174501]),
 'split1_test_score': array([-47.20081903, -23.71839834, -24.9899842 , -23.6763428 ]),
 'split2_test_score': array([-31.47135197, -27.84980916, -26.52270407, -26.07012025]),
 'split3_test_score': array([-15.71402673, -15.77632564, -15.30586113, -17.46546595]),
 'split4_test_score': array([-31.39937117, -29.31457562, -25.91779979, -30.22523071]),
 'mean_test_score': array([-30.89659649, -24.95485297, -24.47809308, -25.62778094]),
 'std_test_score': array([10.01756087,  4.96285508,  4.84636516,  4.84786173]),
 'rank_test_score': array([4, 2, 1, 3], dtype=int32)}

In [11]:

np.sqrt(-rgr.cv_results_['mean_test_score'])

Out[11]:

array([5.5584707 , 4.99548326, 4.94753404, 5.06238886])

In [ ]: