import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
#loading the dataset direclty from sklearn
boston = datasets.load_boston()
X = boston.data
y = boston.target
print(boston.feature_names)
print(boston.DESCR)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT'] .. _boston_dataset: Boston house prices dataset --------------------------- **Data Set Characteristics:** :Number of Instances: 506 :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target. :Attribute Information (in order): - CRIM per capita crime rate by town - ZN proportion of residential land zoned for lots over 25,000 sq.ft. - INDUS proportion of non-retail business acres per town - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - NOX nitric oxides concentration (parts per 10 million) - RM average number of rooms per dwelling - AGE proportion of owner-occupied units built prior to 1940 - DIS weighted distances to five Boston employment centres - RAD index of accessibility to radial highways - TAX full-value property-tax rate per $10,000 - PTRATIO pupil-teacher ratio by town - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - LSTAT % lower status of the population - MEDV Median value of owner-occupied homes in $1000's :Missing Attribute Values: None :Creator: Harrison, D. and Rubinfeld, D.L. This is a copy of UCI ML housing dataset. https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter. The Boston house-price data has been used in many machine learning papers that address regression problems. .. topic:: References - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261. - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
bos = pd.DataFrame(boston.data, columns = boston.feature_names)
correlation_matrix = bos.corr().round(2)
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(ax=ax, data=correlation_matrix, annot=True, cmap="YlGnBu")
<AxesSubplot:>
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
plt.subplots(figsize=(5,5))
plt.xlim(0., 1.1*np.max(y_test))
plt.ylim(0., 1.1*np.max(y_test))
plt.xlabel("Actual house price (k$)", fontsize=16)
plt.ylabel("Predited house price (k$)", fontsize=16)
plt.scatter(y_test, y_pred_lr)
xv = np.linspace(0., 1.1*np.max(y_test), 100)
plt.plot(xv, xv)
rms = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print(rms)
4.672548554009624
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(100), activation='logistic', random_state=1, max_iter=5000)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
plt.subplots(figsize=(5,5))
plt.xlim(0., 1.1*np.max(y_test))
plt.ylim(0., 1.1*np.max(y_test))
plt.xlabel("Actual house price (k$)", fontsize=16)
plt.ylabel("Predited house price (k$)", fontsize=16)
plt.scatter(y_test, y_pred_mlp)
xv = np.linspace(0., 1.1*np.max(y_test), 100)
plt.plot(xv, xv, 'black')
rms = np.sqrt(mean_squared_error(y_test, y_pred_mlp))
print(f"root mean square error {rms:.2f}")
plt.savefig("boston_house_prices.pdf")
root mean square error 4.07
# plt.plot(mlp.loss_curve_)
n_epochs = len(mlp.loss_curve_)
i_epoch = [ i for i in range(n_epochs)]
plt.plot(i_epoch, mlp.loss_curve_)
plt.xlabel('epochs', fontsize=16)
plt.ylabel('mean square error', fontsize=16)
Text(0, 0.5, 'mean square error')
from sklearn.model_selection import GridSearchCV
param_grid = [
{'hidden_layer_sizes': [(50,), (100,), (200,), (300,)]}
]
rgr = GridSearchCV(
MLPRegressor(activation='logistic', random_state=1, max_iter=5000),
param_grid, scoring='neg_mean_squared_error', cv=5)
rgr.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=MLPRegressor(activation='logistic', max_iter=5000, random_state=1), param_grid=[{'hidden_layer_sizes': [(50,), (100,), (200,), (300,)]}], scoring='neg_mean_squared_error')
rgr.cv_results_
{'mean_fit_time': array([3.40978374, 3.23036819, 3.21716728, 3.04543743]), 'std_fit_time': array([0.78036379, 0.27373415, 0.85777762, 1.01502716]), 'mean_score_time': array([0.00091372, 0.00087595, 0.0012547 , 0.00152955]), 'std_score_time': array([2.35679031e-04, 7.47978020e-05, 2.05420131e-04, 3.88200935e-04]), 'param_hidden_layer_sizes': masked_array(data=[(50,), (100,), (200,), (300,)], mask=[False, False, False, False], fill_value='?', dtype=object), 'params': [{'hidden_layer_sizes': (50,)}, {'hidden_layer_sizes': (100,)}, {'hidden_layer_sizes': (200,)}, {'hidden_layer_sizes': (300,)}], 'split0_test_score': array([-28.69741356, -28.1151561 , -29.65411621, -30.70174501]), 'split1_test_score': array([-47.20081903, -23.71839834, -24.9899842 , -23.6763428 ]), 'split2_test_score': array([-31.47135197, -27.84980916, -26.52270407, -26.07012025]), 'split3_test_score': array([-15.71402673, -15.77632564, -15.30586113, -17.46546595]), 'split4_test_score': array([-31.39937117, -29.31457562, -25.91779979, -30.22523071]), 'mean_test_score': array([-30.89659649, -24.95485297, -24.47809308, -25.62778094]), 'std_test_score': array([10.01756087, 4.96285508, 4.84636516, 4.84786173]), 'rank_test_score': array([4, 2, 1, 3], dtype=int32)}
np.sqrt(-rgr.cv_results_['mean_test_score'])
array([5.5584707 , 4.99548326, 4.94753404, 5.06238886])