Data Set Characteristics:
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per \$10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset. https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression problems.
References
import sys
assert sys.version_info >= (3, 6)
import numpy
assert numpy.__version__ >="1.17.3"
import numpy as np
import pandas
assert pandas.__version__ >= "0.25.1"
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# dataset_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
dataset_path = 'data/housing/boston/housing.data'
housing = pd.read_csv(dataset_path, sep ='\s+', header = None)
housing.head()
housing.shape
housing.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
housing.head()
Couting the number of missing values of each feature
housing.isnull().sum()
housing.describe()
h = housing.hist(bins=20, figsize=(20,15))
correlation_matrix = np.corrcoef(housing[housing.columns].values.T)
Visualizing the correlation matrix
import seaborn
assert seaborn.__version__ >= '0.9.0'
import seaborn
assert seaborn.__version__ >= '0.9.0'
import seaborn as sns
f, ax = plt.subplots(figsize=(10, 8))
hm = sns.heatmap(data = correlation_matrix,
annot = True,
square = True,
fmt='.2f',
yticklabels = housing.columns,
xticklabels = housing.columns,
linewidths=.1, ax = ax)
# fix for mpl bug that cuts off top/bottom of seaborn viz
# See the discussion here (https://github.com/mwaskom/seaborn/issues/1773)
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
Let us investigate how RM and LSTAT vary with MEDV
plt.figure(figsize=(20,5))
features = ["RM", "LSTAT"]
for i, col in enumerate(features):
plt.subplot(1, len(features), i + 1)
plt.scatter(housing[col], housing['MEDV'])
plt.xlabel(col)
plt.title(col)
plt.ylabel('MEDV')
X = housing['RM'].values.reshape(-1,1)
y = housing['MEDV'].values.reshape(-1,1)
import sklearn
assert sklearn.__version__ >='0.21.3'
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)
print("Intercept = {}, Slope{}".format(lm.intercept_, lm.coef_))
from sklearn.model_selection import train_test_split
X = pd.DataFrame(np.c_[housing['RM'], housing['LSTAT']], columns = ['RM', 'LSTAT'])
y = housing['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 100)
print("X_train.shape {}, X_test.shape {}".format(X_train.shape, X_test.shape))
print("y_train.shape {}, y_test.shape {}".format(y_train.shape, y_test.shape))
lm = LinearRegression()
lm.fit(X_train, y_train)
y_train_predicted = lm.predict(X_train)
y_test_predicted = lm.predict(X_test)
plt.figure(figsize=(6, 5))
plt.scatter(y_train_predicted, y_train_predicted - y_train, color = "blue", label = "Training data")
plt.scatter(y_test_predicted, y_test_predicted - y_test, color = 'red', label = "Test data")
plt.ylabel("Residuals")
plt.xlabel("Predicted values")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='green')
plt.axis([-10,50, -30,20])
For the training dataset, $R^2$ is bounded between 0 and 1. Therefore, it can become negative for the test set. If $R^2 = 1$, the model fits the data perfectly, which corresponde a $MSE = 0$.
from sklearn.metrics import mean_squared_error, r2_score
mse_train = mean_squared_error(y_train, y_train_predicted)
mse_test = mean_squared_error(y_test, y_test_predicted)
print('MSE training: %.3f, test: %.3f' %(mse_train, mse_test))
print('RMSE training: %.3f, RMSE test: %.3f' % (np.sqrt(mse_train), np.sqrt(mse_test)))
print('R2 score training: %.3f, R2 score test: %.3f' %(r2_score(y_train, y_train_predicted),
r2_score(y_test, y_test_predicted)))
def plot_learning_curves(model, X_train, X_test, y_train, y_test):
""" Plot the learning curves of the given model using the training and the test sets"""
train_errors, test_errors = [], []
for m in range(1, len(X_train)):
model.fit(X_train[:m], y_train[:m])
y_train_predicted = model.predict(X_train[:m])
y_test_predicted = model.predict(X_test)
train_errors.append(mean_squared_error(y_train_predicted, y_train[:m]))
test_errors.append(mean_squared_error(y_test_predicted, y_test))
plt.plot(np.sqrt(train_errors), 'r-+', linewidth = 1, label = "Training set")
plt.plot(np.sqrt(test_errors), 'b-', linewidth = 2, label = "Testing set")
plt.legend(loc="upper right")
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.figure(figsize=(6, 4))
plt.title('Predicting Boston housing prices: learnig curves')
plot_learning_curves(lm, X_train, X_test, y_train, y_test)
plt.savefig('figures/learning_curves.pdf')
from sklearn.linear_model import Ridge
X = housing.drop('MEDV', axis = 1)
y = housing['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 100)
ridge_reg = Ridge(alpha=2, solver="cholesky", random_state=100)
ridge_reg.fit(X_train, y_train)
y_pred_train_ridge = ridge_reg.predict(X_train)
y_pred_ridge = ridge_reg.predict(X_test)
plt.scatter(y_test,y_pred_ridge)
plt.xlabel("Actual value ($Y)$")
plt.ylabel("Predicted value ($\hat{Y}$)")
plt.savefig('figures/ridge_regression_plot.pdf')
plt.show()
We can see a diagonal trend, indicating a good fit model
plt.figure(figsize=(6, 4))
plot_learning_curves(ridge_reg, X_train, X_test, y_train, y_test)
plt.savefig('figures/learning_curve_ridge_regression.pdf')
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.3)
lasso.fit(X_train, y_train)
y_pred_train_lasso = lasso.predict(X_train)
y_pred_lasso = lasso.predict(X_test)
plt.scatter(y_test,y_pred_lasso)
plt.title('Lasso Boston housing price prediction')
plt.xlabel("Actual value ($Y)$")
plt.ylabel("Predicted value ($\hat{Y}$)")
plt.savefig('figures/lasso_regression_plot.pdf')
plt.show()
plt.figure(figsize=(6, 4))
plot_learning_curves(lasso, X_train, X_test, y_train, y_test)
plt.savefig('figures/learning_curve_lasso_regression.pdf')
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=100)
elastic_net.fit(X_train, y_train)
y_pred_train_elastic = elastic_net.predict(X_train)
y_pred_elastic = elastic_net.predict(X_test)
plt.scatter(y_test,y_pred_elastic)
plt.title('Elastic Net Boston housing price prediction')
plt.xlabel("Actual value ($Y)$")
plt.ylabel("Predicted value ($\hat{Y}$)")
plt.savefig('figures/elastic_net_plot.pdf')
plt.show()
plt.figure(figsize=(6, 4))
plot_learning_curves(elastic_net, X_train, X_test, y_train, y_test)
plt.savefig('figures/learning_curve_elastic_net_regression.pdf')