#!/usr/bin/env python # coding: utf-8 # Copyright (c) 2015, 2016 [Sebastian Raschka](sebastianraschka.com) # # https://github.com/rasbt/python-machine-learning-book # # [MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt) # # Python Machine Learning - Code Examples # # Chapter 10 - Predicting Continuous Target Variables with Regression Analysis # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,sklearn,seaborn") # *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.* #
#
# ### Overview # - [Introducing a simple linear regression model](#Introducing-a-simple-linear-regression-model) # - [Exploring the Housing Dataset](#Exploring-the-Housing-Dataset) # - [Visualizing the important characteristics of a dataset](#Visualizing-the-important-characteristics-of-a-dataset) # - [Implementing an ordinary least squares linear regression model](#Implementing-an-ordinary-least-squares-linear-regression-model) # - [Solving regression for regression parameters with gradient descent](#Solving-regression-for-regression-parameters-with-gradient-descent) # - [Estimating the coefficient of a regression model via scikit-learn](#Estimating-the-coefficient-of-a-regression-model-via-scikit-learn) # - [Fitting a robust regression model using RANSAC](#Fitting-a-robust-regression-model-using-RANSAC) # - [Evaluating the performance of linear regression models](#Evaluating-the-performance-of-linear-regression-models) # - [Using regularized methods for regression](#Using-regularized-methods-for-regression) # - [Turning a linear regression model into a curve - polynomial regression](#Turning-a-linear-regression-model-into-a-curve---polynomial-regression) # - [Modeling nonlinear relationships in the Housing Dataset](#Modeling-nonlinear-relationships-in-the-Housing-Dataset) # - [Dealing with nonlinear relationships using random forests](#Dealing-with-nonlinear-relationships-using-random-forests) # - [Decision tree regression](#Decision-tree-regression) # - [Random forest regression](#Random-forest-regression) # - [Summary](#Summary) #
#
# In[2]: from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Added version check for recent scikit-learn 0.18 checks from distutils.version import LooseVersion as Version from sklearn import __version__ as sklearn_version # # Introducing a simple linear regression model # In[4]: Image(filename='./images/10_01.png', width=500) #
#
# # Exploring the Housing dataset # Source: [https://archive.ics.uci.edu/ml/datasets/Housing](https://archive.ics.uci.edu/ml/datasets/Housing) # # Attributes: # #
# 1. CRIM      per capita crime rate by town
# 2. ZN        proportion of residential land zoned for lots over 
#                  25,000 sq.ft.
# 3. INDUS     proportion of non-retail business acres per town
# 4. CHAS      Charles River dummy variable (= 1 if tract bounds 
#                  river; 0 otherwise)
# 5. NOX       nitric oxides concentration (parts per 10 million)
# 6. RM        average number of rooms per dwelling
# 7. AGE       proportion of owner-occupied units built prior to 1940
# 8. DIS       weighted distances to five Boston employment centres
# 9. RAD       index of accessibility to radial highways
# 10. TAX      full-value property-tax rate per $10,000
# 11. PTRATIO  pupil-teacher ratio by town
# 12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
#                  by town
# 13. LSTAT    % lower status of the population
# 14. MEDV     Median value of owner-occupied homes in $1000's
# 
# In[5]: import pandas as pd df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/' 'housing/housing.data', header=None, sep='\s+') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] df.head() #
# # ### Note: # # # If the link to the Housing dataset provided above does not work for you, you can find a local copy in this repository at [./../datasets/housing/housing.data](./../datasets/housing/housing.data). # # Or you could fetch it via # In[6]: df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/housing/housing.data', header=None, sep='\s+') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] df.head() #
#
# ## Visualizing the important characteristics of a dataset # In[7]: import matplotlib.pyplot as plt import seaborn as sns sns.set(style='whitegrid', context='notebook') cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] sns.pairplot(df[cols], size=2.5) plt.tight_layout() # plt.savefig('./figures/scatter.png', dpi=300) plt.show() # In[8]: import numpy as np cm = np.corrcoef(df[cols].values.T) sns.set(font_scale=1.5) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 15}, yticklabels=cols, xticklabels=cols) # plt.tight_layout() # plt.savefig('./figures/corr_mat.png', dpi=300) plt.show() # In[9]: sns.reset_orig() get_ipython().run_line_magic('matplotlib', 'inline') #
#
# # Implementing an ordinary least squares linear regression model # ... # ## Solving regression for regression parameters with gradient descent # In[10]: class LinearRegressionGD(object): def __init__(self, eta=0.001, n_iter=20): self.eta = eta self.n_iter = n_iter def fit(self, X, y): self.w_ = np.zeros(1 + X.shape[1]) self.cost_ = [] for i in range(self.n_iter): output = self.net_input(X) errors = (y - output) self.w_[1:] += self.eta * X.T.dot(errors) self.w_[0] += self.eta * errors.sum() cost = (errors**2).sum() / 2.0 self.cost_.append(cost) return self def net_input(self, X): return np.dot(X, self.w_[1:]) + self.w_[0] def predict(self, X): return self.net_input(X) # In[11]: X = df[['RM']].values y = df['MEDV'].values # In[12]: from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(X) y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() # In[13]: lr = LinearRegressionGD() lr.fit(X_std, y_std) # In[14]: plt.plot(range(1, lr.n_iter+1), lr.cost_) plt.ylabel('SSE') plt.xlabel('Epoch') plt.tight_layout() # plt.savefig('./figures/cost.png', dpi=300) plt.show() # In[15]: def lin_regplot(X, y, model): plt.scatter(X, y, c='lightblue') plt.plot(X, model.predict(X), color='red', linewidth=2) return # In[16]: lin_regplot(X_std, y_std, lr) plt.xlabel('Average number of rooms [RM] (standardized)') plt.ylabel('Price in $1000\'s [MEDV] (standardized)') plt.tight_layout() # plt.savefig('./figures/gradient_fit.png', dpi=300) plt.show() # In[17]: print('Slope: %.3f' % lr.w_[1]) print('Intercept: %.3f' % lr.w_[0]) # In[18]: num_rooms_std = sc_x.transform(np.array([[5.0]])) price_std = lr.predict(num_rooms_std) print("Price in $1000's: %.3f" % sc_y.inverse_transform(price_std)) #
#
# ## Estimating the coefficient of a regression model via scikit-learn # In[19]: from sklearn.linear_model import LinearRegression # In[20]: slr = LinearRegression() slr.fit(X, y) y_pred = slr.predict(X) print('Slope: %.3f' % slr.coef_[0]) print('Intercept: %.3f' % slr.intercept_) # In[21]: lin_regplot(X, y, slr) plt.xlabel('Average number of rooms [RM]') plt.ylabel('Price in $1000\'s [MEDV]') plt.tight_layout() # plt.savefig('./figures/scikit_lr_fit.png', dpi=300) plt.show() # **Normal Equations** alternative: # In[22]: # adding a column vector of "ones" Xb = np.hstack((np.ones((X.shape[0], 1)), X)) w = np.zeros(X.shape[1]) z = np.linalg.inv(np.dot(Xb.T, Xb)) w = np.dot(z, np.dot(Xb.T, y)) print('Slope: %.3f' % w[1]) print('Intercept: %.3f' % w[0]) #
#
# # Fitting a robust regression model using RANSAC # In[23]: from sklearn.linear_model import RANSACRegressor if Version(sklearn_version) < '0.18': ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, residual_metric=lambda x: np.sum(np.abs(x), axis=1), residual_threshold=5.0, random_state=0) else: ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, loss='absolute_loss', residual_threshold=5.0, random_state=0) ransac.fit(X, y) inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers') plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers') plt.plot(line_X, line_y_ransac, color='red') plt.xlabel('Average number of rooms [RM]') plt.ylabel('Price in $1000\'s [MEDV]') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/ransac_fit.png', dpi=300) plt.show() # In[24]: print('Slope: %.3f' % ransac.estimator_.coef_[0]) print('Intercept: %.3f' % ransac.estimator_.intercept_) #
#
# # Evaluating the performance of linear regression models # In[25]: if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split X = df.iloc[:, :-1].values y = df['MEDV'].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0) # In[26]: slr = LinearRegression() slr.fit(X_train, y_train) y_train_pred = slr.predict(X_train) y_test_pred = slr.predict(X_test) # In[27]: plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red') plt.xlim([-10, 50]) plt.tight_layout() # plt.savefig('./figures/slr_residuals.png', dpi=300) plt.show() # In[28]: from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) #
#
# # Using regularized methods for regression # In[29]: from sklearn.linear_model import Lasso lasso = Lasso(alpha=0.1) lasso.fit(X_train, y_train) y_train_pred = lasso.predict(X_train) y_test_pred = lasso.predict(X_test) print(lasso.coef_) # In[30]: print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) #
#
# # Turning a linear regression model into a curve - polynomial regression # In[31]: X = np.array([258.0, 270.0, 294.0, 320.0, 342.0, 368.0, 396.0, 446.0, 480.0, 586.0])[:, np.newaxis] y = np.array([236.4, 234.4, 252.8, 298.6, 314.2, 342.2, 360.8, 368.0, 391.2, 390.8]) # In[32]: from sklearn.preprocessing import PolynomialFeatures lr = LinearRegression() pr = LinearRegression() quadratic = PolynomialFeatures(degree=2) X_quad = quadratic.fit_transform(X) # In[33]: # fit linear features lr.fit(X, y) X_fit = np.arange(250, 600, 10)[:, np.newaxis] y_lin_fit = lr.predict(X_fit) # fit quadratic features pr.fit(X_quad, y) y_quad_fit = pr.predict(quadratic.fit_transform(X_fit)) # plot results plt.scatter(X, y, label='training points') plt.plot(X_fit, y_lin_fit, label='linear fit', linestyle='--') plt.plot(X_fit, y_quad_fit, label='quadratic fit') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/poly_example.png', dpi=300) plt.show() # In[34]: y_lin_pred = lr.predict(X) y_quad_pred = pr.predict(X_quad) # In[35]: print('Training MSE linear: %.3f, quadratic: %.3f' % ( mean_squared_error(y, y_lin_pred), mean_squared_error(y, y_quad_pred))) print('Training R^2 linear: %.3f, quadratic: %.3f' % ( r2_score(y, y_lin_pred), r2_score(y, y_quad_pred))) #
#
# ## Modeling nonlinear relationships in the Housing Dataset # In[36]: X = df[['LSTAT']].values y = df['MEDV'].values regr = LinearRegression() # create quadratic features quadratic = PolynomialFeatures(degree=2) cubic = PolynomialFeatures(degree=3) X_quad = quadratic.fit_transform(X) X_cubic = cubic.fit_transform(X) # fit features X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis] regr = regr.fit(X, y) y_lin_fit = regr.predict(X_fit) linear_r2 = r2_score(y, regr.predict(X)) regr = regr.fit(X_quad, y) y_quad_fit = regr.predict(quadratic.fit_transform(X_fit)) quadratic_r2 = r2_score(y, regr.predict(X_quad)) regr = regr.fit(X_cubic, y) y_cubic_fit = regr.predict(cubic.fit_transform(X_fit)) cubic_r2 = r2_score(y, regr.predict(X_cubic)) # plot results plt.scatter(X, y, label='training points', color='lightgray') plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2, linestyle=':') plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2, color='red', lw=2, linestyle='-') plt.plot(X_fit, y_cubic_fit, label='cubic (d=3), $R^2=%.2f$' % cubic_r2, color='green', lw=2, linestyle='--') plt.xlabel('% lower status of the population [LSTAT]') plt.ylabel('Price in $1000\'s [MEDV]') plt.legend(loc='upper right') plt.tight_layout() # plt.savefig('./figures/polyhouse_example.png', dpi=300) plt.show() # Transforming the dataset: # In[37]: X = df[['LSTAT']].values y = df['MEDV'].values # transform features X_log = np.log(X) y_sqrt = np.sqrt(y) # fit features X_fit = np.arange(X_log.min()-1, X_log.max()+1, 1)[:, np.newaxis] regr = regr.fit(X_log, y_sqrt) y_lin_fit = regr.predict(X_fit) linear_r2 = r2_score(y_sqrt, regr.predict(X_log)) # plot results plt.scatter(X_log, y_sqrt, label='training points', color='lightgray') plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2) plt.xlabel('log(% lower status of the population [LSTAT])') plt.ylabel('$\sqrt{Price \; in \; \$1000\'s [MEDV]}$') plt.legend(loc='lower left') plt.tight_layout() # plt.savefig('./figures/transform_example.png', dpi=300) plt.show() #
#
# # Dealing with nonlinear relationships using random forests # ... # ## Decision tree regression # In[38]: from sklearn.tree import DecisionTreeRegressor X = df[['LSTAT']].values y = df['MEDV'].values tree = DecisionTreeRegressor(max_depth=3) tree.fit(X, y) sort_idx = X.flatten().argsort() lin_regplot(X[sort_idx], y[sort_idx], tree) plt.xlabel('% lower status of the population [LSTAT]') plt.ylabel('Price in $1000\'s [MEDV]') # plt.savefig('./figures/tree_regression.png', dpi=300) plt.show() #
#
# ## Random forest regression # In[39]: X = df.iloc[:, :-1].values y = df['MEDV'].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=1) # In[40]: from sklearn.ensemble import RandomForestRegressor forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1) forest.fit(X_train, y_train) y_train_pred = forest.predict(X_train) y_test_pred = forest.predict(X_test) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) # In[41]: plt.scatter(y_train_pred, y_train_pred - y_train, c='black', marker='o', s=35, alpha=0.5, label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', s=35, alpha=0.7, label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red') plt.xlim([-10, 50]) plt.tight_layout() # plt.savefig('./figures/slr_residuals.png', dpi=300) plt.show() #
#
# # Summary # ...