#!/usr/bin/env python # coding: utf-8 # #### Example of Regression Analysis Using the Boston Housing Data Set. # In[1]: from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor import numpy as np import pandas as pd import pylab as pl # In[2]: from sklearn.datasets import load_boston boston = load_boston() # In[3]: print(boston.DESCR) # In[4]: print(boston.feature_names) # In[5]: print(boston.data.shape) print(boston.target.shape) # In[6]: bostonDF = pd. DataFrame(boston.data, columns = boston.feature_names) bostonDF.head() # In[7]: np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=7) # In[8]: print(boston.data) # In[9]: # The attribute MEDV (Median Values) is the target attribute (response variable) print(boston.target[:20]) # In[10]: # In order to do multiple regression we need to add a column of 1s as the coefficient for x0 x = np.array([np.concatenate((v,[1])) for v in boston.data]) y = boston.target # In[11]: # First 10 elements of the data print(x[:10]) # In[12]: # Create linear regression object linreg = LinearRegression() # Train the model using the training set linreg.fit(x,y) # In[13]: # Let's see predictions for the first 10 instances and compare to actual MEDV values for i in range(10): pred = linreg.predict(np.array([x[i]]))[0] print("%2d \t %2.2f \t %2.2f" % (i, pred, y[i])) # #### Compute RMSE on training data # In[14]: # First, let's compute errors on all training instances p = linreg.predict(x) # p is the array of predicted values # Now we can constuct an array of errors err = abs(p-y) # Let's see the error on the first 10 predictions print(err[:10]) # In[15]: # Dot product of error vector with itself gives us the sum of squared errors total_error = np.dot(err,err) # Finally compute RMSE rmse_train = np.sqrt(total_error/len(p)) print("RMSE on Training Data: ", rmse_train) # In[16]: # We can view the regression coefficients print('Regression Coefficients: \n', linreg.coef_) # In[17]: # Let's put some names to the faces for i in range(len(boston.feature_names)): print("%7s %2.2f" % (boston.feature_names[i], linreg.coef_[i])) # In[18]: # The following function can be used to plot the model coefficients get_ipython().run_line_magic('matplotlib', 'inline') def plot_coefficients(model, n_features, feature_names): pl.barh(range(n_features), model.coef_[:-1], align='center') pl.yticks(np.arange(n_features), feature_names) pl.xlabel("Coefficient Value") pl.ylabel("Feature") pl.ylim(-1, n_features) plot_coefficients(linreg, len(boston.feature_names), boston.feature_names) # In[19]: print("Linear Regression Intercept: ", linreg.intercept_) # In[20]: # Plot predicted against actual (in the training data) get_ipython().run_line_magic('matplotlib', 'inline') pl.plot(p, y,'ro', markersize=5) pl.plot([0,50],[0,50], 'g-') pl.xlabel('Predicted') pl.ylabel('Actual') pl.show() # #### Let's now compute RMSE using 10-fold cross-validation # In[21]: def cross_validate(model, X, y, n, verbose=False): # model: regression model to be trained # X: the data matrix # y: the target variable array # n: the number of fold for x-validation # Returns mean RMSE across all folds from sklearn.model_selection import KFold kf = KFold(n_splits=n, random_state=22) xval_err = 0 f = 1 for train,test in kf.split(x): model.fit(X[train],y[train]) p = model.predict(x[test]) e = p-y[test] rmse = np.sqrt(np.dot(e,e)/len(x[test])) if verbose: print("Fold %2d RMSE: %.4f" % (f, rmse)) xval_err += rmse f += 1 return xval_err/n # In[22]: rmse_10cv = cross_validate(linreg, x, y, 10, verbose=True) # In[23]: method_name = 'Simple Linear Regression' print('Method: %s' %method_name) print('RMSE on training: %.4f' %rmse_train) print('RMSE on 10-fold CV: %.4f' %rmse_10cv) # #### Let's try Ridge Regression: # In[24]: # Create linear regression object with a ridge coefficient 0.5 ridge = Ridge(alpha=0.8) # Train the model using the training set ridge.fit(x,y) # In[25]: # Compute RMSE on training data p = ridge.predict(x) err = p-y total_error = np.dot(err,err) rmse_train = np.sqrt(total_error/len(p)) # Compute RMSE using 10-fold x-validation rmse_10cv = cross_validate(ridge, x, y, 10, verbose=True) method_name = 'Ridge Regression' print("\n") print('Method: %s' %method_name) print('RMSE on training: %.4f' %rmse_train) print('RMSE on 10-fold CV: %.4f' %rmse_10cv) # #### We can try different values of alpha and observe the impact on x-validation RMSE # In[26]: print('Ridge Regression') print('alpha\t RMSE_train\t RMSE_10cv\n') alpha = np.linspace(.01,20,50) t_rmse = np.array([]) cv_rmse = np.array([]) for a in alpha: ridge = Ridge(alpha=a) # computing the RMSE on training data ridge.fit(x,y) p = ridge.predict(x) err = p-y total_error = np.dot(err,err) rmse_train = np.sqrt(total_error/len(p)) rmse_10cv = cross_validate(ridge, x, y, 10) t_rmse = np.append(t_rmse, [rmse_train]) cv_rmse = np.append(cv_rmse, [rmse_10cv]) print('{:.3f}\t {:.4f}\t\t {:.4f}'.format(a,rmse_train,rmse_10cv)) # In[27]: fig = pl.figure(figsize=(10,6)) ax = fig.add_subplot(111) ax.plot(alpha, t_rmse, label='RMSE-Train') ax.plot(alpha, cv_rmse, label='RMSE_XVal') pl.legend( ('RMSE-Train', 'RMSE_XVal') ) pl.ylabel('RMSE') pl.xlabel('Alpha') pl.show() # #### To make comparisons across methods easier, let's parametrize the regression methods: # In[28]: a = 0.01 for name,met in [ ('linear regression', LinearRegression()), ('lasso', Lasso(alpha=a)), ('ridge', Ridge(alpha=a)), ('elastic-net', ElasticNet(alpha=a)) ]: # computing the RMSE on training data met.fit(x,y) p = met.predict(x) e = p-y total_error = np.dot(e,e) rmse_train = np.sqrt(total_error/len(p)) # computing the RMSE for x-validation rmse_10cv = cross_validate(met, x, y, 10) print('Method: %s' %name) print('RMSE on training: %.4f' %rmse_train) print('RMSE on 10-fold CV: %.4f' %rmse_10cv) print("\n") # #### Now let's try to do regression via Stochastic Gradient Descent. # In[29]: # SGD is very senstitive to varying-sized feature values. So, first we need to do feature scaling. from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(x) x_s = scaler.transform(x) sgdreg = SGDRegressor(penalty='l2', alpha=0.01, max_iter=300) # Compute RMSE on training data sgdreg.fit(x_s,y) p = sgdreg.predict(x_s) err = p-y total_error = np.dot(err,err) rmse_train = np.sqrt(total_error/len(p)) # Compute RMSE using 10-fold x-validation from sklearn.model_selection import KFold n = 10 kf = KFold(n_splits=n, random_state=22) xval_err = 0 f = 1 for train,test in kf.split(x): scaler = StandardScaler() scaler.fit(x[train]) # Don't cheat - fit only on training data xtrain_s = scaler.transform(x[train]) xtest_s = scaler.transform(x[test]) # apply same transformation to test data sgdreg.fit(xtrain_s,y[train]) p = sgdreg.predict(xtest_s) e = p-y[test] rmse = np.sqrt(np.dot(e,e)/len(x[test])) print("Fold %2d RMSE: %.4f" % (f, rmse)) xval_err += rmse f += 1 rmse_10cv = xval_err/n method_name = 'Stochastic Gradient Descent Regression' print('Method: %s' %method_name) print('RMSE on training: %.4f' %rmse_train) print('RMSE on 10-fold CV: %.4f' %rmse_10cv) # #### Instead of Scikit-learn, let's implement the closed form solution for linear regression # In[30]: def standRegres(xArr,yArr): xMat = np.mat(xArr); yMat = np.mat(yArr).T xTx = xMat.T*xMat if np.linalg.det(xTx) == 0.0: print("This matrix is singular, cannot do inverse") return ws = xTx.I * (xMat.T*yMat) return ws # In[31]: w = standRegres(x,y) # In[32]: print(w) # In[33]: def ridgeRegres(xArr,yArr,lam=0.2): xMat = np.mat(xArr); yMat = np.mat(yArr).T xTx = xMat.T*xMat denom = xTx + np.eye(np.shape(xMat)[1])*lam if np.linalg.det(denom) == 0.0: print("This matrix is singular, cannot do inverse") return ws = denom.I * (xMat.T*yMat) return ws # In[34]: w_ridge = ridgeRegres(x,y,0.5) print(w_ridge) # #### Now that we have the regression coefficients, we can compute the predictions: # In[35]: xMat=np.mat(x) yMat=np.mat(y) yHat = xMat*w_ridge # In[36]: yHat.shape # In[37]: print(yHat[0:10]) # In[38]: print(yMat.T[0:10]) # #### Model evaluation and cross validation can be performed as before. # In[39]: # You can "ravel" the 2d matrix above to get a 1d Numpy array more suitable for using in earlier functions. print(yHat.A.ravel()) # In[ ]: