#!/usr/bin/env python # coding: utf-8 # # Chapter 6 - Linear Model Selection and Regularization # [6.2 Shrinkage Methods](#6.2-Shrinkage-Methods) # > [6.2.1 Ridge Regression](#6.2.1-Ridge-Regression)
# > [6.2.2 The Lasso](#6.2.2-The-Lasso)
# > [6.2.3 Selecting the Tuning Parameter](#6.2.3-Selecting-the-Tuning-Parameter) # # [6.3 Dimension Reduction Methods](#6.3-Dimension-Reduction-Methods) # > [6.3.1 Principal Components Regression](#6.3.1-Principal-Components-Regression) # # [6.5 Lab 1: Subset Selection Methods](#6.5-Lab-1:-Subset-Selection-Methods) # > [6.5.1 Best Subset Selection](#6.5.1-Best-Subset-Selection)
# > [6.5.2 Forward and Backward Stepwise Selection](#6.5.2-Forward-and-Backward-Stepwise-Selection) # # [6.6 Lab 2: Ridge Regression and the Lasso](#6.6-Lab-2:-Ridge-Regression-and-the-Lasso) # > [6.6.1 Ridge Regression](#6.6.1-Ridge-Regression)
# > [6.6.2 The Lasso](#6.6.2-The-Lasso) # # [6.7 Lab 3: PCR and PLS Regression](#6.7-Lab-3:-PCR-and-PLS-Regression) # > [6.7.1 Principal Components Regression](#6.7.1-Principal-Components-Regression)
# > [6.7.2 Partial Least Squares](#6.7.2-Partial-Least-Squares) # In[381]: import pandas as pd import numpy as np from numpy import linalg import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import scale, StandardScaler from sklearn import model_selection from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV from sklearn.decomposition import PCA from sklearn.cross_decomposition import PLSRegression from sklearn.model_selection import KFold, cross_val_score from sklearn.metrics import mean_squared_error import itertools from itertools import combinations get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn-white') # In[2]: df_credit = pd.read_csv('Data/Credit.csv', index_col=0) df_credit["Gender"] = df_credit["Gender"].astype('category') df_credit["Student"] = df_credit["Student"].astype('category') df_credit["Married"] = df_credit["Married"].astype('category') df_credit["Ethnicity"] = df_credit["Ethnicity"].astype('category') df_credit.describe() # In[3]: df_credit.describe(include='category') # In[4]: df_credit.info() # ## 6.2 Shrinkage Methods # In[5]: y = df_credit.Balance X = df_credit[df_credit.columns.difference(['Balance'])] # Use K-1 columns for K categories so Yes/No uses only one column 0/1 X = pd.get_dummies(X, drop_first=True) X_scaled = scale(X) X.head(3) # ### 6.2.1 Ridge Regression # In[6]: n_lambdas = 200 lambdas = np.logspace(5, -2, n_lambdas) coefs = [] scores = [] for lam in lambdas: ridge = Ridge(alpha=lam) # it's very important to scale the data for Ridge regression ridge.fit(X_scaled, y) coefs.append(ridge.coef_) scores.append(ridge.score(X_scaled, y)) # In[7]: fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,4)) # get the four largest (in abs value) coefficient positions ind = np.argpartition(np.abs(coefs[-1]), -4)[-4:] # firt plot ax1.plot(lambdas, coefs) ax1.set_xscale('log') ax1.set_xlabel('lambda') ax1.set_ylabel('Standardized Coefficients') ax1.legend(np.array(ax1.get_lines())[ind], X.columns[ind]) # second plot no_ridge_norm = linalg.norm(coefs[-1]) norm_coefs = linalg.norm(coefs/no_ridge_norm, axis=1) ax2.plot(norm_coefs, coefs) ax2.set_xlabel('Normalized Ridge coefficients') ax2.set_ylabel('Standardized Coefficients') ax2.legend(np.array(ax2.get_lines())[ind], X.columns[ind]); # third ax3.plot(lambdas, scores) ax3.set_xscale('log') ax3.set_xlabel('lambda') ax3.set_ylabel('R^2') ax3.set_title('R^2 as a function of regularization'); # ### 6.2.2 The Lasso # In[8]: n_lambdas = 200 lambdas = np.logspace(3, 0.5, n_lambdas) coefs = [] scores = [] for lam in lambdas: lasso = Lasso(alpha=lam) lasso.fit(X_scaled, y) coefs.append(lasso.coef_) scores.append(lasso.score(X_scaled, y)) # In[9]: fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,4)) # get the four largest (in abs value) coefficient positions ind = np.argpartition(np.abs(coefs[-1]), -4)[-4:] # firt plot ax1.plot(lambdas, coefs) ax1.set_xscale('log') ax1.set_xlabel('lambda') ax1.set_ylabel('Standardized Coefficients') ax1.legend(np.array(ax1.get_lines())[ind], X.columns[ind]) # second plot no_lasso_norm = linalg.norm(coefs[-1], ord=1) norm_coefs = linalg.norm(coefs/no_lasso_norm, axis=1, ord=1) ax2.plot(norm_coefs, coefs) ax2.set_xlabel('Normalized Lasso coefficients') ax2.set_ylabel('Standardized Coefficients') ax2.legend(np.array(ax2.get_lines())[ind], X.columns[ind]); # third ax3.plot(lambdas, scores) ax3.set_xscale('log') ax3.set_xlabel('lambda') ax3.set_ylabel('R^2') ax3.set_title('R^2 as a function of regularization'); # the values for lambda are different here than in the text # most likely the function that is minimized is a bit different in scikit-learn and R # there's probably a factor of N or 2N (N=len(y)) difference. # ### 6.2.3 Selecting the Tuning Parameter # In[10]: lambdas = np.logspace(0.5, -4, 100) ridgeCV = RidgeCV(alphas=lambdas, store_cv_values=True) ridgeCV.fit(X_scaled, y) MSE_alphas = np.mean(ridgeCV.cv_values_, axis=0) # In[11]: fig, ax = plt.subplots(1, 1, figsize=(5,4)) ax.plot(lambdas, MSE_alphas) ax.axvline(ridgeCV.alpha_, color='k', linestyle='--') ax.set_xscale('log') ax.set_xlabel('lambda') ax.set_ylabel('CV MSE'); # ## 6.3 Dimension Reduction Methods # ### 6.3.1 Principal Components Regression # In[12]: components = list(range(1, X_scaled.shape[1]+1)) coefs = [] scores = [] for comp in components: pca = PCA(n_components=comp) X_pca_comp = pca.fit_transform(X_scaled) # use the first comp components #X_pca_comp = X_pca[:, 0:comp+1] linear = LinearRegression(fit_intercept=False) linear.fit(X_pca_comp, y) coefs.append(linear.coef_) scores.append(linear.score(X_pca_comp, y)) # In[382]: coefs = np.array(list(itertools.zip_longest(*coefs, fillvalue=0))).T # In[14]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4)) # get the four largest (in abs value) coefficient positions ind = np.argpartition(np.abs(coefs[-1]), -4)[-4:] # firt plot ax1.plot(components, coefs) ax1.set_xlabel('Number of PCA components') ax1.set_ylabel('Standardized Coefficients') ax1.legend(np.array(ax1.get_lines())[ind], X.columns[ind]) # third ax2.plot(components, scores) ax2.set_xlabel('Number of PCA components') ax2.set_ylabel('R^2') ax2.set_title('R^2 as a function of regularization'); # i'm not sure what Figure 6.20 is displaying # this is not the same # ## 6.5 Lab 1: Subset Selection Methods # In[93]: df_hitters = pd.read_csv('Data/Hitters.csv', index_col=0).dropna() df_hitters.index.name = 'Player' df_hitters.index = df_hitters.index.map(lambda x: x.replace('-', '', 1)) df_hitters["League"] = df_hitters["League"].astype('category') df_hitters["Division"] = df_hitters["Division"].astype('category') df_hitters["NewLeague"] = df_hitters["NewLeague"].astype('category') # In[94]: df_hitters.head() # In[126]: X = df_hitters[df_hitters.columns.difference(['Salary'])] X = pd.get_dummies(X, drop_first=True) y = df_hitters['Salary'] # ### 6.5.1 Best Subset Selection # In[88]: # we could also use Exhaustive Feature Selector from mlxtend # http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/ def best_subset(estimator, X, y, max_size=8, cv=5): n_features = X.shape[1] subsets = (combinations(range(n_features), k + 1) for k in range(min(n_features, max_size))) best_size_subset = [] for subsets_k in subsets: # for each list of subsets of the same size best_score = -np.inf best_subset = None for subset in subsets_k: # for each subset estimator.fit(X.iloc[:, list(subset)], y) # get the subset with the best score among subsets of the same size score = estimator.score(X.iloc[:, list(subset)], y) if score > best_score: best_score, best_subset = score, subset # to compare subsets of different sizes we must use CV # first store the best subset of each size best_size_subset.append(best_subset) # compare best subsets of each size best_score = -np.inf best_subset = None list_scores = [] for subset in best_size_subset: score = cross_val_score(estimator, X.iloc[:, list(subset)], y, cv=cv).mean() list_scores.append(score) if score > best_score: best_score, best_subset = score, subset return best_subset, best_score, best_size_subset, list_scores # In[75]: # very slow!, results are given below #lm = LinearRegression() #best_subset(lm, X, y, max_size=8, cv=5) # In[76]: # results of best_subset(lm, X, y, max_size=8, cv=5) best_subset, best_score, best_size_subset, list_scores = ( (1, 5, 9, 11, 14, 16), 0.41901698933468134, [(5,), (5, 9), (5, 9, 11), (5, 9, 11, 16), (1, 5, 9, 11, 16), (1, 5, 9, 11, 14, 16), (2, 3, 4, 9, 11, 14, 16), (1, 4, 6, 7, 9, 11, 14, 16)], [0.2814147499343555, 0.3666886710091333, 0.364425331129966, 0.37839204327295717, 0.39425582486482524, 0.41901698933468134, 0.39758595297949645, 0.40534090030548126]) # In[78]: # same results as the book for subset in best_size_subset: print(len(subset), X.columns[list(subset)].values) # In[86]: fig, ax = plt.subplots(1, 1, figsize=(5,4)) ax.plot(range(1,9), list_scores, '.-') ax.set_xlabel('Subset length') ax.set_ylabel('R^2'); # ### 6.5.2 Forward and Backward Stepwise Selection # In[124]: # Use the Sequential Feature Selector from mlxtend # http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ from mlxtend.feature_selection import SequentialFeatureSelector as SFS from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs lr = LinearRegression() sfs = SFS(lr, k_features=(1,len(X.columns)), forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs = sfs.fit(X.values, y) fig = plot_sfs(sfs.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection (w. StdErr)') plt.grid() plt.show() print('\nSequential Forward Selection:') print(X.columns[list(sfs.k_feature_idx_)].tolist()) print('CV Score:') print(sfs.k_score_) # In[125]: lr = LinearRegression() sbs = SFS(lr, k_features=(1,len(X.columns)), forward=False, floating=False, scoring='neg_mean_squared_error', cv=10) sbs = sbs.fit(X.values, y) fig = plot_sfs(sbs.get_metric_dict(), kind='std_err') plt.title('Sequential Backward Selection (w. StdErr)') plt.grid() plt.show() print('\nSequential Backward Selection:') print(X.columns[list(sbs.k_feature_idx_)].tolist()) print('CV Score:') print(sbs.k_score_) # ## 6.6 Lab 2: Ridge Regression and the Lasso # In[177]: # same data split as the book X_train = pd.read_csv('Data/Hitters_X_train.csv', index_col=0) y_train = pd.read_csv('Data/Hitters_y_train.csv', index_col=0) X_test = pd.read_csv('Data/Hitters_X_test.csv', index_col=0) y_test = pd.read_csv('Data/Hitters_y_test.csv', index_col=0) # use the same transformation for train and test scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # In[228]: def find_nearest_idx(array, value): array = np.asarray(array) idx = (np.abs(array - value)).argmin() return idx # ### 6.6.1 Ridge Regression # In[367]: # alpha = lambda/2 n_lambdas = 100 lambdas = np.logspace(5, -2, n_lambdas) alphas = lambdas/2 coefs = [] scores = [] for alpha in alphas: ridge = Ridge(alpha=alpha) ridge.fit(X_train_scaled, y_train) coefs.append(ridge.coef_) scores.append(mean_squared_error(y_test, ridge.predict(X_test_scaled))) coefs = np.array(coefs)[:,0,:] scores = np.array(scores) # In[368]: # norm of coefs norm_coefs = linalg.norm(coefs, axis=1) norm_coefs_line = plt.plot(alphas, norm_coefs) plt.plot(alphas, coefs) plt.xscale('log') plt.xlabel('alpha') plt.ylabel('Norm of Ridge coefficients'); plt.legend(norm_coefs_line, ['Total norm']); # In[369]: idx_4 = find_nearest_idx(lambdas, 4) idx_10e10 = find_nearest_idx(lambdas, 10e10) # Ridge with alpha=0 (same as normal LinearRegression) ridge = Ridge(alpha=0) ridge.fit(X_train_scaled, y_train) lr_MSE = mean_squared_error(y_test, ridge.predict(X_test_scaled)) # In[370]: print(f'MSE for intercept-only model: {np.mean(((y_train.mean()-y_test)**2))[0]:.0f}') print(f'MSE for lambda = 0: {lr_MSE:.0f}') print(f'MSE for lambda = 4: {scores[idx_4]:.0f}') print(f'MSE for lambda = 10^10: {scores[idx_10e10]:.0f}') # In[371]: ridgeCV = RidgeCV(alphas=alphas, cv=10, scoring='neg_mean_squared_error') ridgeCV.fit(X_train_scaled, y_train) ridgeCV_alpha = ridgeCV.alpha_ # In[373]: ridge = Ridge(alpha=ridgeCV_alpha) ridge.fit(X_train_scaled, y_train) ridge_MSE = mean_squared_error(y_test, ridge.predict(X_test_scaled)) print(f'MSE for the best lambda = {ridgeCV_alpha*2:.0f}: {ridge_MSE:.0f}') # In[374]: # all data ridge = Ridge(alpha=ridgeCV_alpha) ridge.fit(X_scaled, y) # much larger than in the book pd.Series(np.array((ridge.intercept_, *ridge.coef_)), index=['Intercept', *X.columns]) # ### 6.6.2 The Lasso # In[375]: n_alphas = 200 alphas = np.logspace(3, -2, n_alphas) coefs = [] scores = [] for alpha in alphas: lasso = Lasso(alpha=alpha, max_iter=10000) lasso.fit(X_train_scaled, y_train) coefs.append(lasso.coef_) scores.append(mean_squared_error(y_test, lasso.predict(X_test_scaled))) # In[376]: # norm of coefs norm_coefs = linalg.norm(coefs, axis=1) norm_coefs_line = plt.plot(alphas, norm_coefs) plt.plot(alphas, coefs) plt.xscale('log') plt.xlabel('alpha') plt.ylabel('Norm of Ridge coefficients'); plt.legend(norm_coefs_line, ['Total norm']); # In[377]: lassoCV = LassoCV(alphas=alphas, cv=10, max_iter=10000) lassoCV.fit(X_train_scaled, y_train.values.ravel()) lassoCV_alpha = lassoCV.alpha_ # In[378]: lasso = Lasso(alpha=lassoCV_alpha) lasso.fit(X_train_scaled, y_train) lasso_MSE = mean_squared_error(y_test, lasso.predict(X_test_scaled)) print(f'MSE for the best lambda = {lassoCV_alpha*2:.0f}: {lasso_MSE:.0f}') # In[379]: # all data lasso = Lasso(alpha=lassoCV_alpha) lasso.fit(X_scaled, y) # much larger than in the book coefs_lasso = pd.Series(np.array((lasso.intercept_, *lasso.coef_)), index=['Intercept', *X.columns]) coefs_lasso = coefs_lasso[coefs_lasso != 0] coefs_lasso # ## 6.7 Lab 3: PCR and PLS Regression # ### 6.7.1 Principal Components Regression # In[486]: pca = PCA() X_train_pca = pca.fit_transform(X_train_scaled) np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100) # In[472]: components = range(0, X_scaled.shape[1]+1) scores = [] linear = LinearRegression() # only intercept score = -cross_val_score(linear, np.ones((len(y_train), 1)), y_train, cv=10, scoring='neg_mean_squared_error').mean() scores.append((score.mean(), score.std())) for comp in components: score = -cross_val_score(linear, X_train_pca[:, 0:comp+1], y_train, cv=10, scoring='neg_mean_squared_error') scores.append((score.mean(), score.std())) scores = np.array(scores) # In[482]: plt.plot(range(0, X_scaled.shape[1]+2), scores[:,0], '.-') plt.xlabel('PCA components') plt.ylabel('CV MSE'); # In[494]: X_test_pca = pca.transform(X_test_scaled) # optimum number of components opt_comp = scores[:,0].argmin() linear = LinearRegression() # fit only the optimum number of components linear.fit(X_train_pca[:,:opt_comp], y_train) # get test sample error pred = linear.predict(X_test_pca[:,:opt_comp]) mean_squared_error(y_test, pred) # ### 6.7.2 Partial Least Squares # In[532]: components = range(1, X_scaled.shape[1]+1) scores = [] for comp in components: pls = PLSRegression(n_components=comp) score = -cross_val_score(pls, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error') scores.append((score.mean(), score.std())) scores = np.array(scores) # In[533]: plt.plot(range(1, X_scaled.shape[1]+1), scores[:,0], '.-') plt.xlabel('PLS components') plt.ylabel('CV MSE'); # In[535]: # optimum number of components opt_comp = scores[:,0].argmin()+1 pls = PLSRegression(n_components=opt_comp) pls.fit(scale(X_train), y_train) mean_squared_error(y_test, pls.predict(X_test_scaled)) # In[ ]: