#!/usr/bin/env python
# coding: utf-8
# # Chapter 6 - Linear Model Selection and Regularization
# [6.2 Shrinkage Methods](#6.2-Shrinkage-Methods)
# > [6.2.1 Ridge Regression](#6.2.1-Ridge-Regression)
# > [6.2.2 The Lasso](#6.2.2-The-Lasso)
# > [6.2.3 Selecting the Tuning Parameter](#6.2.3-Selecting-the-Tuning-Parameter)
#
# [6.3 Dimension Reduction Methods](#6.3-Dimension-Reduction-Methods)
# > [6.3.1 Principal Components Regression](#6.3.1-Principal-Components-Regression)
#
# [6.5 Lab 1: Subset Selection Methods](#6.5-Lab-1:-Subset-Selection-Methods)
# > [6.5.1 Best Subset Selection](#6.5.1-Best-Subset-Selection)
# > [6.5.2 Forward and Backward Stepwise Selection](#6.5.2-Forward-and-Backward-Stepwise-Selection)
#
# [6.6 Lab 2: Ridge Regression and the Lasso](#6.6-Lab-2:-Ridge-Regression-and-the-Lasso)
# > [6.6.1 Ridge Regression](#6.6.1-Ridge-Regression)
# > [6.6.2 The Lasso](#6.6.2-The-Lasso)
#
# [6.7 Lab 3: PCR and PLS Regression](#6.7-Lab-3:-PCR-and-PLS-Regression)
# > [6.7.1 Principal Components Regression](#6.7.1-Principal-Components-Regression)
# > [6.7.2 Partial Least Squares](#6.7.2-Partial-Least-Squares)
# In[381]:
import pandas as pd
import numpy as np
from numpy import linalg
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import itertools
from itertools import combinations
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn-white')
# In[2]:
df_credit = pd.read_csv('Data/Credit.csv', index_col=0)
df_credit["Gender"] = df_credit["Gender"].astype('category')
df_credit["Student"] = df_credit["Student"].astype('category')
df_credit["Married"] = df_credit["Married"].astype('category')
df_credit["Ethnicity"] = df_credit["Ethnicity"].astype('category')
df_credit.describe()
# In[3]:
df_credit.describe(include='category')
# In[4]:
df_credit.info()
# ## 6.2 Shrinkage Methods
# In[5]:
y = df_credit.Balance
X = df_credit[df_credit.columns.difference(['Balance'])]
# Use K-1 columns for K categories so Yes/No uses only one column 0/1
X = pd.get_dummies(X, drop_first=True)
X_scaled = scale(X)
X.head(3)
# ### 6.2.1 Ridge Regression
# In[6]:
n_lambdas = 200
lambdas = np.logspace(5, -2, n_lambdas)
coefs = []
scores = []
for lam in lambdas:
ridge = Ridge(alpha=lam)
# it's very important to scale the data for Ridge regression
ridge.fit(X_scaled, y)
coefs.append(ridge.coef_)
scores.append(ridge.score(X_scaled, y))
# In[7]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,4))
# get the four largest (in abs value) coefficient positions
ind = np.argpartition(np.abs(coefs[-1]), -4)[-4:]
# firt plot
ax1.plot(lambdas, coefs)
ax1.set_xscale('log')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Standardized Coefficients')
ax1.legend(np.array(ax1.get_lines())[ind], X.columns[ind])
# second plot
no_ridge_norm = linalg.norm(coefs[-1])
norm_coefs = linalg.norm(coefs/no_ridge_norm, axis=1)
ax2.plot(norm_coefs, coefs)
ax2.set_xlabel('Normalized Ridge coefficients')
ax2.set_ylabel('Standardized Coefficients')
ax2.legend(np.array(ax2.get_lines())[ind], X.columns[ind]);
# third
ax3.plot(lambdas, scores)
ax3.set_xscale('log')
ax3.set_xlabel('lambda')
ax3.set_ylabel('R^2')
ax3.set_title('R^2 as a function of regularization');
# ### 6.2.2 The Lasso
# In[8]:
n_lambdas = 200
lambdas = np.logspace(3, 0.5, n_lambdas)
coefs = []
scores = []
for lam in lambdas:
lasso = Lasso(alpha=lam)
lasso.fit(X_scaled, y)
coefs.append(lasso.coef_)
scores.append(lasso.score(X_scaled, y))
# In[9]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,4))
# get the four largest (in abs value) coefficient positions
ind = np.argpartition(np.abs(coefs[-1]), -4)[-4:]
# firt plot
ax1.plot(lambdas, coefs)
ax1.set_xscale('log')
ax1.set_xlabel('lambda')
ax1.set_ylabel('Standardized Coefficients')
ax1.legend(np.array(ax1.get_lines())[ind], X.columns[ind])
# second plot
no_lasso_norm = linalg.norm(coefs[-1], ord=1)
norm_coefs = linalg.norm(coefs/no_lasso_norm, axis=1, ord=1)
ax2.plot(norm_coefs, coefs)
ax2.set_xlabel('Normalized Lasso coefficients')
ax2.set_ylabel('Standardized Coefficients')
ax2.legend(np.array(ax2.get_lines())[ind], X.columns[ind]);
# third
ax3.plot(lambdas, scores)
ax3.set_xscale('log')
ax3.set_xlabel('lambda')
ax3.set_ylabel('R^2')
ax3.set_title('R^2 as a function of regularization');
# the values for lambda are different here than in the text
# most likely the function that is minimized is a bit different in scikit-learn and R
# there's probably a factor of N or 2N (N=len(y)) difference.
# ### 6.2.3 Selecting the Tuning Parameter
# In[10]:
lambdas = np.logspace(0.5, -4, 100)
ridgeCV = RidgeCV(alphas=lambdas, store_cv_values=True)
ridgeCV.fit(X_scaled, y)
MSE_alphas = np.mean(ridgeCV.cv_values_, axis=0)
# In[11]:
fig, ax = plt.subplots(1, 1, figsize=(5,4))
ax.plot(lambdas, MSE_alphas)
ax.axvline(ridgeCV.alpha_, color='k', linestyle='--')
ax.set_xscale('log')
ax.set_xlabel('lambda')
ax.set_ylabel('CV MSE');
# ## 6.3 Dimension Reduction Methods
# ### 6.3.1 Principal Components Regression
# In[12]:
components = list(range(1, X_scaled.shape[1]+1))
coefs = []
scores = []
for comp in components:
pca = PCA(n_components=comp)
X_pca_comp = pca.fit_transform(X_scaled)
# use the first comp components
#X_pca_comp = X_pca[:, 0:comp+1]
linear = LinearRegression(fit_intercept=False)
linear.fit(X_pca_comp, y)
coefs.append(linear.coef_)
scores.append(linear.score(X_pca_comp, y))
# In[382]:
coefs = np.array(list(itertools.zip_longest(*coefs, fillvalue=0))).T
# In[14]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4))
# get the four largest (in abs value) coefficient positions
ind = np.argpartition(np.abs(coefs[-1]), -4)[-4:]
# firt plot
ax1.plot(components, coefs)
ax1.set_xlabel('Number of PCA components')
ax1.set_ylabel('Standardized Coefficients')
ax1.legend(np.array(ax1.get_lines())[ind], X.columns[ind])
# third
ax2.plot(components, scores)
ax2.set_xlabel('Number of PCA components')
ax2.set_ylabel('R^2')
ax2.set_title('R^2 as a function of regularization');
# i'm not sure what Figure 6.20 is displaying
# this is not the same
# ## 6.5 Lab 1: Subset Selection Methods
# In[93]:
df_hitters = pd.read_csv('Data/Hitters.csv', index_col=0).dropna()
df_hitters.index.name = 'Player'
df_hitters.index = df_hitters.index.map(lambda x: x.replace('-', '', 1))
df_hitters["League"] = df_hitters["League"].astype('category')
df_hitters["Division"] = df_hitters["Division"].astype('category')
df_hitters["NewLeague"] = df_hitters["NewLeague"].astype('category')
# In[94]:
df_hitters.head()
# In[126]:
X = df_hitters[df_hitters.columns.difference(['Salary'])]
X = pd.get_dummies(X, drop_first=True)
y = df_hitters['Salary']
# ### 6.5.1 Best Subset Selection
# In[88]:
# we could also use Exhaustive Feature Selector from mlxtend
# http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/
def best_subset(estimator, X, y, max_size=8, cv=5):
n_features = X.shape[1]
subsets = (combinations(range(n_features), k + 1) for k in range(min(n_features, max_size)))
best_size_subset = []
for subsets_k in subsets: # for each list of subsets of the same size
best_score = -np.inf
best_subset = None
for subset in subsets_k: # for each subset
estimator.fit(X.iloc[:, list(subset)], y)
# get the subset with the best score among subsets of the same size
score = estimator.score(X.iloc[:, list(subset)], y)
if score > best_score:
best_score, best_subset = score, subset
# to compare subsets of different sizes we must use CV
# first store the best subset of each size
best_size_subset.append(best_subset)
# compare best subsets of each size
best_score = -np.inf
best_subset = None
list_scores = []
for subset in best_size_subset:
score = cross_val_score(estimator, X.iloc[:, list(subset)], y, cv=cv).mean()
list_scores.append(score)
if score > best_score:
best_score, best_subset = score, subset
return best_subset, best_score, best_size_subset, list_scores
# In[75]:
# very slow!, results are given below
#lm = LinearRegression()
#best_subset(lm, X, y, max_size=8, cv=5)
# In[76]:
# results of best_subset(lm, X, y, max_size=8, cv=5)
best_subset, best_score, best_size_subset, list_scores = (
(1, 5, 9, 11, 14, 16),
0.41901698933468134,
[(5,),
(5, 9),
(5, 9, 11),
(5, 9, 11, 16),
(1, 5, 9, 11, 16),
(1, 5, 9, 11, 14, 16),
(2, 3, 4, 9, 11, 14, 16),
(1, 4, 6, 7, 9, 11, 14, 16)],
[0.2814147499343555,
0.3666886710091333,
0.364425331129966,
0.37839204327295717,
0.39425582486482524,
0.41901698933468134,
0.39758595297949645,
0.40534090030548126])
# In[78]:
# same results as the book
for subset in best_size_subset:
print(len(subset), X.columns[list(subset)].values)
# In[86]:
fig, ax = plt.subplots(1, 1, figsize=(5,4))
ax.plot(range(1,9), list_scores, '.-')
ax.set_xlabel('Subset length')
ax.set_ylabel('R^2');
# ### 6.5.2 Forward and Backward Stepwise Selection
# In[124]:
# Use the Sequential Feature Selector from mlxtend
# http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
lr = LinearRegression()
sfs = SFS(lr,
k_features=(1,len(X.columns)),
forward=True,
floating=False,
scoring='neg_mean_squared_error',
cv=10)
sfs = sfs.fit(X.values, y)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
print('\nSequential Forward Selection:')
print(X.columns[list(sfs.k_feature_idx_)].tolist())
print('CV Score:')
print(sfs.k_score_)
# In[125]:
lr = LinearRegression()
sbs = SFS(lr,
k_features=(1,len(X.columns)),
forward=False,
floating=False,
scoring='neg_mean_squared_error',
cv=10)
sbs = sbs.fit(X.values, y)
fig = plot_sfs(sbs.get_metric_dict(), kind='std_err')
plt.title('Sequential Backward Selection (w. StdErr)')
plt.grid()
plt.show()
print('\nSequential Backward Selection:')
print(X.columns[list(sbs.k_feature_idx_)].tolist())
print('CV Score:')
print(sbs.k_score_)
# ## 6.6 Lab 2: Ridge Regression and the Lasso
# In[177]:
# same data split as the book
X_train = pd.read_csv('Data/Hitters_X_train.csv', index_col=0)
y_train = pd.read_csv('Data/Hitters_y_train.csv', index_col=0)
X_test = pd.read_csv('Data/Hitters_X_test.csv', index_col=0)
y_test = pd.read_csv('Data/Hitters_y_test.csv', index_col=0)
# use the same transformation for train and test
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# In[228]:
def find_nearest_idx(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
# ### 6.6.1 Ridge Regression
# In[367]:
# alpha = lambda/2
n_lambdas = 100
lambdas = np.logspace(5, -2, n_lambdas)
alphas = lambdas/2
coefs = []
scores = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train_scaled, y_train)
coefs.append(ridge.coef_)
scores.append(mean_squared_error(y_test, ridge.predict(X_test_scaled)))
coefs = np.array(coefs)[:,0,:]
scores = np.array(scores)
# In[368]:
# norm of coefs
norm_coefs = linalg.norm(coefs, axis=1)
norm_coefs_line = plt.plot(alphas, norm_coefs)
plt.plot(alphas, coefs)
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('Norm of Ridge coefficients');
plt.legend(norm_coefs_line, ['Total norm']);
# In[369]:
idx_4 = find_nearest_idx(lambdas, 4)
idx_10e10 = find_nearest_idx(lambdas, 10e10)
# Ridge with alpha=0 (same as normal LinearRegression)
ridge = Ridge(alpha=0)
ridge.fit(X_train_scaled, y_train)
lr_MSE = mean_squared_error(y_test, ridge.predict(X_test_scaled))
# In[370]:
print(f'MSE for intercept-only model: {np.mean(((y_train.mean()-y_test)**2))[0]:.0f}')
print(f'MSE for lambda = 0: {lr_MSE:.0f}')
print(f'MSE for lambda = 4: {scores[idx_4]:.0f}')
print(f'MSE for lambda = 10^10: {scores[idx_10e10]:.0f}')
# In[371]:
ridgeCV = RidgeCV(alphas=alphas, cv=10, scoring='neg_mean_squared_error')
ridgeCV.fit(X_train_scaled, y_train)
ridgeCV_alpha = ridgeCV.alpha_
# In[373]:
ridge = Ridge(alpha=ridgeCV_alpha)
ridge.fit(X_train_scaled, y_train)
ridge_MSE = mean_squared_error(y_test, ridge.predict(X_test_scaled))
print(f'MSE for the best lambda = {ridgeCV_alpha*2:.0f}: {ridge_MSE:.0f}')
# In[374]:
# all data
ridge = Ridge(alpha=ridgeCV_alpha)
ridge.fit(X_scaled, y)
# much larger than in the book
pd.Series(np.array((ridge.intercept_, *ridge.coef_)), index=['Intercept', *X.columns])
# ### 6.6.2 The Lasso
# In[375]:
n_alphas = 200
alphas = np.logspace(3, -2, n_alphas)
coefs = []
scores = []
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000)
lasso.fit(X_train_scaled, y_train)
coefs.append(lasso.coef_)
scores.append(mean_squared_error(y_test, lasso.predict(X_test_scaled)))
# In[376]:
# norm of coefs
norm_coefs = linalg.norm(coefs, axis=1)
norm_coefs_line = plt.plot(alphas, norm_coefs)
plt.plot(alphas, coefs)
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('Norm of Ridge coefficients');
plt.legend(norm_coefs_line, ['Total norm']);
# In[377]:
lassoCV = LassoCV(alphas=alphas, cv=10, max_iter=10000)
lassoCV.fit(X_train_scaled, y_train.values.ravel())
lassoCV_alpha = lassoCV.alpha_
# In[378]:
lasso = Lasso(alpha=lassoCV_alpha)
lasso.fit(X_train_scaled, y_train)
lasso_MSE = mean_squared_error(y_test, lasso.predict(X_test_scaled))
print(f'MSE for the best lambda = {lassoCV_alpha*2:.0f}: {lasso_MSE:.0f}')
# In[379]:
# all data
lasso = Lasso(alpha=lassoCV_alpha)
lasso.fit(X_scaled, y)
# much larger than in the book
coefs_lasso = pd.Series(np.array((lasso.intercept_, *lasso.coef_)), index=['Intercept', *X.columns])
coefs_lasso = coefs_lasso[coefs_lasso != 0]
coefs_lasso
# ## 6.7 Lab 3: PCR and PLS Regression
# ### 6.7.1 Principal Components Regression
# In[486]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
# In[472]:
components = range(0, X_scaled.shape[1]+1)
scores = []
linear = LinearRegression()
# only intercept
score = -cross_val_score(linear, np.ones((len(y_train), 1)), y_train, cv=10, scoring='neg_mean_squared_error').mean()
scores.append((score.mean(), score.std()))
for comp in components:
score = -cross_val_score(linear, X_train_pca[:, 0:comp+1], y_train, cv=10, scoring='neg_mean_squared_error')
scores.append((score.mean(), score.std()))
scores = np.array(scores)
# In[482]:
plt.plot(range(0, X_scaled.shape[1]+2), scores[:,0], '.-')
plt.xlabel('PCA components')
plt.ylabel('CV MSE');
# In[494]:
X_test_pca = pca.transform(X_test_scaled)
# optimum number of components
opt_comp = scores[:,0].argmin()
linear = LinearRegression()
# fit only the optimum number of components
linear.fit(X_train_pca[:,:opt_comp], y_train)
# get test sample error
pred = linear.predict(X_test_pca[:,:opt_comp])
mean_squared_error(y_test, pred)
# ### 6.7.2 Partial Least Squares
# In[532]:
components = range(1, X_scaled.shape[1]+1)
scores = []
for comp in components:
pls = PLSRegression(n_components=comp)
score = -cross_val_score(pls, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
scores.append((score.mean(), score.std()))
scores = np.array(scores)
# In[533]:
plt.plot(range(1, X_scaled.shape[1]+1), scores[:,0], '.-')
plt.xlabel('PLS components')
plt.ylabel('CV MSE');
# In[535]:
# optimum number of components
opt_comp = scores[:,0].argmin()+1
pls = PLSRegression(n_components=opt_comp)
pls.fit(scale(X_train), y_train)
mean_squared_error(y_test, pls.predict(X_test_scaled))
# In[ ]: