import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, metrics, linear_model, decomposition, cross_decomposition, datasets
from itertools import combinations
import numpy as np
import time
(a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ε of length n = 100.
x = np.random.normal(0,1, 100)
eps = np.random.normal(0,1, 100)
(b) Generate a response vector Y of length n = 100 according to the model Y = β0 +β1X +β2X2 +β3X3 +ε, where β0, β1, β2, and β3 are constants of your choice.
coeffs = np.array([1,2,3,4])
design_matrix = pd.DataFrame({ 'c' : np.repeat(1, 100),
'x' : x, \
'x2' : x ** 2, \
'x3' : x ** 3})
y = (design_matrix @ coeffs.reshape(-1, 1)) + eps.reshape(-1, 1)
sns.scatterplot(x=x, y=y.T.values[0])
<matplotlib.axes._subplots.AxesSubplot at 0x11905d630>
(c) Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X, X2, . . . , X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model ob- tained. Note you will need to use the data.frame() function to create a single data set containing both X and Y .
def n_choose_k(n, k):
return int(np.math.factorial(n) / (np.math.factorial(n - k) * np.math.factorial(k)))
def best_subset(n_predictors, target_column, data):
i = 1
predictors = data.drop(target_column, axis=1).columns
top_models = []
while i <= n_predictors:
tick = time.time()
cmbs = list(combinations(predictors, i))
formulae = ['{} ~ '.format(target_column) + ' + '.join(s) for s in cmbs]
models = [smf.ols(formula=f, data=data).fit() for f in formulae]
top_model = sorted(models, key=lambda m: m.rsquared, reverse=True)[0]
top_models.append(top_model)
tock = time.time()
print('{}/{} : {} combinations, {} seconds'.format(i, n_predictors, n_choose_k(n_predictors, i), tock-tick))
i += 1
return pd.DataFrame({'model' : top_models, 'n_predictors' : range(1, n_predictors + 1)})
data = pd.DataFrame({'x' : x, \
'x2' : x ** 2,\
'x3' : x ** 3,\
'x4' : x ** 4,\
'x5' : x ** 5,\
'x6' : x ** 6,\
'x7' : x ** 7,\
'x8' : x ** 8,\
'x9' : x ** 9,\
'x10' : x ** 10,\
'y' : y.T.values[0]})
# 2 ^ 10 = 1024 possible subsets
best = best_subset(10, 'y', data)
1/10 : 10 combinations, 0.03826308250427246 seconds 2/10 : 45 combinations, 0.18390679359436035 seconds 3/10 : 120 combinations, 0.6283471584320068 seconds 4/10 : 210 combinations, 1.3881821632385254 seconds 5/10 : 252 combinations, 1.7857210636138916 seconds 6/10 : 210 combinations, 1.7825541496276855 seconds 7/10 : 120 combinations, 1.0421671867370605 seconds 8/10 : 45 combinations, 0.45599985122680664 seconds 9/10 : 10 combinations, 0.1031639575958252 seconds 10/10 : 1 combinations, 0.01294708251953125 seconds
# Select between models with different numbers of predictors using adjusted training metrics
best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj)
best['bic'] = best['model'].map(lambda m: m.bic)
best['aic'] = best['model'].map(lambda m: m.aic)
# Plot the error metrics
_, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15))
sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h')
sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h')
sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h')
<matplotlib.axes._subplots.AxesSubplot at 0x119317908>
print(best.sort_values('adj_R_sq', ascending=False)['model'].iloc[0].summary())
print(best.sort_values('bic', ascending=True)['model'].iloc[0].summary())
print(best.sort_values('aic', ascending=True)['model'].iloc[0].summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:28:39 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x 2.2893 0.139 16.474 0.000 2.013 2.565 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x3 3.9099 0.044 88.854 0.000 3.823 3.997 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:28:39 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x 2.2893 0.139 16.474 0.000 2.013 2.565 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x3 3.9099 0.044 88.854 0.000 3.823 3.997 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:28:39 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x 2.2893 0.139 16.474 0.000 2.013 2.565 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x3 3.9099 0.044 88.854 0.000 3.823 3.997 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
(d) Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?
# Forward selection:
# There are 1 + 10(1 + 10) / 2 = 56 possible models in this space
def fwd_selection(target_column, data):
predictors = data.drop('y', axis=1).columns
formulae_s1 = [(p, 'y ~ {}'.format(p)) for p in predictors]
models_s1 = [(p, smf.ols(formula=f, data=data).fit()) for p, f in formulae_s1]
predictor_s1, model_s1 = sorted(models_s1, key=lambda tup: tup[1].rsquared, reverse=True)[0]
predictors = predictors.drop(predictor_s1)
formula = 'y ~ {}'.format(predictor_s1)
step_models = [model_s1]
while len(predictors) > 0:
models = []
for p in predictors:
f = formula + ' + ' + p
m = smf.ols(formula=f, data=data).fit()
models.append((p, m))
predictor, model = sorted(models, key=lambda tup: tup[1].rsquared, reverse=True)[0]
step_models.append(model)
formula = formula + ' + ' + predictor
predictors = predictors.drop(predictor)
return pd.DataFrame({'model': step_models, 'n_predictors' : range(1,data.shape[1])})
# Select between models with different numbers of predictors using adjusted training metrics
best = fwd_selection('y', data)
best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj)
best['bic'] = best['model'].map(lambda m: m.bic)
best['aic'] = best['model'].map(lambda m: m.aic)
# Plot the error metrics
_, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15))
sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h')
sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h')
sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h')
<matplotlib.axes._subplots.AxesSubplot at 0x118d6fda0>
print(best.sort_values('adj_R_sq', ascending=False)['model'].iloc[0].summary())
print(best.sort_values('bic', ascending=True)['model'].iloc[0].summary())
print(best.sort_values('aic', ascending=True)['model'].iloc[0].summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:29:24 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x3 3.9099 0.044 88.854 0.000 3.823 3.997 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x 2.2893 0.139 16.474 0.000 2.013 2.565 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:29:24 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x3 3.9099 0.044 88.854 0.000 3.823 3.997 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x 2.2893 0.139 16.474 0.000 2.013 2.565 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:29:24 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x3 3.9099 0.044 88.854 0.000 3.823 3.997 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x 2.2893 0.139 16.474 0.000 2.013 2.565 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Backward selection
# There are 1 + p(1 + p) / 2 possible models in this space
def bwd_selection(target_column, data):
predictors = data.drop('y', axis=1).columns
formula_s1 = 'y ~ ' + ' + '.join(predictors)
model_s1 = smf.ols(formula=formula_s1, data=data).fit()
step_models = [model_s1]
while len(predictors) > 1:
models = []
for p in predictors:
f = 'y ~ ' + ' + '.join(predictors.drop(p))
m = smf.ols(formula=f, data=data).fit()
models.append((p, m))
p, m = sorted(models, key=lambda tup: tup[1].rsquared, reverse=True)[0]
step_models.append(m)
predictors = predictors.drop(p)
return pd.DataFrame({'model': step_models, 'n_predictors' : range(1, 11)[::-1]})
# Select between models with different numbers of predictors using adjusted training metrics
best = bwd_selection('y', data)
best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj)
best['bic'] = best['model'].map(lambda m: m.bic)
best['aic'] = best['model'].map(lambda m: m.aic)
# Plot the error metrics
_, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15))
sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h')
sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h')
sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h')
<matplotlib.axes._subplots.AxesSubplot at 0x119863b70>
print(best.sort_values('adj_R_sq', ascending=False)['model'].iloc[0].summary())
print(best.sort_values('bic', ascending=True)['model'].iloc[0].summary())
print(best.sort_values('aic', ascending=True)['model'].iloc[0].summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:29:50 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x 2.2893 0.139 16.474 0.000 2.013 2.565 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x3 3.9099 0.044 88.854 0.000 3.823 3.997 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:29:50 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x 2.2893 0.139 16.474 0.000 2.013 2.565 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x3 3.9099 0.044 88.854 0.000 3.823 3.997 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.998 Model: OLS Adj. R-squared: 0.998 Method: Least Squares F-statistic: 1.680e+04 Date: Mon, 01 Oct 2018 Prob (F-statistic): 1.93e-130 Time: 11:29:50 Log-Likelihood: -126.13 No. Observations: 100 AIC: 260.3 Df Residuals: 96 BIC: 270.7 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.0229 0.117 8.713 0.000 0.790 1.256 x 2.2893 0.139 16.474 0.000 2.013 2.565 x2 3.0189 0.077 38.971 0.000 2.865 3.173 x3 3.9099 0.044 88.854 0.000 3.823 3.997 ============================================================================== Omnibus: 0.465 Durbin-Watson: 1.885 Prob(Omnibus): 0.792 Jarque-Bera (JB): 0.386 Skew: 0.150 Prob(JB): 0.825 Kurtosis: 2.951 Cond. No. 7.10 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
(e) Now fit a lasso model to the simulated data, again using X,X2, . . . , X 10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.
# LASSO Regression - Cross-Validation with test MSE
results = pd.DataFrame()
alpha_range = np.linspace(10**-1, 1, num=100)
for train_idx, test_idx in model_selection.KFold(n_splits=10).split(data):
train, test = data.iloc[train_idx], data.iloc[test_idx]
errors = []
for alpha in alpha_range:
reg = linear_model.Lasso(alpha=alpha, max_iter=1000000)
reg.fit(train.drop('y', axis=1), train['y'])
error = metrics.mean_squared_error(test['y'], reg.predict(test.drop('y', axis=1)))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'lambda' : alpha_range, 'MSE' : results.mean()})
sns.scatterplot(x='lambda', y='MSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x11999a240>
reg = linear_model.Lasso(alpha=0.15, max_iter=1000000)
reg.fit(data.drop('y', axis=1), data['y']).coef_
# The cubic term has the largest coefficient
# The 5th order and higher terms have very small coefficeints
array([ 1.78982643e+00, 1.52503944e+00, 4.13617586e+00, 5.68803568e-01, 1.11648904e-04, 0.00000000e+00, 0.00000000e+00, -1.94037799e-02, -1.34039474e-03, 1.99471493e-03])
(f) Now generate a response vector Y according to the model Y = β0 + β7X7 + ε, and perform best subset selection and the lasso. Discuss the results obtained.
coeffs = np.array([1,2])
eps = np.random.normal(0,1, 100)
x7 = x ** 7
design_matrix = pd.DataFrame({ 'c' : np.repeat(1, 100),
'x7' : x7})
y = (design_matrix @ coeffs.reshape(-1, 1)) + eps.reshape(-1, 1)
data['y'] = y
best = best_subset(10, 'y', data)
1/10 : 10 combinations, 0.03963327407836914 seconds 2/10 : 45 combinations, 0.18586397171020508 seconds 3/10 : 120 combinations, 0.7698366641998291 seconds 4/10 : 210 combinations, 1.2466669082641602 seconds 5/10 : 252 combinations, 1.5642271041870117 seconds 6/10 : 210 combinations, 1.5838689804077148 seconds 7/10 : 120 combinations, 1.0425281524658203 seconds 8/10 : 45 combinations, 0.4454309940338135 seconds 9/10 : 10 combinations, 0.11003971099853516 seconds 10/10 : 1 combinations, 0.012729883193969727 seconds
# Select between models with different numbers of predictors using adjusted training metrics
best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj)
best['bic'] = best['model'].map(lambda m: m.bic)
best['aic'] = best['model'].map(lambda m: m.aic)
# Plot the error metrics
_, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15))
sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h')
sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h')
sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h')
<matplotlib.axes._subplots.AxesSubplot at 0x11a1d54a8>
# LASSO Regression - Cross-Validation with test MSE
results = pd.DataFrame()
alpha_range = np.linspace(0.3, 1, num=200)
for train_idx, test_idx in model_selection.KFold(n_splits=10).split(data):
train, test = data.iloc[train_idx], data.iloc[test_idx]
errors = []
for alpha in alpha_range:
reg = linear_model.Lasso(alpha=alpha, max_iter=1000000)
reg.fit(train.drop('y', axis=1), train['y'])
error = np.sqrt(metrics.mean_squared_error(test['y'], reg.predict(test.drop('y', axis=1))))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'lambda' : alpha_range, 'RMSE' : results.mean()})
sns.scatterplot(x='lambda', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x11a255978>
reg = linear_model.Lasso(alpha=0.5, max_iter=1000000)
reg.fit(data.drop('y', axis=1), data['y']).coef_
# The 7th order term has the largest coefficient
# The first, second and third order terms have zero coeffecients
array([ 0. , -0. , 0. , -0.62718828, 0.36328617, 0.60771921, 1.62992381, -0.07840308, 0.07485172, -0.01063306])
college_dat = pd.read_csv('college.csv')
college_dat = college_dat.drop(college_dat.columns[0], axis=1)
college_dat['Private'] = college_dat['Private'].map({'Yes' : 1, 'No' : 0})
college_dat = college_dat.apply(pd.to_numeric, errors='coerce')
college_dat.columns = [c.translate({ord(c):None for c in '.'}) for c in college_dat.columns]
college_dat.head()
Private | Apps | Accept | Enroll | Top10perc | Top25perc | FUndergrad | PUndergrad | Outstate | RoomBoard | Books | Personal | PhD | Terminal | SFRatio | percalumni | Expend | GradRate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1660 | 1232 | 721 | 23 | 52 | 2885 | 537 | 7440 | 3300 | 450 | 2200 | 70 | 78 | 18.1 | 12 | 7041 | 60 |
1 | 1 | 2186 | 1924 | 512 | 16 | 29 | 2683 | 1227 | 12280 | 6450 | 750 | 1500 | 29 | 30 | 12.2 | 16 | 10527 | 56 |
2 | 1 | 1428 | 1097 | 336 | 22 | 50 | 1036 | 99 | 11250 | 3750 | 400 | 1165 | 53 | 66 | 12.9 | 30 | 8735 | 54 |
3 | 1 | 417 | 349 | 137 | 60 | 89 | 510 | 63 | 12960 | 5450 | 450 | 875 | 92 | 97 | 7.7 | 37 | 19016 | 59 |
4 | 1 | 193 | 146 | 55 | 16 | 44 | 249 | 869 | 7560 | 4120 | 800 | 1500 | 76 | 72 | 11.9 | 2 | 10922 | 15 |
(a) Split the data set into a training set and a test set.
train, test = model_selection.train_test_split(college_dat, test_size=0.2)
(b) Fit a linear model using least squares on the training set, and report the test error obtained.
f = 'Apps ~ ' + ' + '.join(train.drop('Apps', axis=1).columns)
model = smf.ols(formula=f, data=train).fit()
test_error_ols = np.sqrt(metrics.mean_squared_error(test['Apps'], model.predict(test)))
print('RMSE: {}'.format(test_error_ols))
RMSE: 1097.8135996785918
(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
results = pd.DataFrame()
alpha_range = np.linspace(10**-3, 50, num=1000)
for train_idx, test_idx in model_selection.KFold(n_splits=8).split(college_dat):
train, test = college_dat.iloc[train_idx], college_dat.iloc[test_idx]
errors = []
for alpha in alpha_range:
reg = linear_model.Ridge(alpha=alpha)
reg.fit(train.drop('Apps', axis=1), train['Apps'])
error = np.sqrt(metrics.mean_squared_error(test['Apps'], reg.predict(test.drop('Apps', axis=1))))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'lambda' : alpha_range, 'RMSE' : results.mean()})
sns.scatterplot(x='lambda', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x1196bff98>
test_error_ridge = df.sort_values('RMSE', ascending=True)['RMSE'].iloc[0]
print('RMSE: {}'.format(test_error_ridge))
RMSE: 1085.9207169016076
(d) Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the num- ber of non-zero coefficient estimates.
# LASSO Regression - Cross-Validation with test MSE
results = pd.DataFrame()
alpha_range = np.linspace(0.1, 10, num=200)
for train_idx, test_idx in model_selection.KFold(n_splits=12).split(college_dat):
train, test = college_dat.iloc[train_idx], college_dat.iloc[test_idx]
errors = []
for alpha in alpha_range:
reg = linear_model.Lasso(alpha=alpha, max_iter=1000000)
reg.fit(train.drop('Apps', axis=1), train['Apps'])
error = np.sqrt(metrics.mean_squared_error(test['Apps'], reg.predict(test.drop('Apps', axis=1))))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'lambda' : alpha_range, 'RMSE' : results.mean()})
sns.scatterplot(x='lambda', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x1197b8358>
test_error_lasso = df.sort_values('RMSE', ascending=True)['RMSE'].iloc[0]
print('RMSE: {}'.format(test_error_lasso))
RMSE: 1070.514298450505
reg = linear_model.Lasso(alpha=3, max_iter=1000000)
reg.fit(college_dat.drop('Apps', axis=1), college_dat['Apps']).coef_
# there are no zero coefficients
array([-4.53468031e+02, 1.58612352e+00, -8.80863226e-01, 4.97995026e+01, -1.41633311e+01, 5.86570247e-02, 4.48014731e-02, -8.78149635e-02, 1.49626985e-01, 1.84128994e-02, 3.09317619e-02, -8.49867765e+00, -3.16424017e+00, 1.56502651e+01, 2.40455396e-02, 7.81736691e-02, 8.60426321e+00])
(e) Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
# PCR Regression - Cross-Validation with test MSE
results = pd.DataFrame()
predictors = college_dat.drop('Apps', axis=1).columns
c_range = range(1, len(predictors) + 1)
for train_idx, test_idx in model_selection.KFold(n_splits=10).split(college_dat):
train, test = college_dat.iloc[train_idx], college_dat.iloc[test_idx]
errors = []
for c in c_range:
pca = decomposition.PCA(n_components=c)
X = pca.fit_transform(train.drop('Apps', axis=1))
reg = linear_model.LinearRegression()
reg.fit(X, train['Apps'])
test_X = pca.transform(test.drop('Apps', axis=1))
error = np.sqrt(metrics.mean_squared_error(test['Apps'], reg.predict(test_X)))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=c_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'n_components' : c_range, 'RMSE' : results.mean()})
sns.barplot(x='n_components', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x11987e780>
test_error_pcr = df.sort_values('RMSE', ascending=True).iloc[0]['RMSE']
print('RMSE: {}'.format(test_error_pcr))
RMSE: 1098.9615610719898
(f) Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
# PLS Regression - Cross-Validation with test MSE
results = pd.DataFrame()
predictors = college_dat.drop('Apps', axis=1).columns
c_range = range(1, len(predictors) + 1)
for train_idx, test_idx in model_selection.KFold(n_splits=10).split(college_dat):
train, test = college_dat.iloc[train_idx], college_dat.iloc[test_idx]
errors = []
for c in c_range:
reg = cross_decomposition.PLSRegression(n_components=c)
reg.fit(train.drop('Apps', axis=1), train['Apps'])
error = np.sqrt(metrics.mean_squared_error(test['Apps'], reg.predict(test.drop('Apps', axis=1))))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=c_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'n_components' : c_range, 'RMSE' : results.mean()})
sns.barplot(x='n_components', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x1199aa898>
test_error_pls = df.sort_values('RMSE', ascending=True).iloc[0]['RMSE']
print('RMSE: {}'.format(test_error_pls))
RMSE: 1098.9586217927474
(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
summary = pd.DataFrame({'rmse' : [test_error_ols, test_error_ridge, test_error_lasso, test_error_pcr, test_error_pls], \
'model': ['ols', 'ridge', 'lasso', 'pcr', 'pls']})
sns.barplot(x='model', y= 'rmse', data=summary)
<matplotlib.axes._subplots.AxesSubplot at 0x119abc518>
summary.sort_values('rmse', ascending=True)
rmse | model | |
---|---|---|
2 | 1070.514298 | lasso |
1 | 1085.920717 | ridge |
0 | 1097.813600 | ols |
4 | 1098.958622 | pls |
3 | 1098.961561 | pcr |
college_dat['Apps'].mean()
3001.6383526383524
(a) Generate a data set with p = 20 features, n = 1,000 observations, and an associated quantitative response vector generated according to the model Y =Xβ+ε, where β has some elements that are exactly equal to zero.
n = 100
p = 20
x = np.random.uniform(0,10,(n, p))
eps = np.random.normal(0, 1, n)
beta = np.random.normal(0, 1, p)
beta[0] = 0
beta[1] = 0
beta[2] = 0
beta[3] = 0
beta[4] = 0
beta[5] = 0
beta[6] = 0
beta[7] = 0
beta[8] = beta[9] ** 2
beta[10] = beta[11] ** 2
y = (x @ beta.reshape(-1, 1)) + eps.reshape(-1, 1)
data = pd.DataFrame(x, columns= ['x' + str(i) for i in range(0, x.shape[1])])
data['y'] = y
data.head()
x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | ... | x11 | x12 | x13 | x14 | x15 | x16 | x17 | x18 | x19 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.783240 | 2.822764 | 2.104245 | 0.085151 | 8.070863 | 9.613367 | 3.890487 | 7.576291 | 5.795171 | 3.736745 | ... | 9.745869 | 1.079770 | 3.823860 | 5.726278 | 0.746411 | 3.317932 | 3.490703 | 2.305381 | 0.182134 | 14.017323 |
1 | 3.872013 | 1.235886 | 5.053738 | 0.075068 | 3.522952 | 8.167472 | 5.532001 | 5.415793 | 8.896883 | 1.439292 | ... | 8.012286 | 1.586732 | 4.754267 | 7.819763 | 3.792867 | 0.562013 | 9.079879 | 0.633988 | 0.269867 | 19.181412 |
2 | 2.868493 | 4.097188 | 5.897901 | 2.579025 | 0.174440 | 2.931412 | 9.650473 | 6.266575 | 1.599330 | 4.434026 | ... | 0.434932 | 7.079660 | 4.791268 | 6.003995 | 5.510795 | 8.790729 | 7.049488 | 2.820926 | 9.893762 | 10.489754 |
3 | 4.747117 | 9.260321 | 7.629206 | 9.956572 | 4.509392 | 8.280505 | 9.809595 | 7.245737 | 2.299474 | 6.779702 | ... | 7.811970 | 1.374622 | 0.747643 | 6.002939 | 0.019969 | 3.797922 | 4.139976 | 1.060459 | 2.778484 | 9.574987 |
4 | 2.116980 | 4.778625 | 0.165573 | 5.720200 | 3.273290 | 9.592775 | 8.044132 | 8.663585 | 9.080542 | 1.654905 | ... | 9.807599 | 6.937180 | 5.899757 | 9.969896 | 2.606421 | 5.331222 | 3.010509 | 1.172996 | 8.545720 | 26.689338 |
5 rows × 21 columns
(b) Split your dataset into a training set containing 100 observations and a test set containing 900 observations.
train, test = model_selection.train_test_split(data, test_size=0.9)
(c) Perform best subset selection on the training set, and plot the training set MSE associated with the best model of each size.
# Best subsets with smf is SLOW.
# Investigating performance of best subsets with sklearn
i = 1
smf_times = []
while i < data.shape[1] - 1 :
cols = ['x' + str(j) for j in range(1, i + 1)]
f = 'y ~ ' + ' + '.join(cols)
tick = time.time()
smf.ols(formula=f, data=data).fit()
tock = time.time()
smf_times.append(tock - tick)
i += 1
sns.regplot(x=[x for x in range(1, data.shape[1] - 1)], y=smf_times)
i = 1
sk_times = []
while i < data.shape[1] - 1 :
cols = ['x' + str(j) for j in range(1, i + 1)]
x = data[cols]
tick = time.time()
model = linear_model.LinearRegression().fit(x, data['y'])
model.score(x, data['y'])
tock = time.time()
sk_times.append(tock - tick)
i += 1
sns.regplot(x=[x for x in range(1, data.shape[1] - 1)], y=sk_times)
# SKlearn kills statsmodels.
np.array(sk_times).mean()
0.0006680865036813836
def n_choose_k(n, k):
return int(np.math.factorial(n) / (np.math.factorial(n - k) * np.math.factorial(k)))
def n_choose_upto_k_sum(n, up_to_k):
return np.array([n_choose_k(n, k) for k in range(1, up_to_k + 1)]).sum()
def best_subset(n_predictors, target_column, data):
i = 1
predictors = data.drop(target_column, axis=1).columns
search_size = n_choose_upto_k_sum(len(predictors), n_predictors)
single_fit_time = 0.0012
print('searching over {} models'.format(search_size))
print('estimated time: {} seconds'.format(single_fit_time * search_size))
top_models = []
top_combs = []
loop_tick = time.time()
while i <= n_predictors:
print('-----------')
print('n choose: {}'.format(i))
cmbs = list(combinations(predictors, i))
print('{} combinations'.format(len(cmbs)))
tick = time.time()
top_model = None
top_score = None
top_comb = None
for s in cmbs:
x = data[list(s)]
model = linear_model.LinearRegression().fit(x, data[target_column])
score = model.score(x, data[target_column])
if top_score == None or score < top_score:
top_model = model
top_score = score
top_comb = s
tock = time.time()
print('done: {} seconds ({} secs/model)'.format(tock - tick, (tock - tick) / len(cmbs)))
top_models.append(top_model)
top_combs.append(top_comb)
tock = time.time()
i += 1
print('--- FINISHED ---')
loop_tock = time.time()
print('total time: {} seconds'.format((loop_tock-loop_tick)))
print('secs/model: {}'.format((loop_tock-loop_tick)/ search_size))
return pd.DataFrame({'model' : top_models, 'n_predictors' : range(1, n_predictors + 1)}), top_combs
df, combs = best_subset(4, 'y', train)
searching over 6195 models estimated time: 7.433999999999999 seconds ----------- n choose: 1 20 combinations done: 0.025238990783691406 seconds (0.0012619495391845703 secs/model) ----------- n choose: 2 190 combinations done: 0.22151517868041992 seconds (0.0011658693614758943 secs/model) ----------- n choose: 3 1140 combinations done: 1.356029987335205 seconds (0.0011894999888905308 secs/model) ----------- n choose: 4 4845 combinations done: 5.599212884902954 seconds (0.001155668294097617 secs/model) --- FINISHED --- total time: 7.204585075378418 seconds secs/model: 0.0011629677280675411
rmses = []
for idx, m in enumerate(df['model']):
comb = combs[idx]
x = test[list(comb)]
preds = m.predict(x)
rmses.append(np.sqrt(metrics.mean_squared_error(test['y'], preds)))
df = pd.DataFrame({'test_rmse' : rmses, 'n_predictors' : np.arange(1, len(rmses) + 1)})
(d) Plot the test set MSE associated with the best model of each size.
sns.barplot(y='n_predictors', x='test_rmse', data=df, orient='h')
<matplotlib.axes._subplots.AxesSubplot at 0x132608320>
(e) For which model size does the test set MSE take on its minimum value? Comment on your results. If it takes on its minimum value for a model containing only an intercept or a model containing all of the features, then play around with the way that you are generating the data in (a) until you come up with a scenario in which the test set MSE is minimized for an intermediate model size.
# Running best subsets for p=20 is going to take ~20 hours!!
# I'm going to do forward stepwise instead.
# Select between models with different numbers of predictors using adjusted training metrics
best = fwd_selection('y', train)
rmses = []
for m in best['model']:
preds = m.predict(test)
rmses.append(np.sqrt(metrics.mean_squared_error(test['y'], preds)))
df = pd.DataFrame({'model' : best['model'] ,'test_rmse' : rmses, 'n_predictors' : np.arange(1, len(rmses) + 1)})
sns.barplot(y='n_predictors', x='test_rmse', data=df, orient='h')
<matplotlib.axes._subplots.AxesSubplot at 0x117a37438>
(f) How does the model at which the test set MSE is minimized compare to the true model used to generate the data? Comment on the coefficient values.
best = df.sort_values('test_rmse', ascending=True).iloc[0]['model']
best.summary()
/usr/local/lib/python3.6/site-packages/scipy/stats/stats.py:1394: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10 "anyway, n=%i" % int(n))
Dep. Variable: | y | R-squared: | 0.889 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.800 |
Method: | Least Squares | F-statistic: | 9.993 |
Date: | Mon, 01 Oct 2018 | Prob (F-statistic): | 0.0133 |
Time: | 19:14:07 | Log-Likelihood: | -19.122 |
No. Observations: | 10 | AIC: | 48.24 |
Df Residuals: | 5 | BIC: | 49.76 |
Df Model: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -7.5641 | 3.561 | -2.124 | 0.087 | -16.717 | 1.589 |
x10 | 0.6322 | 0.298 | 2.121 | 0.087 | -0.134 | 1.398 |
x11 | 1.3608 | 0.387 | 3.513 | 0.017 | 0.365 | 2.357 |
x13 | 0.6353 | 0.307 | 2.072 | 0.093 | -0.153 | 1.423 |
x16 | 0.6766 | 0.384 | 1.761 | 0.139 | -0.311 | 1.664 |
Omnibus: | 1.865 | Durbin-Watson: | 1.442 |
---|---|---|---|
Prob(Omnibus): | 0.394 | Jarque-Bera (JB): | 1.269 |
Skew: | -0.768 | Prob(JB): | 0.530 |
Kurtosis: | 2.173 | Cond. No. | 58.1 |
(g) Create a plot displaying: sqrt(sum ((bj - bjr) ^ 2))
for a range of values
of r, where βˆjr is the jth coefficient estimate for the best model containing r coefficients. Comment on what you observe. How does this compare to the test MSE plot from (d)?
beta_lookup = pd.Series(beta, index=['x' + str(i) for i in range(0, len(beta))])
vs = []
i = 0
while i < df.shape[0]:
row = sorted_df.iloc[i]
coeffs = row['model'].params.drop('Intercept')
betas = beta_lookup[coeffs.index]
v = beta_lookup[list(coeffs.index)] - coeffs
vs.append(np.sqrt((v ** 2).sum()))
i += 1
xy = pd.DataFrame({'value': vs, 'n_predictors' : np.arange(1, i + 1)})
_, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10,10))
sns.barplot(y='n_predictors', x='value', data=xy, orient='h', ax=ax1)
sns.barplot(y='n_predictors', x='test_rmse', data=df, orient='h', ax=ax2)
<matplotlib.axes._subplots.AxesSubplot at 0x1215fd2e8>
(a) Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, cross- validation, or some other reasonable alternative, as opposed to using training error.
(c) Does your chosen model involve all of the features in the data set? Why or why not?
boston_data = datasets.load_boston()
df_boston = pd.DataFrame(boston_data.data,columns=boston_data.feature_names)
df_boston['MEDV'] = pd.Series(boston_data.target)
df_boston.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
# Select between models with different numbers of predictors using cross validation
results = pd.DataFrame()
for train_idx, test_idx in model_selection.KFold(n_splits=5).split(df_boston):
train = df_boston.iloc[train_idx]
test = df_boston.iloc[test_idx]
df, combs = best_subset(13, 'CRIM', train)
rmses = []
for idx, m in enumerate(df['model']):
comb = combs[idx]
x = test[list(comb)]
preds = m.predict(x)
rmses.append(np.sqrt(metrics.mean_squared_error(test['CRIM'], preds)))
rmse = pd.DataFrame({'test_rmse' : rmses})
results = results.append(rmse.T, ignore_index=True)
results
searching over 8191 models estimated time: 9.829199999999998 seconds ----------- n choose: 1 13 combinations done: 0.017893075942993164 seconds (0.0013763904571533203 secs/model) ----------- n choose: 2 78 combinations done: 0.08938980102539062 seconds (0.0011460230900691105 secs/model) ----------- n choose: 3 286 combinations done: 0.3350977897644043 seconds (0.0011716705935818333 secs/model) ----------- n choose: 4 715 combinations done: 0.874377965927124 seconds (0.0012229062460519216 secs/model) ----------- n choose: 5 1287 combinations done: 1.4666271209716797 seconds (0.0011395704125654077 secs/model) ----------- n choose: 6 1716 combinations done: 1.9497790336608887 seconds (0.001136234868100751 secs/model) ----------- n choose: 7 1716 combinations done: 1.9710617065429688 seconds (0.0011486373581252732 secs/model) ----------- n choose: 8 1287 combinations done: 1.492173194885254 seconds (0.0011594197318455742 secs/model) ----------- n choose: 9 715 combinations done: 0.8320097923278809 seconds (0.0011636500591998334 secs/model) ----------- n choose: 10 286 combinations done: 0.3377799987792969 seconds (0.0011810489467807583 secs/model) ----------- n choose: 11 78 combinations done: 0.09338688850402832 seconds (0.0011972678013336964 secs/model) ----------- n choose: 12 13 combinations done: 0.018061161041259766 seconds (0.0013893200800969051 secs/model) ----------- n choose: 13 1 combinations done: 0.0015816688537597656 seconds (0.0015816688537597656 secs/model) --- FINISHED --- total time: 9.48312497138977 seconds secs/model: 0.0011577493555597327 searching over 8191 models estimated time: 9.829199999999998 seconds ----------- n choose: 1 13 combinations done: 0.01477503776550293 seconds (0.0011365413665771484 secs/model) ----------- n choose: 2 78 combinations done: 0.09007978439331055 seconds (0.0011548690306834686 secs/model) ----------- n choose: 3 286 combinations done: 0.3248109817504883 seconds (0.0011357027333933156 secs/model) ----------- n choose: 4 715 combinations done: 0.7975690364837646 seconds (0.0011154811699073631 secs/model) ----------- n choose: 5 1287 combinations done: 1.4798541069030762 seconds (0.0011498477909114811 secs/model) ----------- n choose: 6 1716 combinations done: 2.13156795501709 seconds (0.0012421724679586772 secs/model) ----------- n choose: 7 1716 combinations done: 2.1127989292144775 seconds (0.001231234807234544 secs/model) ----------- n choose: 8 1287 combinations done: 1.602759838104248 seconds (0.0012453456395526404 secs/model) ----------- n choose: 9 715 combinations done: 0.9456429481506348 seconds (0.0013225775498610277 secs/model) ----------- n choose: 10 286 combinations done: 0.37534165382385254 seconds (0.0013123834049785055 secs/model) ----------- n choose: 11 78 combinations done: 0.11243391036987305 seconds (0.0014414603893573468 secs/model) ----------- n choose: 12 13 combinations done: 0.018174171447753906 seconds (0.001398013188288762 secs/model) ----------- n choose: 13 1 combinations done: 0.0018270015716552734 seconds (0.0018270015716552734 secs/model) --- FINISHED --- total time: 10.012573003768921 seconds secs/model: 0.0012223871326784179 searching over 8191 models estimated time: 9.829199999999998 seconds ----------- n choose: 1 13 combinations done: 0.015893936157226562 seconds (0.0012226104736328125 secs/model) ----------- n choose: 2 78 combinations done: 0.10020971298217773 seconds (0.0012847399100279196 secs/model) ----------- n choose: 3 286 combinations done: 0.37825489044189453 seconds (0.0013225695469996311 secs/model) ----------- n choose: 4 715 combinations done: 0.8888750076293945 seconds (0.0012431818288523 secs/model) ----------- n choose: 5 1287 combinations done: 1.602207899093628 seconds (0.0012449167825125313 secs/model) ----------- n choose: 6 1716 combinations done: 2.1144068241119385 seconds (0.001232171808923041 secs/model) ----------- n choose: 7 1716 combinations done: 2.12587308883667 seconds (0.0012388537813733508 secs/model) ----------- n choose: 8 1287 combinations done: 1.5516738891601562 seconds (0.001205651817529259 secs/model) ----------- n choose: 9 715 combinations done: 0.883814811706543 seconds (0.0012361046317574027 secs/model) ----------- n choose: 10 286 combinations done: 0.3632950782775879 seconds (0.0012702625114600974 secs/model) ----------- n choose: 11 78 combinations done: 0.09531021118164062 seconds (0.001221925784380008 secs/model) ----------- n choose: 12 13 combinations done: 0.017831802368164062 seconds (0.0013716771052433895 secs/model) ----------- n choose: 13 1 combinations done: 0.0014450550079345703 seconds (0.0014450550079345703 secs/model) --- FINISHED --- total time: 10.142664909362793 seconds secs/model: 0.0012382694310051022 searching over 8191 models estimated time: 9.829199999999998 seconds ----------- n choose: 1 13 combinations done: 0.017348051071166992 seconds (0.0013344654670128455 secs/model) ----------- n choose: 2 78 combinations done: 0.10265707969665527 seconds (0.0013161164063673753 secs/model) ----------- n choose: 3 286 combinations done: 0.3676578998565674 seconds (0.0012855171323656203 secs/model) ----------- n choose: 4 715 combinations done: 0.8679542541503906 seconds (0.00121392203377677 secs/model) ----------- n choose: 5 1287 combinations done: 1.564488172531128 seconds (0.001215608525665212 secs/model) ----------- n choose: 6 1716 combinations done: 2.1087839603424072 seconds (0.0012288950817846197 secs/model) ----------- n choose: 7 1716 combinations done: 2.1522388458251953 seconds (0.0012542184416230741 secs/model) ----------- n choose: 8 1287 combinations done: 1.5691421031951904 seconds (0.0012192246334072963 secs/model) ----------- n choose: 9 715 combinations done: 0.8493759632110596 seconds (0.001187938410085398 secs/model) ----------- n choose: 10 286 combinations done: 0.3537921905517578 seconds (0.0012370356312998525 secs/model) ----------- n choose: 11 78 combinations done: 0.09138798713684082 seconds (0.0011716408607287284 secs/model) ----------- n choose: 12 13 combinations done: 0.01631307601928711 seconds (0.0012548520014836239 secs/model) ----------- n choose: 13 1 combinations done: 0.002048015594482422 seconds (0.002048015594482422 secs/model) --- FINISHED --- total time: 10.066420793533325 seconds secs/model: 0.0012289611516949487 searching over 8191 models estimated time: 9.829199999999998 seconds ----------- n choose: 1 13 combinations done: 0.015763282775878906 seconds (0.0012125602135291467 secs/model) ----------- n choose: 2 78 combinations done: 0.08764505386352539 seconds (0.001123654536711864 secs/model) ----------- n choose: 3 286 combinations done: 0.3282649517059326 seconds (0.001147779551419345 secs/model) ----------- n choose: 4 715 combinations done: 0.8312163352966309 seconds (0.001162540329086197 secs/model) ----------- n choose: 5 1287 combinations done: 1.627681016921997 seconds (0.0012647094148578067 secs/model) ----------- n choose: 6 1716 combinations done: 2.1096181869506836 seconds (0.0012293812278267387 secs/model) ----------- n choose: 7 1716 combinations done: 2.1005899906158447 seconds (0.0012241200411514247 secs/model) ----------- n choose: 8 1287 combinations done: 1.6426849365234375 seconds (0.001276367472046183 secs/model) ----------- n choose: 9 715 combinations done: 0.8941559791564941 seconds (0.0012505678030160757 secs/model) ----------- n choose: 10 286 combinations done: 0.3765528202056885 seconds (0.0013166182524674422 secs/model) ----------- n choose: 11 78 combinations done: 0.0985708236694336 seconds (0.0012637285085824819 secs/model) ----------- n choose: 12 13 combinations done: 0.01629781723022461 seconds (0.0012536782484788161 secs/model) ----------- n choose: 13 1 combinations done: 0.0013277530670166016 seconds (0.0013277530670166016 secs/model) --- FINISHED --- total time: 10.13389277458191 seconds secs/model: 0.0012371984830401549
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.411323 | 4.755900 | 4.784093 | 5.129547 | 2.046046 | 2.147097 | 1.916061 | 2.925837 | 3.284684 | 3.023006 | 2.982842 | 2.266282 | 2.028975 |
1 | 3.782029 | 4.368218 | 4.670622 | 4.736891 | 4.789007 | 4.404211 | 5.677229 | 5.227295 | 7.070136 | 6.761502 | 6.654373 | 3.910064 | 3.350659 |
2 | 4.128744 | 3.383647 | 3.193658 | 3.447774 | 2.880217 | 2.803286 | 2.962172 | 2.837483 | 2.051755 | 1.982576 | 2.524979 | 2.174489 | 2.267941 |
3 | 12.421952 | 12.135320 | 11.946773 | 11.663072 | 11.055813 | 10.782303 | 10.723177 | 10.263207 | 10.244673 | 10.011402 | 11.265268 | 10.567198 | 10.058744 |
4 | 14.559303 | 13.630074 | 13.646259 | 13.835853 | 13.060408 | 12.657820 | 12.651616 | 14.276134 | 13.376914 | 12.917513 | 12.566299 | 11.688774 | 11.411557 |
xy = pd.DataFrame({'test_rmse' : results.mean(), 'n_predictors' : results.columns + 1})
sns.barplot(x='n_predictors', y='test_rmse', data = xy)
<matplotlib.axes._subplots.AxesSubplot at 0x11b4110f0>
# rmse for ols with all predictors (best performing subset)
xy.sort_values('test_rmse', ascending=True)['test_rmse'].iloc[0]
5.823575152341391
# LASSO Regression - Cross-Validation with test MSE
results = pd.DataFrame()
alpha_range = np.linspace(0.1, 10, num=200)
for train_idx, test_idx in model_selection.KFold(n_splits=12).split(df_boston):
train, test = df_boston.iloc[train_idx], df_boston.iloc[test_idx]
errors = []
for alpha in alpha_range:
reg = linear_model.Lasso(alpha=alpha, max_iter=1000000)
reg.fit(train.drop('CRIM', axis=1), train['CRIM'])
error = np.sqrt(metrics.mean_squared_error(test['CRIM'], reg.predict(test.drop('CRIM', axis=1))))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'lambda' : alpha_range, 'RMSE' : results.mean()})
sns.scatterplot(x='lambda', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x119978a58>
lmbda, RMSE = df.sort_values('RMSE', ascending=True).iloc[0]
# best rmse for lasoo
RMSE
4.575267023519401
# RIDGE Regression - Cross-Validation with test MSE
results = pd.DataFrame()
alpha_range = np.linspace(10**-3, 2000, num=1000)
for train_idx, test_idx in model_selection.KFold(n_splits=8).split(df_boston):
train, test = df_boston.iloc[train_idx], df_boston.iloc[test_idx]
errors = []
for alpha in alpha_range:
reg = linear_model.Ridge(alpha=alpha)
reg.fit(train.drop('CRIM', axis=1), train['CRIM'])
error = np.sqrt(metrics.mean_squared_error(test['CRIM'], reg.predict(test.drop('CRIM', axis=1))))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'lambda' : alpha_range, 'RMSE' : results.mean()})
sns.scatterplot(x='lambda', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x11ccc2160>
lmbda, RMSE = df.sort_values('RMSE', ascending=True).iloc[0]
# best rmse for ridge
RMSE
4.897662699449212
# PCR Regression - Cross-Validation with test MSE
results = pd.DataFrame()
predictors = df_boston.drop('CRIM', axis=1).columns
c_range = range(1, len(predictors) + 1)
for train_idx, test_idx in model_selection.KFold(n_splits=10).split(df_boston):
train, test = df_boston.iloc[train_idx], df_boston.iloc[test_idx]
errors = []
for c in c_range:
pca = decomposition.PCA(n_components=c)
X = pca.fit_transform(train.drop('CRIM', axis=1))
reg = linear_model.LinearRegression()
reg.fit(X, train['CRIM'])
test_X = pca.transform(test.drop('CRIM', axis=1))
error = np.sqrt(metrics.mean_squared_error(test['CRIM'], reg.predict(test_X)))
errors.append(error)
results = pd.concat([results, pd.DataFrame(errors, index=c_range).T], axis=0, ignore_index=True)
df = pd.DataFrame({'n_components' : c_range, 'RMSE' : results.mean()})
sns.barplot(x='n_components', y='RMSE', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x11c530a58>