Chapter 5 - Resampling Methods¶

Lab 5.3.1 The Validation Set Approach
Lab 5.3.2 Leave-One-Out Cross-Validation
Lab 5.3.3 k-Fold Cross-Validation
Lab 5.3.4 The Bootstrap

Imports and Configurations¶

In [1]:

# Standard imports
import warnings

# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri

# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd

# scikit-learn
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.utils import resample
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')

Lab 5.3.1 The Validation Set Approach¶

In [2]:

# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
1	18.0	8.0	307.0	130.0	3504.0	12.0	70.0	1.0	chevrolet chevelle malibu
2	15.0	8.0	350.0	165.0	3693.0	11.5	70.0	1.0	buick skylark 320
3	18.0	8.0	318.0	150.0	3436.0	11.0	70.0	1.0	plymouth satellite
4	16.0	8.0	304.0	150.0	3433.0	12.0	70.0	1.0	amc rebel sst
5	17.0	8.0	302.0	140.0	3449.0	10.5	70.0	1.0	ford torino
6	15.0	8.0	429.0	198.0	4341.0	10.0	70.0	1.0	ford galaxie 500
7	14.0	8.0	454.0	220.0	4354.0	9.0	70.0	1.0	chevrolet impala
8	14.0	8.0	440.0	215.0	4312.0	8.5	70.0	1.0	plymouth fury iii
9	14.0	8.0	455.0	225.0	4425.0	10.0	70.0	1.0	pontiac catalina
10	15.0	8.0	390.0	190.0	3850.0	8.5	70.0	1.0	amc ambassador dpl
11	15.0	8.0	383.0	170.0	3563.0	10.0	70.0	1.0	dodge challenger se
12	14.0	8.0	340.0	160.0	3609.0	8.0	70.0	1.0	plymouth 'cuda 340
13	15.0	8.0	400.0	150.0	3761.0	9.5	70.0	1.0	chevrolet monte carlo
14	14.0	8.0	455.0	225.0	3086.0	10.0	70.0	1.0	buick estate wagon (sw)
15	24.0	4.0	113.0	95.0	2372.0	15.0	70.0	3.0	toyota corona mark ii
16	22.0	6.0	198.0	95.0	2833.0	15.5	70.0	1.0	plymouth duster
17	18.0	6.0	199.0	97.0	2774.0	15.5	70.0	1.0	amc hornet
18	21.0	6.0	200.0	85.0	2587.0	16.0	70.0	1.0	ford maverick
19	27.0	4.0	97.0	88.0	2130.0	14.5	70.0	3.0	datsun pl510
20	26.0	4.0	97.0	46.0	1835.0	20.5	70.0	2.0	volkswagen 1131 deluxe sedan
21	25.0	4.0	110.0	87.0	2672.0	17.5	70.0	2.0	peugeot 504
22	24.0	4.0	107.0	90.0	2430.0	14.5	70.0	2.0	audi 100 ls
23	25.0	4.0	104.0	95.0	2375.0	17.5	70.0	2.0	saab 99e
24	26.0	4.0	121.0	113.0	2234.0	12.5	70.0	2.0	bmw 2002
25	21.0	6.0	199.0	90.0	2648.0	15.0	70.0	1.0	amc gremlin
26	10.0	8.0	360.0	215.0	4615.0	14.0	70.0	1.0	ford f250
27	10.0	8.0	307.0	200.0	4376.0	15.0	70.0	1.0	chevy c20
28	11.0	8.0	318.0	210.0	4382.0	13.5	70.0	1.0	dodge d200
29	9.0	8.0	304.0	193.0	4732.0	18.5	70.0	1.0	hi 1200d
30	27.0	4.0	97.0	88.0	2130.0	14.5	71.0	3.0	datsun pl510
...	...	...	...	...	...	...	...	...	...
368	28.0	4.0	112.0	88.0	2605.0	19.6	82.0	1.0	chevrolet cavalier
369	27.0	4.0	112.0	88.0	2640.0	18.6	82.0	1.0	chevrolet cavalier wagon
370	34.0	4.0	112.0	88.0	2395.0	18.0	82.0	1.0	chevrolet cavalier 2-door
371	31.0	4.0	112.0	85.0	2575.0	16.2	82.0	1.0	pontiac j2000 se hatchback
372	29.0	4.0	135.0	84.0	2525.0	16.0	82.0	1.0	dodge aries se
373	27.0	4.0	151.0	90.0	2735.0	18.0	82.0	1.0	pontiac phoenix
374	24.0	4.0	140.0	92.0	2865.0	16.4	82.0	1.0	ford fairmont futura
375	36.0	4.0	105.0	74.0	1980.0	15.3	82.0	2.0	volkswagen rabbit l
376	37.0	4.0	91.0	68.0	2025.0	18.2	82.0	3.0	mazda glc custom l
377	31.0	4.0	91.0	68.0	1970.0	17.6	82.0	3.0	mazda glc custom
378	38.0	4.0	105.0	63.0	2125.0	14.7	82.0	1.0	plymouth horizon miser
379	36.0	4.0	98.0	70.0	2125.0	17.3	82.0	1.0	mercury lynx l
380	36.0	4.0	120.0	88.0	2160.0	14.5	82.0	3.0	nissan stanza xe
381	36.0	4.0	107.0	75.0	2205.0	14.5	82.0	3.0	honda accord
382	34.0	4.0	108.0	70.0	2245.0	16.9	82.0	3.0	toyota corolla
383	38.0	4.0	91.0	67.0	1965.0	15.0	82.0	3.0	honda civic
384	32.0	4.0	91.0	67.0	1965.0	15.7	82.0	3.0	honda civic (auto)
385	38.0	4.0	91.0	67.0	1995.0	16.2	82.0	3.0	datsun 310 gx
386	25.0	6.0	181.0	110.0	2945.0	16.4	82.0	1.0	buick century limited
387	38.0	6.0	262.0	85.0	3015.0	17.0	82.0	1.0	oldsmobile cutlass ciera (diesel)
388	26.0	4.0	156.0	92.0	2585.0	14.5	82.0	1.0	chrysler lebaron medallion
389	22.0	6.0	232.0	112.0	2835.0	14.7	82.0	1.0	ford granada l
390	32.0	4.0	144.0	96.0	2665.0	13.9	82.0	3.0	toyota celica gt
391	36.0	4.0	135.0	84.0	2370.0	13.0	82.0	1.0	dodge charger 2.2
392	27.0	4.0	151.0	90.0	2950.0	17.3	82.0	1.0	chevrolet camaro
393	27.0	4.0	140.0	86.0	2790.0	15.6	82.0	1.0	ford mustang gl
394	44.0	4.0	97.0	52.0	2130.0	24.6	82.0	2.0	vw pickup
395	32.0	4.0	135.0	84.0	2295.0	11.6	82.0	1.0	dodge rampage
396	28.0	4.0	120.0	79.0	2625.0	18.6	82.0	1.0	ford ranger
397	31.0	4.0	119.0	82.0	2720.0	19.4	82.0	1.0	chevy s-10

392 rows × 9 columns

In [3]:

# Simple linear regression features and response
features = ['horsepower']
response = ['mpg']
X = auto[features]
y = auto[response]

# Split Auto data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)

# Regression
auto_slr = LinearRegression()
auto_slr.fit(X_train, y_train)

# Prediction and MSE
y_pred = auto_slr.predict(X_test)
print("SLR MSE = ", mean_squared_error(y_test, y_pred))

SLR MSE =  25.273723993

In [4]:

# Polynomial regression features of degree 2
poly2 = PolynomialFeatures(degree=2)
X2 = poly2.fit_transform(X)

# Split Auto data into train and test sets
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)

# Regression
auto_poly2 = LinearRegression()
auto_poly2.fit(X2_train, y_train)

# Prediction and MSE
y2_pred = auto_poly2.predict(X2_test)
print("Polynomial regression of degree 2: MSE = ", mean_squared_error(y_test, y2_pred))

Polynomial regression of degree 2: MSE =  18.8690031195

In [5]:

# Polynomial regression features of degree 3
poly3 = PolynomialFeatures(degree=3)
X3 = poly3.fit_transform(X)

# Split Auto data into train and test sets
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)

# Regression
auto_poly3 = LinearRegression()
auto_poly3.fit(X3_train, y_train)

# Prediction and MSE
y3_pred = auto_poly3.predict(X3_test)
print("Polynomial regression of degree 3: MSE = ", mean_squared_error(y_test, y3_pred))

Polynomial regression of degree 3: MSE =  18.8333669959

5.3.2 Leave-One-Out Cross-Validation¶

In [6]:

# Polynomial regression over degrees from 1 (simple linear) to 5
auto_poly = LinearRegression()
loocv = LeaveOneOut()

for poly_deg in range(1, 6):
    print("\nPolynomial regression of degree {}:".format(poly_deg))
    poly = PolynomialFeatures(degree=poly_deg)
    X_d = poly.fit_transform(X)
    scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')
    loocv_mse = scores.mean() * (-1)  # sign-flip to convert score to MSE
    print('  MSE = {}\n'.format(loocv_mse))

Polynomial regression of degree 1:
  MSE = 24.231513517929226


Polynomial regression of degree 2:
  MSE = 19.24821312448939


Polynomial regression of degree 3:
  MSE = 19.334984064114092


Polynomial regression of degree 4:
  MSE = 19.42443030854574


Polynomial regression of degree 5:
  MSE = 19.033219754727583

5.3.3 k-Fold Cross-Validation¶

In [7]:

# Polynomial regression over degrees from 1 (simple linear) to 10
auto_poly = LinearRegression()
kfold = KFold(n_splits=10, random_state=47)

for poly_deg in range(1, 11):
    print("\nPolynomial regression of degree {}:".format(poly_deg))
    poly = PolynomialFeatures(degree=poly_deg)
    X_d = poly.fit_transform(X)
    scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')
    loocv_mse = scores.mean() * (-1)  # sign-flip to convert score to MSE
    print('  MSE = {}\n'.format(loocv_mse))

Polynomial regression of degree 1:
  MSE = 27.439933652339857


Polynomial regression of degree 2:
  MSE = 21.235840055802118


Polynomial regression of degree 3:
  MSE = 21.3366061833284


Polynomial regression of degree 4:
  MSE = 21.353886987563506


Polynomial regression of degree 5:
  MSE = 20.905633737044845


Polynomial regression of degree 6:
  MSE = 20.782704427497574


Polynomial regression of degree 7:
  MSE = 20.953103378424892


Polynomial regression of degree 8:
  MSE = 21.07713162886134


Polynomial regression of degree 9:
  MSE = 21.036781313639857


Polynomial regression of degree 10:
  MSE = 20.98095645636944

5.3.4 The Bootstrap¶

In [8]:

# Auto dataset is in R ISLR package
islr = importr('ISLR')
portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']
portfolio = pandas2ri.ri2py(portfolio_rdf)
display(portfolio)

	X	Y
1	-0.895251	-0.234924
2	-1.562454	-0.885176
3	-0.417090	0.271888
4	1.044356	-0.734198
5	-0.315568	0.841983
6	-1.737124	-2.037191
7	1.966413	1.452957
8	2.152868	-0.434139
9	-0.081208	1.450809
10	-0.891782	0.821016
11	-0.293202	-1.042391
12	0.505779	0.608478
13	0.526751	-0.222493
14	1.066469	1.231357
15	0.294016	0.628589
16	0.042549	-1.267574
17	1.830970	-0.572752
18	-0.326937	-0.487472
19	0.521480	2.565985
20	1.399868	-0.357836
21	-0.645448	-1.412431
22	-0.904352	-0.568305
23	-1.764586	-0.746273
24	-1.810485	0.493747
25	-1.169899	-2.725281
26	-0.685376	-0.457616
27	1.090918	0.014495
28	-0.432340	-0.399831
29	0.268815	-0.201608
30	-0.851841	-1.741829
...	...	...
71	-0.984357	-1.139160
72	-1.384992	0.702700
73	-0.358843	-1.694513
74	-0.226618	0.801939
75	-0.941077	-0.733189
76	2.460336	-0.048373
77	0.716797	0.602337
78	-0.248087	-1.018490
79	1.010773	0.052978
80	2.313049	1.752359
81	0.835180	0.985715
82	-1.071903	-1.247298
83	-1.650526	0.215465
84	-0.600486	-0.420941
85	-0.058529	0.127621
86	0.075727	-0.522149
87	-1.157832	0.590894
88	1.673606	0.114623
89	-1.043988	-0.418944
90	0.014687	-0.558747
91	0.675322	1.482630
92	1.778342	0.942774
93	-1.295764	-1.085204
94	0.079602	-0.539101
95	2.260858	0.673225
96	0.479091	1.454774
97	-0.535020	-0.399175
98	-0.773129	-0.957175
99	0.403634	1.396038
100	-0.588496	-0.497285

100 rows × 2 columns

In [9]:

# Function to calculate the alpha for portofolio allocation
def alpha(data):
    """
    data: pandas dataframe with two columns X and Y.
    """

    sigma = data.cov()  # covariance matrix
    var_x = sigma.X['X']
    var_y = sigma.Y['Y']
    cov_xy = sigma.X['Y']
    alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)
    return alpha
alpha_original = alpha(portfolio)
print("Portfolio alpha = ", alpha_original)

Portfolio alpha =  0.575832074593

In [10]:

# Bootstrap with B=1000 on portfolio data
N = portfolio.shape[0]
B = 1000
portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)
alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]
alpha_bias = np.mean(alphas) - alpha_original
alpha_se = np.std(alphas)
alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],
                              columns=['original', 'bias', 'std. error'], index=['alpha'])
display(alpha_bootstrap)

	original	bias	std. error
alpha	0.575832	0.001963	0.089929

In [11]:

# Function to get simple linear regression coefficients for Auto data set
def auto_coef(data, features, response):
    """
    data: pandas dataframe sampled from the Auto data set
    features: a string list of feature names
    response: a string of response names
    """

    auto_reg = LinearRegression()
    auto_reg.fit(data[features], data[response])
    return [auto_reg.intercept_] + list(auto_reg.coef_)

features = ['horsepower']
response = 'mpg'
coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])
print("\nmpg ~ horsepower coefficients:\n\n", coef_original)

mpg ~ horsepower coefficients:

   Intercept  horsepower
  39.935861   -0.157845

In [12]:

# Bootstrap with B=1000 on Auto data
N = auto.shape[0]
B = 1000
auto_b = resample(auto, n_samples=N*B, random_state=42)
coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]
coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)
coef_bias = coefs_df.mean() - coef_original
coef_se = coefs_df.std()
coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)
coef_bootstrap.columns = ['original', 'bias', 'std. error']
display(coef_bootstrap)

	original	bias	std. error
Intercept	39.935861	0.033521	0.869087
horsepower	-0.157845	-0.000500	0.007503