Chapter 5 - Resampling Methods

Imports and Configurations

In [1]:
# Standard imports
import warnings

# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri

# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd

# scikit-learn
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.utils import resample
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')

Lab 5.3.1 The Validation Set Approach

In [2]:
# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)
mpg cylinders displacement horsepower weight acceleration year origin name
1 18.0 8.0 307.0 130.0 3504.0 12.0 70.0 1.0 chevrolet chevelle malibu
2 15.0 8.0 350.0 165.0 3693.0 11.5 70.0 1.0 buick skylark 320
3 18.0 8.0 318.0 150.0 3436.0 11.0 70.0 1.0 plymouth satellite
4 16.0 8.0 304.0 150.0 3433.0 12.0 70.0 1.0 amc rebel sst
5 17.0 8.0 302.0 140.0 3449.0 10.5 70.0 1.0 ford torino
6 15.0 8.0 429.0 198.0 4341.0 10.0 70.0 1.0 ford galaxie 500
7 14.0 8.0 454.0 220.0 4354.0 9.0 70.0 1.0 chevrolet impala
8 14.0 8.0 440.0 215.0 4312.0 8.5 70.0 1.0 plymouth fury iii
9 14.0 8.0 455.0 225.0 4425.0 10.0 70.0 1.0 pontiac catalina
10 15.0 8.0 390.0 190.0 3850.0 8.5 70.0 1.0 amc ambassador dpl
11 15.0 8.0 383.0 170.0 3563.0 10.0 70.0 1.0 dodge challenger se
12 14.0 8.0 340.0 160.0 3609.0 8.0 70.0 1.0 plymouth 'cuda 340
13 15.0 8.0 400.0 150.0 3761.0 9.5 70.0 1.0 chevrolet monte carlo
14 14.0 8.0 455.0 225.0 3086.0 10.0 70.0 1.0 buick estate wagon (sw)
15 24.0 4.0 113.0 95.0 2372.0 15.0 70.0 3.0 toyota corona mark ii
16 22.0 6.0 198.0 95.0 2833.0 15.5 70.0 1.0 plymouth duster
17 18.0 6.0 199.0 97.0 2774.0 15.5 70.0 1.0 amc hornet
18 21.0 6.0 200.0 85.0 2587.0 16.0 70.0 1.0 ford maverick
19 27.0 4.0 97.0 88.0 2130.0 14.5 70.0 3.0 datsun pl510
20 26.0 4.0 97.0 46.0 1835.0 20.5 70.0 2.0 volkswagen 1131 deluxe sedan
21 25.0 4.0 110.0 87.0 2672.0 17.5 70.0 2.0 peugeot 504
22 24.0 4.0 107.0 90.0 2430.0 14.5 70.0 2.0 audi 100 ls
23 25.0 4.0 104.0 95.0 2375.0 17.5 70.0 2.0 saab 99e
24 26.0 4.0 121.0 113.0 2234.0 12.5 70.0 2.0 bmw 2002
25 21.0 6.0 199.0 90.0 2648.0 15.0 70.0 1.0 amc gremlin
26 10.0 8.0 360.0 215.0 4615.0 14.0 70.0 1.0 ford f250
27 10.0 8.0 307.0 200.0 4376.0 15.0 70.0 1.0 chevy c20
28 11.0 8.0 318.0 210.0 4382.0 13.5 70.0 1.0 dodge d200
29 9.0 8.0 304.0 193.0 4732.0 18.5 70.0 1.0 hi 1200d
30 27.0 4.0 97.0 88.0 2130.0 14.5 71.0 3.0 datsun pl510
... ... ... ... ... ... ... ... ... ...
368 28.0 4.0 112.0 88.0 2605.0 19.6 82.0 1.0 chevrolet cavalier
369 27.0 4.0 112.0 88.0 2640.0 18.6 82.0 1.0 chevrolet cavalier wagon
370 34.0 4.0 112.0 88.0 2395.0 18.0 82.0 1.0 chevrolet cavalier 2-door
371 31.0 4.0 112.0 85.0 2575.0 16.2 82.0 1.0 pontiac j2000 se hatchback
372 29.0 4.0 135.0 84.0 2525.0 16.0 82.0 1.0 dodge aries se
373 27.0 4.0 151.0 90.0 2735.0 18.0 82.0 1.0 pontiac phoenix
374 24.0 4.0 140.0 92.0 2865.0 16.4 82.0 1.0 ford fairmont futura
375 36.0 4.0 105.0 74.0 1980.0 15.3 82.0 2.0 volkswagen rabbit l
376 37.0 4.0 91.0 68.0 2025.0 18.2 82.0 3.0 mazda glc custom l
377 31.0 4.0 91.0 68.0 1970.0 17.6 82.0 3.0 mazda glc custom
378 38.0 4.0 105.0 63.0 2125.0 14.7 82.0 1.0 plymouth horizon miser
379 36.0 4.0 98.0 70.0 2125.0 17.3 82.0 1.0 mercury lynx l
380 36.0 4.0 120.0 88.0 2160.0 14.5 82.0 3.0 nissan stanza xe
381 36.0 4.0 107.0 75.0 2205.0 14.5 82.0 3.0 honda accord
382 34.0 4.0 108.0 70.0 2245.0 16.9 82.0 3.0 toyota corolla
383 38.0 4.0 91.0 67.0 1965.0 15.0 82.0 3.0 honda civic
384 32.0 4.0 91.0 67.0 1965.0 15.7 82.0 3.0 honda civic (auto)
385 38.0 4.0 91.0 67.0 1995.0 16.2 82.0 3.0 datsun 310 gx
386 25.0 6.0 181.0 110.0 2945.0 16.4 82.0 1.0 buick century limited
387 38.0 6.0 262.0 85.0 3015.0 17.0 82.0 1.0 oldsmobile cutlass ciera (diesel)
388 26.0 4.0 156.0 92.0 2585.0 14.5 82.0 1.0 chrysler lebaron medallion
389 22.0 6.0 232.0 112.0 2835.0 14.7 82.0 1.0 ford granada l
390 32.0 4.0 144.0 96.0 2665.0 13.9 82.0 3.0 toyota celica gt
391 36.0 4.0 135.0 84.0 2370.0 13.0 82.0 1.0 dodge charger 2.2
392 27.0 4.0 151.0 90.0 2950.0 17.3 82.0 1.0 chevrolet camaro
393 27.0 4.0 140.0 86.0 2790.0 15.6 82.0 1.0 ford mustang gl
394 44.0 4.0 97.0 52.0 2130.0 24.6 82.0 2.0 vw pickup
395 32.0 4.0 135.0 84.0 2295.0 11.6 82.0 1.0 dodge rampage
396 28.0 4.0 120.0 79.0 2625.0 18.6 82.0 1.0 ford ranger
397 31.0 4.0 119.0 82.0 2720.0 19.4 82.0 1.0 chevy s-10

392 rows × 9 columns

In [3]:
# Simple linear regression features and response
features = ['horsepower']
response = ['mpg']
X = auto[features]
y = auto[response]

# Split Auto data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)

# Regression
auto_slr = LinearRegression()
auto_slr.fit(X_train, y_train)

# Prediction and MSE
y_pred = auto_slr.predict(X_test)
print("SLR MSE = ", mean_squared_error(y_test, y_pred))
SLR MSE =  25.273723993
In [4]:
# Polynomial regression features of degree 2
poly2 = PolynomialFeatures(degree=2)
X2 = poly2.fit_transform(X)

# Split Auto data into train and test sets
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)

# Regression
auto_poly2 = LinearRegression()
auto_poly2.fit(X2_train, y_train)

# Prediction and MSE
y2_pred = auto_poly2.predict(X2_test)
print("Polynomial regression of degree 2: MSE = ", mean_squared_error(y_test, y2_pred))
Polynomial regression of degree 2: MSE =  18.8690031195
In [5]:
# Polynomial regression features of degree 3
poly3 = PolynomialFeatures(degree=3)
X3 = poly3.fit_transform(X)

# Split Auto data into train and test sets
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)

# Regression
auto_poly3 = LinearRegression()
auto_poly3.fit(X3_train, y_train)

# Prediction and MSE
y3_pred = auto_poly3.predict(X3_test)
print("Polynomial regression of degree 3: MSE = ", mean_squared_error(y_test, y3_pred))
Polynomial regression of degree 3: MSE =  18.8333669959

5.3.2 Leave-One-Out Cross-Validation

In [6]:
# Polynomial regression over degrees from 1 (simple linear) to 5
auto_poly = LinearRegression()
loocv = LeaveOneOut()

for poly_deg in range(1, 6):
    print("\nPolynomial regression of degree {}:".format(poly_deg))
    poly = PolynomialFeatures(degree=poly_deg)
    X_d = poly.fit_transform(X)
    scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')
    loocv_mse = scores.mean() * (-1)  # sign-flip to convert score to MSE
    print('  MSE = {}\n'.format(loocv_mse))
Polynomial regression of degree 1:
  MSE = 24.231513517929226


Polynomial regression of degree 2:
  MSE = 19.24821312448939


Polynomial regression of degree 3:
  MSE = 19.334984064114092


Polynomial regression of degree 4:
  MSE = 19.42443030854574


Polynomial regression of degree 5:
  MSE = 19.033219754727583

5.3.3 k-Fold Cross-Validation

In [7]:
# Polynomial regression over degrees from 1 (simple linear) to 10
auto_poly = LinearRegression()
kfold = KFold(n_splits=10, random_state=47)

for poly_deg in range(1, 11):
    print("\nPolynomial regression of degree {}:".format(poly_deg))
    poly = PolynomialFeatures(degree=poly_deg)
    X_d = poly.fit_transform(X)
    scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')
    loocv_mse = scores.mean() * (-1)  # sign-flip to convert score to MSE
    print('  MSE = {}\n'.format(loocv_mse))
Polynomial regression of degree 1:
  MSE = 27.439933652339857


Polynomial regression of degree 2:
  MSE = 21.235840055802118


Polynomial regression of degree 3:
  MSE = 21.3366061833284


Polynomial regression of degree 4:
  MSE = 21.353886987563506


Polynomial regression of degree 5:
  MSE = 20.905633737044845


Polynomial regression of degree 6:
  MSE = 20.782704427497574


Polynomial regression of degree 7:
  MSE = 20.953103378424892


Polynomial regression of degree 8:
  MSE = 21.07713162886134


Polynomial regression of degree 9:
  MSE = 21.036781313639857


Polynomial regression of degree 10:
  MSE = 20.98095645636944

5.3.4 The Bootstrap

In [8]:
# Auto dataset is in R ISLR package
islr = importr('ISLR')
portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']
portfolio = pandas2ri.ri2py(portfolio_rdf)
display(portfolio)
X Y
1 -0.895251 -0.234924
2 -1.562454 -0.885176
3 -0.417090 0.271888
4 1.044356 -0.734198
5 -0.315568 0.841983
6 -1.737124 -2.037191
7 1.966413 1.452957
8 2.152868 -0.434139
9 -0.081208 1.450809
10 -0.891782 0.821016
11 -0.293202 -1.042391
12 0.505779 0.608478
13 0.526751 -0.222493
14 1.066469 1.231357
15 0.294016 0.628589
16 0.042549 -1.267574
17 1.830970 -0.572752
18 -0.326937 -0.487472
19 0.521480 2.565985
20 1.399868 -0.357836
21 -0.645448 -1.412431
22 -0.904352 -0.568305
23 -1.764586 -0.746273
24 -1.810485 0.493747
25 -1.169899 -2.725281
26 -0.685376 -0.457616
27 1.090918 0.014495
28 -0.432340 -0.399831
29 0.268815 -0.201608
30 -0.851841 -1.741829
... ... ...
71 -0.984357 -1.139160
72 -1.384992 0.702700
73 -0.358843 -1.694513
74 -0.226618 0.801939
75 -0.941077 -0.733189
76 2.460336 -0.048373
77 0.716797 0.602337
78 -0.248087 -1.018490
79 1.010773 0.052978
80 2.313049 1.752359
81 0.835180 0.985715
82 -1.071903 -1.247298
83 -1.650526 0.215465
84 -0.600486 -0.420941
85 -0.058529 0.127621
86 0.075727 -0.522149
87 -1.157832 0.590894
88 1.673606 0.114623
89 -1.043988 -0.418944
90 0.014687 -0.558747
91 0.675322 1.482630
92 1.778342 0.942774
93 -1.295764 -1.085204
94 0.079602 -0.539101
95 2.260858 0.673225
96 0.479091 1.454774
97 -0.535020 -0.399175
98 -0.773129 -0.957175
99 0.403634 1.396038
100 -0.588496 -0.497285

100 rows × 2 columns

In [9]:
# Function to calculate the alpha for portofolio allocation
def alpha(data):
    """
    data: pandas dataframe with two columns X and Y.
    """

    sigma = data.cov()  # covariance matrix
    var_x = sigma.X['X']
    var_y = sigma.Y['Y']
    cov_xy = sigma.X['Y']
    alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)
    return alpha
alpha_original = alpha(portfolio)
print("Portfolio alpha = ", alpha_original)
Portfolio alpha =  0.575832074593
In [10]:
# Bootstrap with B=1000 on portfolio data
N = portfolio.shape[0]
B = 1000
portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)
alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]
alpha_bias = np.mean(alphas) - alpha_original
alpha_se = np.std(alphas)
alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],
                              columns=['original', 'bias', 'std. error'], index=['alpha'])
display(alpha_bootstrap)
original bias std. error
alpha 0.575832 0.001963 0.089929
In [11]:
# Function to get simple linear regression coefficients for Auto data set
def auto_coef(data, features, response):
    """
    data: pandas dataframe sampled from the Auto data set
    features: a string list of feature names
    response: a string of response names
    """

    auto_reg = LinearRegression()
    auto_reg.fit(data[features], data[response])
    return [auto_reg.intercept_] + list(auto_reg.coef_)

features = ['horsepower']
response = 'mpg'
coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])
print("\nmpg ~ horsepower coefficients:\n\n", coef_original)
mpg ~ horsepower coefficients:

   Intercept  horsepower
  39.935861   -0.157845
In [12]:
# Bootstrap with B=1000 on Auto data
N = auto.shape[0]
B = 1000
auto_b = resample(auto, n_samples=N*B, random_state=42)
coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]
coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)
coef_bias = coefs_df.mean() - coef_original
coef_se = coefs_df.std()
coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)
coef_bootstrap.columns = ['original', 'bias', 'std. error']
display(coef_bootstrap)
original bias std. error
Intercept 39.935861 0.033521 0.869087
horsepower -0.157845 -0.000500 0.007503