#!/usr/bin/env python
# coding: utf-8
# # Chapter 5 - Resampling Methods
# - [Lab 5.3.1 The Validation Set Approach](#lab-5.3.1)
# - [Lab 5.3.2 Leave-One-Out Cross-Validation](#lab-5.3.2)
# - [Lab 5.3.3 k-Fold Cross-Validation](#lab-5.3.3)
# - [Lab 5.3.4 The Bootstrap](#lab-5.3.4)
# ### Imports and Configurations
# In[1]:
# Standard imports
import warnings
# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# scikit-learn
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.utils import resample
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
mpl.style.use('ggplot')
#
# ### Lab 5.3.1 The Validation Set Approach
# In[2]:
# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)
# In[3]:
# Simple linear regression features and response
features = ['horsepower']
response = ['mpg']
X = auto[features]
y = auto[response]
# Split Auto data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)
# Regression
auto_slr = LinearRegression()
auto_slr.fit(X_train, y_train)
# Prediction and MSE
y_pred = auto_slr.predict(X_test)
print("SLR MSE = ", mean_squared_error(y_test, y_pred))
# In[4]:
# Polynomial regression features of degree 2
poly2 = PolynomialFeatures(degree=2)
X2 = poly2.fit_transform(X)
# Split Auto data into train and test sets
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)
# Regression
auto_poly2 = LinearRegression()
auto_poly2.fit(X2_train, y_train)
# Prediction and MSE
y2_pred = auto_poly2.predict(X2_test)
print("Polynomial regression of degree 2: MSE = ", mean_squared_error(y_test, y2_pred))
# In[5]:
# Polynomial regression features of degree 3
poly3 = PolynomialFeatures(degree=3)
X3 = poly3.fit_transform(X)
# Split Auto data into train and test sets
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)
# Regression
auto_poly3 = LinearRegression()
auto_poly3.fit(X3_train, y_train)
# Prediction and MSE
y3_pred = auto_poly3.predict(X3_test)
print("Polynomial regression of degree 3: MSE = ", mean_squared_error(y_test, y3_pred))
#
# ### 5.3.2 Leave-One-Out Cross-Validation
# In[6]:
# Polynomial regression over degrees from 1 (simple linear) to 5
auto_poly = LinearRegression()
loocv = LeaveOneOut()
for poly_deg in range(1, 6):
print("\nPolynomial regression of degree {}:".format(poly_deg))
poly = PolynomialFeatures(degree=poly_deg)
X_d = poly.fit_transform(X)
scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')
loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE
print(' MSE = {}\n'.format(loocv_mse))
#
# ### 5.3.3 k-Fold Cross-Validation
# In[7]:
# Polynomial regression over degrees from 1 (simple linear) to 10
auto_poly = LinearRegression()
kfold = KFold(n_splits=10, random_state=47)
for poly_deg in range(1, 11):
print("\nPolynomial regression of degree {}:".format(poly_deg))
poly = PolynomialFeatures(degree=poly_deg)
X_d = poly.fit_transform(X)
scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')
loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE
print(' MSE = {}\n'.format(loocv_mse))
#
# ### 5.3.4 The Bootstrap
# In[8]:
# Auto dataset is in R ISLR package
islr = importr('ISLR')
portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']
portfolio = pandas2ri.ri2py(portfolio_rdf)
display(portfolio)
# In[9]:
# Function to calculate the alpha for portofolio allocation
def alpha(data):
"""
data: pandas dataframe with two columns X and Y.
"""
sigma = data.cov() # covariance matrix
var_x = sigma.X['X']
var_y = sigma.Y['Y']
cov_xy = sigma.X['Y']
alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)
return alpha
alpha_original = alpha(portfolio)
print("Portfolio alpha = ", alpha_original)
# In[10]:
# Bootstrap with B=1000 on portfolio data
N = portfolio.shape[0]
B = 1000
portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)
alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]
alpha_bias = np.mean(alphas) - alpha_original
alpha_se = np.std(alphas)
alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],
columns=['original', 'bias', 'std. error'], index=['alpha'])
display(alpha_bootstrap)
# In[11]:
# Function to get simple linear regression coefficients for Auto data set
def auto_coef(data, features, response):
"""
data: pandas dataframe sampled from the Auto data set
features: a string list of feature names
response: a string of response names
"""
auto_reg = LinearRegression()
auto_reg.fit(data[features], data[response])
return [auto_reg.intercept_] + list(auto_reg.coef_)
features = ['horsepower']
response = 'mpg'
coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])
print("\nmpg ~ horsepower coefficients:\n\n", coef_original)
# In[12]:
# Bootstrap with B=1000 on Auto data
N = auto.shape[0]
B = 1000
auto_b = resample(auto, n_samples=N*B, random_state=42)
coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]
coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)
coef_bias = coefs_df.mean() - coef_original
coef_se = coefs_df.std()
coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)
coef_bootstrap.columns = ['original', 'bias', 'std. error']
display(coef_bootstrap)