#!/usr/bin/env python # coding: utf-8 # # Chapter 5 - Resampling Methods # - [Lab 5.3.1 The Validation Set Approach](#lab-5.3.1) # - [Lab 5.3.2 Leave-One-Out Cross-Validation](#lab-5.3.2) # - [Lab 5.3.3 k-Fold Cross-Validation](#lab-5.3.3) # - [Lab 5.3.4 The Bootstrap](#lab-5.3.4) # ### Imports and Configurations # In[1]: # Standard imports import warnings # Use rpy2 for loading R datasets from rpy2.robjects.packages import importr from rpy2.robjects.packages import data as rdata from rpy2.robjects import pandas2ri # Math and data processing import numpy as np import scipy as sp import pandas as pd # scikit-learn from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score from sklearn.utils import resample from sklearn.preprocessing import scale, PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error # Visulization from IPython.display import display import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') mpl.style.use('ggplot') # # ### Lab 5.3.1 The Validation Set Approach # In[2]: # Auto dataset is in R ISLR package islr = importr('ISLR') auto_rdf = rdata(islr).fetch('Auto')['Auto'] auto = pandas2ri.ri2py(auto_rdf) display(auto) # In[3]: # Simple linear regression features and response features = ['horsepower'] response = ['mpg'] X = auto[features] y = auto[response] # Split Auto data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47) # Regression auto_slr = LinearRegression() auto_slr.fit(X_train, y_train) # Prediction and MSE y_pred = auto_slr.predict(X_test) print("SLR MSE = ", mean_squared_error(y_test, y_pred)) # In[4]: # Polynomial regression features of degree 2 poly2 = PolynomialFeatures(degree=2) X2 = poly2.fit_transform(X) # Split Auto data into train and test sets X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47) # Regression auto_poly2 = LinearRegression() auto_poly2.fit(X2_train, y_train) # Prediction and MSE y2_pred = auto_poly2.predict(X2_test) print("Polynomial regression of degree 2: MSE = ", mean_squared_error(y_test, y2_pred)) # In[5]: # Polynomial regression features of degree 3 poly3 = PolynomialFeatures(degree=3) X3 = poly3.fit_transform(X) # Split Auto data into train and test sets X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47) # Regression auto_poly3 = LinearRegression() auto_poly3.fit(X3_train, y_train) # Prediction and MSE y3_pred = auto_poly3.predict(X3_test) print("Polynomial regression of degree 3: MSE = ", mean_squared_error(y_test, y3_pred)) # # ### 5.3.2 Leave-One-Out Cross-Validation # In[6]: # Polynomial regression over degrees from 1 (simple linear) to 5 auto_poly = LinearRegression() loocv = LeaveOneOut() for poly_deg in range(1, 6): print("\nPolynomial regression of degree {}:".format(poly_deg)) poly = PolynomialFeatures(degree=poly_deg) X_d = poly.fit_transform(X) scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error') loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE print(' MSE = {}\n'.format(loocv_mse)) # # ### 5.3.3 k-Fold Cross-Validation # In[7]: # Polynomial regression over degrees from 1 (simple linear) to 10 auto_poly = LinearRegression() kfold = KFold(n_splits=10, random_state=47) for poly_deg in range(1, 11): print("\nPolynomial regression of degree {}:".format(poly_deg)) poly = PolynomialFeatures(degree=poly_deg) X_d = poly.fit_transform(X) scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error') loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE print(' MSE = {}\n'.format(loocv_mse)) # # ### 5.3.4 The Bootstrap # In[8]: # Auto dataset is in R ISLR package islr = importr('ISLR') portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio'] portfolio = pandas2ri.ri2py(portfolio_rdf) display(portfolio) # In[9]: # Function to calculate the alpha for portofolio allocation def alpha(data): """ data: pandas dataframe with two columns X and Y. """ sigma = data.cov() # covariance matrix var_x = sigma.X['X'] var_y = sigma.Y['Y'] cov_xy = sigma.X['Y'] alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy) return alpha alpha_original = alpha(portfolio) print("Portfolio alpha = ", alpha_original) # In[10]: # Bootstrap with B=1000 on portfolio data N = portfolio.shape[0] B = 1000 portfolio_b = resample(portfolio, n_samples=N*B, random_state=42) alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)] alpha_bias = np.mean(alphas) - alpha_original alpha_se = np.std(alphas) alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],], columns=['original', 'bias', 'std. error'], index=['alpha']) display(alpha_bootstrap) # In[11]: # Function to get simple linear regression coefficients for Auto data set def auto_coef(data, features, response): """ data: pandas dataframe sampled from the Auto data set features: a string list of feature names response: a string of response names """ auto_reg = LinearRegression() auto_reg.fit(data[features], data[response]) return [auto_reg.intercept_] + list(auto_reg.coef_) features = ['horsepower'] response = 'mpg' coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=['']) print("\nmpg ~ horsepower coefficients:\n\n", coef_original) # In[12]: # Bootstrap with B=1000 on Auto data N = auto.shape[0] B = 1000 auto_b = resample(auto, n_samples=N*B, random_state=42) coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)] coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features) coef_bias = coefs_df.mean() - coef_original coef_se = coefs_df.std() coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1) coef_bootstrap.columns = ['original', 'bias', 'std. error'] display(coef_bootstrap)