# Standard imports
import warnings
# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# scikit-learn
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.utils import resample
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')
# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
1 | 18.0 | 8.0 | 307.0 | 130.0 | 3504.0 | 12.0 | 70.0 | 1.0 | chevrolet chevelle malibu |
2 | 15.0 | 8.0 | 350.0 | 165.0 | 3693.0 | 11.5 | 70.0 | 1.0 | buick skylark 320 |
3 | 18.0 | 8.0 | 318.0 | 150.0 | 3436.0 | 11.0 | 70.0 | 1.0 | plymouth satellite |
4 | 16.0 | 8.0 | 304.0 | 150.0 | 3433.0 | 12.0 | 70.0 | 1.0 | amc rebel sst |
5 | 17.0 | 8.0 | 302.0 | 140.0 | 3449.0 | 10.5 | 70.0 | 1.0 | ford torino |
6 | 15.0 | 8.0 | 429.0 | 198.0 | 4341.0 | 10.0 | 70.0 | 1.0 | ford galaxie 500 |
7 | 14.0 | 8.0 | 454.0 | 220.0 | 4354.0 | 9.0 | 70.0 | 1.0 | chevrolet impala |
8 | 14.0 | 8.0 | 440.0 | 215.0 | 4312.0 | 8.5 | 70.0 | 1.0 | plymouth fury iii |
9 | 14.0 | 8.0 | 455.0 | 225.0 | 4425.0 | 10.0 | 70.0 | 1.0 | pontiac catalina |
10 | 15.0 | 8.0 | 390.0 | 190.0 | 3850.0 | 8.5 | 70.0 | 1.0 | amc ambassador dpl |
11 | 15.0 | 8.0 | 383.0 | 170.0 | 3563.0 | 10.0 | 70.0 | 1.0 | dodge challenger se |
12 | 14.0 | 8.0 | 340.0 | 160.0 | 3609.0 | 8.0 | 70.0 | 1.0 | plymouth 'cuda 340 |
13 | 15.0 | 8.0 | 400.0 | 150.0 | 3761.0 | 9.5 | 70.0 | 1.0 | chevrolet monte carlo |
14 | 14.0 | 8.0 | 455.0 | 225.0 | 3086.0 | 10.0 | 70.0 | 1.0 | buick estate wagon (sw) |
15 | 24.0 | 4.0 | 113.0 | 95.0 | 2372.0 | 15.0 | 70.0 | 3.0 | toyota corona mark ii |
16 | 22.0 | 6.0 | 198.0 | 95.0 | 2833.0 | 15.5 | 70.0 | 1.0 | plymouth duster |
17 | 18.0 | 6.0 | 199.0 | 97.0 | 2774.0 | 15.5 | 70.0 | 1.0 | amc hornet |
18 | 21.0 | 6.0 | 200.0 | 85.0 | 2587.0 | 16.0 | 70.0 | 1.0 | ford maverick |
19 | 27.0 | 4.0 | 97.0 | 88.0 | 2130.0 | 14.5 | 70.0 | 3.0 | datsun pl510 |
20 | 26.0 | 4.0 | 97.0 | 46.0 | 1835.0 | 20.5 | 70.0 | 2.0 | volkswagen 1131 deluxe sedan |
21 | 25.0 | 4.0 | 110.0 | 87.0 | 2672.0 | 17.5 | 70.0 | 2.0 | peugeot 504 |
22 | 24.0 | 4.0 | 107.0 | 90.0 | 2430.0 | 14.5 | 70.0 | 2.0 | audi 100 ls |
23 | 25.0 | 4.0 | 104.0 | 95.0 | 2375.0 | 17.5 | 70.0 | 2.0 | saab 99e |
24 | 26.0 | 4.0 | 121.0 | 113.0 | 2234.0 | 12.5 | 70.0 | 2.0 | bmw 2002 |
25 | 21.0 | 6.0 | 199.0 | 90.0 | 2648.0 | 15.0 | 70.0 | 1.0 | amc gremlin |
26 | 10.0 | 8.0 | 360.0 | 215.0 | 4615.0 | 14.0 | 70.0 | 1.0 | ford f250 |
27 | 10.0 | 8.0 | 307.0 | 200.0 | 4376.0 | 15.0 | 70.0 | 1.0 | chevy c20 |
28 | 11.0 | 8.0 | 318.0 | 210.0 | 4382.0 | 13.5 | 70.0 | 1.0 | dodge d200 |
29 | 9.0 | 8.0 | 304.0 | 193.0 | 4732.0 | 18.5 | 70.0 | 1.0 | hi 1200d |
30 | 27.0 | 4.0 | 97.0 | 88.0 | 2130.0 | 14.5 | 71.0 | 3.0 | datsun pl510 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
368 | 28.0 | 4.0 | 112.0 | 88.0 | 2605.0 | 19.6 | 82.0 | 1.0 | chevrolet cavalier |
369 | 27.0 | 4.0 | 112.0 | 88.0 | 2640.0 | 18.6 | 82.0 | 1.0 | chevrolet cavalier wagon |
370 | 34.0 | 4.0 | 112.0 | 88.0 | 2395.0 | 18.0 | 82.0 | 1.0 | chevrolet cavalier 2-door |
371 | 31.0 | 4.0 | 112.0 | 85.0 | 2575.0 | 16.2 | 82.0 | 1.0 | pontiac j2000 se hatchback |
372 | 29.0 | 4.0 | 135.0 | 84.0 | 2525.0 | 16.0 | 82.0 | 1.0 | dodge aries se |
373 | 27.0 | 4.0 | 151.0 | 90.0 | 2735.0 | 18.0 | 82.0 | 1.0 | pontiac phoenix |
374 | 24.0 | 4.0 | 140.0 | 92.0 | 2865.0 | 16.4 | 82.0 | 1.0 | ford fairmont futura |
375 | 36.0 | 4.0 | 105.0 | 74.0 | 1980.0 | 15.3 | 82.0 | 2.0 | volkswagen rabbit l |
376 | 37.0 | 4.0 | 91.0 | 68.0 | 2025.0 | 18.2 | 82.0 | 3.0 | mazda glc custom l |
377 | 31.0 | 4.0 | 91.0 | 68.0 | 1970.0 | 17.6 | 82.0 | 3.0 | mazda glc custom |
378 | 38.0 | 4.0 | 105.0 | 63.0 | 2125.0 | 14.7 | 82.0 | 1.0 | plymouth horizon miser |
379 | 36.0 | 4.0 | 98.0 | 70.0 | 2125.0 | 17.3 | 82.0 | 1.0 | mercury lynx l |
380 | 36.0 | 4.0 | 120.0 | 88.0 | 2160.0 | 14.5 | 82.0 | 3.0 | nissan stanza xe |
381 | 36.0 | 4.0 | 107.0 | 75.0 | 2205.0 | 14.5 | 82.0 | 3.0 | honda accord |
382 | 34.0 | 4.0 | 108.0 | 70.0 | 2245.0 | 16.9 | 82.0 | 3.0 | toyota corolla |
383 | 38.0 | 4.0 | 91.0 | 67.0 | 1965.0 | 15.0 | 82.0 | 3.0 | honda civic |
384 | 32.0 | 4.0 | 91.0 | 67.0 | 1965.0 | 15.7 | 82.0 | 3.0 | honda civic (auto) |
385 | 38.0 | 4.0 | 91.0 | 67.0 | 1995.0 | 16.2 | 82.0 | 3.0 | datsun 310 gx |
386 | 25.0 | 6.0 | 181.0 | 110.0 | 2945.0 | 16.4 | 82.0 | 1.0 | buick century limited |
387 | 38.0 | 6.0 | 262.0 | 85.0 | 3015.0 | 17.0 | 82.0 | 1.0 | oldsmobile cutlass ciera (diesel) |
388 | 26.0 | 4.0 | 156.0 | 92.0 | 2585.0 | 14.5 | 82.0 | 1.0 | chrysler lebaron medallion |
389 | 22.0 | 6.0 | 232.0 | 112.0 | 2835.0 | 14.7 | 82.0 | 1.0 | ford granada l |
390 | 32.0 | 4.0 | 144.0 | 96.0 | 2665.0 | 13.9 | 82.0 | 3.0 | toyota celica gt |
391 | 36.0 | 4.0 | 135.0 | 84.0 | 2370.0 | 13.0 | 82.0 | 1.0 | dodge charger 2.2 |
392 | 27.0 | 4.0 | 151.0 | 90.0 | 2950.0 | 17.3 | 82.0 | 1.0 | chevrolet camaro |
393 | 27.0 | 4.0 | 140.0 | 86.0 | 2790.0 | 15.6 | 82.0 | 1.0 | ford mustang gl |
394 | 44.0 | 4.0 | 97.0 | 52.0 | 2130.0 | 24.6 | 82.0 | 2.0 | vw pickup |
395 | 32.0 | 4.0 | 135.0 | 84.0 | 2295.0 | 11.6 | 82.0 | 1.0 | dodge rampage |
396 | 28.0 | 4.0 | 120.0 | 79.0 | 2625.0 | 18.6 | 82.0 | 1.0 | ford ranger |
397 | 31.0 | 4.0 | 119.0 | 82.0 | 2720.0 | 19.4 | 82.0 | 1.0 | chevy s-10 |
392 rows × 9 columns
# Simple linear regression features and response
features = ['horsepower']
response = ['mpg']
X = auto[features]
y = auto[response]
# Split Auto data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)
# Regression
auto_slr = LinearRegression()
auto_slr.fit(X_train, y_train)
# Prediction and MSE
y_pred = auto_slr.predict(X_test)
print("SLR MSE = ", mean_squared_error(y_test, y_pred))
SLR MSE = 25.273723993
# Polynomial regression features of degree 2
poly2 = PolynomialFeatures(degree=2)
X2 = poly2.fit_transform(X)
# Split Auto data into train and test sets
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)
# Regression
auto_poly2 = LinearRegression()
auto_poly2.fit(X2_train, y_train)
# Prediction and MSE
y2_pred = auto_poly2.predict(X2_test)
print("Polynomial regression of degree 2: MSE = ", mean_squared_error(y_test, y2_pred))
Polynomial regression of degree 2: MSE = 18.8690031195
# Polynomial regression features of degree 3
poly3 = PolynomialFeatures(degree=3)
X3 = poly3.fit_transform(X)
# Split Auto data into train and test sets
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)
# Regression
auto_poly3 = LinearRegression()
auto_poly3.fit(X3_train, y_train)
# Prediction and MSE
y3_pred = auto_poly3.predict(X3_test)
print("Polynomial regression of degree 3: MSE = ", mean_squared_error(y_test, y3_pred))
Polynomial regression of degree 3: MSE = 18.8333669959
# Polynomial regression over degrees from 1 (simple linear) to 5
auto_poly = LinearRegression()
loocv = LeaveOneOut()
for poly_deg in range(1, 6):
print("\nPolynomial regression of degree {}:".format(poly_deg))
poly = PolynomialFeatures(degree=poly_deg)
X_d = poly.fit_transform(X)
scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')
loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE
print(' MSE = {}\n'.format(loocv_mse))
Polynomial regression of degree 1: MSE = 24.231513517929226 Polynomial regression of degree 2: MSE = 19.24821312448939 Polynomial regression of degree 3: MSE = 19.334984064114092 Polynomial regression of degree 4: MSE = 19.42443030854574 Polynomial regression of degree 5: MSE = 19.033219754727583
# Polynomial regression over degrees from 1 (simple linear) to 10
auto_poly = LinearRegression()
kfold = KFold(n_splits=10, random_state=47)
for poly_deg in range(1, 11):
print("\nPolynomial regression of degree {}:".format(poly_deg))
poly = PolynomialFeatures(degree=poly_deg)
X_d = poly.fit_transform(X)
scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')
loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE
print(' MSE = {}\n'.format(loocv_mse))
Polynomial regression of degree 1: MSE = 27.439933652339857 Polynomial regression of degree 2: MSE = 21.235840055802118 Polynomial regression of degree 3: MSE = 21.3366061833284 Polynomial regression of degree 4: MSE = 21.353886987563506 Polynomial regression of degree 5: MSE = 20.905633737044845 Polynomial regression of degree 6: MSE = 20.782704427497574 Polynomial regression of degree 7: MSE = 20.953103378424892 Polynomial regression of degree 8: MSE = 21.07713162886134 Polynomial regression of degree 9: MSE = 21.036781313639857 Polynomial regression of degree 10: MSE = 20.98095645636944
# Auto dataset is in R ISLR package
islr = importr('ISLR')
portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']
portfolio = pandas2ri.ri2py(portfolio_rdf)
display(portfolio)
X | Y | |
---|---|---|
1 | -0.895251 | -0.234924 |
2 | -1.562454 | -0.885176 |
3 | -0.417090 | 0.271888 |
4 | 1.044356 | -0.734198 |
5 | -0.315568 | 0.841983 |
6 | -1.737124 | -2.037191 |
7 | 1.966413 | 1.452957 |
8 | 2.152868 | -0.434139 |
9 | -0.081208 | 1.450809 |
10 | -0.891782 | 0.821016 |
11 | -0.293202 | -1.042391 |
12 | 0.505779 | 0.608478 |
13 | 0.526751 | -0.222493 |
14 | 1.066469 | 1.231357 |
15 | 0.294016 | 0.628589 |
16 | 0.042549 | -1.267574 |
17 | 1.830970 | -0.572752 |
18 | -0.326937 | -0.487472 |
19 | 0.521480 | 2.565985 |
20 | 1.399868 | -0.357836 |
21 | -0.645448 | -1.412431 |
22 | -0.904352 | -0.568305 |
23 | -1.764586 | -0.746273 |
24 | -1.810485 | 0.493747 |
25 | -1.169899 | -2.725281 |
26 | -0.685376 | -0.457616 |
27 | 1.090918 | 0.014495 |
28 | -0.432340 | -0.399831 |
29 | 0.268815 | -0.201608 |
30 | -0.851841 | -1.741829 |
... | ... | ... |
71 | -0.984357 | -1.139160 |
72 | -1.384992 | 0.702700 |
73 | -0.358843 | -1.694513 |
74 | -0.226618 | 0.801939 |
75 | -0.941077 | -0.733189 |
76 | 2.460336 | -0.048373 |
77 | 0.716797 | 0.602337 |
78 | -0.248087 | -1.018490 |
79 | 1.010773 | 0.052978 |
80 | 2.313049 | 1.752359 |
81 | 0.835180 | 0.985715 |
82 | -1.071903 | -1.247298 |
83 | -1.650526 | 0.215465 |
84 | -0.600486 | -0.420941 |
85 | -0.058529 | 0.127621 |
86 | 0.075727 | -0.522149 |
87 | -1.157832 | 0.590894 |
88 | 1.673606 | 0.114623 |
89 | -1.043988 | -0.418944 |
90 | 0.014687 | -0.558747 |
91 | 0.675322 | 1.482630 |
92 | 1.778342 | 0.942774 |
93 | -1.295764 | -1.085204 |
94 | 0.079602 | -0.539101 |
95 | 2.260858 | 0.673225 |
96 | 0.479091 | 1.454774 |
97 | -0.535020 | -0.399175 |
98 | -0.773129 | -0.957175 |
99 | 0.403634 | 1.396038 |
100 | -0.588496 | -0.497285 |
100 rows × 2 columns
# Function to calculate the alpha for portofolio allocation
def alpha(data):
"""
data: pandas dataframe with two columns X and Y.
"""
sigma = data.cov() # covariance matrix
var_x = sigma.X['X']
var_y = sigma.Y['Y']
cov_xy = sigma.X['Y']
alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)
return alpha
alpha_original = alpha(portfolio)
print("Portfolio alpha = ", alpha_original)
Portfolio alpha = 0.575832074593
# Bootstrap with B=1000 on portfolio data
N = portfolio.shape[0]
B = 1000
portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)
alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]
alpha_bias = np.mean(alphas) - alpha_original
alpha_se = np.std(alphas)
alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],
columns=['original', 'bias', 'std. error'], index=['alpha'])
display(alpha_bootstrap)
original | bias | std. error | |
---|---|---|---|
alpha | 0.575832 | 0.001963 | 0.089929 |
# Function to get simple linear regression coefficients for Auto data set
def auto_coef(data, features, response):
"""
data: pandas dataframe sampled from the Auto data set
features: a string list of feature names
response: a string of response names
"""
auto_reg = LinearRegression()
auto_reg.fit(data[features], data[response])
return [auto_reg.intercept_] + list(auto_reg.coef_)
features = ['horsepower']
response = 'mpg'
coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])
print("\nmpg ~ horsepower coefficients:\n\n", coef_original)
mpg ~ horsepower coefficients: Intercept horsepower 39.935861 -0.157845
# Bootstrap with B=1000 on Auto data
N = auto.shape[0]
B = 1000
auto_b = resample(auto, n_samples=N*B, random_state=42)
coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]
coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)
coef_bias = coefs_df.mean() - coef_original
coef_se = coefs_df.std()
coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)
coef_bootstrap.columns = ['original', 'bias', 'std. error']
display(coef_bootstrap)
original | bias | std. error | |
---|---|---|---|
Intercept | 39.935861 | 0.033521 | 0.869087 |
horsepower | -0.157845 | -0.000500 | 0.007503 |