In [1]:
library(ISLR)
names(Hitters)
dim(Hitters)
sum(is.na(Hitters$Salary))
Hitters=na.omit(Hitters)
dim(Hitters)
sum(is.na(Hitters))
  1. 'AtBat'
  2. 'Hits'
  3. 'HmRun'
  4. 'Runs'
  5. 'RBI'
  6. 'Walks'
  7. 'Years'
  8. 'CAtBat'
  9. 'CHits'
  10. 'CHmRun'
  11. 'CRuns'
  12. 'CRBI'
  13. 'CWalks'
  14. 'League'
  15. 'Division'
  16. 'PutOuts'
  17. 'Assists'
  18. 'Errors'
  19. 'Salary'
  20. 'NewLeague'
  1. 322
  2. 20
59
  1. 263
  2. 20
0
In [2]:
library(leaps)
regfit.full=regsubsets(Salary~.,Hitters)
summary(regfit.full)
Subset selection object
Call: regsubsets.formula(Salary ~ ., Hitters)
19 Variables  (and intercept)
           Forced in Forced out
AtBat          FALSE      FALSE
Hits           FALSE      FALSE
HmRun          FALSE      FALSE
Runs           FALSE      FALSE
RBI            FALSE      FALSE
Walks          FALSE      FALSE
Years          FALSE      FALSE
CAtBat         FALSE      FALSE
CHits          FALSE      FALSE
CHmRun         FALSE      FALSE
CRuns          FALSE      FALSE
CRBI           FALSE      FALSE
CWalks         FALSE      FALSE
LeagueN        FALSE      FALSE
DivisionW      FALSE      FALSE
PutOuts        FALSE      FALSE
Assists        FALSE      FALSE
Errors         FALSE      FALSE
NewLeagueN     FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
         AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
         CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
7  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "       
In [3]:
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
reg.summary$rsq
  1. 'which'
  2. 'rsq'
  3. 'rss'
  4. 'adjr2'
  5. 'cp'
  6. 'bic'
  7. 'outmat'
  8. 'obj'
  1. 0.321450088667894
  2. 0.425223746466778
  3. 0.451429415717957
  4. 0.475406653394787
  5. 0.490803615963518
  6. 0.508714557358996
  7. 0.514122682361273
  8. 0.528556860302574
  9. 0.534612447792431
  10. 0.540494950940798
  11. 0.54261532542542
  12. 0.543630208632163
  13. 0.544457014231977
  14. 0.545216356301489
  15. 0.545469230681164
  16. 0.54576555748528
  17. 0.545951808076699
  18. 0.546094522341906
  19. 0.546115861912532
In [4]:
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)
11
10
6
In [5]:
plot(regfit.full,scale="r2")
In [6]:
plot(regfit.full,scale="adjr2")
In [7]:
plot(regfit.full,scale="Cp")
In [8]:
plot(regfit.full,scale="bic")
In [9]:
coef(regfit.full,6)
(Intercept)
91.5117981171539
AtBat
-1.86858923135561
Hits
7.60439763117204
Walks
3.69764677424139
CRBI
0.643016935097332
DivisionW
-122.95153377292
PutOuts
0.264307605461511
In [10]:
# Forward and Backward Stepwise Selection

regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
19 Variables  (and intercept)
           Forced in Forced out
AtBat          FALSE      FALSE
Hits           FALSE      FALSE
HmRun          FALSE      FALSE
Runs           FALSE      FALSE
RBI            FALSE      FALSE
Walks          FALSE      FALSE
Years          FALSE      FALSE
CAtBat         FALSE      FALSE
CHits          FALSE      FALSE
CHmRun         FALSE      FALSE
CRuns          FALSE      FALSE
CRBI           FALSE      FALSE
CWalks         FALSE      FALSE
LeagueN        FALSE      FALSE
DivisionW      FALSE      FALSE
PutOuts        FALSE      FALSE
Assists        FALSE      FALSE
Errors         FALSE      FALSE
NewLeagueN     FALSE      FALSE
1 subsets of each size up to 19
Selection Algorithm: forward
          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
In [11]:
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
summary(regfit.bwd)
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
19 Variables  (and intercept)
           Forced in Forced out
AtBat          FALSE      FALSE
Hits           FALSE      FALSE
HmRun          FALSE      FALSE
Runs           FALSE      FALSE
RBI            FALSE      FALSE
Walks          FALSE      FALSE
Years          FALSE      FALSE
CAtBat         FALSE      FALSE
CHits          FALSE      FALSE
CHmRun         FALSE      FALSE
CRuns          FALSE      FALSE
CRBI           FALSE      FALSE
CWalks         FALSE      FALSE
LeagueN        FALSE      FALSE
DivisionW      FALSE      FALSE
PutOuts        FALSE      FALSE
Assists        FALSE      FALSE
Errors         FALSE      FALSE
NewLeagueN     FALSE      FALSE
1 subsets of each size up to 19
Selection Algorithm: backward
          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
4  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
5  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
4  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
5  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
In [12]:
coef(regfit.full,7)
(Intercept)
79.4509472435319
Hits
1.28335125463328
Walks
3.22742638972714
CAtBat
-0.375235019565453
CHits
1.49570730989869
CHmRun
1.44205381889666
DivisionW
-129.986643165116
PutOuts
0.236681317525621
In [13]:
coef(regfit.fwd,7)
(Intercept)
109.787306240078
AtBat
-1.95888512315655
Hits
7.44987721841218
Walks
4.91314009272066
CRBI
0.853762209061578
CWalks
-0.305307000381738
DivisionW
-127.122392777488
PutOuts
0.253340431147386
In [14]:
coef(regfit.bwd,7)
(Intercept)
105.648748778828
AtBat
-1.97628381026618
Hits
6.75749144281392
Walks
6.05586912154598
CRuns
1.12930946903271
CWalks
-0.71633458662198
DivisionW
-116.16921690431
PutOuts
0.302884743257576
In [15]:
# Choosing Among Models

set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for(i in 1:19){
   coefi=coef(regfit.best,id=i)
   pred=test.mat[,names(coefi)]%*%coefi
   val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}
val.errors
  1. 220968.010570781
  2. 169157.080938694
  3. 178518.155945132
  4. 163426.079195481
  5. 168418.096758255
  6. 171270.594721366
  7. 162377.082699745
  8. 157909.280553932
  9. 154055.716825013
  10. 148162.052539399
  11. 151156.401317955
  12. 151742.451908185
  13. 152214.450874715
  14. 157358.654080678
  15. 158541.407297311
  16. 158743.3209425
  17. 159972.673927589
  18. 159859.793366072
  19. 160105.640513861
In [16]:
which.min(val.errors)
10
In [17]:
coef(regfit.best,10)
(Intercept)
-80.2751498545584
AtBat
-1.46838155352597
Hits
7.16253138932867
Walks
3.64303449365111
CAtBat
-0.185569803428967
CHits
1.10532384927797
CHmRun
1.38448634401015
CWalks
-0.74831695417675
LeagueN
84.5576102976496
DivisionW
-53.0289658320235
PutOuts
0.238166218963617
In [18]:
predict.regsubsets=function(object,newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
  }
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
In [19]:
coef(regfit.best,10)
(Intercept)
162.535442040545
AtBat
-2.16865005340437
Hits
6.91801749476659
Walks
5.7732246470264
CAtBat
-0.130079783913646
CRuns
1.40824900877044
CRBI
0.774312153250204
CWalks
-0.830826351501743
DivisionW
-112.380057493543
PutOuts
0.297372596603942
Assists
0.283168028635278
In [20]:
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=TRUE)
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
for(j in 1:k){
  best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
  for(i in 1:19){
    pred=predict(best.fit,Hitters[folds==j,],id=i)
    cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
    }
  }
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
1
160093.486398318
2
140196.773091529
3
153116.968353284
4
151159.338649923
5
146841.301803275
6
138302.632366087
7
144346.225347738
8
130207.693625525
9
129459.613710927
10
125334.660569683
11
125153.816649585
12
128273.547905646
13
133461.030824061
14
133974.608481698
15
131825.723993328
16
131882.769468209
17
132750.868578319
18
133096.230787088
19
132804.718368856
In [21]:
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
In [22]:
coef(reg.best,11)
(Intercept)
135.751219457458
AtBat
-2.12774819603869
Hits
6.92369939666552
Walks
5.62027551957861
CAtBat
-0.138991434861849
CRuns
1.45533103037942
CRBI
0.785252782049429
CWalks
-0.822855923951907
LeagueN
43.1116151638512
DivisionW
-111.14602518374
PutOuts
0.289408680295373
Assists
0.26882769003494
In [23]:
# Chapter 6 Lab 2: Ridge Regression and the Lasso

x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary
In [24]:
# Ridge Regression

library(glmnet)
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-5

In [25]:
dim(coef(ridge.mod))
ridge.mod$lambda[50]
  1. 20
  2. 100
11497.5699539774
In [26]:
coef(ridge.mod)[,50]
(Intercept)
407.356050200416
AtBat
0.0369571817501359
Hits
0.138180343807892
HmRun
0.524629975886911
Runs
0.230701522621179
RBI
0.239841458504058
Walks
0.289618741049884
Years
1.10770292908555
CAtBat
0.00313181522151328
CHits
0.0116536373557531
CHmRun
0.0875456697555949
CRuns
0.0233798823693758
CRBI
0.0241383203685686
CWalks
0.0250154205993732
LeagueN
0.0850281135625444
DivisionW
-6.21544097273146
PutOuts
0.0164825767604547
Assists
0.00261298804528183
Errors
-0.0205026903654579
NewLeagueN
0.301433531372699
In [27]:
sqrt(sum(coef(ridge.mod)[-1,50]^2))
6.36061242142791
In [28]:
ridge.mod$lambda[60]
705.480231071865
In [29]:
coef(ridge.mod)[,60]
(Intercept)
54.3251995018372
AtBat
0.112111145878249
Hits
0.656224085323628
HmRun
1.17980909638777
Runs
0.937697128927054
RBI
0.847185458771521
Walks
1.31987948048781
Years
2.59640424574253
CAtBat
0.0108341254432856
CHits
0.0467455700054452
CHmRun
0.337773183143353
CRuns
0.0935552830000676
CRBI
0.0978040232271687
CWalks
0.0718961166304866
LeagueN
13.6837019095343
DivisionW
-54.658777504592
PutOuts
0.118522894134745
Assists
0.01606037317599
Errors
-0.703586547290985
NewLeagueN
8.61181213448926
In [30]:
sqrt(sum(coef(ridge.mod)[-1,60]^2))
57.110014262533
In [31]:
predict(ridge.mod,s=50,type="coefficients")[1:20,]
(Intercept)
48.7661032921608
AtBat
-0.358099859376738
Hits
1.96935928646357
HmRun
-1.27824798145678
Runs
1.14589163211962
RBI
0.803829228437672
Walks
2.71618579623371
Years
-6.21831921727865
CAtBat
0.00544783719814918
CHits
0.10648951402342
CHmRun
0.624485956082661
CRuns
0.221498463760022
CRBI
0.218691380321248
CWalks
-0.150024548516927
LeagueN
45.9258855144158
DivisionW
-118.201136816368
PutOuts
0.250232154092559
Assists
0.121566461346767
Errors
-3.27859954463555
NewLeagueN
-9.4966803100264
In [32]:
set.seed(1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)
mean((mean(y[train])-y.test)^2)
101036.832669597
193253.113067991
In [33]:
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean((ridge.pred-y.test)^2)
193253.05679545
In [34]:
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
mean((ridge.pred-y.test)^2)
114783.077172766
In [35]:
lm(y~x, subset=train)
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
Call:
lm(formula = y ~ x, subset = train)

Coefficients:
(Intercept)       xAtBat        xHits       xHmRun        xRuns         xRBI  
  299.42849     -2.54027      8.36682     11.64512     -9.09923      2.44105  
     xWalks       xYears      xCAtBat       xCHits      xCHmRun       xCRuns  
    9.23440    -22.93673     -0.18154     -0.11598     -1.33888      3.32838  
      xCRBI      xCWalks     xLeagueN   xDivisionW     xPutOuts     xAssists  
    0.07536     -1.07841     59.76065    -98.86233      0.34087      0.34165  
    xErrors  xNewLeagueN  
   -0.64207     -0.67442  
(Intercept)
299.428835955511
AtBat
-2.54014665070144
Hits
8.36611719216035
HmRun
11.6440071973906
Runs
-9.09877718514622
RBI
2.44152119069374
Walks
9.23403909103472
Years
-22.9358444150819
CAtBat
-0.181608426793676
CHits
-0.115614963792392
CHmRun
-1.33836534074527
CRuns
3.32817776511616
CRBI
0.0751177103859102
CWalks
-1.07828646608209
LeagueN
59.7652905857257
DivisionW
-98.8599658984281
PutOuts
0.340864001788611
Assists
0.341656051135429
Errors
-0.642058388019889
NewLeagueN
-0.676063138728381
In [36]:
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
In [37]:
bestlam=cv.out$lambda.min
bestlam
211.741584781282
In [38]:
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
96015.5127255327
(Intercept)
9.88487156523819
AtBat
0.0314399123075993
Hits
1.00882875071523
HmRun
0.139276236015288
Runs
1.11320780992504
RBI
0.873189900643772
Walks
1.80410229199678
Years
0.130743811114436
CAtBat
0.0111397797786966
CHits
0.0648984331610168
CHmRun
0.451585462080598
CRuns
0.12900049045775
CRBI
0.13737711633305
CWalks
0.0290857160383186
LeagueN
27.1822753486268
DivisionW
-91.6341129943135
PutOuts
0.191492519898057
Assists
0.0425453623726451
Errors
-1.81244470270312
NewLeagueN
7.2120838996523
In [39]:
# The Lasso

lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)
In [40]:
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
In [41]:
bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef
lasso.coef[lasso.coef!=0]
100743.446221539
(Intercept)
18.5394843700404
AtBat
0
Hits
1.87353897868797
HmRun
0
Runs
0
RBI
0
Walks
2.21784439442808
Years
0
CAtBat
0
CHits
0
CHmRun
0
CRuns
0.207125172948793
CRBI
0.413013208899728
CWalks
0
LeagueN
3.26666772880497
DivisionW
-103.484545814138
PutOuts
0.220428413476128
Assists
0
Errors
0
NewLeagueN
0
(Intercept)
18.5394843700404
Hits
1.87353897868797
Walks
2.21784439442808
CRuns
0.207125172948793
CRBI
0.413013208899728
LeagueN
3.26666772880497
DivisionW
-103.484545814138
PutOuts
0.220428413476128
In [42]:
# Chapter 6 Lab 3: PCR and PLS Regression

# Principal Components Regression

library(pls)
set.seed(2)
pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV")
summary(pcr.fit)
Attaching package: ‘pls’

The following object is masked from ‘package:stats’:

    loadings

Data: 	X dimension: 263 19 
	Y dimension: 263 1
Fit method: svdpc
Number of components considered: 19

VALIDATION: RMSEP
Cross-validated using 10 random segments.
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
CV             452    348.9    352.2    353.5    352.8    350.1    349.1
adjCV          452    348.7    351.8    352.9    352.1    349.3    348.0
       7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
CV       349.6    350.9    352.9     353.8     355.0     356.2     363.5
adjCV    348.5    349.8    351.6     352.3     353.4     354.5     361.6
       14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
CV        355.2     357.4     347.6     350.1     349.2     352.6
adjCV     352.8     355.2     345.5     347.6     346.7     349.8

TRAINING: % variance explained
        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
X         38.31    60.16    70.84    79.03    84.29    88.63    92.26    94.96
Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69    46.75
        9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
X         96.28     97.26     97.98     98.65     99.15     99.47     99.75
Salary    46.86     47.76     47.82     47.85     48.10     50.40     50.55
        16 comps  17 comps  18 comps  19 comps
X          99.89     99.97     99.99    100.00
Salary     53.01     53.85     54.61     54.61
In [43]:
validationplot(pcr.fit,val.type="MSEP")
In [44]:
set.seed(1)
pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
validationplot(pcr.fit,val.type="MSEP")
In [45]:
pcr.pred=predict(pcr.fit,x[test,],ncomp=7)
mean((pcr.pred-y.test)^2)
96556.2191395342
In [46]:
pcr.fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr.fit)
Data: 	X dimension: 263 19 
	Y dimension: 263 1
Fit method: svdpc
Number of components considered: 7
TRAINING: % variance explained
   1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
X    38.31    60.16    70.84    79.03    84.29    88.63    92.26
y    40.63    41.58    42.17    43.22    44.90    46.48    46.69