library(ISLR)
names(Hitters)
dim(Hitters)
sum(is.na(Hitters$Salary))
Hitters=na.omit(Hitters)
dim(Hitters)
sum(is.na(Hitters))
library(leaps)
regfit.full=regsubsets(Salary~.,Hitters)
summary(regfit.full)
Subset selection object Call: regsubsets.formula(Salary ~ ., Hitters) 19 Variables (and intercept) Forced in Forced out AtBat FALSE FALSE Hits FALSE FALSE HmRun FALSE FALSE Runs FALSE FALSE RBI FALSE FALSE Walks FALSE FALSE Years FALSE FALSE CAtBat FALSE FALSE CHits FALSE FALSE CHmRun FALSE FALSE CRuns FALSE FALSE CRBI FALSE FALSE CWalks FALSE FALSE LeagueN FALSE FALSE DivisionW FALSE FALSE PutOuts FALSE FALSE Assists FALSE FALSE Errors FALSE FALSE NewLeagueN FALSE FALSE 1 subsets of each size up to 8 Selection Algorithm: exhaustive AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " " 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " " CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN 1 ( 1 ) " " " " " " " " " " " " " " 2 ( 1 ) " " " " " " " " " " " " " " 3 ( 1 ) " " " " " " "*" " " " " " " 4 ( 1 ) " " " " "*" "*" " " " " " " 5 ( 1 ) " " " " "*" "*" " " " " " " 6 ( 1 ) " " " " "*" "*" " " " " " " 7 ( 1 ) " " " " "*" "*" " " " " " " 8 ( 1 ) "*" " " "*" "*" " " " " " "
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
reg.summary$rsq
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)
plot(regfit.full,scale="r2")
plot(regfit.full,scale="adjr2")
plot(regfit.full,scale="Cp")
plot(regfit.full,scale="bic")
coef(regfit.full,6)
# Forward and Backward Stepwise Selection
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
Subset selection object Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward") 19 Variables (and intercept) Forced in Forced out AtBat FALSE FALSE Hits FALSE FALSE HmRun FALSE FALSE Runs FALSE FALSE RBI FALSE FALSE Walks FALSE FALSE Years FALSE FALSE CAtBat FALSE FALSE CHits FALSE FALSE CHmRun FALSE FALSE CRuns FALSE FALSE CRBI FALSE FALSE CWalks FALSE FALSE LeagueN FALSE FALSE DivisionW FALSE FALSE PutOuts FALSE FALSE Assists FALSE FALSE Errors FALSE FALSE NewLeagueN FALSE FALSE 1 subsets of each size up to 19 Selection Algorithm: forward AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*" 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*" 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*" 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*" 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*" 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN 1 ( 1 ) " " " " " " " " " " " " " " 2 ( 1 ) " " " " " " " " " " " " " " 3 ( 1 ) " " " " " " "*" " " " " " " 4 ( 1 ) " " " " "*" "*" " " " " " " 5 ( 1 ) " " " " "*" "*" " " " " " " 6 ( 1 ) " " " " "*" "*" " " " " " " 7 ( 1 ) "*" " " "*" "*" " " " " " " 8 ( 1 ) "*" " " "*" "*" " " " " " " 9 ( 1 ) "*" " " "*" "*" " " " " " " 10 ( 1 ) "*" " " "*" "*" "*" " " " " 11 ( 1 ) "*" "*" "*" "*" "*" " " " " 12 ( 1 ) "*" "*" "*" "*" "*" " " " " 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
summary(regfit.bwd)
Subset selection object Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward") 19 Variables (and intercept) Forced in Forced out AtBat FALSE FALSE Hits FALSE FALSE HmRun FALSE FALSE Runs FALSE FALSE RBI FALSE FALSE Walks FALSE FALSE Years FALSE FALSE CAtBat FALSE FALSE CHits FALSE FALSE CHmRun FALSE FALSE CRuns FALSE FALSE CRBI FALSE FALSE CWalks FALSE FALSE LeagueN FALSE FALSE DivisionW FALSE FALSE PutOuts FALSE FALSE Assists FALSE FALSE Errors FALSE FALSE NewLeagueN FALSE FALSE 1 subsets of each size up to 19 Selection Algorithm: backward AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " " 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " " 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " " 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " " 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*" 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*" 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*" 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*" 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*" 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN 1 ( 1 ) " " " " " " " " " " " " " " 2 ( 1 ) " " " " " " " " " " " " " " 3 ( 1 ) " " " " " " "*" " " " " " " 4 ( 1 ) " " " " " " "*" " " " " " " 5 ( 1 ) " " " " " " "*" " " " " " " 6 ( 1 ) " " " " "*" "*" " " " " " " 7 ( 1 ) "*" " " "*" "*" " " " " " " 8 ( 1 ) "*" " " "*" "*" " " " " " " 9 ( 1 ) "*" " " "*" "*" " " " " " " 10 ( 1 ) "*" " " "*" "*" "*" " " " " 11 ( 1 ) "*" "*" "*" "*" "*" " " " " 12 ( 1 ) "*" "*" "*" "*" "*" " " " " 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " 15 ( 1 ) "*" "*" "*" "*" "*" "*" " " 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
coef(regfit.full,7)
coef(regfit.fwd,7)
coef(regfit.bwd,7)
# Choosing Among Models
set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for(i in 1:19){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}
val.errors
which.min(val.errors)
coef(regfit.best,10)
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,10)
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=TRUE)
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
for(j in 1:k){
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19){
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
coef(reg.best,11)
# Chapter 6 Lab 2: Ridge Regression and the Lasso
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary
# Ridge Regression
library(glmnet)
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
Loading required package: Matrix Loading required package: foreach Loaded glmnet 2.0-5
dim(coef(ridge.mod))
ridge.mod$lambda[50]
coef(ridge.mod)[,50]
sqrt(sum(coef(ridge.mod)[-1,50]^2))
ridge.mod$lambda[60]
coef(ridge.mod)[,60]
sqrt(sum(coef(ridge.mod)[-1,60]^2))
predict(ridge.mod,s=50,type="coefficients")[1:20,]
set.seed(1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)
mean((mean(y[train])-y.test)^2)
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean((ridge.pred-y.test)^2)
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
mean((ridge.pred-y.test)^2)
lm(y~x, subset=train)
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
Call: lm(formula = y ~ x, subset = train) Coefficients: (Intercept) xAtBat xHits xHmRun xRuns xRBI 299.42849 -2.54027 8.36682 11.64512 -9.09923 2.44105 xWalks xYears xCAtBat xCHits xCHmRun xCRuns 9.23440 -22.93673 -0.18154 -0.11598 -1.33888 3.32838 xCRBI xCWalks xLeagueN xDivisionW xPutOuts xAssists 0.07536 -1.07841 59.76065 -98.86233 0.34087 0.34165 xErrors xNewLeagueN -0.64207 -0.67442
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
# The Lasso
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef
lasso.coef[lasso.coef!=0]
# Chapter 6 Lab 3: PCR and PLS Regression
# Principal Components Regression
library(pls)
set.seed(2)
pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV")
summary(pcr.fit)
Attaching package: ‘pls’ The following object is masked from ‘package:stats’: loadings
Data: X dimension: 263 19 Y dimension: 263 1 Fit method: svdpc Number of components considered: 19 VALIDATION: RMSEP Cross-validated using 10 random segments. (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps CV 452 348.9 352.2 353.5 352.8 350.1 349.1 adjCV 452 348.7 351.8 352.9 352.1 349.3 348.0 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps CV 349.6 350.9 352.9 353.8 355.0 356.2 363.5 adjCV 348.5 349.8 351.6 352.3 353.4 354.5 361.6 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps CV 355.2 357.4 347.6 350.1 349.2 352.6 adjCV 352.8 355.2 345.5 347.6 346.7 349.8 TRAINING: % variance explained 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps X 38.31 60.16 70.84 79.03 84.29 88.63 92.26 94.96 Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69 46.75 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps X 96.28 97.26 97.98 98.65 99.15 99.47 99.75 Salary 46.86 47.76 47.82 47.85 48.10 50.40 50.55 16 comps 17 comps 18 comps 19 comps X 99.89 99.97 99.99 100.00 Salary 53.01 53.85 54.61 54.61
validationplot(pcr.fit,val.type="MSEP")
set.seed(1)
pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
validationplot(pcr.fit,val.type="MSEP")
pcr.pred=predict(pcr.fit,x[test,],ncomp=7)
mean((pcr.pred-y.test)^2)
pcr.fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr.fit)
Data: X dimension: 263 19 Y dimension: 263 1 Fit method: svdpc Number of components considered: 7 TRAINING: % variance explained 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps X 38.31 60.16 70.84 79.03 84.29 88.63 92.26 y 40.63 41.58 42.17 43.22 44.90 46.48 46.69