In [1]:
# Chapter 7 Lab: Non-linear Modeling

library(ISLR)
attach(Wage)

# Polynomial Regression and Step Functions

fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
EstimateStd. Errort valuePr(>|t|)
(Intercept)111.7036082 0.7287409153.2830150 0.0000000
poly(age, 4)14.470679e+023.991479e+011.120056e+011.484604e-28
poly(age, 4)2-4.783158e+02 3.991479e+01-1.198342e+01 2.355831e-32
poly(age, 4)31.255217e+023.991479e+013.144742e+001.678622e-03
poly(age, 4)4-77.91118099 39.91478506 -1.95193788 0.05103865
In [2]:
fit2=lm(wage~poly(age,4,raw=T),data=Wage)
coef(summary(fit2))
EstimateStd. Errort valuePr(>|t|)
(Intercept)-1.841542e+02 6.004038e+01-3.067172e+00 2.180254e-03
poly(age, 4, raw = T)12.124552e+015.886748e+003.609042e+003.123618e-04
poly(age, 4, raw = T)2-0.563859313 0.206108256-2.735743451 0.006260645
poly(age, 4, raw = T)30.0068106880.0030659312.2214092200.026397752
poly(age, 4, raw = T)4-3.203830e-05 1.641359e-05-1.951938e+00 5.103865e-02
In [3]:
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
(Intercept)
-184.154179774634
age
21.2455205321355
I(age^2)
-0.56385931256058
I(age^3)
0.00681068771418498
I(age^4)
-3.20383037493206e-05
In [4]:
fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage)
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
In [5]:
preds2=predict(fit2,newdata=list(age=age.grid),se=TRUE)
max(abs(preds$fit-preds2$fit))
1.83320025826106e-12
In [6]:
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
In [7]:
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
Res.DfRSSDfSum of SqFPr(>F)
1 29985022216 NA NA NA NA
22.997000e+034.793430e+061.000000e+002.287860e+051.435931e+022.367734e-32
32.996000e+034.777674e+061.000000e+001.575569e+049.888756e+001.679213e-03
42.995000e+034.771604e+061.000000e+006.070152e+033.809813e+005.104623e-02
52.994000e+034.770322e+061.000000e+001.282563e+038.049758e-013.696820e-01
In [8]:
coef(summary(fit.5))
EstimateStd. Errort valuePr(>|t|)
(Intercept)111.7036082 0.7287647153.2780243 0.0000000
poly(age, 5)14.470679e+023.991608e+011.120019e+011.491111e-28
poly(age, 5)2-4.783158e+02 3.991608e+01-1.198303e+01 2.367734e-32
poly(age, 5)31.255217e+023.991608e+013.144639e+001.679213e-03
poly(age, 5)4-77.91118099 39.91608468 -1.95187433 0.05104623
poly(age, 5)5-35.8128890 39.9160847 -0.8972045 0.3696820
In [9]:
(-11.983)^2
143.592289
In [10]:
fit.1=lm(wage~education+age,data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
anova(fit.1,fit.2,fit.3)
Res.DfRSSDfSum of SqFPr(>F)
1 29943867992 NA NA NA NA
22.993000e+033.725395e+061.000000e+001.425971e+051.146969e+022.728971e-26
32.992000e+033.719809e+061.000000e+005.586660e+034.493588e+003.410431e-02
In [11]:
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial)
preds=predict(fit,newdata=list(age=age.grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))
preds=predict(fit,newdata=list(age=age.grid),type="response",se=T)
plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
points(jitter(age), I((wage>250)/5),cex=.5,pch="|",col="darkgrey")
lines(age.grid,pfit,lwd=2, col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
In [12]:
table(cut(age,4))
(17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
        750        1399         779          72 
In [13]:
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
EstimateStd. Errort valuePr(>|t|)
(Intercept)94.158392 1.47606963.789970 0.000000
cut(age, 4)(33.5,49]2.405349e+011.829431e+001.314807e+011.982315e-38
cut(age, 4)(49,64.5]2.366456e+012.067958e+001.144344e+011.040750e-29
cut(age, 4)(64.5,80.1]7.6405924.9874241.5319720.125635
In [14]:
# Splines

library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
In [15]:
dim(bs(age,knots=c(25,40,60)))
dim(bs(age,df=6))
attr(bs(age,df=6),"knots")
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit2,newdata=list(age=age.grid),se=T)
lines(age.grid, pred2$fit,col="red",lwd=2)
  1. 3000
  2. 6
  1. 3000
  2. 6
25%
33.75
50%
42
75%
51
Error in plot.xy(xy.coords(x, y), type = type, ...): plot.new has not been called yet
Traceback:

1. lines(age.grid, pred2$fit, col = "red", lwd = 2)
2. lines.default(age.grid, pred2$fit, col = "red", lwd = 2)
3. plot.xy(xy.coords(x, y), type = type, ...)
In [16]:
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
In [17]:
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=TRUE)
fit2$df
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
Warning message:
In smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique 'x' values seems doubtful
6.79459570272203
Error in plot.xy(xy.coords(x, y), type = type, ...): plot.new has not been called yet
Traceback:

1. lines(fit, col = "red", lwd = 2)
2. lines.default(fit, col = "red", lwd = 2)
3. plot.xy(xy.coords(x, y), type = type, ...)
In [18]:
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage)
fit2=loess(wage~age,span=.5,data=Wage)
lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2)
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)