In [1]:
# The Stock Market Data

library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
pairs(Smarket)
cor(Smarket)
  1. 'Year'
  2. 'Lag1'
  3. 'Lag2'
  4. 'Lag3'
  5. 'Lag4'
  6. 'Lag5'
  7. 'Volume'
  8. 'Today'
  9. 'Direction'
  1. 1250
  2. 9
      Year           Lag1                Lag2                Lag3          
 Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
 1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
 Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
 Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
 3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
 Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
      Lag4                Lag5              Volume           Today          
 Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
 1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
 Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
 Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
 3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
 Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
 Direction 
 Down:602  
 Up  :648  
           
           
           
           
Error in cor(Smarket): 'x' must be numeric
Traceback:

1. cor(Smarket)
2. stop("'x' must be numeric")
In [2]:
cor(Smarket[,-9])
plot(Smarket$Volume)
YearLag1Lag2Lag3Lag4Lag5VolumeToday
Year1.000000000.029699650.030596420.033194580.035688720.029787990.539006470.03009523
Lag1 0.029699649 1.000000000-0.026294328-0.010803402-0.002985911-0.005674606 0.040909908-0.026155045
Lag2 0.030596422-0.026294328 1.000000000-0.025896670-0.010853533-0.003557949-0.043383215-0.010250033
Lag3 0.033194581-0.010803402-0.025896670 1.000000000-0.024051036-0.018808338-0.041823686-0.002447647
Lag4 0.035688718-0.002985911-0.010853533-0.024051036 1.000000000-0.027083641-0.048414246-0.006899527
Lag5 0.029787995-0.005674606-0.003557949-0.018808338-0.027083641 1.000000000-0.022002315-0.034860083
Volume 0.53900647 0.04090991-0.04338321-0.04182369-0.04841425-0.02200231 1.00000000 0.01459182
Today 0.030095229-0.026155045-0.010250033-0.002447647-0.006899527-0.034860083 0.014591823 1.000000000
In [3]:
# Logistic Regression
attach(Smarket)
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial)
summary(glm.fit)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial, data = Smarket)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.446  -1.203   1.065   1.145   1.326  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000   0.240736  -0.523    0.601
Lag1        -0.073074   0.050167  -1.457    0.145
Lag2        -0.042301   0.050086  -0.845    0.398
Lag3         0.011085   0.049939   0.222    0.824
Lag4         0.009359   0.049974   0.187    0.851
Lag5         0.010313   0.049511   0.208    0.835
Volume       0.135441   0.158360   0.855    0.392

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1731.2  on 1249  degrees of freedom
Residual deviance: 1727.6  on 1243  degrees of freedom
AIC: 1741.6

Number of Fisher Scoring iterations: 3
In [4]:
coef(glm.fit)
(Intercept)
-0.126000256559266
Lag1
-0.0730737458900261
Lag2
-0.0423013440073083
Lag3
0.0110851083796763
Lag4
0.00935893837027878
Lag5
0.0103130684758179
Volume
0.13544065885916
In [5]:
summary(glm.fit)$coef
summary(glm.fit)$coef[,4]
EstimateStd. Errorz valuePr(>|z|)
(Intercept)-0.1260003 0.2407357-0.5233966 0.6006983
Lag1-0.07307375 0.05016739-1.45659861 0.14522721
Lag2-0.04230134 0.05008605-0.84457335 0.39834910
Lag30.011085110.049938540.221975000.82433335
Lag40.0093589380.0499741310.1872756590.851444507
Lag50.010313070.049511460.208296590.83499739
Volume0.13544070.15835970.85527230.3924004
(Intercept)
0.600698319413355
Lag1
0.145227211568647
Lag2
0.398349095427021
Lag3
0.824333346101536
Lag4
0.851444506926454
Lag5
0.834997390499829
Volume
0.392400433202429
In [6]:
glm.probs=predict(glm.fit,type="response")
glm.probs[1:10]
contrasts(Smarket$Direction)
1
0.507084133395402
2
0.481467878454591
3
0.481138835214201
4
0.515222355813022
5
0.510781162691538
6
0.506956460534911
7
0.492650874187038
8
0.509229158207377
9
0.517613526170958
10
0.488837779771376
Up
Down0
Up1
In [7]:
glm.pred=rep("Down",1250)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Smarket$Direction)
(507+145)/1250
        
glm.pred Down  Up
    Down  145 141
    Up    457 507
0.5216
In [8]:
mean(glm.pred==Direction)
0.5216
In [9]:
train=(Year<2005)
Smarket.2005=Smarket[!train,]
dim(Smarket.2005)
  1. 252
  2. 9
In [10]:
Direction.2005=Direction[!train]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
        Direction.2005
glm.pred Down Up
    Down   77 97
    Up     34 44
In [11]:
mean(glm.pred==Direction.2005)
mean(glm.pred!=Direction.2005)
0.48015873015873
0.51984126984127
In [12]:
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
106/(106+76)
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")
        Direction.2005
glm.pred Down  Up
    Down   35  35
    Up     76 106
0.55952380952381
0.582417582417582
1
0.479146239171912
2
0.496093872956532
In [13]:
# Linear Discriminant Analysis

library(MASS)
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
Call:
lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544

Coefficients of linear discriminants:
            LD1
Lag1 -0.6420190
Lag2 -0.5135293
In [14]:
plot(lda.fit)
In [15]:
lda.pred=predict(lda.fit, Smarket.2005)
names(lda.pred)
  1. 'class'
  2. 'posterior'
  3. 'x'
In [16]:
lda.class=lda.pred$class
table(lda.class,Direction.2005)
         Direction.2005
lda.class Down  Up
     Down   35  35
     Up     76 106
In [17]:
mean(lda.class==Direction.2005)
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
0.55952380952381
70
182
In [18]:
lda.pred$posterior[1:20,1]
lda.class[1:20]
sum(lda.pred$posterior[,1]>.9)
999
0.490179249818258
1000
0.479218499099683
1001
0.466818479852065
1002
0.474001069455248
1003
0.492787663967445
1004
0.493856154997504
1005
0.495101564646223
1006
0.487286099421815
1007
0.490701348960405
1008
0.484402624071869
1009
0.490696276120968
1010
0.511998846261919
1011
0.489515226936648
1012
0.470676122211879
1013
0.474459285611829
1014
0.479958339148108
1015
0.493577529465861
1016
0.503089377118306
1017
0.497880612141404
1018
0.488633086516518
  1. Up
  2. Up
  3. Up
  4. Up
  5. Up
  6. Up
  7. Up
  8. Up
  9. Up
  10. Up
  11. Up
  12. Down
  13. Up
  14. Up
  15. Up
  16. Up
  17. Up
  18. Down
  19. Up
  20. Up
0
In [19]:
# Quadratic Discriminant Analysis

qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit
qda.class=predict(qda.fit,Smarket.2005)$class
table(qda.class,Direction.2005)
mean(qda.class==Direction.2005)
Call:
qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544
         Direction.2005
qda.class Down  Up
     Down   30  20
     Up     81 121
0.599206349206349
In [20]:
# K-Nearest Neighbors

library(class)
train.X=cbind(Lag1,Lag2)[train,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,Direction.2005)
(83+43)/252
knn.pred=knn(train.X,test.X,train.Direction,k=3)
table(knn.pred,Direction.2005)
mean(knn.pred==Direction.2005)
        Direction.2005
knn.pred Down Up
    Down   43 58
    Up     68 83
0.5
        Direction.2005
knn.pred Down Up
    Down   48 54
    Up     63 87
0.535714285714286
In [21]:
# An Application to Caravan Insurance Data

dim(Caravan)
attach(Caravan)
summary(Purchase)
348/5822
standardized.X=scale(Caravan[,-86])
var(Caravan[,1])
var(Caravan[,2])
var(standardized.X[,1])
var(standardized.X[,2])
  1. 5822
  2. 86
No
5474
Yes
348
0.0597732737890759
165.037847395189
0.164707781931954
1
1
In [22]:
test=1:1000
train.X=standardized.X[-test,]
test.X=standardized.X[test,]
train.Y=Purchase[-test]
test.Y=Purchase[test]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)
mean(test.Y!="No")
table(knn.pred,test.Y)
9/(68+9)
0.118
0.059
        test.Y
knn.pred  No Yes
     No  873  50
     Yes  68   9
0.116883116883117
In [23]:
knn.pred=knn(train.X,test.X,train.Y,k=3)
table(knn.pred,test.Y)
5/26
        test.Y
knn.pred  No Yes
     No  920  54
     Yes  21   5
0.192307692307692
In [24]:
knn.pred=knn(train.X,test.X,train.Y,k=5)
table(knn.pred,test.Y)
4/15
        test.Y
knn.pred  No Yes
     No  930  55
     Yes  11   4
0.266666666666667
In [25]:
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
glm.probs=predict(glm.fit,Caravan[test,],type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.5]="Yes"
table(glm.pred,test.Y)
Warning message:
: glm.fit: fitted probabilities numerically 0 or 1 occurred
        test.Y
glm.pred  No Yes
     No  934  59
     Yes   7   0
In [26]:
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.Y)
11/(22+11)
        test.Y
glm.pred  No Yes
     No  919  48
     Yes  22  11
0.333333333333333