In [1]:

# Chapter 10 Lab 1: Principal Components Analysis

states=row.names(USArrests)
states

'Alabama'
'Alaska'
'Arizona'
'Arkansas'
'California'
'Colorado'
'Connecticut'
'Delaware'
'Florida'
'Georgia'
'Hawaii'
'Idaho'
'Illinois'
'Indiana'
'Iowa'
'Kansas'
'Kentucky'
'Louisiana'
'Maine'
'Maryland'
'Massachusetts'
'Michigan'
'Minnesota'
'Mississippi'
'Missouri'
'Montana'
'Nebraska'
'Nevada'
'New Hampshire'
'New Jersey'
'New Mexico'
'New York'
'North Carolina'
'North Dakota'
'Ohio'
'Oklahoma'
'Oregon'
'Pennsylvania'
'Rhode Island'
'South Carolina'
'South Dakota'
'Tennessee'
'Texas'
'Utah'
'Vermont'
'Virginia'
'Washington'
'West Virginia'
'Wisconsin'
'Wyoming'

In [2]:

names(USArrests)

'Murder'
'Assault'
'UrbanPop'
'Rape'

In [3]:

apply(USArrests, 2, mean)

Murder: 7.788
Assault: 170.76
UrbanPop: 65.54
Rape: 21.232

In [4]:

apply(USArrests, 2, var)

Murder: 18.9704653061224
Assault: 6945.16571428571
UrbanPop: 209.518775510204
Rape: 87.7291591836735

In [5]:

pr.out=prcomp(USArrests, scale=TRUE)
names(pr.out)

'sdev'
'rotation'
'center'
'scale'
'x'

In [6]:

pr.out$center
pr.out$scale
pr.out$rotation

Murder: 7.788
Assault: 170.76
UrbanPop: 65.54
Rape: 21.232

Murder: 4.35550976420929
Assault: 83.3376608400171
UrbanPop: 14.4747634008368
Rape: 9.36638453105965

	PC1	PC2	PC3	PC4
Murder	-0.5358995	0.4181809	-0.3412327	0.6492278
Assault	-0.5831836	0.1879856	-0.2681484	-0.7434075
UrbanPop	-0.2781909	-0.8728062	-0.3780158	0.1338777
Rape	-0.54343209	-0.16731864	0.81777791	0.08902432

In [7]:

dim(pr.out$x)

50
4

In [8]:

biplot(pr.out, scale=0)

In [9]:

pr.out$rotation=-pr.out$rotation
pr.out$x=-pr.out$x
biplot(pr.out, scale=0)

In [10]:

pr.out$sdev

1.57487827439123
0.994869414817764
0.597129115502527
0.41644938195396

In [11]:

pr.var=pr.out$sdev^2
pr.var
pve=pr.var/sum(pr.var)
pve

2.48024157914949
0.989765152539841
0.35656318058083
0.173430087729835

0.620060394787373
0.24744128813496
0.0891407951452075
0.0433575219324588

In [12]:

plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

In [13]:

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

In [14]:

a=c(1,2,8,-3)
cumsum(a)

1
3
11
8

In [15]:

# Chapter 10 Lab 2: Clustering

# K-Means Clustering

set.seed(2)
x=matrix(rnorm(50*2), ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4
km.out=kmeans(x,2,nstart=20)
km.out$cluster

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

In [16]:

plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2)

In [17]:

set.seed(4)
km.out=kmeans(x,3,nstart=20)
km.out

K-means clustering with 3 clusters of sizes 10, 23, 17

Cluster means:
        [,1]        [,2]
1  2.3001545 -2.69622023
2 -0.3820397 -0.08740753
3  3.7789567 -4.56200798

Clustering vector:
 [1] 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 1 2 1 2 2 2 2

Within cluster sum of squares by cluster:
[1] 19.56137 52.67700 25.74089
 (between_SS / total_SS =  79.3 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

In [18]:

plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=3", xlab="", ylab="", pch=20, cex=2)

In [19]:

set.seed(3)
km.out=kmeans(x,3,nstart=1)
km.out$tot.withinss
km.out=kmeans(x,3,nstart=20)
km.out$tot.withinss

104.331921973392

97.9792674793981

In [20]:

# Hierarchical Clustering

hc.complete=hclust(dist(x), method="complete")
hc.average=hclust(dist(x), method="average")
hc.single=hclust(dist(x), method="single")

In [21]:

par(mfrow=c(1,3))
plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=.9)
plot(hc.average, main="Average Linkage", xlab="", sub="", cex=.9)
plot(hc.single, main="Single Linkage", xlab="", sub="", cex=.9)

In [22]:

cutree(hc.complete, 2)
cutree(hc.average, 2)
cutree(hc.single, 2)
cutree(hc.single, 4)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
1
2
2
2
2
2
2
2
2
2
2
1
2
1
2
2
2
2

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
3
3
3
3
3
3
3
3

In [23]:

xsc=scale(x)
plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")

In [24]:

x=matrix(rnorm(30*3), ncol=3)
dd=as.dist(1-cor(t(x)))
plot(hclust(dd, method="complete"), main="Complete Linkage with Correlation-Based Distance", xlab="", sub="")

In [25]:

# Chapter 10 Lab 3: NCI60 Data Example

# The NCI60 data

library(ISLR)
nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
nci.labs[1:4]
table(nci.labs)

64
6830

'CNS'
'CNS'
'CNS'
'RENAL'

nci.labs
     BREAST         CNS       COLON K562A-repro K562B-repro    LEUKEMIA 
          7           5           7           1           1           6 
MCF7A-repro MCF7D-repro    MELANOMA       NSCLC     OVARIAN    PROSTATE 
          1           1           8           9           6           2 
      RENAL     UNKNOWN 
          9           1

In [26]:

# PCA on the NCI60 Data

pr.out=prcomp(nci.data, scale=TRUE)
Cols=function(vec){
    cols=rainbow(length(unique(vec)))
    return(cols[as.numeric(as.factor(vec))])
  }

In [27]:

par(mfrow=c(1,2))
plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z2")
plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z3")

In [28]:

summary(pr.out)

Importance of components:
                           PC1      PC2      PC3      PC4      PC5      PC6
Standard deviation     27.8535 21.48136 19.82046 17.03256 15.97181 15.72108
Proportion of Variance  0.1136  0.06756  0.05752  0.04248  0.03735  0.03619
Cumulative Proportion   0.1136  0.18115  0.23867  0.28115  0.31850  0.35468
                            PC7      PC8      PC9     PC10     PC11     PC12
Standard deviation     14.47145 13.54427 13.14400 12.73860 12.68672 12.15769
Proportion of Variance  0.03066  0.02686  0.02529  0.02376  0.02357  0.02164
Cumulative Proportion   0.38534  0.41220  0.43750  0.46126  0.48482  0.50646
                           PC13     PC14     PC15     PC16     PC17     PC18
Standard deviation     11.83019 11.62554 11.43779 11.00051 10.65666 10.48880
Proportion of Variance  0.02049  0.01979  0.01915  0.01772  0.01663  0.01611
Cumulative Proportion   0.52695  0.54674  0.56590  0.58361  0.60024  0.61635
                           PC19    PC20     PC21    PC22    PC23    PC24
Standard deviation     10.43518 10.3219 10.14608 10.0544 9.90265 9.64766
Proportion of Variance  0.01594  0.0156  0.01507  0.0148 0.01436 0.01363
Cumulative Proportion   0.63229  0.6479  0.66296  0.6778 0.69212 0.70575
                          PC25    PC26    PC27   PC28    PC29    PC30    PC31
Standard deviation     9.50764 9.33253 9.27320 9.0900 8.98117 8.75003 8.59962
Proportion of Variance 0.01324 0.01275 0.01259 0.0121 0.01181 0.01121 0.01083
Cumulative Proportion  0.71899 0.73174 0.74433 0.7564 0.76824 0.77945 0.79027
                          PC32    PC33    PC34    PC35    PC36    PC37    PC38
Standard deviation     8.44738 8.37305 8.21579 8.15731 7.97465 7.90446 7.82127
Proportion of Variance 0.01045 0.01026 0.00988 0.00974 0.00931 0.00915 0.00896
Cumulative Proportion  0.80072 0.81099 0.82087 0.83061 0.83992 0.84907 0.85803
                          PC39    PC40    PC41   PC42    PC43   PC44    PC45
Standard deviation     7.72156 7.58603 7.45619 7.3444 7.10449 7.0131 6.95839
Proportion of Variance 0.00873 0.00843 0.00814 0.0079 0.00739 0.0072 0.00709
Cumulative Proportion  0.86676 0.87518 0.88332 0.8912 0.89861 0.9058 0.91290
                         PC46    PC47    PC48    PC49    PC50    PC51    PC52
Standard deviation     6.8663 6.80744 6.64763 6.61607 6.40793 6.21984 6.20326
Proportion of Variance 0.0069 0.00678 0.00647 0.00641 0.00601 0.00566 0.00563
Cumulative Proportion  0.9198 0.92659 0.93306 0.93947 0.94548 0.95114 0.95678
                          PC53    PC54    PC55    PC56    PC57   PC58    PC59
Standard deviation     6.06706 5.91805 5.91233 5.73539 5.47261 5.2921 5.02117
Proportion of Variance 0.00539 0.00513 0.00512 0.00482 0.00438 0.0041 0.00369
Cumulative Proportion  0.96216 0.96729 0.97241 0.97723 0.98161 0.9857 0.98940
                          PC60    PC61    PC62    PC63      PC64
Standard deviation     4.68398 4.17567 4.08212 4.04124 1.956e-14
Proportion of Variance 0.00321 0.00255 0.00244 0.00239 0.000e+00
Cumulative Proportion  0.99262 0.99517 0.99761 1.00000 1.000e+00

In [29]:

plot(pr.out)

In [30]:

pve=100*pr.out$sdev^2/sum(pr.out$sdev^2)
par(mfrow=c(1,2))
plot(pve,  type="o", ylab="PVE", xlab="Principal Component", col="blue")
plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")

In [31]:

# Clustering the Observations of the NCI60 Data

sd.data=scale(nci.data)
par(mfrow=c(1,3))
data.dist=dist(sd.data)
plot(hclust(data.dist), labels=nci.labs, main="Complete Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="average"), labels=nci.labs, main="Average Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="single"), labels=nci.labs,  main="Single Linkage", xlab="", sub="",ylab="")

In [32]:

hc.out=hclust(dist(sd.data))
hc.clusters=cutree(hc.out,4)
table(hc.clusters,nci.labs)

           nci.labs
hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
          1      2   3     2           0           0        0           0
          2      3   2     0           0           0        0           0
          3      0   0     0           1           1        6           0
          4      2   0     5           0           0        0           1
           nci.labs
hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
          1           0        8     8       6        2     8       1
          2           0        0     1       0        0     1       0
          3           0        0     0       0        0     0       0
          4           1        0     0       0        0     0       0

In [33]:

par(mfrow=c(1,1))
plot(hc.out, labels=nci.labs)
abline(h=139, col="red")
hc.out

Call:
hclust(d = dist(sd.data))

Cluster method   : complete 
Distance         : euclidean 
Number of objects: 64

In [34]:

set.seed(2)
km.out=kmeans(sd.data, 4, nstart=20)
km.clusters=km.out$cluster
table(km.clusters,hc.clusters)
hc.out=hclust(dist(pr.out$x[,1:5]))

           hc.clusters
km.clusters  1  2  3  4
          1 11  0  0  9
          2  0  0  8  0
          3  9  0  0  0
          4 20  7  0  0

In [35]:

plot(hc.out, labels=nci.labs, main="Hier. Clust. on First Five Score Vectors")

In [36]:

table(cutree(hc.out,4), nci.labs)

   nci.labs
    BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro MCF7D-repro
  1      0   2     7           0           0        2           0           0
  2      5   3     0           0           0        0           0           0
  3      0   0     0           1           1        4           0           0
  4      2   0     0           0           0        0           1           1
   nci.labs
    MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
  1        1     8       5        2     7       0
  2        7     1       1        0     2       1
  3        0     0       0        0     0       0
  4        0     0       0        0     0       0