## Hierarchical clustering¶

In [1]:
import pandas as pd
import numpy as np


We'll use rpy2 to create the same sample data as in video.

In [3]:
%load_ext rmagic

In [4]:
%%R -o x,y
set.seed(1234); par(mar=c(0,0,0,0))
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
plot(x,y,col='blue',pch=19,cex=2)
text(x+0.05,y+0.05,labels=as.character(0:11))

In [5]:
dataFrame = pd.DataFrame(zip(x, y), columns=['x', 'y'])


Performing hierarchical clustering and plotting dendrogram can be done with scipy.

In [6]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform

# not printed as pretty, but the values are correct
distxy = squareform(pdist(dataFrame, metric='euclidean'))
print distxy

[[ 0.          0.34120511  0.57493739  0.26381786  1.694247    1.65812902
1.49823399  1.99149025  2.13629539  2.06419586  2.14702468  2.05664233]
[ 0.34120511  0.          0.2410275   0.52578819  1.35818182  1.31960442
1.16620981  1.69093111  1.83167669  1.76999236  1.85183204  1.74662555]
[ 0.57493739  0.2410275   0.          0.71861759  1.11952883  1.08338841
0.92568723  1.45648906  1.67835968  1.6310979   1.71074417  1.58658782]
[ 0.26381786  0.52578819  0.71861759  0.          1.80666768  1.78081321
1.60131659  2.0284949   2.35675598  2.2923948   2.37461984  2.27232243]
[ 1.694247    1.35818182  1.11952883  1.80666768  0.          0.08150268
0.21110433  0.617042    1.18349654  1.23847877  1.28153948  1.07700974]
[ 1.65812902  1.31960442  1.08338841  1.78081321  0.08150268  0.
0.21666557  0.69791931  1.11500116  1.16550201  1.21077373  1.00777231]
[ 1.49823399  1.16620981  0.92568723  1.60131659  0.21110433  0.21666557
0.          0.65062566  1.28582631  1.32063059  1.37369662  1.17740375]
[ 1.99149025  1.69093111  1.45648906  2.0284949   0.617042    0.69791931
0.65062566  0.          1.76460709  1.83517785  1.86999431  1.66223814]
[ 2.13629539  1.83167669  1.67835968  2.35675598  1.18349654  1.11500116
1.28582631  1.76460709  0.          0.14090406  0.11624471  0.10848966]
[ 2.06419586  1.76999236  1.6310979   2.2923948   1.23847877  1.16550201
1.32063059  1.83517785  0.14090406  0.          0.0831757   0.19128645]
[ 2.14702468  1.85183204  1.71074417  2.37461984  1.28153948  1.21077373
1.37369662  1.86999431  0.11624471  0.0831757   0.          0.20802789]
[ 2.05664233  1.74662555  1.58658782  2.27232243  1.07700974  1.00777231
1.17740375  1.66223814  0.10848966  0.19128645  0.20802789  0.        ]]

In [7]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram

R = dendrogram(linkage(distxy, method='complete'))

xlabel('points')
ylabel('Height')
suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);


Heatmap

We'll again use rpy2 to create the data:

In [9]:
%%R -i x,y -o dataMatrix
dataFrame <- data.frame(x=x,y=y)
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]


To plot dendrograms with heatmap as in the video, we'll need to create our own function.

In [12]:
# a simple function to compute hierarchical cluster on both rows and columns, and plot heatmaps
def heatmap(dm):
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform

D1 = squareform(pdist(dm, metric='euclidean'))
D2 = squareform(pdist(dm.T, metric='euclidean'))

f = figure(figsize=(8, 8))

# add first dendrogram
ax1 = f.add_axes([0.09, 0.1, 0.2, 0.6])
Y = linkage(D1, method='complete')
Z1 = dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])

# add second dendrogram
ax2 = f.add_axes([0.3, 0.71, 0.6, 0.2])
Y = linkage(D2, method='complete')
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])

# add matrix plot
axmatrix = f.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D1[idx1, :]
D = D[:, idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap='hot')
axmatrix.set_xticks([])
axmatrix.set_yticks([])

return {'ordered' : D, 'rorder' : Z1['leaves'], 'corder' : Z2['leaves']}

In [13]:
heatmap(dataMatrix)

Out[13]:
{'corder': [0, 1],
'ordered': array([[ 2.05664233,  0.        ],
[ 2.27232243,  0.26381786],
[ 1.58658782,  0.57493739],
[ 1.74662555,  0.34120511],
[ 0.        ,  2.05664233],
[ 0.10848966,  2.13629539],
[ 0.20802789,  2.14702468],
[ 0.19128645,  2.06419586],
[ 1.66223814,  1.99149025],
[ 1.17740375,  1.49823399],
[ 1.07700974,  1.694247  ],
[ 1.00777231,  1.65812902]]),
'rorder': [1, 3, 7, 9, 0, 6, 4, 10, 2, 5, 8, 11]}