Hierarchical clustering

In [1]:
import pandas as pd
import numpy as np

We'll use rpy2 to create the same sample data as in video.

In [3]:
%load_ext rmagic
In [4]:
%%R -o x,y
set.seed(1234); par(mar=c(0,0,0,0))
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
plot(x,y,col='blue',pch=19,cex=2)
text(x+0.05,y+0.05,labels=as.character(0:11))
In [5]:
dataFrame = pd.DataFrame(zip(x, y), columns=['x', 'y'])

Performing hierarchical clustering and plotting dendrogram can be done with scipy.

In [6]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform

# not printed as pretty, but the values are correct
distxy = squareform(pdist(dataFrame, metric='euclidean'))
print distxy
[[ 0.          0.34120511  0.57493739  0.26381786  1.694247    1.65812902
   1.49823399  1.99149025  2.13629539  2.06419586  2.14702468  2.05664233]
 [ 0.34120511  0.          0.2410275   0.52578819  1.35818182  1.31960442
   1.16620981  1.69093111  1.83167669  1.76999236  1.85183204  1.74662555]
 [ 0.57493739  0.2410275   0.          0.71861759  1.11952883  1.08338841
   0.92568723  1.45648906  1.67835968  1.6310979   1.71074417  1.58658782]
 [ 0.26381786  0.52578819  0.71861759  0.          1.80666768  1.78081321
   1.60131659  2.0284949   2.35675598  2.2923948   2.37461984  2.27232243]
 [ 1.694247    1.35818182  1.11952883  1.80666768  0.          0.08150268
   0.21110433  0.617042    1.18349654  1.23847877  1.28153948  1.07700974]
 [ 1.65812902  1.31960442  1.08338841  1.78081321  0.08150268  0.
   0.21666557  0.69791931  1.11500116  1.16550201  1.21077373  1.00777231]
 [ 1.49823399  1.16620981  0.92568723  1.60131659  0.21110433  0.21666557
   0.          0.65062566  1.28582631  1.32063059  1.37369662  1.17740375]
 [ 1.99149025  1.69093111  1.45648906  2.0284949   0.617042    0.69791931
   0.65062566  0.          1.76460709  1.83517785  1.86999431  1.66223814]
 [ 2.13629539  1.83167669  1.67835968  2.35675598  1.18349654  1.11500116
   1.28582631  1.76460709  0.          0.14090406  0.11624471  0.10848966]
 [ 2.06419586  1.76999236  1.6310979   2.2923948   1.23847877  1.16550201
   1.32063059  1.83517785  0.14090406  0.          0.0831757   0.19128645]
 [ 2.14702468  1.85183204  1.71074417  2.37461984  1.28153948  1.21077373
   1.37369662  1.86999431  0.11624471  0.0831757   0.          0.20802789]
 [ 2.05664233  1.74662555  1.58658782  2.27232243  1.07700974  1.00777231
   1.17740375  1.66223814  0.10848966  0.19128645  0.20802789  0.        ]]
In [7]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram

R = dendrogram(linkage(distxy, method='complete'))

xlabel('points')
ylabel('Height')
suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);


Heatmap

We'll again use rpy2 to create the data:

In [9]:
%%R -i x,y -o dataMatrix
dataFrame <- data.frame(x=x,y=y)
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]

To plot dendrograms with heatmap as in the video, we'll need to create our own function.

In [12]:
# a simple function to compute hierarchical cluster on both rows and columns, and plot heatmaps
def heatmap(dm):
    from scipy.cluster.hierarchy import linkage, dendrogram
    from scipy.spatial.distance import pdist, squareform
    
    D1 = squareform(pdist(dm, metric='euclidean'))
    D2 = squareform(pdist(dm.T, metric='euclidean'))
    
    f = figure(figsize=(8, 8))

    # add first dendrogram
    ax1 = f.add_axes([0.09, 0.1, 0.2, 0.6])
    Y = linkage(D1, method='complete')
    Z1 = dendrogram(Y, orientation='right')
    ax1.set_xticks([])
    ax1.set_yticks([])

    # add second dendrogram
    ax2 = f.add_axes([0.3, 0.71, 0.6, 0.2])
    Y = linkage(D2, method='complete')
    Z2 = dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    # add matrix plot
    axmatrix = f.add_axes([0.3, 0.1, 0.6, 0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    D = D1[idx1, :]
    D = D[:, idx2]
    im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap='hot')
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])
    
    return {'ordered' : D, 'rorder' : Z1['leaves'], 'corder' : Z2['leaves']}
In [13]:
heatmap(dataMatrix)
Out[13]:
{'corder': [0, 1],
 'ordered': array([[ 2.05664233,  0.        ],
       [ 2.27232243,  0.26381786],
       [ 1.58658782,  0.57493739],
       [ 1.74662555,  0.34120511],
       [ 0.        ,  2.05664233],
       [ 0.10848966,  2.13629539],
       [ 0.20802789,  2.14702468],
       [ 0.19128645,  2.06419586],
       [ 1.66223814,  1.99149025],
       [ 1.17740375,  1.49823399],
       [ 1.07700974,  1.694247  ],
       [ 1.00777231,  1.65812902]]),
 'rorder': [1, 3, 7, 9, 0, 6, 4, 10, 2, 5, 8, 11]}