import pandas as pd
import numpy as np
We'll use rpy2
to create the same sample data as in video.
%load_ext rmagic
%%R -o x,y
set.seed(1234); par(mar=c(0,0,0,0))
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
plot(x,y,col='blue',pch=19,cex=2)
text(x+0.05,y+0.05,labels=as.character(0:11))
dataFrame = pd.DataFrame(zip(x, y), columns=['x', 'y'])
Performing hierarchical clustering and plotting dendrogram can be done with scipy
.
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
# not printed as pretty, but the values are correct
distxy = squareform(pdist(dataFrame, metric='euclidean'))
print distxy
[[ 0. 0.34120511 0.57493739 0.26381786 1.694247 1.65812902 1.49823399 1.99149025 2.13629539 2.06419586 2.14702468 2.05664233] [ 0.34120511 0. 0.2410275 0.52578819 1.35818182 1.31960442 1.16620981 1.69093111 1.83167669 1.76999236 1.85183204 1.74662555] [ 0.57493739 0.2410275 0. 0.71861759 1.11952883 1.08338841 0.92568723 1.45648906 1.67835968 1.6310979 1.71074417 1.58658782] [ 0.26381786 0.52578819 0.71861759 0. 1.80666768 1.78081321 1.60131659 2.0284949 2.35675598 2.2923948 2.37461984 2.27232243] [ 1.694247 1.35818182 1.11952883 1.80666768 0. 0.08150268 0.21110433 0.617042 1.18349654 1.23847877 1.28153948 1.07700974] [ 1.65812902 1.31960442 1.08338841 1.78081321 0.08150268 0. 0.21666557 0.69791931 1.11500116 1.16550201 1.21077373 1.00777231] [ 1.49823399 1.16620981 0.92568723 1.60131659 0.21110433 0.21666557 0. 0.65062566 1.28582631 1.32063059 1.37369662 1.17740375] [ 1.99149025 1.69093111 1.45648906 2.0284949 0.617042 0.69791931 0.65062566 0. 1.76460709 1.83517785 1.86999431 1.66223814] [ 2.13629539 1.83167669 1.67835968 2.35675598 1.18349654 1.11500116 1.28582631 1.76460709 0. 0.14090406 0.11624471 0.10848966] [ 2.06419586 1.76999236 1.6310979 2.2923948 1.23847877 1.16550201 1.32063059 1.83517785 0.14090406 0. 0.0831757 0.19128645] [ 2.14702468 1.85183204 1.71074417 2.37461984 1.28153948 1.21077373 1.37369662 1.86999431 0.11624471 0.0831757 0. 0.20802789] [ 2.05664233 1.74662555 1.58658782 2.27232243 1.07700974 1.00777231 1.17740375 1.66223814 0.10848966 0.19128645 0.20802789 0. ]]
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(distxy, method='complete'))
xlabel('points')
ylabel('Height')
suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
Heatmap
We'll again use rpy2
to create the data:
%%R -i x,y -o dataMatrix
dataFrame <- data.frame(x=x,y=y)
set.seed(143)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]
To plot dendrograms with heatmap as in the video, we'll need to create our own function.
# a simple function to compute hierarchical cluster on both rows and columns, and plot heatmaps
def heatmap(dm):
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform
D1 = squareform(pdist(dm, metric='euclidean'))
D2 = squareform(pdist(dm.T, metric='euclidean'))
f = figure(figsize=(8, 8))
# add first dendrogram
ax1 = f.add_axes([0.09, 0.1, 0.2, 0.6])
Y = linkage(D1, method='complete')
Z1 = dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# add second dendrogram
ax2 = f.add_axes([0.3, 0.71, 0.6, 0.2])
Y = linkage(D2, method='complete')
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# add matrix plot
axmatrix = f.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D1[idx1, :]
D = D[:, idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap='hot')
axmatrix.set_xticks([])
axmatrix.set_yticks([])
return {'ordered' : D, 'rorder' : Z1['leaves'], 'corder' : Z2['leaves']}
heatmap(dataMatrix)
{'corder': [0, 1], 'ordered': array([[ 2.05664233, 0. ], [ 2.27232243, 0.26381786], [ 1.58658782, 0.57493739], [ 1.74662555, 0.34120511], [ 0. , 2.05664233], [ 0.10848966, 2.13629539], [ 0.20802789, 2.14702468], [ 0.19128645, 2.06419586], [ 1.66223814, 1.99149025], [ 1.17740375, 1.49823399], [ 1.07700974, 1.694247 ], [ 1.00777231, 1.65812902]]), 'rorder': [1, 3, 7, 9, 0, 6, 4, 10, 2, 5, 8, 11]}