Unsupervised Methods

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as ss
import matplotlib.pyplot as plt
import pylab as pl
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import Ward
from sklearn.preprocessing import Binarizer
%matplotlib inline
In [2]:
# Dataset imported from R using write.csv(USArrests, "/tmp/USArrests.csv", row.names=FALSE)
# Each row in this dataset corresponds to one of the 50 US states.
usa_df = pd.read_csv("../data/USArrests.csv")
usa_df.head()
Out[2]:
State Murder Assault UrbanPop Rape
0 Alabama 13.2 236 58 21.2
1 Alaska 10.0 263 48 44.5
2 Arizona 8.1 294 80 31.0
3 Arkansas 8.8 190 50 19.5
4 California 9.0 276 91 40.6

5 rows × 5 columns

In [3]:
usa_df.describe()
Out[3]:
Murder Assault UrbanPop Rape
count 50.00000 50.000000 50.000000 50.000000
mean 7.78800 170.760000 65.540000 21.232000
std 4.35551 83.337661 14.474763 9.366385
min 0.80000 45.000000 32.000000 7.300000
25% 4.07500 109.000000 54.500000 15.075000
50% 7.25000 159.000000 66.000000 20.100000
75% 11.25000 249.000000 77.750000 26.175000
max 17.40000 337.000000 91.000000 46.000000

8 rows × 4 columns

Principal Components

From the output of describe(), Assault has larger variance compared to other features. This would make Assault dominate the Principal component, so we should standardize the variables.

In [4]:
collist = usa_df.columns[1:]
X = usa_df[collist].values
Xdiv = np.linalg.norm(X, ord=2, axis=1)
usa_df[collist] = X / Xdiv[:, None]
usa_df.head()
Out[4]:
State Murder Assault UrbanPop Rape
0 Alabama 0.054031 0.966016 0.237411 0.086778
1 Alaska 0.036872 0.969739 0.176987 0.164081
2 Arizona 0.026439 0.959624 0.261122 0.101185
3 Arkansas 0.044528 0.961392 0.252998 0.098669
4 California 0.030657 0.940134 0.309972 0.138295

5 rows × 5 columns

In [5]:
usa_df.describe()
Out[5]:
Murder Assault UrbanPop Rape
count 50.000000 50.000000 50.000000 50.000000
mean 0.040677 0.881704 0.409630 0.119341
std 0.015018 0.101645 0.166944 0.035642
min 0.012626 0.473419 0.132112 0.042620
25% 0.027485 0.850453 0.279234 0.092741
50% 0.039406 0.907323 0.395010 0.117441
75% 0.049811 0.952031 0.508929 0.139552
max 0.079346 0.989371 0.854213 0.207893

8 rows × 4 columns

The variances are now comparable. Before doing PCA, we compute similarities between the states and represent as a heatmap. Since the matrix is already normalized, cosine similarity can be computed as a product of X and its transpose (the ||x|| and ||y|| are already built in because of the normalization).

In [6]:
X = usa_df[collist].values
cosim = np.matrix(X) * np.matrix(X.T)
cosim
Out[6]:
matrix([[ 1.        ,  0.99503239,  0.99921399, ...,  0.97860009,
          0.7958759 ,  0.99326328],
        [ 0.99503239,  1.        ,  0.99437703, ...,  0.96276683,
          0.76073097,  0.9820459 ],
        [ 0.99921399,  0.99437703,  1.        , ...,  0.98283719,
          0.81122097,  0.99575786],
        ..., 
        [ 0.97860009,  0.96276683,  0.98283719, ...,  1.        ,
          0.90312329,  0.99546599],
        [ 0.7958759 ,  0.76073097,  0.81122097, ...,  0.90312329,
          1.        ,  0.86017937],
        [ 0.99326328,  0.9820459 ,  0.99575786, ...,  0.99546599,
          0.86017937,  1.        ]])
In [7]:
def heatmap(X, labels):
    """ Based on this stack overflow discussion """
    """ http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor """
    fig, ax = plt.subplots()
    heatmap = ax.pcolor(X, cmap=plt.cm.Blues, alpha=0.8)
    # Format
    fig = plt.gcf()
    fig.set_size_inches(8, 11)
    # turn off the frame
    ax.set_frame_on(False)
    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(X.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(X.shape[1]) + 0.5, minor=False)
    # want a more natural, table-like display
    ax.invert_yaxis()
    ax.xaxis.tick_top()
    # Set the labels
    ax.set_xticklabels(labels, minor=False)
    ax.set_yticklabels(labels, minor=False)
    # rotate the xticks
    plt.xticks(rotation=90)
    ax.grid(False)
    # Turn off all the ticks
    ax = plt.gca()
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

heatmap(np.array(cosim), usa_df["State"].values)
In [8]:
# PCA seems to consider each row as a component. So doing a PCA.fit(X) will decompose
# our 50x4 matrix to 2x4. So we need to fit the transpose of X (4x50) and transpose the 
# resulting principal components 2x50.
pca = PCA(n_components=2)
pca.fit(X.T)
print "Explained variance:", pca.explained_variance_ratio_
X_pr = pca.components_.T
X_pr[0:5, :]
Explained variance: [ 0.93412764  0.06465589]
Out[8]:
array([[ 0.15422962, -0.12628851],
       [ 0.15000496, -0.17607116],
       [ 0.1550715 , -0.10258781],
       [ 0.15364218, -0.11332485],
       [ 0.14892486, -0.06991918]])
In [9]:
# Calculate cosine similarity using X_pr and plot a heat map
X_pdiv = np.linalg.norm(X_pr, ord=2, axis=1)
X_prn = X_pr / X_pdiv[:, None]
cosim_p = np.matrix(X_prn) * np.matrix(X_prn.T)
cosim_p
Out[9]:
matrix([[ 1.        ,  0.98401382,  0.99483635, ...,  0.66265268,
         -0.39006555,  0.90240887],
        [ 0.98401382,  1.        ,  0.96085777, ...,  0.51868141,
         -0.54781484,  0.81124632],
        [ 0.99483635,  0.96085777,  1.        , ...,  0.73524112,
         -0.29459879,  0.94148013],
        ..., 
        [ 0.66265268,  0.51868141,  0.73524112, ...,  1.        ,
          0.43112418,  0.92068183],
        [-0.39006555, -0.54781484, -0.29459879, ...,  0.43112418,
          1.        ,  0.04475082],
        [ 0.90240887,  0.81124632,  0.94148013, ...,  0.92068183,
          0.04475082,  1.        ]])
In [10]:
heatmap(np.array(cosim_p), usa_df["State"].values)

The resulting heatmap after PCA has more distinct similarities, which is expected, since PCA tends to highlight the differences. The R example shows a biplot. I couldn't find an example of a biplot in Matplotlib, and in any case, I didn't find the biplot to be as compelling a visual as a heatmap, so went for a heatmap instead.

K-Means Clustering

For illustration and visualization purposes, the example is restricted to 2 dimensions.

In [12]:
X = np.random.rand(100, 2)
X_mean = 8 + (4 * np.random.rand(4, 2))  # N(8,4)
which = np.random.choice(np.array([0,1,2,3]), size=100, replace=True)
for i in range(0, X.shape[0]):
    X[i] = X[i] + X_mean[which[i], :]

# Plot the points
fig, ax = plt.subplots()
ax.scatter(X[which == 0][:, 0], X[which == 0][:, 1], c='blue')
ax.scatter(X[which == 1][:, 0], X[which == 1][:, 1], c='green')
ax.scatter(X[which == 2][:, 0], X[which == 2][:, 1], c='red')
ax.scatter(X[which == 3][:, 0], X[which == 3][:, 1], c='cyan')
Out[12]:
<matplotlib.collections.PathCollection at 0x4861050>

We now cluster these points using K-Means (we use the which values to evaluate the performance later).

In [13]:
kmeans = KMeans(n_clusters=4, n_init=15)
kmeans.fit(X)
ypred = kmeans.predict(X)

# Print confusion matrix. Note that the matrix is not aligned because we don't know
# the correspondence between the assigned cluster and the generated cluster, but the
# matrix should show one high value per row and/or column.
confusion_matrix = np.zeros((4, 4))
for i in range(0, which.shape[0]):
    actual = which[i]
    predicted = ypred[i]
    confusion_matrix[actual, predicted] = confusion_matrix[actual, predicted] + 1
print confusion_matrix

# Plot points with cluster centers (marked with +)
fig, ax = plt.subplots()
ax.scatter(X[which == 0][:, 0], X[which == 0][:, 1], c='blue')
ax.scatter(X[which == 1][:, 0], X[which == 1][:, 1], c='green')
ax.scatter(X[which == 2][:, 0], X[which == 2][:, 1], c='red')
ax.scatter(X[which == 3][:, 0], X[which == 3][:, 1], c='cyan')
for cc in kmeans.cluster_centers_:
    ax.plot(cc[0], cc[1], marker='+', color='black', markersize=20)
[[  0.   0.  16.   0.]
 [ 28.   0.   0.   2.]
 [  0.   0.   1.  25.]
 [  0.  28.   0.   0.]]

Hierarchical Clustering

We reuse the same data to demonstrate Hierarchical Clustering. R has many more options in this area than Scikit-Learn. The only algorithm available is Ward algorithm - one of the 7 methods available in R's hclust() function.

R also has a built-in plot() method which prints out the dendogram for the clustering. It is possible to draw a dendogram using data returned

In [14]:
# produce a connectivity matrix based on cosine similarity
norms = np.linalg.norm(X, ord=2, axis=1)
X_n = X / norms[:, None]
cosim = np.matrix(X_n) * np.matrix(X_n.T)
binarizer = Binarizer(threshold=0.5).fit_transform(cosim)
cosim_sparse = ss.csr_matrix(cosim)

# run the clustering
ward = Ward(n_clusters=4, connectivity=cosim_sparse)
ypred = ward.fit_predict(X)

# compute the confusion matrix for hierarchical clustering
confusion_matrix = np.zeros((4, 4))
for i in range(0, which.shape[0]):
    actual = which[i]
    predicted = ypred[i]
    confusion_matrix[actual, predicted] = confusion_matrix[actual, predicted] + 1
confusion_matrix
Out[14]:
array([[ 16.,   0.,   0.,   0.],
       [  0.,   0.,  24.,   6.],
       [  1.,   0.,   0.,  25.],
       [  0.,  28.,   0.,   0.]])