This is one of the 100 recipes of the IPython Cookbook, the definitive guide to high-performance scientific computing and data science in Python.
from itertools import permutations
import numpy as np
import sklearn
import sklearn.decomposition as dec
import sklearn.cluster as clu
import sklearn.datasets as ds
import sklearn.grid_search as gs
import matplotlib.pyplot as plt
%matplotlib inline
X, y = ds.make_blobs(n_samples=200, n_features=2, centers=3)
def relabel(cl):
"""Relabel a clustering with three clusters
to match the original classes."""
if np.max(cl) != 2:
return cl
perms = np.array(list(permutations((0, 1, 2))))
i = np.argmin([np.sum(np.abs(perm[cl] - y))
for perm in perms])
p = perms[i]
return p[cl]
def display_clustering(labels, title):
"""Plot the data points with the cluster colors."""
# We relabel the classes when there are 3 clusters.
labels = relabel(labels)
plt.figure(figsize=(8,3));
# Display the points with the true labels on the left,
# and with the clustering labels on the right.
for i, (c, title) in enumerate(zip(
[y, labels], ["True labels", title])):
plt.subplot(121 + i);
plt.scatter(X[:,0], X[:,1], c=c, s=30,
linewidths=0, cmap=plt.cm.rainbow);
plt.xticks([]); plt.yticks([]);
plt.title(title);
km = clu.KMeans()
km.fit(X);
display_clustering(km.labels_, "KMeans")
n_clusters=3
(that's cheating, because we happen to know that there are 3 clusters!).km = clu.KMeans(n_clusters=3)
km.fit(X);
display_clustering(km.labels_, "KMeans(3)")
plt.figure(figsize=(8,5));
plt.subplot(231);
plt.scatter(X[:,0], X[:,1], c=y, s=30,
linewidths=0, cmap=plt.cm.rainbow);
plt.xticks([]); plt.yticks([]);
plt.title("True labels");
for i, est in enumerate([
clu.SpectralClustering(3),
clu.AgglomerativeClustering(3),
clu.MeanShift(),
clu.AffinityPropagation(),
clu.DBSCAN(),
]):
est.fit(X);
c = relabel(est.labels_)
plt.subplot(232 + i);
plt.scatter(X[:,0], X[:,1], c=c, s=30,
linewidths=0, cmap=plt.cm.rainbow);
plt.xticks([]); plt.yticks([]);
plt.title(est.__class__.__name__);
The first two algorithms required the number of clusters as input. The next two did not, but they were able to find the right number 3. The last two failed at finding the correct number of clusters (overclustering).
You'll find all the explanations, figures, references, and much more in the book (to be released later this summer).
IPython Cookbook, by Cyrille Rossant, Packt Publishing, 2014 (500 pages).