#!/usr/bin/env python # coding: utf-8 # #### In this notebook we will use two data sets, the Boston Housing data and the Iris Plants data to illustrate the use of KMeans clustering technqiue. We will use both the KMeans clustering module from scikit-learn as well as a modified version of the KMeans impelementation from the Machine Learning in Action book. # In[1]: import numpy as np import pylab as pl import pandas as pd from sklearn.cluster import KMeans from sklearn.datasets import load_boston boston = load_boston() # ### I. Clustering with Boston Housing Data # In[2]: np.set_printoptions(suppress=True, precision=2, linewidth=120) # In[3]: print(boston.feature_names) # In[4]: print(boston.data[:5]) # In[5]: data = pd.DataFrame(boston.data, columns=boston.feature_names) data.head(10) # #### Now we use KMeans algorithm of scikit-learn to perform the clustering. # In[6]: kmeans = KMeans(n_clusters=5, max_iter=500, verbose=1) # initialization # In[7]: kmeans.fit(data) # In[8]: clusters = kmeans.predict(data) # In[9]: pd.DataFrame(clusters, columns=["Cluster"]) # #### The centroids provide an aggregate representation and a characterization of each cluster. # In[10]: pd.options.display.float_format='{:,.2f}'.format centroids = pd.DataFrame(kmeans.cluster_centers_, columns=boston.feature_names) centroids # In[11]: def cluster_sizes(clusters): #clusters is an array of cluster labels for each instance in the data size = {} cluster_labels = np.unique(clusters) n_clusters = cluster_labels.shape[0] for c in cluster_labels: size[c] = len(data[clusters == c]) return size # In[12]: size = cluster_sizes(clusters) for c in size.keys(): print("Size of Cluster", c, "= ", size[c]) # #### One way to measure the quality of clustering is to compute the Silhouette values for each instance in the data. The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). It is the ratio of the difference between in-cluster dissimilarity and the closest out-of-cluster dissimilarity, and the maximum of these two values. The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and well separated from other clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters. More details on the definition of Silhouette measure. # In[13]: from sklearn import metrics # In[14]: silhouettes = metrics.silhouette_samples(data, clusters) print(silhouettes[:20]) # In[15]: print(silhouettes.mean()) # In[16]: def plot_silhouettes(data, clusters, metric='euclidean'): from matplotlib import cm from sklearn.metrics import silhouette_samples cluster_labels = np.unique(clusters) n_clusters = cluster_labels.shape[0] silhouette_vals = metrics.silhouette_samples(data, clusters, metric='euclidean') c_ax_lower, c_ax_upper = 0, 0 cticks = [] for i, k in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[clusters == k] c_silhouette_vals.sort() c_ax_upper += len(c_silhouette_vals) color = cm.jet(float(i) / n_clusters) pl.barh(range(c_ax_lower, c_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) cticks.append((c_ax_lower + c_ax_upper) / 2) c_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) pl.axvline(silhouette_avg, color="red", linestyle="--") pl.yticks(cticks, cluster_labels) pl.ylabel('Cluster') pl.xlabel('Silhouette coefficient') pl.tight_layout() #pl.savefig('images/11_04.png', dpi=300) pl.show() return # In[17]: plot_silhouettes(data, clusters) # ### II. Clustering with Iris Plant Database # In[18]: from sklearn.datasets import load_iris iris = load_iris() # In[19]: print(iris.DESCR) # In[20]: data = iris.data target = iris.target # In[21]: print(iris.feature_names) # In[22]: irisDF = pd.DataFrame(data, columns=iris.feature_names) irisDF.head(10) # In[23]: print(iris.target) # #### This snippet uses the first and the third dimension (sepal length and sepal width) and the result is shown in the following figure. # #### Classes: 0 = Iris-Setosa, 1 = Iris-Versicolour, 2 = Iris-Virginica # In[24]: pl.plot(data[target==0,0],data[target==0,2],'bo') pl.plot(data[target==1,0],data[target==1,2],'ro') pl.plot(data[target==2,0],data[target==2,2],'go') pl.legend(('Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'), loc=4) pl.show() # #### In the graph we have 150 points and their color represents the class; the blue points represent the samples that belong to the specie setosa, the red ones represent versicolor and the green ones represent virginica. Next let's see if through clustering we can obtain the correct classes. # In[25]: iris_kmeans = KMeans(n_clusters=3, max_iter=500, verbose=1, n_init=5) # initialization iris_kmeans.fit(irisDF) # In[26]: c = iris_kmeans.predict(data) # In[27]: c.shape # In[28]: size = cluster_sizes(c) for i in size.keys(): print("Size of Cluster", i, "= ", size[i]) # In[29]: iris_centroids = pd.DataFrame(iris_kmeans.cluster_centers_, columns=iris.feature_names) iris_centroids # #### Since we know what the actual classes are (in the target attribute), we can evaluate clustering performance by using metrics that compare our discovered cluster labels to the actual classes: # In[30]: print(c) # In[31]: print(target) # #### Homogeneity: each cluster contains only members of a single class. Completeness: all members of a given class are assigned to the same cluster. # In[32]: from sklearn.metrics import completeness_score, homogeneity_score # In[33]: print(completeness_score(target,c)) # In[34]: print(homogeneity_score(target,c)) # #### The completeness score approaches 1 when most of the data points that are members of a given class are elements of the same cluster while the homogeneity score approaches 1 when all the clusters contain almost only data points that are member of a single class. # #### Let's again plot sepal length against sepal width, but this time we'll use our cluster labels instead of the actual class labels from the target attribute. # In[35]: pl.plot(data[c==0,0],data[c==0,2],'ro') pl.plot(data[c==1,0],data[c==1,2],'bo') pl.plot(data[c==2,0],data[c==2,2],'go') pl.show() # #### Let's also do some silhouette analysis on the Iris clusters: # In[36]: from sklearn.metrics import silhouette_samples iris_silhouettes = metrics.silhouette_samples(iris.data, c) print(iris_silhouettes[:20]) print("\n Mean Silhoouette Value: ", iris_silhouettes.mean()) # In[37]: plot_silhouettes(data, c) # #### Let's now use the kMeans clustering implementation from Machine Learning in Action, Ch. 10: # In[38]: import kMeans data = np.array(data) # #### Note: in the MLA kMeans module only a Euclidean distance function "distEuclid" is provided which is passed to the kMeans function. For this example, we have added another distance function based on the Cosine Similarity measure to the kMeans module and this function is used in the example below. # In[39]: centroids, clusters = kMeans.kMeans(data, 3, kMeans.distCosine, kMeans.randCent) # In[40]: pd.DataFrame(centroids, columns=iris.feature_names) # In[41]: print(clusters[:10,:]) # In[42]: iris_clusters = pd.DataFrame(clusters, columns=["Cluster", "MinDistance**2"]) iris_clusters.head(10) # In[43]: newC = iris_clusters["Cluster"].astype(int) print(newC) # In[44]: print(completeness_score(target,newC)) # In[45]: print(homogeneity_score(target,newC)) # #### Let's now try the Bisection kMeans algorithm also provided in the MLA kMeans module. # In[46]: centroids_bk, clusters_bk = kMeans.biKmeans(data, 3, kMeans.distEuclid) # In[47]: print(centroids_bk) # In[48]: bkC = clusters_bk.T[0] bkC = bkC.astype(int) # In[49]: print(bkC) # In[50]: bkC = np.ravel(bkC) print(bkC) # In[51]: print(completeness_score(target,bkC)) # In[52]: print(homogeneity_score(target,bkC)) # In[ ]: