#!/usr/bin/env python # coding: utf-8 # In[2]: get_ipython().run_line_magic('run', '../../../common_functions/import_all.py') from common_functions.setup_notebook import set_css_style, setup_matplotlib, config_ipython from sklearn.cluster import KMeans from sklearn.preprocessing import scale from common_functions.kmeans_helpers import (retrieve_cluster_points, compute_wcss, compute_gap_statistic, evaluate_gap_statistic_best_k) dataset = '../../datasets/oldfaithful.txt' config_ipython() setup_matplotlib() set_css_style() # # k-means: trying it! # ### Let's use the "old faithful" dataset # # We will use the Old Faithful classic dataset, a copy of which is available in this repo under folder `datasets/` for convenience. # In[2]: # Let's read the Old Faithful dataset into a Pandas Dataframe, choosing the columns data = pd.read_csv(dataset, delimiter=' ')[['eruptions', 'waiting']] data.head() # ### Scale the data so that each column is in the same scale # In[3]: scaled_data = scale(data) # ### First, see how it looks like # # It looks that 2 groups are naturally emerging from the data. # In[4]: data.plot.scatter('eruptions', 'waiting') plt.title('Scatter of the data in the "old faithful" dataset') plt.show(); # ### What does the rule of thumb say? # In[5]: thumb_k = np.sqrt(data.shape[0]/2) print(' * Rule of thumb asserts optimal k =', round(thumb_k, 2)) # ### Using the elbow method # # You'll see the elbow seems to appear quite soon. # In[6]: k_range = range(1, 21) # range of K (decide arbitrarily how many Ks to test) inertia_k = dict() # inertia for each k wcss_k = dict() # WCSS for each k p_k = dict() # percentage of variance explained for each k # Loop over the value of K for k in k_range: print('k = ', k) # Fit the model fit = KMeans(n_clusters=k).fit(scaled_data) # Retrieve centroids for fitted model # A centroid is the cluster center, given as the vector of coordinates over the n_features dimensions # So there will be K number of n_features-dimensional centroids centroids = fit.cluster_centers_ # Retrieve cluster labels for fitted model labels = fit.labels_ # Retrieve the inertia of the fitted model inertia_k[k] = fit.inertia_ # Compute the WCSS of the fitted model wcss_k[k] = compute_wcss(centroids, labels, scaled_data) plt.title('WCSS vs. K') plt.plot(k_range, list(wcss_k.values()), marker='o') plt.xlabel('K') plt.xticks(k_range) plt.ylabel('WCSS') plt.savefig('elbow.png', dpi=200) plt.show(); # ### Run a clustering # In[7]: k = 5 # choose K fit = KMeans(n_clusters=k).fit(scaled_data) centroids = fit.cluster_centers_ labels = fit.labels_ labels # ### Compute the gap statistic # # Note that this takes a bit to run. # In[8]: k_gap = {} for k in k_range: gap = compute_gap_statistic(scaled_data, k, scale_data=False) k_gap[k] = gap k_gapdiff = {} for k in range(min(k_range), max(k_range)-1): k_gapdiff[k] = k_gap[k][0] - k_gap[k+1][0] - k_gap[k + 1][2] 'The best k according to the gap statistic is: ', evaluate_gap_statistic_best_k(scaled_data) # In[9]: plt.plot(list(k_gap.keys()), [value[0] for value in k_gap.values()]) plt.title('Gap statistic') plt.ylabel('$G$') plt.xlabel('$K$') #plt.grid() plt.savefig('gap.png', dpi=200) plt.show(); # In[11]: plt.bar(list(k_gapdiff.keys()), k_gapdiff.values()) plt.title('Gap diff') plt.ylabel('$G_K - G_{K+1} + s_{K+1}$') plt.xlabel('$K$') plt.ylim(-0.8, 0.1) plt.savefig('gapdiff.png', dpi=200) plt.show(); # ### Scatter plot of two given features with the respective cluster colour and centroid # In[15]: # Choose feature indices from the header f1 = 1 f2 = 2 k = 4 # List of colours: if len is < k, add some colours = { 'red': 'r', 'black': 'k', 'blue': 'b', 'green': 'g', 'grey': '#929090', 'pink': '#FFB6C1', 'light_blue': '#00BFFF', 'light_green': '#29E191' } plt.title('Scatter plot of two features, k clusters', fontweight='bold', fontsize=16) plt.xlabel('eruptions') plt.ylabel('waiting') plt.grid() # Note: plot displays the scaled data for cluster_index in range(k): cluster_color = colours[list(colours)[cluster_index]] cluster_points = retrieve_cluster_points(cluster_index, labels, scaled_data) plt.scatter([point[f1-1] for point in cluster_points], [point[f2-1] for point in cluster_points], color=cluster_color) plt.scatter(centroids[cluster_index,f1-1], centroids[cluster_index, f2-1], marker='x', s=200, color=cluster_color) plt.savefig('cluster.png', dpi=200) plt.show();