#!/usr/bin/env python
# coding: utf-8

# In[2]:


get_ipython().run_line_magic('run', '../../../common_functions/import_all.py')

from common_functions.setup_notebook import set_css_style, setup_matplotlib, config_ipython

from sklearn.cluster import KMeans
from sklearn.preprocessing import scale

from common_functions.kmeans_helpers import (retrieve_cluster_points, 
                                             compute_wcss, 
                                             compute_gap_statistic, 
                                             evaluate_gap_statistic_best_k)

dataset = '../../datasets/oldfaithful.txt'

config_ipython()
setup_matplotlib()
set_css_style()


# # k-means: trying it!

# ### Let's use the "old faithful" dataset
# 
# We will use the Old Faithful classic dataset, a copy of which is available in this repo under folder `datasets/` for convenience. 

# In[2]:


# Let's read the Old Faithful dataset into a Pandas Dataframe, choosing the columns

data = pd.read_csv(dataset, delimiter=' ')[['eruptions', 'waiting']]

data.head()


# ### Scale the data so that each column is in the same scale

# In[3]:


scaled_data = scale(data)


# ### First, see how it looks like
# 
# It looks that 2 groups are naturally emerging from the data.

# In[4]:


data.plot.scatter('eruptions', 'waiting')
plt.title('Scatter of the data in the "old faithful" dataset')
plt.show();


# ### What does the rule of thumb say?

# In[5]:


thumb_k = np.sqrt(data.shape[0]/2)
print(' * Rule of thumb asserts optimal k =', round(thumb_k, 2))


# ### Using the elbow method
# 
# You'll see the elbow seems to appear quite soon.

# In[6]:


k_range = range(1, 21)            # range of K (decide arbitrarily how many Ks to test)
inertia_k = dict()                # inertia for each k
wcss_k = dict()                   # WCSS for each k
p_k = dict()                      # percentage of variance explained for each k

# Loop over the value of K
for k in k_range:
    print('k = ', k)
    
    # Fit the model
    fit = KMeans(n_clusters=k).fit(scaled_data)

    # Retrieve centroids for fitted model
    # A centroid is the cluster center, given as the vector of coordinates over the n_features dimensions
    # So there will be K number of n_features-dimensional centroids
    centroids = fit.cluster_centers_
    
    # Retrieve cluster labels for fitted model
    labels = fit.labels_

    # Retrieve the inertia of the fitted model
    inertia_k[k] = fit.inertia_
    
    # Compute the WCSS of the fitted model
    wcss_k[k] = compute_wcss(centroids, labels, scaled_data)
    
plt.title('WCSS vs. K')
plt.plot(k_range, list(wcss_k.values()), marker='o')
plt.xlabel('K')
plt.xticks(k_range)
plt.ylabel('WCSS')
plt.savefig('elbow.png', dpi=200)
plt.show();


# ### Run a clustering

# In[7]:


k = 5                          # choose K

fit = KMeans(n_clusters=k).fit(scaled_data)
centroids = fit.cluster_centers_
labels = fit.labels_

labels


# ### Compute the gap statistic
# 
# Note that this takes a bit to run.

# In[8]:


k_gap = {}
for k in k_range:
    gap = compute_gap_statistic(scaled_data, k, scale_data=False)
    k_gap[k] = gap 
    
k_gapdiff = {}
for k in range(min(k_range), max(k_range)-1):
    k_gapdiff[k] = k_gap[k][0] - k_gap[k+1][0] - k_gap[k + 1][2]
        
'The best k according to the gap statistic is: ', evaluate_gap_statistic_best_k(scaled_data)


# In[9]:


plt.plot(list(k_gap.keys()), [value[0] for value in k_gap.values()])
plt.title('Gap statistic')
plt.ylabel('$G$')
plt.xlabel('$K$')
#plt.grid()
plt.savefig('gap.png', dpi=200)
plt.show();


# In[11]:


plt.bar(list(k_gapdiff.keys()), k_gapdiff.values())
plt.title('Gap diff')
plt.ylabel('$G_K - G_{K+1} + s_{K+1}$')
plt.xlabel('$K$')
plt.ylim(-0.8, 0.1)
plt.savefig('gapdiff.png', dpi=200)
plt.show();


# ### Scatter plot of two given features with the respective cluster colour and centroid

# In[15]:


# Choose feature indices from the header
f1 = 1
f2 = 2

k = 4

# List of colours: if len is < k, add some
colours = {
    'red': 'r',
    'black': 'k',
    'blue': 'b',
    'green': 'g',
    'grey': '#929090',
    'pink': '#FFB6C1',
    'light_blue': '#00BFFF',
    'light_green': '#29E191'
}

plt.title('Scatter plot of two features, k clusters', fontweight='bold', fontsize=16)
plt.xlabel('eruptions')
plt.ylabel('waiting')
plt.grid()

# Note: plot displays the scaled data
for cluster_index in range(k):
    cluster_color = colours[list(colours)[cluster_index]]
    cluster_points = retrieve_cluster_points(cluster_index, labels, scaled_data)
    plt.scatter([point[f1-1] for point in cluster_points], 
                [point[f2-1] for point in cluster_points], color=cluster_color)
    plt.scatter(centroids[cluster_index,f1-1], centroids[cluster_index, f2-1], marker='x', s=200, color=cluster_color)

plt.savefig('cluster.png', dpi=200)
plt.show();