K Means Iterations¶

For each iteration, display the classifier label for each point, the region (convex hull) for each label, and where the next centroid will be.

In [1]:

import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from scipy.spatial import ConvexHull
import seaborn as sns
%matplotlib inline

sns.set_style("ticks")

def kmeans_iter_graph (point_list=[], *arg, k, out_file):
    # combine the lists
    points = np.concatenate([p for p in point_list])
    colors = ['b', 'g', 'r', 'y', 'm', 'k', 'c']
    
    # select k random points
    centroids = np.random.uniform(np.min(points[:,0]),np.max(points[:,1]),(k,2))
    
    plt.figure (figsize=(8,8))
    plt.plot(points[:,0], points[:,1], 'bo')
    plt.plot(centroids[:,0],centroids[:,1],'r*',markersize=20)
    plt.title('Initial Random Centroids')
    plt.savefig(out_file+'000.png',dpi=96)

    old_centroids = []
    counter = 0
    
    while not np.array_equal(centroids,old_centroids):
        # for each point in points, get the distance from each centroid and find
        # the nearest centroid.
        counter+=1
        old_centroids = centroids

        distances = [[linalg.norm(point-centroids[j]) for j in range(k)] for point in points]
        labels = np.asarray([np.argmin(distances[i]) for i in range(len(points))])

        # now get the new centroids from each label group
        centroids = []
        
        plt.figure(figsize=(8,8))
        plt.title('Iteration #'+str(counter))
        for i in range(k):
            current_points = points[np.where(labels==i)]
            hull=ConvexHull(current_points)
            centroids.append(np.mean(current_points,axis=0))
            plt.plot(current_points[:,0],current_points[:,1], '%so'%colors[i],alpha=0.2)
            plt.plot(centroids[i][0],centroids[i][1], '%s*'%colors[i], markersize=20)
            plt.plot(old_centroids[i][0],old_centroids[i][1], '%s*'%colors[i], markersize=20, alpha=0.3)
            plt.plot([centroids[i][0],old_centroids[i][0]],[centroids[i][1],old_centroids[i][1]], '%s-'%colors[i], alpha=1.0)
            for simplex in hull.simplices:
                plt.plot(current_points[simplex,0],current_points[simplex,1], '%s--'%colors[i],alpha=0.3)
            plt.title('Iteration ' + str(counter))
        plt.savefig(out_file+'%03d'%counter+'.png',dpi=96)

    print('Final Centroid Coordinates:')
    print(centroids)

Try it with 3 randomly generated regions centered around (0,0), (5,5) and (10,10)

In [2]:

points1 = np.random.standard_normal((100,2))
points2 = np.random.normal(10,2,(100,2))
points3 = np.random.normal(5,2,(100,2))
point_list = [points1,points2,points3]

In [3]:

kmeans_iter_graph(point_list,k=3,out_file='example1_')

Final Centroid Coordinates:
[array([ 0.24796396,  0.11090377]), array([ 5.03920599,  5.86426901]), array([ 10.12017354,  10.14535921])]

Four regions around (5,5), (10,10), (15,15), and (20,20) with a bunch of extra noise centered on (12.5,12.5). Let's see if KMeans converges towards the four regions.

In [4]:

p1 = np.random.normal(5,2,(100,2))
p2 = np.random.normal(10,2,(100,2))
p3 = np.random.normal(15,2,(100,2))
p4 = np.random.normal(20,2,(100,2))
p5 = np.random.normal(12.5,5,(50,2))
pl2 = [p1,p2,p3,p4,p5]

In [7]:

kmeans_iter_graph(pl2,k=4,out_file='example2_')

Final Centroid Coordinates:
[array([ 5.20803973,  5.00411462]), array([ 20.04790979,  19.79497467]), array([ 14.75365572,  14.6691234 ]), array([ 9.51401797,  9.70890713])]

Now for centroids not on the 45 degree line...

In [8]:

pp1 = np.concatenate((np.random.normal(5,2,(100,1)),np.random.normal(8,2,(100,1))),axis=1)
pp2 = np.concatenate((np.random.normal(7,2,(100,1)),np.random.normal(15,2,(100,1))),axis=1)
pp3 = np.concatenate((np.random.normal(15,2,(100,1)),np.random.normal(5,2,(100,1))),axis=1)
pp4 = np.concatenate((np.random.normal(14,2,(100,1)),np.random.normal(13,2,(100,1))),axis=1)
pp5 = np.random.normal(10,5,(250,2))
pl3 = [pp1,pp2,pp3,pp4,pp5]

In [9]:

kmeans_iter_graph(pl3,k=4,out_file='example3_')

Final Centroid Coordinates:
[array([  7.25428578,  14.85833453]), array([ 5.32840506,  7.64823214]), array([ 14.67325263,   4.67971284]), array([ 14.35333394,  13.23570452])]