For each iteration, display the classifier label for each point, the region (convex hull) for each label, and where the next centroid will be.
import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from scipy.spatial import ConvexHull
import seaborn as sns
%matplotlib inline
sns.set_style("ticks")
def kmeans_iter_graph (point_list=[], *arg, k, out_file):
# combine the lists
points = np.concatenate([p for p in point_list])
colors = ['b', 'g', 'r', 'y', 'm', 'k', 'c']
# select k random points
centroids = np.random.uniform(np.min(points[:,0]),np.max(points[:,1]),(k,2))
plt.figure (figsize=(8,8))
plt.plot(points[:,0], points[:,1], 'bo')
plt.plot(centroids[:,0],centroids[:,1],'r*',markersize=20)
plt.title('Initial Random Centroids')
plt.savefig(out_file+'000.png',dpi=96)
old_centroids = []
counter = 0
while not np.array_equal(centroids,old_centroids):
# for each point in points, get the distance from each centroid and find
# the nearest centroid.
counter+=1
old_centroids = centroids
distances = [[linalg.norm(point-centroids[j]) for j in range(k)] for point in points]
labels = np.asarray([np.argmin(distances[i]) for i in range(len(points))])
# now get the new centroids from each label group
centroids = []
plt.figure(figsize=(8,8))
plt.title('Iteration #'+str(counter))
for i in range(k):
current_points = points[np.where(labels==i)]
hull=ConvexHull(current_points)
centroids.append(np.mean(current_points,axis=0))
plt.plot(current_points[:,0],current_points[:,1], '%so'%colors[i],alpha=0.2)
plt.plot(centroids[i][0],centroids[i][1], '%s*'%colors[i], markersize=20)
plt.plot(old_centroids[i][0],old_centroids[i][1], '%s*'%colors[i], markersize=20, alpha=0.3)
plt.plot([centroids[i][0],old_centroids[i][0]],[centroids[i][1],old_centroids[i][1]], '%s-'%colors[i], alpha=1.0)
for simplex in hull.simplices:
plt.plot(current_points[simplex,0],current_points[simplex,1], '%s--'%colors[i],alpha=0.3)
plt.title('Iteration ' + str(counter))
plt.savefig(out_file+'%03d'%counter+'.png',dpi=96)
print('Final Centroid Coordinates:')
print(centroids)
Try it with 3 randomly generated regions centered around (0,0), (5,5) and (10,10)
points1 = np.random.standard_normal((100,2))
points2 = np.random.normal(10,2,(100,2))
points3 = np.random.normal(5,2,(100,2))
point_list = [points1,points2,points3]
kmeans_iter_graph(point_list,k=3,out_file='example1_')
Final Centroid Coordinates: [array([ 0.24796396, 0.11090377]), array([ 5.03920599, 5.86426901]), array([ 10.12017354, 10.14535921])]
Four regions around (5,5), (10,10), (15,15), and (20,20) with a bunch of extra noise centered on (12.5,12.5). Let's see if KMeans converges towards the four regions.
p1 = np.random.normal(5,2,(100,2))
p2 = np.random.normal(10,2,(100,2))
p3 = np.random.normal(15,2,(100,2))
p4 = np.random.normal(20,2,(100,2))
p5 = np.random.normal(12.5,5,(50,2))
pl2 = [p1,p2,p3,p4,p5]
kmeans_iter_graph(pl2,k=4,out_file='example2_')
Final Centroid Coordinates: [array([ 5.20803973, 5.00411462]), array([ 20.04790979, 19.79497467]), array([ 14.75365572, 14.6691234 ]), array([ 9.51401797, 9.70890713])]
Now for centroids not on the 45 degree line...
pp1 = np.concatenate((np.random.normal(5,2,(100,1)),np.random.normal(8,2,(100,1))),axis=1)
pp2 = np.concatenate((np.random.normal(7,2,(100,1)),np.random.normal(15,2,(100,1))),axis=1)
pp3 = np.concatenate((np.random.normal(15,2,(100,1)),np.random.normal(5,2,(100,1))),axis=1)
pp4 = np.concatenate((np.random.normal(14,2,(100,1)),np.random.normal(13,2,(100,1))),axis=1)
pp5 = np.random.normal(10,5,(250,2))
pl3 = [pp1,pp2,pp3,pp4,pp5]
kmeans_iter_graph(pl3,k=4,out_file='example3_')
Final Centroid Coordinates: [array([ 7.25428578, 14.85833453]), array([ 5.32840506, 7.64823214]), array([ 14.67325263, 4.67971284]), array([ 14.35333394, 13.23570452])]