from pylab import * import random as pyrandom from scipy.spatial.distance import cdist data = r_[10*randn(1000,2)+array([70,30]), 10*randn(1000,2)+array([10,10]), 10*randn(1000,2)+array([50,80])] data = data[pyrandom.sample(xrange(len(data)),len(data))] figsize(10,10) plot(data[:,0],data[:,1],'b+') # starting guess protos = array([[30,30],[40,20],[0,90],[50,50]]) start = protos.copy() # plotting the starting guess figsize(10,10) plot(data[:,0],data[:,1],'b+') plot(protos[:,0],protos[:,1],'ro',markersize=10) # k-means update figsize(10,10) dists = cdist(protos,data) closest = argmin(dists,axis=0) for i in range(len(protos)): plot(data[closest==i,0],data[closest==i,1],['c+','g+','b+','y+'][i]) plot(protos[:,0],protos[:,1],'ro',markersize=10) history = [protos.copy()] for i in range(len(protos)): protos[i,:] = average(data[closest==i],axis=0) history.append(protos.copy()) figsize(10,10) plot(data[:,0],data[:,1],'b+') harray = array(history) for i in range(len(protos)): plot(harray[:,i,0],harray[:,i,1],'r') plot(harray[0,:,0],harray[0,:,1],'ko',markersize=10) plot(protos[:,0],protos[:,1],'ro',markersize=10) for round in range(1000): if round%100==0: sys.stderr.write("%d "%round) dists = cdist(protos,data) closest = argmin(dists,axis=0) for i in range(len(protos)): protos[i,:] = average(data[closest==i],axis=0) history.append(protos.copy()) figsize(10,10) plot(data[:,0],data[:,1],'b+') history = array(history) for i in range(len(protos)): plot(history[:,i,0],history[:,i,1],'r') plot(history[0,:,0],history[0,:,1],'ko',markersize=10) plot(protos[:,0],protos[:,1],'ro',markersize=10) figsize(10,10) dists = cdist(protos,data) closest = argmin(dists,axis=0) for i in range(len(protos)): plot(data[closest==i,0],data[closest==i,1],['c+','g+','b+','y+'][i]) plot(protos[:,0],protos[:,1],'ro',markersize=10)