Generate a set of data from three spherical gaussian distributions, set the spacing so the clouds mix a little. How might we identify the three clusters?
%pylab
%matplotlib inline
import random
def make_data(n_points, n_clusters=2, dim=2, sigma=1):
x = [[] for i in range(dim)]
for i in range(n_clusters):
for d in range(dim):
x[d].extend([random.gauss(i*3,sigma) for j in range(n_points)])
return x
#[x,y] = make_data(100, 3)
scatter(*make_data(75, 3))
Using matplotlib backend: TkAgg Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['random'] `%matplotlib` prevents importing * from pylab and numpy
<matplotlib.collections.PathCollection at 0x7f2601213b90>
def centroid(x):
return [[sum(col)/float(len(x[0]))] for col in x]
ex1 = make_data(100, 1)
#print ex1
scatter(*ex1, color="green")
print centroid(ex1)
scatter(*centroid(ex1), color="red", marker="x")
[[0.044536365610677767], [0.19467733699277656]]
<matplotlib.collections.PathCollection at 0x7f26014bfc10>
co = ["red", "orange", "yellow", "green", "blue", "black","brown"]
def guess_clusters(x, n_clusters):
# req co list of identifiers
for i in range(len(x[0])):
return [ co[random.choice(range(n_clusters))] for i in range(len(x[0]))]
ex2 = make_data(50, 2)
scatter(*ex2)
mem_ex2 = guess_clusters(ex2,2)
#print mem_ex2
scatter(*ex2, color=mem_ex2)
<matplotlib.collections.PathCollection at 0x7f26011e7350>
def select_members(x, membership, cluster):
return [ [i for i,label in zip(dim, membership) if label == cluster] for dim in x ]
print mem_ex2.count("red")
scatter(*select_members(ex2, mem_ex2, "red"), color="red")
scatter(*centroid(select_members(ex2, mem_ex2, "red")), color="red", marker="x")
58
<matplotlib.collections.PathCollection at 0x7f25fa180c50>
print mem_ex2.count("orange")
scatter(*select_members(ex2, mem_ex2, "orange"), color="orange")
scatter(*centroid(select_members(ex2, mem_ex2, "orange")), color="orange", marker="x")
42
<matplotlib.collections.PathCollection at 0x7f25fa0bc210>
import math
def distance(p1, p2):
# odd... vectors are lists of lists with only 1 element in each dim
return math.sqrt(sum([(i[0]-j[0])**2 for i,j in zip(p1, p2)]))
print distance([[-1],[-1]],[[2],[3]])
5.0
import sys
def reassign(x, centriods):
membership = []
for idx in range(len(x[0])):
min_d = sys.maxint
cluster = ""
for c, vc in centriods.items():
dist = distance(vc, [[t[idx]] for t in x])
if dist < min_d:
min_d = dist
cluster = c
membership.append(cluster)
return membership
cent_ex2 = {i:centroid(select_members(ex2, mem_ex2, i)) for i in co[:2]}
mem_ex2 = reassign(ex2, cent_ex2)
scatter(*ex2, color=mem_ex2)
scatter(*cent_ex2["red"], color="red", marker="x")
scatter(*cent_ex2["orange"], color="orange", marker="x")
<matplotlib.collections.PathCollection at 0x7f25f9ff2c90>
# function
def get_centroids(x, membership):
return {i:centroid(select_members(x, membership, i)) for i in set(membership)}
# redifine with total distance measure
def reassign(x, centroids):
membership, scores = [], {}
# step through all the vectors
for idx in range(len(x[0])):
min_d, cluster = sys.maxint, None
for c, vc in centroids.items():
dist = distance(vc, [[t[idx]] for t in x])
if dist < min_d:
min_d = dist
cluster = c
scores[cluster] = min_d + scores.get(cluster, 0)
membership.append(cluster)
return membership, sum(scores.values())/float(len(x[0]))
def k_means(data, k):
# start with random distribution
membership = guess_clusters(data, k)
score, last_score = 0.0, sys.maxint
while abs(last_score - score) > 1e-7:
last_score = score
c = get_centroids(data, membership)
membership, score = reassign(data, c)
#print last_score - score
return membership, c, score
ex3 = make_data(100, 7, sigma=1)
mem_ex3, cl_ex3, s_ex3 = k_means(ex3, 7)
scatter(*ex3, color = mem_ex3)
for i, pt in cl_ex3.items():
scatter(*pt, color=i, marker="x")
ex4 = make_data(200, 4)
err = []
trial_ks = range(1,8)
for k in trial_ks:
mem_ex4, cl_ex4, s_ex4 = k_means(ex4, k)
err.append(s_ex4)
scatter(*ex4, color = mem_ex4)
<matplotlib.collections.PathCollection at 0x7f25f9ed6a50>
plot(trial_ks, err)
[<matplotlib.lines.Line2D at 0x7f25f9ec8e90>]