In [1]:
import pandas, numpy
from matplotlib import pyplot
import scipy.stats
from ipywidgets import interact, interactive, fixed, interact_manual, Button
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
In [2]:
pyplot.style.use("seaborn")
In [3]:
def length_euclid(x):
    return numpy.sqrt(numpy.sum(x ** 2))

def length_manhattan(x):
    return numpy.sum(numpy.abs(x))

def length_p(x, p):
    return numpy.sum(numpy.abs(x) ** p) ** (1/p)
In [4]:
def show_length(x, y):
    pyplot.figure(figsize=(4,4))
    pyplot.xlim(-2, 2)
    pyplot.ylim(-2,2)
    pyplot.plot([0,x], [0,0], c="#aaaaaa")
    pyplot.plot([x,x], [y,0], c="#aaaaaa")
    pyplot.plot([0,x], [0,y])
    pyplot.text(-1.7, -1.7, "l2: {:.3f}".format(length_euclid(numpy.array([x,y]))))
    pyplot.text(-1.7, -1, "l1: {:.3f}".format(length_manhattan(numpy.array([x,y]))))
    pyplot.show()

interact(show_length, x=(-2, 2, 0.1), y=(-2, 2, 0.1))
Out[4]:
<function __main__.show_length(x, y)>
In [10]:
x = numpy.linspace(-1, 1, 30)
y = numpy.linspace(-1, 1, 30)

xx, yy = numpy.meshgrid(x, y, sparse=True)
p = 0.1
z = (numpy.abs(xx) ** p + numpy.abs(yy) ** p) ** (1/p)
pyplot.figure(figsize=(4,4))
pyplot.contourf(x, y, z, cmap="viridis")
pyplot.show()
In [15]:
# from https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html

points, real_clusters = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.6, random_state=0)
pyplot.scatter(points[:, 0], points[:, 1])
pyplot.show()
In [20]:
def assign_points(centers):
    assignments = numpy.zeros(300)
    
    for point_id in range(300):
        center_distances = numpy.zeros(4)
        for cluster_id in range(4):
            difference = centers[cluster_id,:] - points[point_id,:]
            center_distances[cluster_id] = length_p(difference, 2)
        closest_cluster = numpy.argmin(center_distances)

        assignments[point_id] = closest_cluster
        
    return assignments

def move_centers(assignments):
    new_centers = numpy.zeros((4,2))
    
    for cluster_id in range(4):
        cluster_points = points[ assignments == cluster_id ]
        if cluster_points.shape[0] == 0:
            new_centers[cluster_id,:] = points[numpy.random.randint(300),:]
        else:
            new_centers[cluster_id,:] = numpy.mean(cluster_points, axis=0)
        
        #new_centers[cluster_id,:] = numpy.mean(cluster_points, axis=0)

    return new_centers

def show(panel, assignments, cents):
    pyplot.subplot(1,3,panel)
    pyplot.scatter(points[:, 0], points[:, 1], c=assignments, cmap="viridis")
    pyplot.scatter(cents[:,0], cents[:,1], c="red")
In [39]:
centers = numpy.random.normal(0,1,size=(4,2))
current_clusters = numpy.zeros(300)
In [43]:
show(1, current_clusters, centers)

current_clusters = assign_points(centers)
show(2, current_clusters, centers)

centers = move_centers(current_clusters)
show(3, current_clusters, centers)

pyplot.show()
centers
Out[43]:
array([[ 0.94973532,  4.41906906],
       [-1.37324398,  7.75368871],
       [-1.58438467,  2.83081263],
       [ 1.98258281,  0.86771314]])
In [ ]: