Comparison of sklean k-means to hand coded implementation. k-means clustering is one of the oldest clustering techniques. However it is still actively used and research, partially because it is simple and scales well to large data sets. A few recent works include:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
iris.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
X.shape
(150, 4)
K = 3
p = X.shape[1]
n = X.shape[0]
## initialize parameters
mu = np.zeros(K*p).reshape((K,p))
C = np.random.randint(low=0,high=K,size=n)
## run algorithm
N = 1000 ## number of iterations
for jj in np.arange(N):
## update cluster centers
for k in np.arange(K):
mu[k,:] = np.mean(X[C==k,:],axis=0)
## update cluster memberships
for ii in np.arange(n):
C[ii] = np.argmin(np.sum(np.power(mu - X[ii,:],2),axis=1))
## NOTE: should functionalize code i.e.
## def kmeans(X,K,init):
## where X is data matrix, K=number of clusters, init are initial cluster centers
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from matplotlib import pylab
pylab.rcParams['figure.figsize'] = (9, 4)
plt.subplot(121)
plt.scatter(X[:,0],X[:,1],c=iris.target, cmap='viridis',edgecolor='white');
plt.subplot(122)
plt.scatter(X[:,0],X[:,1],c=C,cmap='viridis',edgecolor='white');
Note: Group labels are arbitrary.
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
plt.scatter(X[:,0],X[:,1],c=kmeans.predict(X),cmap='viridis',edgecolor='white');
import numpy as np
a = np.arange(4).reshape(2,2)
a
array([[0, 1], [2, 3]])
b = np.arange(6).reshape(2,3) + 10
b
array([[10, 11, 12], [13, 14, 15]])
b[:,0]
array([10, 13])
a - b[:,0]
array([[-10, -12], [ -8, -10]])