K-means examples


In [1]:
# Numerical arrays.
import numpy as np

# Machine learning - KMeans.
import sklearn.cluster as skcl

# Plotting.
import matplotlib.pyplot as plt
In [2]:
# Data set.
X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
In [3]:
# Plot the data set.
plt.plot(X[:,0], X[:,1], 'x')
# Set reasonable limits.
plt.xlim([-2,14])
plt.ylim([-2,6])
Out[3]:
(-2.0, 6.0)
In [4]:
# Perform kmeans fitting.
kmeans = skcl.KMeans(n_clusters=2, random_state=0).fit(X)
In [5]:
# See the labels of the points.
kmeans.labels_
Out[5]:
array([1, 1, 1, 0, 0, 0])
In [6]:
# Plot the data set.
plt.plot(X[kmeans.labels_ == 0][:,0], X[kmeans.labels_ == 0][:,1], 'gx')
plt.plot(X[kmeans.labels_ == 1][:,0], X[kmeans.labels_ == 1][:,1], 'rx')
# Set reasonable limits.
plt.xlim([-2,14])
plt.ylim([-2,6])
Out[6]:
(-2.0, 6.0)
In [7]:
# Predict the cluster for two points.
newvals = np.array([[0, 0], [12, 3]])
predictions = kmeans.predict(newvals)
predictions
Out[7]:
array([1, 0])
In [8]:
# Plot the data set.
plt.plot(X[kmeans.labels_ == 0][:,0], X[kmeans.labels_ == 0][:,1], 'gx')
plt.plot(X[kmeans.labels_ == 1][:,0], X[kmeans.labels_ == 1][:,1], 'rx')
plt.plot(newvals[:,0], newvals[:,1], 'bo')
# Set reasonable limits.
plt.xlim([-2,14])
plt.ylim([-2,6])
Out[8]:
(-2.0, 6.0)
In [9]:
# Plot the data set.
plt.plot(X[kmeans.labels_ == 0][:,0], X[kmeans.labels_ == 0][:,1], 'gx')
plt.plot(X[kmeans.labels_ == 1][:,0], X[kmeans.labels_ == 1][:,1], 'rx')
plt.plot(newvals[predictions == 0][:,0], newvals[predictions == 0][:,1], 'go')
plt.plot(newvals[predictions == 1][:,0], newvals[predictions == 1][:,1], 'ro')
# Set reasonable limits.
plt.xlim([-2,14])
plt.ylim([-2,6])
Out[9]:
(-2.0, 6.0)
In [10]:
cent = kmeans.cluster_centers_
cent
Out[10]:
array([[10.,  2.],
       [ 1.,  2.]])
In [11]:
# Plot the data set.
plt.plot(X[kmeans.labels_ == 0][:,0], X[kmeans.labels_ == 0][:,1], 'gx')
plt.plot(X[kmeans.labels_ == 1][:,0], X[kmeans.labels_ == 1][:,1], 'rx')
plt.plot(newvals[predictions == 0][:,0], newvals[predictions == 0][:,1], 'go')
plt.plot(newvals[predictions == 1][:,0], newvals[predictions == 1][:,1], 'ro')
plt.plot(cent[:,0], cent[:,1], 'k.')
# Set reasonable limits.
plt.xlim([-2,14])
plt.ylim([-2,6])
Out[11]:
(-2.0, 6.0)


My own dataset


In [12]:
# Data set.

# Two centre points.
c1 = np.array([1.0,  2.0])
c2 = np.array([5.0, 12.0])

# Create points randomly around the centre points.
c1x = np.random.normal(c1[0], 2.0, 10)
c1y = np.random.normal(c1[1], 2.0, 10)
c1p = np.vstack([c1x, c1y]).T
c2x = np.random.normal(c2[0], 2.0, 10)
c2y = np.random.normal(c2[1], 2.0, 10)
c2p = np.vstack([c2x, c2y]).T

# Merge the two lists of values.
myX = np.concatenate([c1p, c2p])
# Shuffle the points.
np.random.shuffle(myX)

myX
Out[12]:
array([[-0.60350908,  0.80600677],
       [ 5.79766518, 10.527845  ],
       [ 2.93118138, -2.27208718],
       [ 6.25648216, 12.45203601],
       [ 5.79633889,  8.5543678 ],
       [ 6.59360945,  9.0939594 ],
       [ 0.10517133,  2.36381443],
       [ 2.30112082,  0.29770193],
       [ 2.48206805, 11.06734928],
       [ 4.02188676, 16.24798429],
       [ 2.92692029, -0.29308455],
       [ 2.70377258,  4.6306797 ],
       [-0.25069006, -0.30494735],
       [-0.4425126 ,  3.44875223],
       [ 2.27559788,  5.89615065],
       [ 5.24168866, 14.26724582],
       [-4.02183793,  1.87040957],
       [-1.48560208,  2.56505284],
       [ 2.3880615 , 13.0276387 ],
       [ 4.94471303, 11.85252002]])
In [13]:
# Plot the data set.
plt.plot(myX[:,0], myX[:,1], 'x')
Out[13]:
[<matplotlib.lines.Line2D at 0x25f83b60fd0>]
In [14]:
# Perform kmeans fitting.
mykmeans = skcl.KMeans(n_clusters=2, random_state=0).fit(myX)
In [15]:
# Plot the data set.
plt.plot(myX[mykmeans.labels_ == 0][:,0], myX[mykmeans.labels_ == 0][:,1], 'gx')
plt.plot(myX[mykmeans.labels_ == 1][:,0], myX[mykmeans.labels_ == 1][:,1], 'rx')
Out[15]:
[<matplotlib.lines.Line2D at 0x25f83bcc1f0>]
In [16]:
# Create new dummy points for classification.
# mynewvals = np.array([[0, 0], [6, 10]])
myxvals = np.linspace(-1.0, 7.0, 10)
myyvals = np.linspace(-1.0, 15.0, 10)
mynewvals = np.vstack([myxvals, myyvals]).T
mynewvals
Out[16]:
array([[-1.        , -1.        ],
       [-0.11111111,  0.77777778],
       [ 0.77777778,  2.55555556],
       [ 1.66666667,  4.33333333],
       [ 2.55555556,  6.11111111],
       [ 3.44444444,  7.88888889],
       [ 4.33333333,  9.66666667],
       [ 5.22222222, 11.44444444],
       [ 6.11111111, 13.22222222],
       [ 7.        , 15.        ]])
In [17]:
# Predict the cluster for two points.
mypredictions = mykmeans.predict(mynewvals)
mypredictions
Out[17]:
array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
In [18]:
# Plot the data set.
plt.plot(myX[mykmeans.labels_ == 0][:,0], myX[mykmeans.labels_ == 0][:,1], 'gx')
plt.plot(myX[mykmeans.labels_ == 1][:,0], myX[mykmeans.labels_ == 1][:,1], 'rx')
plt.plot(mynewvals[mypredictions == 0][:,0], mynewvals[mypredictions == 0][:,1], 'go')
plt.plot(mynewvals[mypredictions == 1][:,0], mynewvals[mypredictions == 1][:,1], 'ro')
Out[18]:
[<matplotlib.lines.Line2D at 0x25f83c37100>]
In [19]:
# The centres of clusters.
mycent = mykmeans.cluster_centers_
mycent
Out[19]:
array([[ 0.58541932,  1.72804082],
       [ 4.83583485, 11.89899404]])
In [20]:
# Plot the data set.
plt.plot(myX[mykmeans.labels_ == 0][:,0], myX[mykmeans.labels_ == 0][:,1], 'gx')
plt.plot(myX[mykmeans.labels_ == 1][:,0], myX[mykmeans.labels_ == 1][:,1], 'rx')
plt.plot(mynewvals[mypredictions == 0][:,0], mynewvals[mypredictions == 0][:,1], 'go')
plt.plot(mynewvals[mypredictions == 1][:,0], mynewvals[mypredictions == 1][:,1], 'ro')
plt.plot(mycent[:,0], mycent[:,1], 'k.')
Out[20]:
[<matplotlib.lines.Line2D at 0x25f83c97af0>]

End