import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.cluster import vq
# create 100 coordinate pairs (i.e. two values), then add 5 to all of them
year_1 = np.random.randn(100, 2) + 5
# create 30 coordinatee pairs (i.e. two values), then subtract 5 to all of them
year_2 = np.random.randn(30, 2) - 5
# create 50 coordinatee pairs (i.e. two values)
year_3 = np.random.randn(50, 2)
print('year 1 battles:', year_1[0:3])
print('year 2 battles:', year_2[0:3])
print('year 3 battles:', year_3[0:3])
year 1 battles: [[ 3.87032104 4.93418141] [ 4.47603646 3.23230121] [ 6.15905943 4.55274026]] year 2 battles: [[-3.55642932 -3.13125097] [-5.83295449 -5.75787649] [-5.12144789 -5.00466761]] year 3 battles: [[-0.27557365 -0.65002898] [ 0.94593878 -0.46056352] [ 0.91003511 0.27888337]]
# vertically stack year_1, year_2, and year_3 elements
battles = np.vstack([year_1, year_2, year_3])
# calculate the centroid coordinates of each cluster
# and the variance of all the clusters
centroids, variance = vq.kmeans(battles, 3)
centroids
array([[ 4.89478443, 5.00806609], [ 0.16770004, 0.01639683], [-5.06447231, -4.99956259]])
variance
1.2382236882037887
identified, distance = vq.vq(battles, centroids)
identified
distance
cluster_1 = battles[identified == 0]
cluster_2 = battles[identified == 1]
cluster_3 = battles[identified == 2]
print(cluster_1[0:3])
print(cluster_2[0:3])
print(cluster_3[0:3])
[[ 3.87032104 4.93418141] [ 4.47603646 3.23230121] [ 6.15905943 4.55274026]] [[-0.27557365 -0.65002898] [ 0.94593878 -0.46056352] [ 0.91003511 0.27888337]] [[-3.55642932 -3.13125097] [-5.83295449 -5.75787649] [-5.12144789 -5.00466761]]
# create a scatter plot there the x-axis is the first column of battles
# the y-axis is the second column of battles, the size is 100, and
# the color of each point is determined by the indentified variable
plt.scatter(battles[:,0], battles[:,1], s=100, c=identified)
<matplotlib.collections.PathCollection at 0x10771b890>