import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.cluster import vq

# create 100 coordinate pairs (i.e. two values), then add 5 to all of them
year_1 = np.random.randn(100, 2) + 5

# create 30 coordinatee pairs (i.e. two values), then subtract 5 to all of them
year_2 = np.random.randn(30, 2) - 5

# create 50 coordinatee pairs (i.e. two values)
year_3 = np.random.randn(50, 2)

print('year 1 battles:',  year_1[0:3])
print('year 2 battles:', year_2[0:3])
print('year 3 battles:', year_3[0:3])

# vertically stack year_1, year_2, and year_3 elements
battles = np.vstack([year_1, year_2, year_3])

# calculate the centroid coordinates of each cluster 
# and the variance of all the clusters
centroids, variance  = vq.kmeans(battles, 3)

centroids

variance

identified, distance = vq.vq(battles, centroids)

identified

distance

cluster_1 = battles[identified == 0]
cluster_2 = battles[identified == 1]
cluster_3 = battles[identified == 2]

print(cluster_1[0:3])
print(cluster_2[0:3])
print(cluster_3[0:3])

# create a scatter plot there the x-axis is the first column of battles
# the y-axis is the second column of battles, the size is 100, and
# the color of each point is determined by the indentified variable
plt.scatter(battles[:,0], battles[:,1], s=100, c=identified)