import numpy as np %matplotlib inline import matplotlib.pyplot as plt from scipy.cluster import vq # create 100 coordinate pairs (i.e. two values), then add 5 to all of them year_1 = np.random.randn(100, 2) + 5 # create 30 coordinatee pairs (i.e. two values), then subtract 5 to all of them year_2 = np.random.randn(30, 2) - 5 # create 50 coordinatee pairs (i.e. two values) year_3 = np.random.randn(50, 2) print('year 1 battles:', year_1[0:3]) print('year 2 battles:', year_2[0:3]) print('year 3 battles:', year_3[0:3]) # vertically stack year_1, year_2, and year_3 elements battles = np.vstack([year_1, year_2, year_3]) # calculate the centroid coordinates of each cluster # and the variance of all the clusters centroids, variance = vq.kmeans(battles, 3) centroids variance identified, distance = vq.vq(battles, centroids) identified distance cluster_1 = battles[identified == 0] cluster_2 = battles[identified == 1] cluster_3 = battles[identified == 2] print(cluster_1[0:3]) print(cluster_2[0:3]) print(cluster_3[0:3]) # create a scatter plot there the x-axis is the first column of battles # the y-axis is the second column of battles, the size is 100, and # the color of each point is determined by the indentified variable plt.scatter(battles[:,0], battles[:,1], s=100, c=identified)