import numpy as np
from fastcluster import linkage
from scipy.spatial.distance import squareform
distance_matrix = np.array([[0.0, .25, .25, .25],
[.25, 0.0, .25, .25],
[.25, .25, 0.0, .25],
[.25, .25, .25, 0.0]])
distance_matrix
array([[ 0. , 0.25, 0.25, 0.25], [ 0.25, 0. , 0.25, 0.25], [ 0.25, 0.25, 0. , 0.25], [ 0.25, 0.25, 0.25, 0. ]])
compressed_distance_matrix = squareform(distance_matrix)
Z = linkage(compressed_distance_matrix, method="complete", metric="cosine")
Z
array([[ 0. , 1. , 0.25, 2. ], [ 2. , 4. , 0.25, 3. ], [ 3. , 5. , 0.25, 4. ]])
# Num desired clusters
k = 2
# Count entries in the matrix
initial_count = len(distance_matrix[0])
# Fill an array with indices from the matrix
c = [[i] for i in range(initial_count)]
# Now extend with empty entries for merging into
c.extend([[]]*(initial_count))
# Loop the linkage matrix, making merges
# into new clusters until we have k clusters.
for i, block in enumerate(Z):
j = i + initial_count
a = int(block[0])
b = int(block[1])
# don't cluster randomly.
if block[2] >= 1:
break
# merge a and b into j
c[j] = c[a]
c[j].extend(c[b])
# empty a and b
c[a] = c[b] = []
# count clusters and see if we have
# reached our target number
cluster_count = sum([min(len(x),1) for x in c])
if cluster_count <= k:
break
clusters = [x for x in c if x]
clusters
[[3], [2, 0, 1]]
# I have 2 clusters of size 1 and 3, but I'd like 2 and 2
# More generally I want to cluster evenly where possible for larger arrays.
# If I have a small k it will then start combining those clusters (also evenly).