This notebook is at: https://ageo.co/YywklD
X = [[1,1,2,3],
[1,1,1,1],
[2,2,2,2],
[2,2,3,3],
[1,3,3,3],
]
from kmodes.kmodes import KModes
km = KModes(n_clusters=3, init='Huang', n_init=5, verbose=1)
clusters = km.fit(X)
km.labels_
Init: initializing centroids Init: initializing clusters Starting iterations... Run 1, iteration: 1/100, moves: 1, cost: 4.0 Run 1, iteration: 2/100, moves: 0, cost: 4.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 2, iteration: 1/100, moves: 0, cost: 4.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 3, iteration: 1/100, moves: 0, cost: 4.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 4, iteration: 1/100, moves: 0, cost: 4.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 5, iteration: 1/100, moves: 0, cost: 4.0 Best run was number 1
array([0, 0, 2, 1, 1], dtype=uint16)
That's... something.
Or any function that provides a 'distance' between two sequences. They can even be of different length (as with Levenshtein).
def levenshtein(seq1, seq2):
oneago = None
thisrow = list(range(1, len(seq2) + 1)) + [0]
for x in range(len(seq1)):
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
for y in range(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
return thisrow[len(seq2) - 1]
levenshtein('hi','pie')
2
data = ['hello', 'halo', 'hullo', 'hi', 'pie', 'py', 'my']
import numpy as np
from sklearn.cluster import dbscan
def lev_metric(x, y):
i, j = int(x[0]), int(y[0])
return levenshtein(data[i], data[j])
X = [[i] for i in range(len(data))]
clustering = dbscan(X, metric=lev_metric, eps=2, min_samples=2)
clustering
(array([0, 1, 2, 3, 4, 5, 6]), array([0, 0, 0, 1, 1, 1, 1]))