%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
rng = np.random.RandomState(42)
signal, labels = make_blobs(n_samples=int(1e6), n_features=2,
cluster_std=.1, centers=500,
random_state=rng)
noise = rng.uniform(low=-15, high=15, size=(len(signal) // 10, 2))
data = np.vstack([signal, noise])
labels = np.concatenate([labels, [-1] * noise.shape[0]])
data, labels = shuffle(data, labels, random_state=rng)
def plot_data_with_labels(data, labels, alpha=0.3, cm=plt.cm.prism, subsample=int(5e4),
random_state=0):
n_samples = data.shape[0]
if n_samples > subsample:
idx = shuffle(np.arange(n_samples), random_state=random_state)[:subsample]
data = data[idx]
labels = labels[idx]
plt.figure(figsize=(10, 10))
plt.xlim(-12, 12)
plt.ylim(-12, 12)
unique_labels = np.unique(labels)
colors = cm(np.linspace(0, 1, len(unique_labels)))
for l, c in zip(unique_labels, colors):
if l == -1:
c = 'b'
data_l = data[labels == l]
plt.scatter(data_l[:, 0], data_l[:, 1], alpha=alpha,
facecolors=c, edgecolors='none')
plot_data_with_labels(data, labels)
%%time
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.05, algorithm='kd_tree')
dbscan_labels = dbscan.fit_predict(data)
CPU times: user 2min 25s, sys: 1.4 s, total: 2min 26s Wall time: 2min 26s
unique_dbscan_labels = np.unique(dbscan_labels)
print(len(unique_dbscan_labels))
355
plot_data_with_labels(data, labels)
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
print("NMI DBSCAN: %0.3f"
% normalized_mutual_info_score(labels, dbscan_labels))
print("ARI DBSCAN: %0.3f"
% adjusted_rand_score(labels, dbscan_labels))
NMI DBSCAN: 0.879 ARI DBSCAN: 0.617
%%time
from sklearn.cluster import MiniBatchKMeans
mbkmeans = MiniBatchKMeans(n_clusters=500, init_size=int(1e4), batch_size=int(1e3))
mbkmeans_labels = mbkmeans.fit_predict(data)
CPU times: user 42.3 s, sys: 2.77 s, total: 45.1 s Wall time: 42 s
plot_data_with_labels(data, mbkmeans_labels)
print("NMI MB K-Means: %0.3f"
% normalized_mutual_info_score(labels, mbkmeans_labels))
print("ARI MB K-Means: %0.3f"
% adjusted_rand_score(labels, mbkmeans_labels))
NMI MB K-Means: 0.898 ARI MB K-Means: 0.248
%%time
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=500, n_jobs=10)
kmeans_labels = kmeans.fit_predict(data)
CPU times: user 1.61 s, sys: 910 ms, total: 2.52 s Wall time: 18min 53s
plot_data_with_labels(data, kmeans_labels)
print("NMI K-Means: %0.3f"
% normalized_mutual_info_score(labels, kmeans_labels))
print("ARI K-Means: %0.3f"
% adjusted_rand_score(labels, kmeans_labels))
NMI K-Means: 0.899 ARI K-Means: 0.249
%%time
from sklearn.cluster import Birch
birch = Birch(threshold=0.1, n_clusters=500)
birch_labels = birch.fit_predict(data)
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) <ipython-input-26-7926669e0c96> in <module>() ----> 1 get_ipython().run_cell_magic('time', '', 'from sklearn.cluster import Birch\n\nbirch = Birch(threshold=0.1, n_clusters=500)\nbirch_labels = birch.fit_predict(data)') /volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2259 magic_arg_s = self.var_expand(line, stack_depth) 2260 with self.builtin_trap: -> 2261 result = fn(magic_arg_s, cell) 2262 return result 2263 /volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns) /volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 191 # but it's overkill for just that one bit of state. 192 def magic_deco(arg): --> 193 call = lambda f, *a, **k: f(*a, **k) 194 195 if callable(arg): /volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns) 1164 else: 1165 st = clock2() -> 1166 exec(code, glob, local_ns) 1167 end = clock2() 1168 out = None <timed exec> in <module>() /volatile/ogrisel/code/scikit-learn/sklearn/base.py in fit_predict(self, X, y) 347 # non-optimized default implementation; override when a better 348 # method is possible for a given clustering algorithm --> 349 self.fit(X) 350 return self.labels_ 351 /volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in fit(self, X, y) 420 """ 421 self.fit_, self.partial_fit_ = True, False --> 422 return self._fit(X) 423 424 def _fit(self, X): /volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in _fit(self, X) 470 self.subcluster_centers_ = centroids 471 --> 472 self._global_clustering(X) 473 return self 474 /volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in _global_clustering(self, X) 606 607 if compute_labels: --> 608 self.labels_ = self.predict(X) /volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in predict(self, X) 541 X = check_array(X, accept_sparse='csr') 542 self._check_fit(X) --> 543 reduced_distance = safe_sparse_dot(X, self.subcluster_centers_.T) 544 reduced_distance *= -2 545 reduced_distance += self._subcluster_norms /volatile/ogrisel/code/scikit-learn/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output) 181 return ret 182 else: --> 183 return fast_dot(a, b) 184 185 MemoryError:
plot_data_with_labels(data, birch_labels)
print("NMI Birch: %0.3f"
% normalized_mutual_info_score(labels, birch_labels))
print("ARI Birch: %0.3f"
% adjusted_rand_score(labels, birch_labels))
%%time
birch.set_params(n_clusters=10)
birch.partial_fit()
birch_labels_small = birch.predict(data)
plot_data_with_labels(data, birch_labels_small)