In [3]:

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle

In [4]:

rng = np.random.RandomState(42)

In [5]:

signal, labels = make_blobs(n_samples=int(1e6), n_features=2,
                            cluster_std=.1, centers=500,
                            random_state=rng)
noise = rng.uniform(low=-15, high=15, size=(len(signal) // 10, 2))

data = np.vstack([signal, noise])
labels = np.concatenate([labels, [-1] * noise.shape[0]])
data, labels = shuffle(data, labels, random_state=rng)

In [14]:

def plot_data_with_labels(data, labels, alpha=0.3, cm=plt.cm.prism, subsample=int(5e4),
                          random_state=0):
    n_samples = data.shape[0]
    if n_samples > subsample:
        idx = shuffle(np.arange(n_samples), random_state=random_state)[:subsample]
        data = data[idx]
        labels = labels[idx]

    plt.figure(figsize=(10, 10))
    plt.xlim(-12, 12)
    plt.ylim(-12, 12)

    unique_labels = np.unique(labels)
    colors = cm(np.linspace(0, 1, len(unique_labels)))
    for l, c in zip(unique_labels, colors):
        if l == -1:
            c = 'b'
        data_l = data[labels == l]
        plt.scatter(data_l[:, 0], data_l[:, 1], alpha=alpha,
                    facecolors=c, edgecolors='none')

In [15]:

plot_data_with_labels(data, labels)

In [16]:

%%time
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.05, algorithm='kd_tree')
dbscan_labels = dbscan.fit_predict(data)

CPU times: user 2min 25s, sys: 1.4 s, total: 2min 26s
Wall time: 2min 26s

In [17]:

unique_dbscan_labels = np.unique(dbscan_labels)
print(len(unique_dbscan_labels))

In [18]:

plot_data_with_labels(data, labels)

In [19]:

from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

print("NMI DBSCAN: %0.3f"
      % normalized_mutual_info_score(labels, dbscan_labels))
print("ARI DBSCAN: %0.3f"
      % adjusted_rand_score(labels, dbscan_labels))

NMI DBSCAN: 0.879
ARI DBSCAN: 0.617

In [20]:

%%time
from sklearn.cluster import MiniBatchKMeans

mbkmeans = MiniBatchKMeans(n_clusters=500, init_size=int(1e4), batch_size=int(1e3))
mbkmeans_labels = mbkmeans.fit_predict(data)

CPU times: user 42.3 s, sys: 2.77 s, total: 45.1 s
Wall time: 42 s

In [21]:

plot_data_with_labels(data, mbkmeans_labels)

In [22]:

print("NMI MB K-Means: %0.3f"
      % normalized_mutual_info_score(labels, mbkmeans_labels))
print("ARI  MB K-Means: %0.3f"
      % adjusted_rand_score(labels, mbkmeans_labels))

NMI MB K-Means: 0.898
ARI  MB K-Means: 0.248

In [23]:

%%time
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=500, n_jobs=10)
kmeans_labels = kmeans.fit_predict(data)

CPU times: user 1.61 s, sys: 910 ms, total: 2.52 s
Wall time: 18min 53s

In [24]:

plot_data_with_labels(data, kmeans_labels)

In [25]:

print("NMI K-Means: %0.3f"
      % normalized_mutual_info_score(labels, kmeans_labels))
print("ARI  K-Means: %0.3f"
      % adjusted_rand_score(labels, kmeans_labels))

NMI K-Means: 0.899
ARI  K-Means: 0.249

In [26]:

%%time
from sklearn.cluster import Birch

birch = Birch(threshold=0.1, n_clusters=500)
birch_labels = birch.fit_predict(data)

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-26-7926669e0c96> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', 'from sklearn.cluster import Birch\n\nbirch = Birch(threshold=0.1, n_clusters=500)\nbirch_labels = birch.fit_predict(data)')

/volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2259             magic_arg_s = self.var_expand(line, stack_depth)
   2260             with self.builtin_trap:
-> 2261                 result = fn(magic_arg_s, cell)
   2262             return result
   2263 

/volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)

/volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/volatile/ogrisel/envs/py35/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
   1164         else:
   1165             st = clock2()
-> 1166             exec(code, glob, local_ns)
   1167             end = clock2()
   1168             out = None

<timed exec> in <module>()

/volatile/ogrisel/code/scikit-learn/sklearn/base.py in fit_predict(self, X, y)
    347         # non-optimized default implementation; override when a better
    348         # method is possible for a given clustering algorithm
--> 349         self.fit(X)
    350         return self.labels_
    351 

/volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in fit(self, X, y)
    420         """
    421         self.fit_, self.partial_fit_ = True, False
--> 422         return self._fit(X)
    423 
    424     def _fit(self, X):

/volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in _fit(self, X)
    470         self.subcluster_centers_ = centroids
    471 
--> 472         self._global_clustering(X)
    473         return self
    474 

/volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in _global_clustering(self, X)
    606 
    607         if compute_labels:
--> 608             self.labels_ = self.predict(X)

/volatile/ogrisel/code/scikit-learn/sklearn/cluster/birch.py in predict(self, X)
    541         X = check_array(X, accept_sparse='csr')
    542         self._check_fit(X)
--> 543         reduced_distance = safe_sparse_dot(X, self.subcluster_centers_.T)
    544         reduced_distance *= -2
    545         reduced_distance += self._subcluster_norms

/volatile/ogrisel/code/scikit-learn/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
    181         return ret
    182     else:
--> 183         return fast_dot(a, b)
    184 
    185 

MemoryError:

In [ ]:

plot_data_with_labels(data, birch_labels)

In [ ]:

print("NMI Birch: %0.3f"
      % normalized_mutual_info_score(labels, birch_labels))
print("ARI  Birch: %0.3f"
      % adjusted_rand_score(labels, birch_labels))

In [ ]:

%%time
birch.set_params(n_clusters=10)
birch.partial_fit()
birch_labels_small = birch.predict(data)

In [ ]:

plot_data_with_labels(data, birch_labels_small)

In [ ]: