This notebook presents the Barnes-Hut implementation of t-SNE. t-SNE is used to visualize high-dimensional data in a low dimensional space that attempts preserve the pairwise high-dimensional similarities in a low-dimensional embedding. The Barnes-Hut algorithm, which is used by astrophysicists to perform N-body simulations, allows the calculation of the t-SNE embedding in $O(N log N)$ time instead of $O(N^{2})$. This effectively allows us to learn embeddings of data sets with millions of elements instead of tens of thousands.
To try out out the BH version of t-SNE, do the following:
Checkout out scikit-learn
git clone https://github.com/cemoody/scikit-learn.git
Change to my branch
git checkout cemoody/bhtsne
Rebuild the sources, especially the new Cython file sklearn/manifold/bhtsne.pyx
python setup.py build_ext -i
Download this notebook
curl https://gist.githubusercontent.com/cemoody/01135ef2f26837548360/raw/57288dd4ebd97a1d6447b14c78b8fb743147b91e/Barnes-Hut%20t-SNE%20Demo.ipynb > bhtsne_demo.ipynb
Start the iPython Notebook, and then open the notebook
ipython notebook
Alternatively, you can download the script version of this and run it:
curl https://gist.githubusercontent.com/cemoody/64ec3934e5b540e8f6e8/raw/f66809381d1baf005a2e2cd6c733e027fbf2e5eb/demo_bhtsne.py > demo_bhtsne.py
%matplotlib inline
from sklearn.manifold import bhtsne
import sys
import numpy as np
import matplotlib.pyplot as plt
from time import time
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda, random_projection)
from sklearn.datasets import fetch_mldata
digits = datasets.load_digits(n_class=6)
X = digits.data
y = digits.target
n_samples, n_features = X.shape
n_neighbors = 30
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
plt.figure(figsize=(15,15))
ax = plt.subplot(111)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
if hasattr(offsetbox, 'AnnotationBbox'):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]]) # just something big
for i in range(digits.data.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
# don't show points that are too close
continue
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
X[i])
ax.add_artist(imagebox)
plt.xticks([]), plt.yticks([])
if title is not None:
plt.title(title)
tsne = manifold.TSNE(n_components=2, init='pca', random_state=1, method='barnes_hut', n_iter=1000, verbose=20)
t0 = time()
Xtbh = tsne.fit_transform(X)
t1 = time()
dtbh = t1 - t0
plot_embedding(Xtbh, title="Barnes-Hut t-SNE visualization of MNIST digits in %1.1f sec" % dtbh)
tsne = manifold.TSNE(n_components=2, init='pca', random_state=1, method='standard', n_iter=1000, verbose=20)
t0 = time()
Xts = tsne.fit_transform(X)
t1 = time()
dts = t1 - t0
plot_embedding(Xts, title="Standard t-SNE visualization of MNIST digits in %1.1f sec" % dts)