import addutils.toc ; addutils.toc.js(ipy_notebook=True)
import scipy.io
import numpy as np
import pandas as pd
from addutils import css_notebook
css_notebook()
Before trying any ML technique it is always a good idea to visualize the available data from different point of view but when the dimensionality of the problem is higher, it's difficult to use some that kind of plots and it is necessary to do dimenionality reduction.
We have already seen some simple visualization example made with scatter plot or scatter matrix on the tutorial ml01. In this lesson we're going furter by working with some of the mamy data projection techniques available in scikit-learn.
In the first example we will consider the digits dataset in which we have many 8x8 grayscale images of handwritten digits. In other words, this dataset is made by samples with 64 features
What we want to do is to obtain a descriptive 2D scatter plot.
import bokeh.plotting as bk
bk.output_notebook()
from sklearn import datasets
digits = datasets.load_digits()
import addutils.imagegrid as ig
from bokeh.palettes import Greys9
num_imgs = digits.images.shape[0]
fig = ig.imagegrid_figure(images=[ digits.images[i][::-1, :] for i in range(num_imgs) ],
text=[ str(digits.target[i]) for i in range(num_imgs) ],
figure_title=None, palette=Greys9[::-1],
figure_plot_width=760, figure_plot_height=600,
text_color='red', padding=0.1,
grid_size=(10, 8))
bk.show(fig)
We'll start our analysis with Principal Component Analysis (PCA): PCA seeks orthogonal linear combinations of the features which show the greatest variance. In this example we'll use RandomizedPCA
, because it's faster for large datasets.
import addutils.palette as pal
import seaborn as sns
from sklearn import decomposition
from bokeh.models.sources import ColumnDataSource
from bokeh.models.tools import HoverTool
pca = decomposition.RandomizedPCA(copy=True, iterated_power=3, n_components=2, whiten=False)
pca_proj = pca.fit_transform(digits.data)
colors = map(pal.to_hex, sns.color_palette('Paired', 10))
pca_df = pd.DataFrame({'x': pca_proj[:,0],
'y': pca_proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target})
pca_src = ColumnDataSource(data=pca_df)
fig = bk.figure(title='PCA Projection - digits dataset',
x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=560)
fig.scatter(source=pca_src, x='x', y='y',
size=10, fill_alpha=0.9, line_alpha=0.5, line_color='black',
fill_color='color')
hover_tool = HoverTool(tooltips=[("target", "@target"),
("color", "$color[swatch]:color")],
snap_to_data=True)
fig.add_tools(hover_tool)
bk.show(fig)
We can notice some structure in this data but it's still difficult to understand if it would be possible to effectively apply some classification algorithm since the different data clusters do not show enough separation.
Principal Component Analysis (PCA) identifies the combination of attributes that account for the most variance in the data.
Linear Discriminant Analysis (LDA) instead tries to identify attributes that account for the most variance between classes. In particular, LDA, in contrast to PCA, is a supervised method, and can be used just when the class labels are available.
Let's see an LDA on the same Dataset:
TODO - Remove the warning
import warnings
warnings.filterwarnings('ignore', message='Variables are collinear')
from sklearn import lda
lda = lda.LDA(n_components=2)
lda_proj = lda.fit(digits.data, digits.target).transform(digits.data)
lda_df = pd.DataFrame({ 'x': lda_proj[:,0],
'y': lda_proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target })
lda_src = ColumnDataSource(data=lda_df)
fig = bk.figure(title='LDA Projection - digits dataset',
x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=500)
fig.scatter(source=lda_src, x='x', y='y',
size=8, line_color='black', line_alpha=0.5,
fill_color='color')
hover_tool = HoverTool(tooltips=[("target", "@target"),
("color", "$color[swatch]:color")],
snap_to_data=True)
fig.add_tools(hover_tool)
bk.show(fig)
As in the previous example we don't see a clear separation of the different clusters. This is because this specific dataset present some non-linear features that can not be separate by linear projections.
The main weakness of the linear techniques seen so far is that they cannot detect non-linear features. A set of algorithms known as Manifold Learning have been developed to address this deficiency.
Isomap
is a global, nonlinear, nonparametric dimentional reduction algorithm:
warnings.filterwarnings('ignore', message='kneighbors*')
from sklearn import manifold
from bokeh.models import GlyphRenderer, Quad, Legend
fig_grid = []
for i in range(2):
row = []
for j in range(2):
iso = manifold.Isomap(n_neighbors=i+2, n_components=2)
proj = iso.fit_transform(digits.data)
df = pd.DataFrame({ 'x': proj[:,0], 'y': proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target })
src = ColumnDataSource(data=df)
fig = bk.figure(title="n_neighbors = %d" % (i+2),
title_text_font_size='12pt',
plot_width=340, plot_height=300)
fig.scatter(source=src, x='x', y='y', fill_color='color',
size=8, line_alpha=0.5, line_color='black')
hover_tool = HoverTool(tooltips=[ ("target", "@target") ],
snap_to_data=True)
fig.add_tools(hover_tool)
row.append(fig)
fig_grid.append(row)
bk.show(bk.gridplot(fig_grid))
These visualizations show us that there is hope: even a simple classifier should be able to adequately identify the members of the various classes.
A canonical dataset used in Manifold learning is the S-curve, which we briefly saw in an earlier section:
%matplotlib inline
import matplotlib.pyplot as plt
X, y = datasets.make_s_curve(n_samples=1000)
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = plt.axes(projection='3d')
s = ax.scatter3D(X[:, 0], X[:, 1], X[:, 2], c=y, s=50, marker="o", cmap='GnBu')
fig.colorbar(s)
ax.view_init(10, -60)
This is a 2-dimensional dataset embedded in three dimensions, but it is embedded in such a way that PCA cannot discover the underlying data orientation:
seqcolors = map(pal.to_hex, sns.color_palette('GnBu', 50))
X_pca = decomposition.PCA(n_components=2).fit_transform(X)
fig = bk.figure(title='PCA Projection - S-Curve',
x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=500)
fig.scatter(X_pca[:, 0], X_pca[:, 1],
size=10, line_color='black', line_alpha=0.5,
fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)
Manifold learning algorithms, however, available in the sklearn.manifold
submodule, are able to recover the underlying 2-dimensional manifold:
iso = manifold.Isomap(n_neighbors=25, n_components=2)
X_iso = iso.fit_transform(X)
fig = bk.figure(title='Isomap - S-Curve', x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=400)
fig.scatter(X_iso[:, 0], X_iso[:, 1], size=8, alpha=0.8, line_color='black',
fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)
LocallyLinearEmbedding
(LLE) is a local, nonlinear, nonparametric algorithm. The main idea behind the Local Algorithms is to make the local configurations of points in the low-dimensional space resemble the local configurations in the high-dimensional space.
lle = manifold.LocallyLinearEmbedding(n_neighbors=15, n_components=2, method='modified')
X_lle = lle.fit_transform(X)
fig = bk.figure(title='LocallyLinearEmbedding - S-Curve', x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=400)
fig.scatter(X_lle[:, 0], X_lle[:, 1],
size=10, line_color='black', line_alpha=0.75,
fill_color=pal.linear_map(y, seqcolors))
bk.show(fig)
t-SNE
give more importance to local distances and lower importance to non-local distances. In other words, it try to keep closer in the projected space the points that are closer in the original space while neglecting the others.
Moreover t-SNE
has a probabilistic way to find pairwise local distances: it converts each high-dimension similarity into the probability that one data point will pick one other data point as its neighbor. This make t-SNE
almost insensitive to bad feature scaling.
On th other side, the relative local nature of t-SNE
makes it sensitive to the course of the dimensionality of the data.
tsne = manifold.TSNE(n_components=2, n_iter=500)
tsne_proj = tsne.fit_transform(digits.data)
df = pd.DataFrame({ 'x': tsne_proj[:,0],
'y': tsne_proj[:,1],
'color': pal.linear_map(digits.target, colors),
'target': digits.target
})
src = ColumnDataSource(data=df)
fig = bk.figure(title='t-SNE - digits dataset', x_axis_label='c1', y_axis_label='c2',
plot_width=750, plot_height=450)
fig.scatter(source=src, x='x', y='y', fill_color='color',
size=8, line_color='black', line_alpha=0.50)
hover_tool = HoverTool(tooltips=[('target', '@target')], snap_to_data=True)
fig.add_tools(hover_tool)
bk.show(fig)
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.