Notebook

In [ ]:

from os import mkdir, path
from math import cos, sin, pi, acos
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns

from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.datasets import load_digits
from sklearn import preprocessing
%matplotlib inline

In [ ]:

###############################################################################
# Plot initialization

dirname = "../prebuiltimages/"
if not path.exists(dirname):
    mkdir(dirname)

imageformat = '.pdf'
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']})
params = {'axes.labelsize': 16,
          'font.size': 23,
          'legend.fontsize': 12,
          'xtick.labelsize': 10,
          'ytick.labelsize': 10,
          'text.usetex': True
          }
plt.rcParams.update(params)
plt.close("all")

In [ ]:

###############################################################################
# display function:

saving = False


def my_saving_display(fig, dirname, filename, imageformat, bbox_inches='tight'):
    """"Saving with personal function."""
    filename = filename.replace('.', 'pt')  # remove "." to avoid floats issues
    if saving is True:
        dirname + filename + imageformat
        image_name = dirname + filename + imageformat
        # BEWARE: bbox_inches='tight' helps for legend outside of bbox
        fig.savefig(image_name, bbox_inches='tight')

In [ ]:

###############################################################################
# PCA interpretation

color_blind_list = sns.color_palette("colorblind", 8)

centers = [(-5, -5), (0, 0), (5, 5)]
n_samples = 50
n_features = 2
Z, _ = make_blobs(n_samples=n_samples, n_features=n_features, cluster_std=4.0,
                  centers=centers, shuffle=False, random_state=42)

scaler = StandardScaler(with_mean=True, with_std=True).fit(Z)
X = scaler.transform(Z)


my_orange = color_blind_list[2]

sns.set_context("poster")
sns.set_palette("colorblind")
sns.axes_style()
sns.set_style("white")

s_large = 200
s_small = 50

In [ ]:

# Data scatterplot
fig = plt.figure(00, figsize=(10, 5))
sub1 = fig.add_subplot(121)
sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange,
             edgecolors='k', zorder=1)
sub1.get_yaxis().set_ticks([])
sub1.get_xaxis().set_ticks([])
sub1.set_ylim([-3., 3.])
sub1.set_xlim([-3., 3.])
sub1.set_aspect('equal')
sub1.set_xlabel(u'Data')

sub2 = fig.add_subplot(122)
# sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1)
sub2.set_aspect('equal')
sub2.set_ylim([0, 2.])
sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2])
sub2.set_xlabel('')
sub2.set_ylabel('')
sub2.get_yaxis().set_ticks([])
sub2.get_xaxis().set_ticks([])
plt.show()

filename = 'fig_pca_axis_raw'
my_saving_display(fig, dirname, filename, imageformat)

In [ ]:

# with mean
fig = plt.figure(00, figsize=(10, 5))
sub1 = fig.add_subplot(121)
sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange,
             edgecolors='k', zorder=1)
sub1.get_yaxis().set_ticks([])
sub1.get_xaxis().set_ticks([])
sub1.set_ylim([-3., 3.])
sub1.set_xlim([-3., 3.])
sub1.set_aspect('equal')
sub1.set_xlabel(u'Data')
sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=2, edgecolors='k')

sub2 = fig.add_subplot(122)
# sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1)
sub2.set_aspect('equal')
sub2.set_ylim([0, 2.])
sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2])
sub2.set_xlabel('')
sub2.set_ylabel('')
sub2.get_yaxis().set_ticks([])
sub2.get_xaxis().set_ticks([])
plt.show()
plt.show()
sub1.set_aspect('equal')
sub1.set_xlabel(u'Data and mean')
filename = 'fig_pca_axis_raw_mean'
my_saving_display(fig, dirname, filename, imageformat)
plt.show()

In [ ]:

x_range = np.linspace(-2, 2, num=100)
X_mean = X.mean(axis=0)

thetas = np.linspace(- pi / 2, pi / 2, num=10)
theta_grid = np.linspace(- pi / 2 - 0.2, pi / 2 + 0.2, num=200)
var_grid = np.zeros(theta_grid.shape)


f, (sub1, sub2) = plt.subplots(1, 2,figsize=(10, 5))
sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange,
             edgecolors='k', zorder=1)
sub1.get_yaxis().set_ticks([])
sub1.get_xaxis().set_ticks([])
sub1.set_ylim([-3., 3.])
sub1.set_xlim([-3., 3.])
sub1.set_aspect('equal')
sub1.set_xlabel(u'Data')
filename = 'fig_pca_axis_raw'
my_saving_display(fig, dirname, filename, imageformat)

# with mean
sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=2, edgecolors='k')
sub1.set_aspect('equal')
sub1.set_xlabel(u'Data and mean')
filename = 'fig_pca_axis_raw_mean'
my_saving_display(fig, dirname, filename, imageformat)

sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1)
sub2.set_aspect('equal')
sub2.set_ylim([0, 2.])
sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2])
sub2.set_xlabel('Angle')
sub2.set_ylabel('Variance')
sub2.get_yaxis().set_ticks([])
sub2.get_xaxis().set_ticks([])
plt.show()


for image_nb, theta in enumerate(theta_grid):
    rotation = np.asarray([cos(theta), sin(theta)])
    var_grid[image_nb] = np.var(X.dot(rotation))

for image_nb, theta in enumerate(thetas):
    fig = plt.figure(image_nb, figsize=(10, 5))
    sub1 = fig.add_subplot(121)
    sub1.set_aspect('equal')
    sub1.set_ylim([-3., 3.])
    sub1.set_xlim([-3., 3.])
    cos_var = np.asarray([cos(theta), -cos(theta)])
    sin_var = np.asarray([sin(theta), -sin(theta)])
    rotation = np.asarray([cos(theta), sin(theta)])
    points_projected = np.outer(X.dot(rotation), rotation)

    for i in range(X.shape[0]):
        point_projected = points_projected[i]
        point_ini = X[i, :]
        sub1.plot([point_ini[0], point_projected[0]],
                  [point_ini[1], point_projected[1]],
                  '--k', linewidth=1, zorder=1)
        sub1.scatter(point_projected[0], point_projected[1],
                     s=s_small, alpha=1, c='k', zorder=2)
        amplitude = 4
        sub1.plot(amplitude * cos_var, amplitude * sin_var, 'k', linewidth=1,
                  zorder=2)

    sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, zorder=3,
                 edgecolors='k')
    sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=4, edgecolors='k')
    sub1.get_yaxis().set_ticks([])
    sub1.get_xaxis().set_ticks([])
    sub1.set_xlabel(u'Data, mean and projection')

    var = np.var(X.dot(rotation))

    sub2 = fig.add_subplot(122)
    sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1)
    sub2.set_aspect('equal')
    sub2.set_ylim([0, 2.])
    sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2])
    sub2.scatter(theta, var, c=my_orange, s=s_large, zorder=2, edgecolors='k')
    sub2.set_xlabel('Angle')
    sub2.set_ylabel('Variance')
    sub2.get_yaxis().set_ticks([])
    sub2.get_xaxis().set_ticks([])
    plt.show()
    filename = 'fig_pca_axis' + str(image_nb)
    my_saving_display(fig, dirname, filename, imageformat)


# PCA itself

pca = PCA(n_components=2)
pca.fit(X)
rotation = np.asarray([pca.components_[0][0], pca.components_[0][0]])
X_new = np.outer(X.dot(rotation), rotation)
theta_opt = acos(pca.components_[0][0])

In [ ]:

fig = plt.figure(30, figsize=(10, 5))
sub1 = fig.add_subplot(121)
sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, zorder=3,
             edgecolors='k')
sub1.get_yaxis().set_ticks([])
sub1.get_xaxis().set_ticks([])
sub1.set_ylim([-3., 3.])
sub1.set_xlim([-3., 3.])
sub1.set_xlabel(u'Principal direction (main axis)')
sub1.set_aspect('equal')

for i in range(X.shape[0]):
    point_projected = X_new[i]
    point_ini = X[i, :]
    sub1.plot([point_ini[0], point_projected[0]],
              [point_ini[1], point_projected[1]],
              '--k', linewidth=1, zorder=2)
    sub1.scatter(point_projected[0], point_projected[1],
                 s=s_small, alpha=1, c='k', zorder=3)
    amplitude = 4
    cos_var = np.asarray([pca.components_[0][0], -pca.components_[0][0]])
    sin_var = np.asarray([pca.components_[0][1], -pca.components_[0][0]])
    sub1.plot(amplitude * cos_var, amplitude * sin_var, 'k', linewidth=1,
              zorder=2)


sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=3, edgecolors='k')

# BEWARE: normalisation is by 1 / (n_samples - 1) in sklearn:
var_opt = pca.explained_variance_[0] * (n_samples - 1) / (n_samples)

sub2 = fig.add_subplot(122)
sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1)
sub2.set_aspect('equal')
sub2.set_ylim([0, 2.])
sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2])
sub2.scatter(theta_opt, var_opt, c=my_orange, s=s_large, zorder=2,
             edgecolors='k')
sub2.set_xlabel('Angle')
sub2.set_ylabel('Variance')
sub2.get_yaxis().set_ticks([])
sub2.get_xaxis().set_ticks([])
filename = 'fig_pca_axis_opt'
my_saving_display(fig, dirname, filename, imageformat)
plt.show()

Exploration avec la PCA de la base de données Digits¶

In [ ]:

digits = load_digits()
X = digits.data
y = digits.target

n_samples, n_features = X.shape
n_digits = len(np.unique(y))

print("n_digits: %d, \t n_samples %d, \t n_features %d"
      % (n_digits, n_samples, n_features))
sns.set(style="white")

Visualisation d'un element de la base sous forme d'image:¶

In [ ]:

fig = plt.figure(figsize=(8, 3))
for i in range(10):
    ax = fig.add_subplot(2,5,i + 1)
    digit = X[i]
    ax.imshow(digit.reshape(8, 8), cmap=plt.cm.gray, interpolation="nearest")
    ax.axis('off')

filename = 'fig_digits'
my_saving_display(fig, dirname, filename, imageformat)    
    
transfo = PCA(n_components=2)
X_2d = transfo.fit_transform(X)
X_2d.shape

Visualisation en 2D des données (projection sur les deux axes principaux)¶

In [ ]:

sns.set_context("poster")
sns.set_palette("Paired", n_colors=10)
color_paired_list = sns.set_palette("Paired", n_colors=10)
fig = plt.figure(figsize=(10, 8))
plt.plot(X_2d[:, 0], X_2d[:, 1], 'k.', markersize=8)
filename = 'fig_pca_digits_wo_classes'
my_saving_display(fig, dirname, filename, imageformat)

Visualisation en 2D des données (projection sur les deux axes principaux) + coleur correspondant aux diverses classse¶

In [ ]:

fig = plt.figure(figsize=(10, 8))
for k in range(10):
    Xk_2d = X_2d[y == k]
    plt.plot(Xk_2d[:, 0], Xk_2d[:, 1], '.', markersize=8, label=k)
    plt.legend(numpoints=1, loc=1, bbox_to_anchor=(1.2, 0.7), markerscale=3)
plt.tight_layout()
plt.show()
filename = 'fig_pca_digits_w_classes'
my_saving_display(fig, dirname, filename, imageformat)

Une alternative non linéaire: t-distributed Stochastic Neighbor Embedding (t-SNE)¶

In [ ]:

from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
X_tsne = tsne.fit_transform(X)

In [ ]:

fig = plt.figure(figsize=(10, 8))
for k in range(10):
    X_tsne_int = X_tsne[y == k]
    plt.plot(X_tsne_int[:, 0], X_tsne_int[:, 1], '.', markersize=8, label=k)
    plt.legend(numpoints=1,loc=1, bbox_to_anchor=(1.2, 0.7), markerscale=3)
plt.tight_layout()
plt.show()
filename = 'fig_tsne_digits_w_classes'
my_saving_display(fig, dirname, filename, imageformat)

In [ ]:

from PIL import Image

data = np.random.random((100,100))

#Rescale to 0-255 and convert to uint8


fig = plt.figure(figsize=(8, 3))
for i in range(X.shape[0]):
    digit = X[i]    
    rescaled = (255.0 / digit.max() * (digit - digit.min())).astype(np.uint8)
    rescaled = rescaled.reshape(8, 8)
    new_p = Image.fromarray(np.asarray(rescaled,dtype='uint8'),mode='L')
    new_p.save("images/digit"+str(i)+".png")
    

In [ ]:

imgs_names = []
for i in range(X.shape[0]):
    imgs_names.append('images/digit'+str(i)+".png")
print(imgs_names[0])

In [ ]:

import warnings
warnings.filterwarnings('ignore')

from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook

output_notebook()

In [ ]:

color_paired_list = sns.color_palette("Paired", 10).as_hex()
colors = {i : color_paired_list[i] for i in range(10) }
df = pd.DataFrame(dict(y=y))
df['colors'] = df['y'].apply(lambda x: colors[x])

In [ ]:

source = ColumnDataSource(
        data=dict(
            x=X_tsne[:, 0],
            y=X_tsne[:, 1],
            imgs=imgs_names,
            desc=y,
            color =df['colors']
        )
    )
hover = HoverTool(
        tooltips="""
        <div>
            <div>
                <img
                    src="@imgs" height="80" alt="@imgs" width="80"
                    style="float: left; margin: 0px 15px 15px 0px;"
                    border="2"
                ></img>
            </div>
            <div>
                <span style="font-size: 17px; font-weight: bold;">@desc</span>
                <span style="font-size: 15px; color: #966;">[$index]</span>
            </div>
        </div>
        """
    )

p = figure(plot_width=400, plot_height=400, tools=[hover,'wheel_zoom, box_zoom'],
           title="Mouse over the dots to see the digits",)


p.circle('x', 'y', size=5, source=source, color='color')

show(p)

Timing comparisons for various SVD solvers¶

Adapted from http://blog.explainmydata.com/2016/01/how-much-faster-is-truncated-svd.html

In [ ]:

RANK = 99
N_COLS = 2000
min_rows = 100
max_rows = 2000

Define useful functions for SVD solvers comparisons¶

In [ ]:

def evaluate_svd(svd_fn, reconstruct_fn, min_rows=100, max_rows=1000,
                 n_samples=10, n_cols=N_COLS, rank=RANK, random_seed=0):
    """ SVD evaluation:
    Return n_rows, time_elimes, errors
    """
    np.random.seed(random_seed)
    time_elimes = []
    errors = []
    n_rows_array = (np.linspace(min_rows, max_rows, num=n_samples)).astype(int)

    for n_rows in n_rows_array:
        # construct a low-rank matrix
        left = np.random.randn(n_rows, rank)
        right = np.random.randn(rank, n_cols)
        full = np.dot(left, right)

        # how long does it take to perform the SVD?
        start_t = time.time()
        svd_outputs = svd_fn(full)
        end_t = time.time()
        time_el = end_t - start_t
        time_elimes.append(time_el)

        # compute mean absolte error of reconstruction
        reconstructed = reconstruct_fn(svd_outputs)
        diff = full - reconstructed
        mae = np.mean(np.abs(diff))
        errors.append(mae)
        #print("n_rows=%d , time = %0.4f, MAE = %0.8f" % (n_rows, time_el, mae))
        print("n_rows=%d , time = %0.4f" % (n_rows, time_el))
    max_error = np.max(errors)
    print("Max Error=%f" % max_error)
    assert max_error < 0.0000001
    return n_rows_array, time_elimes, errors


# Full SVD with NumPy
def np_svd(X):
    """
    Compute SVD with numpy method
    """
    return np.linalg.svd(X, full_matrices=False, compute_uv=True)


def np_inv_svd(svd_outputs):
    """
    Compute reconstruction from SVD with numpy method
    """
    U, s, V = svd_outputs
    return np.dot(U, np.dot(np.diag(s), V))


# Truncated SVD with scikit-learn
def skl_svd(X, rank=RANK):
    """
    Compute SVD with skl method
    """
    tsvd = decomposition.TruncatedSVD(rank)
    X_reduced = tsvd.fit_transform(X)
    return (tsvd, X_reduced)


def skl_inv_svd(svd_outputs):
    """
    Compute reconstruction from SVD with skl method
    """
    tsvd, X_reduced = svd_outputs
    return tsvd.inverse_transform(X_reduced)


def skl_rand_svd(X, rank=RANK):
    """
    Compute approximated SVD with skl method (randomized algorithm)
    """
    tsvd = decomposition.TruncatedSVD(rank, algorithm="randomized",
                                      n_iter=1)
    X_reduced = tsvd.fit_transform(X)
    return (tsvd, X_reduced)


def skl_arpack_svd(X, rank=RANK):
    """
    Compute approximated SVD with skl method (Arpack algorithm)
    """
    tsvd = decomposition.TruncatedSVD(rank, algorithm="arpack")
    X_reduced = tsvd.fit_transform(X)
    return (tsvd, X_reduced)

Perform timings:¶

In [ ]:

n_rows, np_times, np_errors = evaluate_svd(np_svd, np_inv_svd,
                                           min_rows=min_rows, max_rows=max_rows,
                                           n_samples=10, n_cols=N_COLS, rank=RANK)

In [ ]:

n_rows, skl_rand_times, skl_rand_err = evaluate_svd(skl_rand_svd, skl_inv_svd,
                                           min_rows=min_rows, max_rows=max_rows,
                                           n_samples=10, n_cols=N_COLS, rank=RANK)

In [ ]:

n_rows, skl_arpack_times, skl_arpack_err = evaluate_svd(skl_arpack_svd, skl_inv_svd,
                                           min_rows=min_rows, max_rows=max_rows,
                                           n_samples=10, n_cols=N_COLS, rank=RANK)

Display¶

In [ ]:

sns.set_palette("colorblind")
sns.axes_style()
sns.set_style({'legend.frameon': True})

fig = plt.figure(figsize=(10, 10))
plt.xlim(min_rows, max_rows)
plt.ylim(0, np.max([np_times, skl_rand_times, skl_arpack_times]))

x_axis = pd.Series(n_rows, name="$n$: number of rows")

sns.regplot(x=x_axis, y=pd.Series(np_times, name="elapsed time (s)"))
sns.regplot(x=x_axis, y=pd.Series(skl_rand_times, name="elapsed time (s)"))
sns.regplot(x=x_axis, y=pd.Series(skl_arpack_times, name="elapsed time (s)"))


plt.legend(("numpy.linalg.svd", "TruncatedSVD (rand)",
            "TruncatedSVD (arpack)"), loc='upper left')
plt.title("Time to perform SVD:" +
          " rank = {0} with $p=${1} columns".format(RANK, N_COLS))
plt.show()
filename = 'timinng_svd'
my_saving_display(fig, dirname, filename, imageformat)