#!/usr/bin/env python # coding: utf-8 # In[ ]: from os import mkdir, path from math import cos, sin, pi, acos import time import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib import rc import seaborn as sns from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn import metrics from sklearn.datasets import load_digits from sklearn import preprocessing get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: ############################################################################### # Plot initialization dirname = "../prebuiltimages/" if not path.exists(dirname): mkdir(dirname) imageformat = '.pdf' rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']}) params = {'axes.labelsize': 16, 'font.size': 23, 'legend.fontsize': 12, 'xtick.labelsize': 10, 'ytick.labelsize': 10, 'text.usetex': True } plt.rcParams.update(params) plt.close("all") # In[ ]: ############################################################################### # display function: saving = False def my_saving_display(fig, dirname, filename, imageformat, bbox_inches='tight'): """"Saving with personal function.""" filename = filename.replace('.', 'pt') # remove "." to avoid floats issues if saving is True: dirname + filename + imageformat image_name = dirname + filename + imageformat # BEWARE: bbox_inches='tight' helps for legend outside of bbox fig.savefig(image_name, bbox_inches='tight') # In[ ]: ############################################################################### # PCA interpretation color_blind_list = sns.color_palette("colorblind", 8) centers = [(-5, -5), (0, 0), (5, 5)] n_samples = 50 n_features = 2 Z, _ = make_blobs(n_samples=n_samples, n_features=n_features, cluster_std=4.0, centers=centers, shuffle=False, random_state=42) scaler = StandardScaler(with_mean=True, with_std=True).fit(Z) X = scaler.transform(Z) my_orange = color_blind_list[2] sns.set_context("poster") sns.set_palette("colorblind") sns.axes_style() sns.set_style("white") s_large = 200 s_small = 50 # In[ ]: # Data scatterplot fig = plt.figure(00, figsize=(10, 5)) sub1 = fig.add_subplot(121) sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, edgecolors='k', zorder=1) sub1.get_yaxis().set_ticks([]) sub1.get_xaxis().set_ticks([]) sub1.set_ylim([-3., 3.]) sub1.set_xlim([-3., 3.]) sub1.set_aspect('equal') sub1.set_xlabel(u'Data') sub2 = fig.add_subplot(122) # sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1) sub2.set_aspect('equal') sub2.set_ylim([0, 2.]) sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2]) sub2.set_xlabel('') sub2.set_ylabel('') sub2.get_yaxis().set_ticks([]) sub2.get_xaxis().set_ticks([]) plt.show() filename = 'fig_pca_axis_raw' my_saving_display(fig, dirname, filename, imageformat) # In[ ]: # with mean fig = plt.figure(00, figsize=(10, 5)) sub1 = fig.add_subplot(121) sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, edgecolors='k', zorder=1) sub1.get_yaxis().set_ticks([]) sub1.get_xaxis().set_ticks([]) sub1.set_ylim([-3., 3.]) sub1.set_xlim([-3., 3.]) sub1.set_aspect('equal') sub1.set_xlabel(u'Data') sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=2, edgecolors='k') sub2 = fig.add_subplot(122) # sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1) sub2.set_aspect('equal') sub2.set_ylim([0, 2.]) sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2]) sub2.set_xlabel('') sub2.set_ylabel('') sub2.get_yaxis().set_ticks([]) sub2.get_xaxis().set_ticks([]) plt.show() plt.show() sub1.set_aspect('equal') sub1.set_xlabel(u'Data and mean') filename = 'fig_pca_axis_raw_mean' my_saving_display(fig, dirname, filename, imageformat) plt.show() # In[ ]: x_range = np.linspace(-2, 2, num=100) X_mean = X.mean(axis=0) thetas = np.linspace(- pi / 2, pi / 2, num=10) theta_grid = np.linspace(- pi / 2 - 0.2, pi / 2 + 0.2, num=200) var_grid = np.zeros(theta_grid.shape) f, (sub1, sub2) = plt.subplots(1, 2,figsize=(10, 5)) sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, edgecolors='k', zorder=1) sub1.get_yaxis().set_ticks([]) sub1.get_xaxis().set_ticks([]) sub1.set_ylim([-3., 3.]) sub1.set_xlim([-3., 3.]) sub1.set_aspect('equal') sub1.set_xlabel(u'Data') filename = 'fig_pca_axis_raw' my_saving_display(fig, dirname, filename, imageformat) # with mean sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=2, edgecolors='k') sub1.set_aspect('equal') sub1.set_xlabel(u'Data and mean') filename = 'fig_pca_axis_raw_mean' my_saving_display(fig, dirname, filename, imageformat) sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1) sub2.set_aspect('equal') sub2.set_ylim([0, 2.]) sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2]) sub2.set_xlabel('Angle') sub2.set_ylabel('Variance') sub2.get_yaxis().set_ticks([]) sub2.get_xaxis().set_ticks([]) plt.show() for image_nb, theta in enumerate(theta_grid): rotation = np.asarray([cos(theta), sin(theta)]) var_grid[image_nb] = np.var(X.dot(rotation)) for image_nb, theta in enumerate(thetas): fig = plt.figure(image_nb, figsize=(10, 5)) sub1 = fig.add_subplot(121) sub1.set_aspect('equal') sub1.set_ylim([-3., 3.]) sub1.set_xlim([-3., 3.]) cos_var = np.asarray([cos(theta), -cos(theta)]) sin_var = np.asarray([sin(theta), -sin(theta)]) rotation = np.asarray([cos(theta), sin(theta)]) points_projected = np.outer(X.dot(rotation), rotation) for i in range(X.shape[0]): point_projected = points_projected[i] point_ini = X[i, :] sub1.plot([point_ini[0], point_projected[0]], [point_ini[1], point_projected[1]], '--k', linewidth=1, zorder=1) sub1.scatter(point_projected[0], point_projected[1], s=s_small, alpha=1, c='k', zorder=2) amplitude = 4 sub1.plot(amplitude * cos_var, amplitude * sin_var, 'k', linewidth=1, zorder=2) sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, zorder=3, edgecolors='k') sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=4, edgecolors='k') sub1.get_yaxis().set_ticks([]) sub1.get_xaxis().set_ticks([]) sub1.set_xlabel(u'Data, mean and projection') var = np.var(X.dot(rotation)) sub2 = fig.add_subplot(122) sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1) sub2.set_aspect('equal') sub2.set_ylim([0, 2.]) sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2]) sub2.scatter(theta, var, c=my_orange, s=s_large, zorder=2, edgecolors='k') sub2.set_xlabel('Angle') sub2.set_ylabel('Variance') sub2.get_yaxis().set_ticks([]) sub2.get_xaxis().set_ticks([]) plt.show() filename = 'fig_pca_axis' + str(image_nb) my_saving_display(fig, dirname, filename, imageformat) # PCA itself pca = PCA(n_components=2) pca.fit(X) rotation = np.asarray([pca.components_[0][0], pca.components_[0][0]]) X_new = np.outer(X.dot(rotation), rotation) theta_opt = acos(pca.components_[0][0]) # In[ ]: fig = plt.figure(30, figsize=(10, 5)) sub1 = fig.add_subplot(121) sub1.scatter(X[:, 0], X[:, 1], s=s_small, alpha=1, c=my_orange, zorder=3, edgecolors='k') sub1.get_yaxis().set_ticks([]) sub1.get_xaxis().set_ticks([]) sub1.set_ylim([-3., 3.]) sub1.set_xlim([-3., 3.]) sub1.set_xlabel(u'Principal direction (main axis)') sub1.set_aspect('equal') for i in range(X.shape[0]): point_projected = X_new[i] point_ini = X[i, :] sub1.plot([point_ini[0], point_projected[0]], [point_ini[1], point_projected[1]], '--k', linewidth=1, zorder=2) sub1.scatter(point_projected[0], point_projected[1], s=s_small, alpha=1, c='k', zorder=3) amplitude = 4 cos_var = np.asarray([pca.components_[0][0], -pca.components_[0][0]]) sin_var = np.asarray([pca.components_[0][1], -pca.components_[0][0]]) sub1.plot(amplitude * cos_var, amplitude * sin_var, 'k', linewidth=1, zorder=2) sub1.scatter(0, 0, s=s_large, alpha=1, c='r', zorder=3, edgecolors='k') # BEWARE: normalisation is by 1 / (n_samples - 1) in sklearn: var_opt = pca.explained_variance_[0] * (n_samples - 1) / (n_samples) sub2 = fig.add_subplot(122) sub2.plot(theta_grid, var_grid, c='k', linewidth=3, zorder=1) sub2.set_aspect('equal') sub2.set_ylim([0, 2.]) sub2.set_xlim([- pi / 2 - 0.2, pi / 2 + 0.2]) sub2.scatter(theta_opt, var_opt, c=my_orange, s=s_large, zorder=2, edgecolors='k') sub2.set_xlabel('Angle') sub2.set_ylabel('Variance') sub2.get_yaxis().set_ticks([]) sub2.get_xaxis().set_ticks([]) filename = 'fig_pca_axis_opt' my_saving_display(fig, dirname, filename, imageformat) plt.show() # # Exploration avec la PCA de la base de données *Digits* # In[ ]: digits = load_digits() X = digits.data y = digits.target n_samples, n_features = X.shape n_digits = len(np.unique(y)) print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) sns.set(style="white") # # Visualisation d'un element de la base sous forme d'image: # In[ ]: fig = plt.figure(figsize=(8, 3)) for i in range(10): ax = fig.add_subplot(2,5,i + 1) digit = X[i] ax.imshow(digit.reshape(8, 8), cmap=plt.cm.gray, interpolation="nearest") ax.axis('off') filename = 'fig_digits' my_saving_display(fig, dirname, filename, imageformat) transfo = PCA(n_components=2) X_2d = transfo.fit_transform(X) X_2d.shape # # Visualisation en 2D des données (projection sur les deux axes principaux) # In[ ]: sns.set_context("poster") sns.set_palette("Paired", n_colors=10) color_paired_list = sns.set_palette("Paired", n_colors=10) fig = plt.figure(figsize=(10, 8)) plt.plot(X_2d[:, 0], X_2d[:, 1], 'k.', markersize=8) filename = 'fig_pca_digits_wo_classes' my_saving_display(fig, dirname, filename, imageformat) # # Visualisation en 2D des données (projection sur les deux axes principaux) + coleur correspondant aux diverses classse # In[ ]: fig = plt.figure(figsize=(10, 8)) for k in range(10): Xk_2d = X_2d[y == k] plt.plot(Xk_2d[:, 0], Xk_2d[:, 1], '.', markersize=8, label=k) plt.legend(numpoints=1, loc=1, bbox_to_anchor=(1.2, 0.7), markerscale=3) plt.tight_layout() plt.show() filename = 'fig_pca_digits_w_classes' my_saving_display(fig, dirname, filename, imageformat) # # Une alternative non linéaire: t-distributed Stochastic Neighbor Embedding (t-SNE) # In[ ]: from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection) tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_tsne = tsne.fit_transform(X) # In[ ]: fig = plt.figure(figsize=(10, 8)) for k in range(10): X_tsne_int = X_tsne[y == k] plt.plot(X_tsne_int[:, 0], X_tsne_int[:, 1], '.', markersize=8, label=k) plt.legend(numpoints=1,loc=1, bbox_to_anchor=(1.2, 0.7), markerscale=3) plt.tight_layout() plt.show() filename = 'fig_tsne_digits_w_classes' my_saving_display(fig, dirname, filename, imageformat) # In[ ]: from PIL import Image data = np.random.random((100,100)) #Rescale to 0-255 and convert to uint8 fig = plt.figure(figsize=(8, 3)) for i in range(X.shape[0]): digit = X[i] rescaled = (255.0 / digit.max() * (digit - digit.min())).astype(np.uint8) rescaled = rescaled.reshape(8, 8) new_p = Image.fromarray(np.asarray(rescaled,dtype='uint8'),mode='L') new_p.save("images/digit"+str(i)+".png") # In[ ]: imgs_names = [] for i in range(X.shape[0]): imgs_names.append('images/digit'+str(i)+".png") print(imgs_names[0]) # In[ ]: import warnings warnings.filterwarnings('ignore') from bokeh.plotting import figure, output_file, show, ColumnDataSource from bokeh.models import HoverTool from bokeh.io import output_notebook output_notebook() # In[ ]: color_paired_list = sns.color_palette("Paired", 10).as_hex() colors = {i : color_paired_list[i] for i in range(10) } df = pd.DataFrame(dict(y=y)) df['colors'] = df['y'].apply(lambda x: colors[x]) # In[ ]: source = ColumnDataSource( data=dict( x=X_tsne[:, 0], y=X_tsne[:, 1], imgs=imgs_names, desc=y, color =df['colors'] ) ) hover = HoverTool( tooltips="""