#!/usr/bin/env python # coding: utf-8 # # MDI 720 : Statistiques # ## SVD # ### *Joseph Salmon* # # This notebook reproduces the pictures for the course "SVD" # In[ ]: import numpy as np import pandas as pd import statsmodels.api as sm import matplotlib.pyplot as plt from matplotlib import rc import seaborn as sns from os import mkdir, path from sklearn import decomposition import time get_ipython().run_line_magic('matplotlib', 'notebook') # # Plot initialization # # In[ ]: dirname = "../prebuiltimages/" if not path.exists(dirname): mkdir(dirname) imageformat = '.pdf' rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']}) params = {'axes.labelsize': 12, 'font.size': 12, 'legend.fontsize': 12, 'xtick.labelsize': 10, 'ytick.labelsize': 10, 'text.usetex': True, 'figure.figsize': (8, 6)} plt.rcParams.update(params) sns.set_context("poster") sns.set_palette("colorblind") sns.axes_style() sns.set_style({'legend.frameon': True}) plt.close("all") # # Saving display function # In[ ]: saving = True def my_saving_display(fig, dirname, filename, imageformat): """"Saving with personal function.""" filename = filename.replace('.', 'pt') # remove "." to avoid floats issues if saving is True: dirname + filename + imageformat image_name = dirname + filename + imageformat fig.savefig(image_name) # # Compare # In[ ]: X = np.random.randn(9, 6) # Full SVD U, s, V = np.linalg.svd(X, full_matrices=True) U.shape, V.shape, s.shape S = np.zeros((9, 6), dtype=float) S[:6, :6] = np.diag(s) # test to numerical precision if 2 arguments are equal print(np.allclose(X, U.dot(S.dot(V)))) # Partial SVD U, s, V = np.linalg.svd(X, full_matrices=False) U.shape, V.shape, s.shape S = np.diag(s) # reshape to get a diagonal matrix # test to numerical precision if 2 arguments are equal print(np.allclose(X, U.dot(S.dot(V)))) # # SVD/OLS (un)stability # In[ ]: n_features = 6 n_samples = 10 x = 10. ** (-np.arange(n_samples,)) X = (np.column_stack([x ** (i) for i in range(n_features)])) U, s, V = np.linalg.svd(X, full_matrices=False) theta_true = np.ones([n_features, ]) y_true = np.dot(X, theta_true) err = np.zeros(n_samples,) err_delta = np.zeros(n_samples,) for i in range(1, n_samples): delta = 10. ** (-i) * (0.5 - np.random.rand(n_samples, )) * y_true y = y_true + delta w = np.dot(np.transpose(U), y) theta_hat = np.dot(V, w / s) err[i] = np.sqrt(np.sum((theta_hat - theta_true) ** 2)) err_delta[i] = np.sqrt(np.sum(delta ** 2)) sns.set_context("poster", font_scale=1.5) sns.set_style("ticks") fig1 = plt.figure(figsize=(14, 8)) ax1 = fig1.add_subplot(111) ax1.plot(err_delta, err, '*', markersize=20) ax1.set_yscale('log') ax1.set_xscale('log') sns.despine() ax1.set_xlabel(r"$\|\Delta\|_2$") ax1.set_ylabel(r"$\|\hat\theta^{\Delta}-\hat\theta\|_2$") plt.title(r"$s_1={0: .1e}, s_2={1:.1e}, s_3={2:.1e}, s_4={3: .1e}, s_5={4: .1e}, s_6={5: .1e}$".format(s[0], s[1], s[2], s[3], s[4], s[5]), fontsize=22) plt.tight_layout() plt.show() filename = "amplification_erreur" image_name = dirname + filename + imageformat fig1.savefig(image_name)