#!/usr/bin/env python # coding: utf-8 # Linear Regression: Ridge and Lasso # ==================================== # # *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes. # $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$ # $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$ # $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$ # $\newcommand{\norm}[1]{\|#1\|}$ # $\newcommand{\abs}[1]{\left|#1\right|}$ # $\newcommand{\choice}[1]{ \left\{ \begin{array}{l} #1 \end{array} \right. }$ # $\newcommand{\pa}[1]{\left(#1\right)}$ # $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$ # $\newcommand{\qandq}{\quad\text{and}\quad}$ # $\newcommand{\qwhereq}{\quad\text{where}\quad}$ # $\newcommand{\qifq}{ \quad \text{if} \quad }$ # $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$ # $\newcommand{\ZZ}{\mathbb{Z}}$ # $\newcommand{\CC}{\mathbb{C}}$ # $\newcommand{\RR}{\mathbb{R}}$ # $\newcommand{\EE}{\mathbb{E}}$ # $\newcommand{\Zz}{\mathcal{Z}}$ # $\newcommand{\Ww}{\mathcal{W}}$ # $\newcommand{\Vv}{\mathcal{V}}$ # $\newcommand{\Nn}{\mathcal{N}}$ # $\newcommand{\NN}{\mathcal{N}}$ # $\newcommand{\Hh}{\mathcal{H}}$ # $\newcommand{\Bb}{\mathcal{B}}$ # $\newcommand{\Ee}{\mathcal{E}}$ # $\newcommand{\Cc}{\mathcal{C}}$ # $\newcommand{\Gg}{\mathcal{G}}$ # $\newcommand{\Ss}{\mathcal{S}}$ # $\newcommand{\Pp}{\mathcal{P}}$ # $\newcommand{\Ff}{\mathcal{F}}$ # $\newcommand{\Xx}{\mathcal{X}}$ # $\newcommand{\Mm}{\mathcal{M}}$ # $\newcommand{\Ii}{\mathcal{I}}$ # $\newcommand{\Dd}{\mathcal{D}}$ # $\newcommand{\Ll}{\mathcal{L}}$ # $\newcommand{\Tt}{\mathcal{T}}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\al}{\alpha}$ # $\newcommand{\la}{\lambda}$ # $\newcommand{\ga}{\gamma}$ # $\newcommand{\Ga}{\Gamma}$ # $\newcommand{\La}{\Lambda}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\Si}{\Sigma}$ # $\newcommand{\be}{\beta}$ # $\newcommand{\de}{\delta}$ # $\newcommand{\De}{\Delta}$ # $\newcommand{\phi}{\varphi}$ # $\newcommand{\th}{\theta}$ # $\newcommand{\om}{\omega}$ # $\newcommand{\Om}{\Omega}$ # $\newcommand{\eqdef}{\equiv}$ # This tour studies linear regression method in conjunction with # regularization. # It contrasts ridge regression and the Lasso. # We recommend that after doing this Numerical Tours, you apply it to your # own data, for instance using a dataset from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/) or [Kaggle](https://www.kaggle.com/). # # _Disclaimer:_ these machine learning tours are intended to be # overly-simplistic implementations and applications of baseline machine learning methods. # For more advanced uses and implementations, we recommend # to use a state-of-the-art library, the most well known being # [Scikit-Learn](http://scikit-learn.org/). # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[58]: # Use this code to read from a CSV file. #import pandas as pd #U = pd.read_csv('myfile.csv') # Usefull functions to convert to a column/row vectors. # In[2]: # convert to a column vector def MakeCol(y): return y.reshape(-1,1) # convert to a row vector def MakeRow(y): return y.reshape(1,-1) # find non zero/true elements def find(x): return np.nonzero(x)[0] # Dataset Loading # --------------- # We test the method on the prostate dataset in $n=97$ samples with # features $x_i \in \RR^p$ in dimension $p=8$. The goal is to predict the price value # $y_i \in \RR$. # Load the dataset. # In[3]: from scipy import io name = 'prostate'; Data = io.loadmat('nt_toolbox/data/ml-' + name) Ay = Data['A'] class_names = Data['class_names'] # Randomly permute it. # In[4]: Ay = Ay[np.random.permutation(Ay.shape[0]),:] # Separate the features $X$ from the data $y$ to predict information. # In[5]: A_full = Ay[:,0:-2]; y_full = MakeCol( Ay[:,-2] ) c = MakeCol( Ay[:,-1] ) # Split into training and testing. # In[6]: I0 = find(c==1) # train I1 = find(c==0) # test n = I0.size n1 = n-n A = A_full[I0,:] y = y_full[I0] A1 = A_full[I1,:] y1 = y_full[I1] # $n$ is the total number of samples, $p$ is the dimensionality of the features, # In[7]: [n,p] = A.shape print(n,p) # Normalize the features by the mean and std of the *training* set. # This is optional. # In[8]: mA = A.mean(axis=0) sA = A.std(axis=0) A = (A-mA)/sA A1 = (A1-mA)/sA # Remove the mean (computed from the *test* set) to avoid introducing a bias term and a constant regressor. # This is optional. # In[9]: m = y.mean() y = y-m y1 = y1-m # Dimenionality Reduction and PCA # ------------------------------- # In order to display in 2-D or 3-D the data, dimensionality reduction is needed. # The simplest method is the principal components analysis (PCA), which performs an # orthogonal linear projection on the principal axes (eigenvectors) of the # covariance matrix. # # Display the covariance matrix of the training set. # In[10]: C = A.transpose().dot(A) plt.imshow(C); # In[11]: u = A.transpose().dot(y) plt.clf plt.bar(np.arange(1,p+1),u.flatten()) plt.axis('tight'); # Compute PCA ortho-basis and # the feature in the PCA basis. # In[12]: U, s, V = np.linalg.svd(A) Ar = A.dot( V.transpose() ) # Plot sqrt of the eigenvalues. # In[13]: plt.plot(s, '.-'); # Display the features. # In[14]: pmax = min(p,8) k = 0 plt.clf for i in np.arange(0,pmax): for j in np.arange(0,pmax): k = k+1 plt.subplot(pmax,pmax,k) if i==j: plt.hist(A[:,i],6) plt.axis('tight') else: plt.plot(A[:,j],A[:,i], '.') plt.axis('tight') if i==1: plt.title(class_names[0][j][0]) plt.tick_params(axis='x', labelbottom=False) plt.tick_params(axis='y', labelleft=False) # Display the points cloud of feature vectors in 2-D PCA space. # In[15]: plt.plot(Ar[:,0], Ar[:,1], '.') plt.axis('equal'); # 1D plot of the function to regress along the main eigenvector axes. # In[16]: plt.clf for i in np.arange(0,3): plt.subplot(3,1,i+1) plt.plot(Ar[:,i], y, '.') plt.axis('tight') # Linear Regression # ----------------- # We look for a linear relationship # $ y_i \approx \dotp{x}{a_i} $ # written in matrix format # $ y= A x $ # where the rows of $A \in \RR^{n \times p}$ stores the features $a_i \in \RR^p$. # # # Since here $n > p$, this is an over-determined system, which can # solved in the least square sense # $$ \umin{ x } \norm{Ax-y}^2 $$ # whose solution is given using the Moore-Penrose pseudo-inverse # $$ x = (A^\top A)^{-1} A^\top y $$ # # # Compute the least square solution. # In[17]: x = np.linalg.solve( A.transpose().dot(A), A.transpose().dot(y) ) # Prediction (along 1st eigenvector). # In[18]: plt.clf plt.plot( A1.dot(x), '.-' ) plt.plot( y1, '.-' ) plt.axis('tight') plt.legend(('$y_1$', '$X_1 w$')); # Mean-square error on testing set. # In[19]: E = np.linalg.norm(A1.dot(x)-y1) / np.linalg.norm(y1) print(( 'Relative prediction error: ' + str(E) ) ); # Although this is not an effective method to solve this problem (a more efficient approach is to use for instance conjugate gradient), one can do a gradient descent to minimize the function # $$ # \min_x f(x) = \frac{1}{2}\norm{A x-y}^2. # $$ # In[20]: def f(x): return 1/2*np.linalg.norm(A.dot(x)-y)**2 # The gradient of $f$ is # $$ # \nabla f(x) = A^\top (Ax - y). # $$ # In[21]: def Gradf(x): return A.transpose().dot(A.dot(x)-y) # The maxium step size allowable by the gradient descent is # $$ # \tau \leq \tau_\max \eqdef \frac{2}{\norm{AA^\top}_{op}} # $$ # where $\norm{\cdot}_{op}$ is the maximum singular eigenvalue. # In[22]: tau = 1/np.linalg.norm(A,2)**2 # Initialize the algorithm to $x=0$. # In[23]: x = np.zeros((p,1)) # One step of gradient descent reads: # $$ x \leftarrow x - \tau \nabla f(x). $$ # In[24]: x = x - tau*Gradf(x) # In[25]: tau_mult = [.1, .5, 1, 1.8] # __Exercise 0:__ Display the evolution of the training error $f(x)$ as a function of the number of iterations. Test for diffetent values of $\tau$. # In[26]: run -i nt_solutions/ml_2_regression/exo0 # The optimal step size to minimize a quadratic function $\dotp{C w}{w}$ is # $$ # \tau_{\text{opt}} = \frac{2}{\sigma_\min(C) + \sigma_\max(C)} # $$ # In[27]: C = A.transpose().dot(A) tau_opt = 2 / ( np.linalg.norm(C,2) + np.linalg.norm(C,-2) ) print(( 'Optimal tau = ' + str( tau_opt * np.linalg.norm(A,2)**2 ) ) + ' / |AA^T|' ); # Stochastic Gradient Method # ======= # # We use SGD (which is *not* actually a descent algorithm) to minimize the quadratic risk # $$ \umin{x} f(x) \eqdef \frac{1}{n} \sum_{i=1}^n f_i(x) = \frac{1}{n} \sum_{i=1}^n \frac{1}{2} ( \dotp{x}{a_i}-y_i )^2 $$ # where we used # $$ f_i(x) \eqdef \frac{1}{2} ( \dotp{x}{a_i}-y_i )^2$$ # The algorithm reads # $$ x_{k+1} = x_k - \tau_k \nabla f_{i_k}(x_k)$$ # where at each iteration $i_k$ is drawn in $\{1,\ldots,n\}$ uniformly at random. # # In[28]: x = np.zeros((p,1)) # Draw $i_k$ are random. # In[29]: ik = int( np.floor(np.random.rand()*n) ) # Compute $\nabla f_{i_k}(x)$. # In[30]: gk = (A[ik,:].dot(x)-y[ik]) * A[ik,:].transpose() # Set the step size $\tau_k$ (for convergence is should converge to 0) and perform the update. # In[31]: tauk = 1/np.linalg.norm(A,2)**2 x = x - tauk * gk # __Exercise SGD1:__ Test different *fixed* step size $\tau_k=\tau$. # In[32]: run -i nt_solutions/ml_2_regression/exo_sgd1.py # __Exercise SGD2:__ Average on different runs to see the impact of the step size. # In[33]: run -i nt_solutions/ml_2_regression/exo_sgd2.py # __Exercise SGD3:__ Use a decaying step size $\tau_k=\frac{\tau_0}{1+k/k_0}$. # In[34]: run -i nt_solutions/ml_2_regression/exo_sgd3.py # __Exercise SGD4:__ Use a decaying step size $\tau_k=\frac{\tau_0}{1+\sqrt{k/k_0}}$ and average the iteration $\frac{1}{K}\sum_{k0$ is the regularization parameter. # # # The solution is given using the following equivalent formula # $$ x = (A^\top A + \lambda \text{Id}_p )^{-1} A^\top y, $$ # $$ x = A^\top ( AA^\top + \lambda \text{Id}_n)^{-1} y, $$ # When $p # $(document).ready(function(){ # $('div.prompt').hide(); # }); #