#!/usr/bin/env python # coding: utf-8 # Linear Regression # ==================================== # # *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes. # $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$ # $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$ # $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$ # $\newcommand{\norm}[1]{\|#1\|}$ # $\newcommand{\abs}[1]{\left|#1\right|}$ # $\newcommand{\choice}[1]{ \left\{ \begin{array}{l} #1 \end{array} \right. }$ # $\newcommand{\pa}[1]{\left(#1\right)}$ # $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$ # $\newcommand{\qandq}{\quad\text{and}\quad}$ # $\newcommand{\qwhereq}{\quad\text{where}\quad}$ # $\newcommand{\qifq}{ \quad \text{if} \quad }$ # $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$ # $\newcommand{\ZZ}{\mathbb{Z}}$ # $\newcommand{\CC}{\mathbb{C}}$ # $\newcommand{\RR}{\mathbb{R}}$ # $\newcommand{\EE}{\mathbb{E}}$ # $\newcommand{\Zz}{\mathcal{Z}}$ # $\newcommand{\Ww}{\mathcal{W}}$ # $\newcommand{\Vv}{\mathcal{V}}$ # $\newcommand{\Nn}{\mathcal{N}}$ # $\newcommand{\NN}{\mathcal{N}}$ # $\newcommand{\Hh}{\mathcal{H}}$ # $\newcommand{\Bb}{\mathcal{B}}$ # $\newcommand{\Ee}{\mathcal{E}}$ # $\newcommand{\Cc}{\mathcal{C}}$ # $\newcommand{\Gg}{\mathcal{G}}$ # $\newcommand{\Ss}{\mathcal{S}}$ # $\newcommand{\Pp}{\mathcal{P}}$ # $\newcommand{\Ff}{\mathcal{F}}$ # $\newcommand{\Xx}{\mathcal{X}}$ # $\newcommand{\Mm}{\mathcal{M}}$ # $\newcommand{\Ii}{\mathcal{I}}$ # $\newcommand{\Dd}{\mathcal{D}}$ # $\newcommand{\Ll}{\mathcal{L}}$ # $\newcommand{\Tt}{\mathcal{T}}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\al}{\alpha}$ # $\newcommand{\la}{\lambda}$ # $\newcommand{\ga}{\gamma}$ # $\newcommand{\Ga}{\Gamma}$ # $\newcommand{\La}{\Lambda}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\Si}{\Sigma}$ # $\newcommand{\be}{\beta}$ # $\newcommand{\de}{\delta}$ # $\newcommand{\De}{\Delta}$ # $\newcommand{\phi}{\varphi}$ # $\newcommand{\th}{\theta}$ # $\newcommand{\om}{\omega}$ # $\newcommand{\Om}{\Omega}$ # $\newcommand{\eqdef}{\equiv}$ # This tour studies linear regression method in conjunction with # regularization. # It contrasts ridge regression and the Lasso. # We recommend that after doing this Numerical Tours, you apply it to your # own data, for instance using a dataset from . # # _Disclaimer:_ these machine learning tours are intended to be # overly-simplistic implementations and applications of baseline machine learning methods. # For more advanced uses and implementations, we recommend # to use a state-of-the-art library, the most well known being # # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # Usefull functions to convert to a column/row vectors. # In[3]: # convert to a column vector def MakeCol(y): return y.reshape(-1,1) # convert to a row vector def MakeRow(y): return y.reshape(1,-1) # find non zero/true elements def find(x): return np.nonzero(x)[0] # Dataset Loading # --------------- # We test the method on the prostate dataset in $n=97$ samples with # features $x_i \in \RR^p$ in dimension $p=8$. The goal is to predict the price value # $y_i \in \RR$. # Load the dataset. # In[4]: from scipy import io name = 'prostate'; U = io.loadmat('nt_toolbox/data/ml-' + name) A = U['A'] class_names = U['class_names'] # Randomly permute it. # In[5]: A = A[np.random.permutation(A.shape[0]),:] # Separate the features $X$ from the data $y$ to predict information. # In[6]: X = A[:,0:-2]; y = MakeCol( A[:,-2] ) c = MakeCol( A[:,-1] ) # $n$ is the number of samples, $p$ is the dimensionality of the features, # In[7]: [n,p] = X.shape print(n,p) # Split into training and testing. # In[8]: I0 = find(c==1) # train I1 = find(c==0) # test n0 = I0.size n1 = n-n0 X0 = X[I0,:] y0 = y[I0] X1 = X[I1,:] y1 = y[I1] # Normalize the features by the mean and std of the *training* set. # This is optional. # In[9]: mX0 = X0.mean(axis=0) sX0 = X0.std(axis=0) X0 = (X0-mX0)/sX0 X1 = (X1-mX0)/sX0 # Remove the mean (computed from the *test* set) to avoid introducing a bias term and a constant regressor. # This is optional. # In[10]: m0 = y0.mean() y0 = y0-m0 y1 = y1-m0 # Dimenionality Reduction and PCA # ------------------------------- # In order to display in 2-D or 3-D the data, dimensionality is needed. # The simplest method is the principal component analysis, which perform an # orthogonal linear projection on the principal axsis (eigenvector) of the # covariance matrix. # # # Display the covariance matrix of the training set. # In[10]: C = X0.transpose().dot(X0) plt.imshow(C); # In[11]: u = X0.transpose().dot(y0) plt.clf plt.bar(np.arange(1,p+1),u.flatten()) plt.axis('tight'); # Compute PCA ortho-basis and # the feature in the PCA basis. # In[12]: U, s, V = np.linalg.svd(X0) X0r = X0.dot( V.transpose() ) # Plot sqrt of the eigenvalues. # In[13]: plt.plot(s, '.-'); # Display the features. # In[14]: pmax = min(p,8) k = 0 plt.clf for i in np.arange(0,pmax): for j in np.arange(0,pmax): k = k+1 plt.subplot(pmax,pmax,k) if i==j: plt.hist(X0[:,i],6) plt.axis('tight') else: plt.plot(X0[:,j],X0[:,i], '.') plt.axis('tight') if i==1: plt.title(class_names[0][j][0]) plt.tick_params(axis='x', labelbottom=False) plt.tick_params(axis='y', labelleft=False) # Display the points cloud of feature vectors in 2-D PCA space. # In[15]: plt.plot(X0r[:,0], X0r[:,1], '.') plt.axis('equal'); # 1D plot of the function to regress along the main eigenvector axes. # In[16]: plt.clf for i in np.arange(0,3): plt.subplot(3,1,i+1) plt.plot(X0r[:,i], y0, '.') plt.axis('tight') # Linear Regression # ----------------- # We look for a linear relationship # $ y_i = \dotp{w}{x_i} $ # written in matrix format # $ y= X w $ # where the rows of $X \in \RR^{n \times p}$ stores the features $x_i \in \RR^p$. # # # Since here $ n > p $, this is an over-determined system, which can # solved in the least square sense # $$ \umin{ w } \norm{Xw-y}^2 $$ # whose solution is given using the Moore-Penrose pseudo-inverse # $$ w = (X^\top X)^{-1} X^\top y $$ # # # Least square solution. # In[11]: w = np.linalg.solve( X0.transpose().dot(X0), X0.transpose().dot(y0) ) # Prediction (along 1st eigenvector). # In[12]: plt.clf plt.plot( X1.dot(w), '.-' ) plt.plot( y1, '.-' ) plt.axis('tight') plt.legend(('$y_1$', '$X_1 w$')); # Mean-square error on testing set. # In[13]: E = np.linalg.norm(X1.dot(w)-y1) / np.linalg.norm(y1) print(( 'Relative prediction error: ' + str(E) ) ); # Although this is not an effective method to solve this problem (a more efficient approach is to use for instance conjugate gradient), one can do a gradient descent to minimize the function # $$ # \min_w J(w) = \frac{1}{2}\norm{X w-y}^2. # $$ # In[14]: def J(w): return 1/2*np.linalg.norm(X0.dot(w)-y0)**2 # The gradient of $J$ is # $$ # \nabla J(w) = X^\top (X w - y). # $$ # In[15]: def GradJ(w): return X0.transpose().dot(X0.dot(w)-y0) # The maxium step size allowable by the gradient descent is # $$ # \tau \leq \tau_\max \eqdef \frac{2}{\norm{XX^\top}_{op}} # $$ # where $\norm{\cdot}_{op}$ is the maximum singular eigenvalue. # In[16]: tau = 1/np.linalg.norm(X0,2)**2 # Initialize the algorithm to $w=0$. # In[17]: w = np.zeros((p,1)) # One step of gradient descent reads: # $$ w \leftarrow w - \tau \nabla J(w). $$ # In[18]: w = w - tau*GradJ(w) # In[19]: tau_mult = [.1, .5, 1, 1.8] # __Exercise 0__ # # Display the evolution of the training error $J(w)$ as a function of the number of iterations. # Test for diffetent values of $\tau$. # In[26]: run -i nt_solutions/ml_2_regression/exo0 # The optimal step size to minimize a quadratic function $\dotp{C w}{w}$ is # $$ # \tau_{\text{opt}} = \frac{2}{\sigma_\min(C) + \sigma_\max(C)} # $$ # In[27]: C = X0.transpose().dot(X0) tau_opt = 2 / ( np.linalg.norm(C,2) + np.linalg.norm(C,-2) ) print(( 'Optimal tau = ' + str( tau_opt * np.linalg.norm(X0,2)**2 ) ) + ' / |X_0 X_0^T|' ); # Stochastic Gradient Method # ======= # # We use SGD (which is *not* actually a descent algorithm) to minimize the quadratic risk # $$ \umin{w} J(w) \eqdef \frac{1}{n} \sum_{i=1}^n f_i(w) = \frac{1}{n} \sum_{i=1}^n \frac{1}{2} ( \dotp{w}{x_i}-y_i )^2 $$ # where we used # $$ f_i(w) \eqdef \frac{1}{2} ( \dotp{w}{x_i}-y_i )^2$$ # The algorithm reads # $$ w_{k+1} = w_k - \tau_k \nabla f_{i_k}(w_k)$$ # where at each iteration $i_k$ is drawn in $\{1,\ldots,n\}$ uniformly at random. # # In[20]: w = np.zeros((p,1)) # Draw $i_k$ are random. # In[21]: ik = int( np.floor(np.random.rand()*n0) ) # Compute $\nabla f_{i_k}(w_k)$. # In[22]: gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose() # Set the step size $w_k$ (for convergence is should converge to 0) and perform the update. # In[31]: tauk = 1/np.linalg.norm(X0,2)**2 w = w - tauk * gk # Test different *fixed* step size $\tau_k=\tau$. # In[56]: wopt = np.linalg.solve( X0.transpose().dot(X0), X0.transpose().dot(y0) ) # least square solution niter = 5000 Jlist = np.zeros((niter,1)) tau_mult = [.05, .3, .8]; for itau in np.arange(0,len(tau_mult)): tauk = tau_mult[itau]/np.linalg.norm(X0,2)**2 w = np.zeros((p,1)) for i in np.arange(0,niter): ik = int( np.floor(np.random.rand()*n0) ) gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose() # stochastic gradient # gk = X0.transpose().dot( X0.dot(w)-y0 ) # batch gradient w = w - tauk * gk Jlist[i] = J(w) plt.plot(Jlist/J(wopt)-1) # Average on different runs to see the impact of the step size. # In[67]: niter = 8000 tau_mult = [.05, .8]; nruns = 10 # number of runs to compute the average performance for itau in np.arange(0,len(tau_mult)): tauk = tau_mult[itau]/np.linalg.norm(X0,2)**2 Jlist = np.zeros((niter,1)) for iruns in np.arange(0,nruns): w = np.zeros((p,1)) for i in np.arange(0,niter): ik = int( np.floor(np.random.rand()*n0) ) gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose() w = w - tauk * gk Jlist[i] = Jlist[i] + J(w) plt.plot( (Jlist/nruns)/J(wopt)-1 ) # Use a decaying step size $\tau_k=\frac{\tau_0}{1+k/k_0}$. # In[68]: niter = 20000 Jlist = np.zeros((niter,1)) w = np.zeros((p,1)) for i in np.arange(0,niter): tauk = 1/np.linalg.norm(X0,2)**2 * 1/(1+i/10) ik = int( np.floor(np.random.rand()*n0) ) gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose() w = w - tauk * gk Jlist[i] = J(w) plt.plot(Jlist/J(wopt)-1 ); print( Jlist[-1]/J(wopt)-1 ) # Use a decaying step size $\tau_k=\frac{\tau_0}{1+\sqrt{k/k_0}}$ and average the iteration $\frac{1}{K}\sum_{k0$ is the regularization parameter. # # # The solution is given using the following equivalent formula # $$ w = (X^\top X + \lambda \text{Id}_p )^{-1} X^\top y, $$ # $$ w = X^\top ( XX^\top + \lambda \text{Id}_n)^{-1} y, $$ # When $p # $(document).ready(function(){ # $('div.prompt').hide(); # }); #