#!/usr/bin/env python
# coding: utf-8

# Linear Regression: Ridge and Lasso
# ====================================
# 
# *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes.
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$
# $\newcommand{\eqdef}{\equiv}$

# This tour studies linear regression method in conjunction with
# regularization.
# It contrasts ridge regression and the Lasso.

# We recommend that after doing this Numerical Tours, you apply it to your
# own data, for instance using a dataset from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/) or [Kaggle](https://www.kaggle.com/).
# 
# _Disclaimer:_ these machine learning tours are intended to be
# overly-simplistic implementations and applications of baseline machine learning methods.
# For more advanced uses and implementations, we recommend
# to use a state-of-the-art library, the most well known being
# [Scikit-Learn](http://scikit-learn.org/). 

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[58]:


# Use this code to read from a CSV file.
#import pandas as pd
#U = pd.read_csv('myfile.csv')


# Usefull functions to convert to a column/row vectors.

# In[2]:


#  convert to a column vector
def MakeCol(y): return y.reshape(-1,1)
#  convert to a row vector
def MakeRow(y): return y.reshape(1,-1)
# find non zero/true elements
def find(x): return np.nonzero(x)[0]


# Dataset Loading
# ---------------
# We test the method on the prostate dataset in $n=97$ samples with
# features $x_i \in \RR^p$ in dimension $p=8$. The goal is to predict the price value
# $y_i \in \RR$.

# Load the dataset.

# In[3]:


from scipy import io
name = 'prostate';
Data = io.loadmat('nt_toolbox/data/ml-' + name)
Ay = Data['A']
class_names = Data['class_names']


# Randomly permute it.

# In[4]:


Ay = Ay[np.random.permutation(Ay.shape[0]),:]


# Separate the features $X$ from the data $y$ to predict information.

# In[5]:


A_full = Ay[:,0:-2];
y_full = MakeCol( Ay[:,-2] )
c = MakeCol( Ay[:,-1] )


# Split into training and testing.

# In[6]:


I0 = find(c==1) # train
I1 = find(c==0) # test
n = I0.size
n1 = n-n
A = A_full[I0,:]
y = y_full[I0]
A1 = A_full[I1,:]
y1 = y_full[I1]


# $n$ is the total number of samples, $p$ is the dimensionality of the features,

# In[7]:


[n,p] = A.shape
print(n,p)


# Normalize the features by the mean and std of the *training* set.
# This is optional.

# In[8]:


mA = A.mean(axis=0)
sA = A.std(axis=0)
A = (A-mA)/sA
A1 = (A1-mA)/sA


# Remove the mean (computed from the *test* set) to avoid introducing a bias term and a constant regressor.
# This is optional.

# In[9]:


m = y.mean()
y = y-m
y1 = y1-m


# Dimenionality Reduction and PCA
# -------------------------------
# In order to display in 2-D or 3-D the data, dimensionality reduction is needed.
# The simplest method is the principal components analysis (PCA), which performs an
# orthogonal linear projection on the principal axes (eigenvectors) of the
# covariance matrix.
# 
# Display the covariance matrix of the training set.

# In[10]:


C = A.transpose().dot(A)
plt.imshow(C);


# In[11]:


u = A.transpose().dot(y)
plt.clf
plt.bar(np.arange(1,p+1),u.flatten())
plt.axis('tight');


# Compute PCA ortho-basis and
# the feature in the PCA basis.

# In[12]:


U, s, V = np.linalg.svd(A)
Ar = A.dot( V.transpose() )


# Plot sqrt of the eigenvalues.

# In[13]:


plt.plot(s, '.-');


# Display the features.

# In[14]:


pmax = min(p,8)
k = 0
plt.clf
for i in np.arange(0,pmax):
    for j in np.arange(0,pmax):
        k = k+1
        plt.subplot(pmax,pmax,k)
        if i==j:
            plt.hist(A[:,i],6)
            plt.axis('tight')
        else:
            plt.plot(A[:,j],A[:,i], '.')
        plt.axis('tight')
        if i==1:
            plt.title(class_names[0][j][0])
        plt.tick_params(axis='x', labelbottom=False)
        plt.tick_params(axis='y', labelleft=False)


# Display the points cloud of feature vectors in 2-D PCA space.

# In[15]:


plt.plot(Ar[:,0], Ar[:,1], '.')
plt.axis('equal');


# 1D plot of the function to regress along the main eigenvector axes.

# In[16]:


plt.clf
for i in np.arange(0,3):
    plt.subplot(3,1,i+1)
    plt.plot(Ar[:,i], y, '.')
    plt.axis('tight')


# Linear Regression
# -----------------
# We look for a linear relationship
#   $ y_i \approx \dotp{x}{a_i} $
# written in matrix format
#   $ y= A x $
# where the rows of $A \in \RR^{n \times p}$ stores the features $a_i \in \RR^p$.
# 
# 
# Since here $n > p$, this is an over-determined system, which can
# solved in the least square sense
#   $$ \umin{ x }  \norm{Ax-y}^2 $$
# whose solution is given using the Moore-Penrose pseudo-inverse
#   $$ x = (A^\top A)^{-1} A^\top y $$
# 
# 
# Compute the least square solution.

# In[17]:


x = np.linalg.solve( A.transpose().dot(A), A.transpose().dot(y) )


# Prediction (along 1st eigenvector).

# In[18]:


plt.clf
plt.plot( A1.dot(x), '.-' )
plt.plot( y1, '.-' )
plt.axis('tight')
plt.legend(('$y_1$', '$X_1 w$'));


# Mean-square error on testing set.

# In[19]:


E = np.linalg.norm(A1.dot(x)-y1) / np.linalg.norm(y1)
print(( 'Relative prediction error: ' + str(E) ) );


# Although this is not an effective method to solve this problem (a more efficient approach is to use for instance conjugate gradient), one can do a gradient descent to minimize the function
# $$ 
#     \min_x f(x) = \frac{1}{2}\norm{A x-y}^2.
# $$

# In[20]:


def f(x): return 1/2*np.linalg.norm(A.dot(x)-y)**2


# The gradient of $f$ is
# $$
#     \nabla f(x) = A^\top (Ax - y).
# $$

# In[21]:


def Gradf(x): return A.transpose().dot(A.dot(x)-y)


# The maxium step size allowable by the gradient descent is 
# $$
#     \tau \leq \tau_\max \eqdef \frac{2}{\norm{AA^\top}_{op}}
# $$
# where $\norm{\cdot}_{op}$ is the maximum singular eigenvalue.

# In[22]:


tau = 1/np.linalg.norm(A,2)**2


# Initialize the algorithm to $x=0$.

# In[23]:


x = np.zeros((p,1))


# One step of gradient descent reads:
# $$  x \leftarrow x - \tau \nabla f(x). $$

# In[24]:


x = x - tau*Gradf(x)


# In[25]:


tau_mult = [.1, .5, 1, 1.8]


# __Exercise 0:__ Display the evolution of the training error $f(x)$ as a function of the number of iterations. Test for diffetent values of $\tau$.

# In[26]:


run -i nt_solutions/ml_2_regression/exo0


# The optimal step size to minimize a quadratic function $\dotp{C w}{w}$ is
# $$
# \tau_{\text{opt}} = \frac{2}{\sigma_\min(C) + \sigma_\max(C)}
# $$

# In[27]:


C = A.transpose().dot(A)
tau_opt = 2 / ( np.linalg.norm(C,2) + np.linalg.norm(C,-2) )
print(( 'Optimal tau = ' + str( tau_opt * np.linalg.norm(A,2)**2 ) ) + ' / |AA^T|' );


# Stochastic Gradient Method
# =======
# 
# We use SGD (which is *not* actually a descent algorithm) to minimize the quadratic risk
# $$ \umin{x} f(x) \eqdef \frac{1}{n} \sum_{i=1}^n f_i(x) = \frac{1}{n} \sum_{i=1}^n \frac{1}{2} ( \dotp{x}{a_i}-y_i )^2 $$
# where we used
# $$ f_i(x) \eqdef \frac{1}{2} ( \dotp{x}{a_i}-y_i )^2$$
# The algorithm reads
# $$ x_{k+1} = x_k - \tau_k \nabla f_{i_k}(x_k)$$
# where at each iteration $i_k$ is drawn in $\{1,\ldots,n\}$ uniformly at random.
# 

# In[28]:


x = np.zeros((p,1))


# Draw $i_k$ are random. 

# In[29]:


ik = int( np.floor(np.random.rand()*n) )


# Compute $\nabla f_{i_k}(x)$.

# In[30]:


gk = (A[ik,:].dot(x)-y[ik]) * A[ik,:].transpose()


# Set the step size $\tau_k$ (for convergence is should converge to 0) and perform the update.

# In[31]:


tauk = 1/np.linalg.norm(A,2)**2
x = x - tauk * gk


# __Exercise SGD1:__ Test different *fixed* step size $\tau_k=\tau$. 

# In[32]:


run -i nt_solutions/ml_2_regression/exo_sgd1.py


# __Exercise SGD2:__ Average on different runs to see the impact of the step size.

# In[33]:


run -i nt_solutions/ml_2_regression/exo_sgd2.py


# __Exercise SGD3:__ Use a decaying step size $\tau_k=\frac{\tau_0}{1+k/k_0}$.

# In[34]:


run -i nt_solutions/ml_2_regression/exo_sgd3.py


# __Exercise SGD4:__ Use a decaying step size $\tau_k=\frac{\tau_0}{1+\sqrt{k/k_0}}$ and average the iteration $\frac{1}{K}\sum_{k<K}x_k$.

# In[35]:


run -i nt_solutions/ml_2_regression/exo_sgd4.py


# 
# Ridge Regularization
# =======
# 
# Regularization is obtained by introducing a penalty. It is often called
# ridge regression, and is defined as
#   $$ \umin{ x }  \norm{Ax-y}^2 + \lambda \norm{x}^2 $$
# where $\lambda>0$ is the regularization parameter.
# 
# 
# The solution is given using the following equivalent formula
#   $$ x = (A^\top A + \lambda \text{Id}_p )^{-1} A^\top y, $$
#   $$ x = A^\top ( AA^\top + \lambda \text{Id}_n)^{-1} y, $$
# When $p<n$ (which is the case here), the first formula should be
# prefered.
# 
# 
# In contrast, when the dimensionality $p$ of the feature is very
# large and there is little data, the second is faster. Furthermore, this
# second expression is generalizable to Kernel Hilbert space setting,
# corresponding possibly to $p=+\infty$ for some kernels.

# In[36]:


Lambda = .2*np.linalg.norm(A)**2;
x = np.linalg.solve( A.transpose().dot(A) + Lambda*np.eye(p), A.transpose().dot(y) )
u = np.linalg.solve( A.dot(A.transpose()) + Lambda*np.eye(n), y )
x1 = A.transpose().dot( u )
print( ('Error (should be 0): ' + str( np.linalg.norm(x-x1)/np.linalg.norm(x) ) ) )


# __Exercise 1:__ Display the evolution of the test error $E$ as a function of $\lambda$.

# In[37]:


run -i nt_solutions/ml_2_regression/exo1


# __Exercise 2:__ Display the regularization path, i.e. the evolution of $w$ as a function
# of $\lambda$.

# In[38]:


run -i nt_solutions/ml_2_regression/exo2


# Sparse Regularization
# ---------------------
# In order to perform feature selection (i.e. select a subsect of the
# features which are the most predictive), one needs to replace the
# $\ell^2$ regularization penalty by a sparsity inducing regularizer. The
# most well known is the $\ell^1$ norm
# $$ \norm{x}_1 \eqdef \sum_i \abs{x_i} . $$
# 
# 
# The energy to minimize is
# $$ \umin{x} f(x) \eqdef \frac{1}{2}\norm{Ax-y}^2 + \lambda \norm{x}_1. $$

# In[39]:


def f(x,Lambda): return 1/2*np.linalg.norm(A.dot(x)-y)**2 + Lambda*np.linalg.norm(x,1)


# The simplest iterative algorithm to perform the minimization is the
# so-called iterative soft thresholding (ISTA), aka proximal gradient aka
# forward-backward.
# 
# 
# It performs first a gradient step (forward) of the smooth part $\frac{1}{2}\norm{X w-y}^2$ of the
# functional and then a proximal step (backward) step which account for the
# $\ell^1$ penalty and induce sparsity. This proximal step is the soft-thresholding operator
# $$ \Ss_s(x) \eqdef \max( \abs{x}-\lambda,0 ) \text{sign}(x).  $$

# In[40]:


def Soft(x,s): return np.maximum( abs(x)-s, np.zeros(x.shape)  ) * np.sign(x)


# The ISTA algorithm reads
# $$ x_{k+1} \eqdef \Ss_{\la\tau}( x_k - \tau A^\top ( A x_k - y )  ), $$
# where, to ensure convergence, the step size should verify $ 0 < \tau <
# 2/\norm{A}^2  $ where $\norm{A}$ is the operator norm.
# 
# 
# Display the soft thresholding operator.

# In[41]:


t = np.linspace(-5,5,201)
plt.clf
plt.plot(t,Soft(t,2)) 
plt.axis('tight');


# Descent step size.

# In[42]:


tau = 1.5/np.linalg.norm(A,2)**2


# Choose a regularization parameter $\la$.

# In[43]:


lmax = abs( A.transpose().dot(y) ).max()
Lambda = lmax /10


# Initialization $w_0$.

# In[44]:


x = np.zeros((p,1))


# A single ISTA step.

# In[45]:


C = A.transpose().dot(A)
u = A.transpose().dot(y)
def ISTA(x,Lambda,tau): return Soft( x-tau*( C.dot(x)-u ), Lambda*tau )
x = ISTA(x,Lambda,tau)


# __Exercise 3:__ Implement the ISTA algorithm, display the convergence of the energy.

# In[46]:


run -i nt_solutions/ml_2_regression/exo3


# __Exercise 4:__ Compute the test error along the full regularization path. You can start by large $\lambda$ and use a warm restart procedure to reduce the computation time. Compute the classification error.
# ind optimal lambda isplay error evolution.

# In[47]:


run -i nt_solutions/ml_2_regression/exo4


# __Exercise 5:__ Display the regularization path, i.e. the evolution of $w$ as a function
# of $\lambda$.

# In[48]:


run -i nt_solutions/ml_2_regression/exo5


# __Exercise 6:__ Compare the optimal weights for ridge and lasso.

# In[49]:


run -i nt_solutions/ml_2_regression/exo8


# <script>
#   $(document).ready(function(){
#       $('div.prompt').hide();
#   });
# </script>
<script>
  $(document).ready(function(){
      $('div.prompt').hide();
  });
</script>