#!/usr/bin/env python
# coding: utf-8

# Linear Regression
# ====================================
# 
# *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes.
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$
# $\newcommand{\eqdef}{\equiv}$

# This tour studies linear regression method in conjunction with
# regularization.
# It contrasts ridge regression and the Lasso.

# We recommend that after doing this Numerical Tours, you apply it to your
# own data, for instance using a dataset from <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LibSVM>.
# 
# _Disclaimer:_ these machine learning tours are intended to be
# overly-simplistic implementations and applications of baseline machine learning methods.
# For more advanced uses and implementations, we recommend
# to use a state-of-the-art library, the most well known being
# <http://scikit-learn.org/ Scikit-Learn>

# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# Usefull functions to convert to a column/row vectors.

# In[3]:


#  convert to a column vector
def MakeCol(y): return y.reshape(-1,1)
#  convert to a row vector
def MakeRow(y): return y.reshape(1,-1)
# find non zero/true elements
def find(x): return np.nonzero(x)[0]


# Dataset Loading
# ---------------
# We test the method on the prostate dataset in $n=97$ samples with
# features $x_i \in \RR^p$ in dimension $p=8$. The goal is to predict the price value
# $y_i \in \RR$.

# Load the dataset.

# In[4]:


from scipy import io
name = 'prostate';
U = io.loadmat('nt_toolbox/data/ml-' + name)
A = U['A']
class_names = U['class_names']


# Randomly permute it.

# In[5]:


A = A[np.random.permutation(A.shape[0]),:]


# Separate the features $X$ from the data $y$ to predict information.

# In[6]:


X = A[:,0:-2];
y = MakeCol( A[:,-2] )
c = MakeCol( A[:,-1] )


# $n$ is the number of samples, $p$ is the dimensionality of the features,

# In[7]:


[n,p] = X.shape
print(n,p)


# Split into training and testing.

# In[8]:


I0 = find(c==1) # train
I1 = find(c==0) # test
n0 = I0.size
n1 = n-n0
X0 = X[I0,:]
y0 = y[I0]
X1 = X[I1,:]
y1 = y[I1]


# Normalize the features by the mean and std of the *training* set.
# This is optional.

# In[9]:


mX0 = X0.mean(axis=0)
sX0 = X0.std(axis=0)
X0 = (X0-mX0)/sX0
X1 = (X1-mX0)/sX0


# Remove the mean (computed from the *test* set) to avoid introducing a bias term and a constant regressor.
# This is optional.

# In[10]:


m0 = y0.mean()
y0 = y0-m0
y1 = y1-m0


# Dimenionality Reduction and PCA
# -------------------------------
# In order to display in 2-D or 3-D the data, dimensionality is needed.
# The simplest method is the principal component analysis, which perform an
# orthogonal linear projection on the principal axsis (eigenvector) of the
# covariance matrix.
# 
# 
# Display the covariance matrix of the training set.

# In[10]:


C = X0.transpose().dot(X0)
plt.imshow(C);


# In[11]:


u = X0.transpose().dot(y0)
plt.clf
plt.bar(np.arange(1,p+1),u.flatten())
plt.axis('tight');


# Compute PCA ortho-basis and
# the feature in the PCA basis.

# In[12]:


U, s, V = np.linalg.svd(X0)
X0r = X0.dot( V.transpose() )


# Plot sqrt of the eigenvalues.

# In[13]:


plt.plot(s, '.-');


# Display the features.

# In[14]:


pmax = min(p,8)
k = 0
plt.clf
for i in np.arange(0,pmax):
    for j in np.arange(0,pmax):
        k = k+1
        plt.subplot(pmax,pmax,k)
        if i==j:
            plt.hist(X0[:,i],6)
            plt.axis('tight')
        else:
            plt.plot(X0[:,j],X0[:,i], '.')
        plt.axis('tight')
        if i==1:
            plt.title(class_names[0][j][0])
        plt.tick_params(axis='x', labelbottom=False)
        plt.tick_params(axis='y', labelleft=False)


# Display the points cloud of feature vectors in 2-D PCA space.

# In[15]:


plt.plot(X0r[:,0], X0r[:,1], '.')
plt.axis('equal');


# 1D plot of the function to regress along the main eigenvector axes.

# In[16]:


plt.clf
for i in np.arange(0,3):
    plt.subplot(3,1,i+1)
    plt.plot(X0r[:,i], y0, '.')
    plt.axis('tight')


# Linear Regression
# -----------------
# We look for a linear relationship
#   $ y_i = \dotp{w}{x_i} $
# written in matrix format
#   $ y= X w $
# where the rows of $X \in \RR^{n \times p}$ stores the features $x_i \in \RR^p$.
# 
# 
# Since here $ n > p $, this is an over-determined system, which can
# solved in the least square sense
#   $$ \umin{ w }  \norm{Xw-y}^2 $$
# whose solution is given using the Moore-Penrose pseudo-inverse
#   $$ w = (X^\top X)^{-1} X^\top y $$
# 
# 
# Least square solution.

# In[11]:


w = np.linalg.solve( X0.transpose().dot(X0), X0.transpose().dot(y0) )


# Prediction (along 1st eigenvector).

# In[12]:


plt.clf
plt.plot( X1.dot(w), '.-' )
plt.plot( y1, '.-' )
plt.axis('tight')
plt.legend(('$y_1$', '$X_1 w$'));


# Mean-square error on testing set.

# In[13]:


E = np.linalg.norm(X1.dot(w)-y1) / np.linalg.norm(y1)
print(( 'Relative prediction error: ' + str(E) ) );


# Although this is not an effective method to solve this problem (a more efficient approach is to use for instance conjugate gradient), one can do a gradient descent to minimize the function
# $$ 
#     \min_w J(w) = \frac{1}{2}\norm{X w-y}^2.
# $$

# In[14]:


def J(w): return 1/2*np.linalg.norm(X0.dot(w)-y0)**2


# The gradient of $J$ is
# $$
#     \nabla J(w) = X^\top (X w - y).
# $$

# In[15]:


def GradJ(w): return X0.transpose().dot(X0.dot(w)-y0)


# The maxium step size allowable by the gradient descent is 
# $$
#     \tau \leq \tau_\max \eqdef \frac{2}{\norm{XX^\top}_{op}}
# $$
# where $\norm{\cdot}_{op}$ is the maximum singular eigenvalue.

# In[16]:


tau = 1/np.linalg.norm(X0,2)**2


# Initialize the algorithm to $w=0$.

# In[17]:


w = np.zeros((p,1))


# One step of gradient descent reads:
# $$  w \leftarrow w - \tau \nabla J(w). $$

# In[18]:


w = w - tau*GradJ(w)


# In[19]:


tau_mult = [.1, .5, 1, 1.8]


# __Exercise 0__
# 
# Display the evolution of the training error $J(w)$ as a function of the number of iterations.
# Test for diffetent values of $\tau$.

# In[26]:


run -i nt_solutions/ml_2_regression/exo0


# The optimal step size to minimize a quadratic function $\dotp{C w}{w}$ is
# $$
# \tau_{\text{opt}} = \frac{2}{\sigma_\min(C) + \sigma_\max(C)}
# $$

# In[27]:


C = X0.transpose().dot(X0)
tau_opt = 2 / ( np.linalg.norm(C,2) + np.linalg.norm(C,-2) )
print(( 'Optimal tau = ' + str( tau_opt * np.linalg.norm(X0,2)**2 ) ) + ' / |X_0 X_0^T|' );


# Stochastic Gradient Method
# =======
# 
# We use SGD (which is *not* actually a descent algorithm) to minimize the quadratic risk
# $$ \umin{w} J(w) \eqdef \frac{1}{n} \sum_{i=1}^n f_i(w) = \frac{1}{n} \sum_{i=1}^n \frac{1}{2} ( \dotp{w}{x_i}-y_i )^2 $$
# where we used
# $$ f_i(w) \eqdef \frac{1}{2} ( \dotp{w}{x_i}-y_i )^2$$
# The algorithm reads
# $$ w_{k+1} = w_k - \tau_k \nabla f_{i_k}(w_k)$$
# where at each iteration $i_k$ is drawn in $\{1,\ldots,n\}$ uniformly at random.
# 

# In[20]:


w = np.zeros((p,1))


# Draw $i_k$ are random. 

# In[21]:


ik = int( np.floor(np.random.rand()*n0) )


# Compute $\nabla f_{i_k}(w_k)$.

# In[22]:


gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose()


# Set the step size $w_k$ (for convergence is should converge to 0) and perform the update.

# In[31]:


tauk = 1/np.linalg.norm(X0,2)**2
w = w - tauk * gk


# Test different *fixed* step size $\tau_k=\tau$. 

# In[56]:


wopt = np.linalg.solve( X0.transpose().dot(X0), X0.transpose().dot(y0) ) # least square solution
niter = 5000
Jlist = np.zeros((niter,1))
tau_mult = [.05, .3, .8];
for itau in np.arange(0,len(tau_mult)):
    tauk = tau_mult[itau]/np.linalg.norm(X0,2)**2
    w = np.zeros((p,1))
    for i in np.arange(0,niter):
        ik = int( np.floor(np.random.rand()*n0) )
        gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose() # stochastic gradient
        # gk = X0.transpose().dot( X0.dot(w)-y0 )  # batch gradient
        w = w - tauk * gk
        Jlist[i] = J(w)
    plt.plot(Jlist/J(wopt)-1)


# Average on different runs to see the impact of the step size.

# In[67]:


niter = 8000
tau_mult = [.05, .8];
nruns = 10 # number of runs to compute the average performance
for itau in np.arange(0,len(tau_mult)):
    tauk = tau_mult[itau]/np.linalg.norm(X0,2)**2
    Jlist = np.zeros((niter,1))
    for iruns in np.arange(0,nruns):
        w = np.zeros((p,1))
        for i in np.arange(0,niter):
            ik = int( np.floor(np.random.rand()*n0) )
            gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose()
            w = w - tauk * gk
            Jlist[i] = Jlist[i] + J(w)
    plt.plot( (Jlist/nruns)/J(wopt)-1 )


# Use a decaying step size $\tau_k=\frac{\tau_0}{1+k/k_0}$.

# In[68]:


niter = 20000
Jlist = np.zeros((niter,1))
w = np.zeros((p,1))
for i in np.arange(0,niter):
    tauk = 1/np.linalg.norm(X0,2)**2 * 1/(1+i/10)
    ik = int( np.floor(np.random.rand()*n0) )
    gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose()
    w = w - tauk * gk
    Jlist[i] = J(w)
plt.plot(Jlist/J(wopt)-1 );
print( Jlist[-1]/J(wopt)-1 )


# Use a decaying step size $\tau_k=\frac{\tau_0}{1+\sqrt{k/k_0}}$ and average the iteration $\frac{1}{K}\sum_{k<K}w_k$.

# In[77]:


niter = 20000
Jlist = np.zeros((niter,1))
w  = np.zeros((p,1))
w1 = np.zeros((p,1))
for i in np.arange(0,niter):
    tauk = 1/np.linalg.norm(X0,2)**2 * 1/(1+np.sqrt(i/10.0))
    ik = int( np.floor(np.random.rand()*n0) )
    gk = (X0[ik,:].dot(w)-y0[ik]) * X0[ik,:].transpose()
    w = w - tauk * gk
    w1 = 1/(i+1) * w + i/(i+1) * w1
    Jlist[i] = J(w1)
plt.plot(Jlist/J(wopt)-1 );


# 
# Ridge Regularization
# =======
# 
# Regularization is obtained by introducing a penalty. It is often called
# ridge regression, and is defined as
#   $$ \umin{ w }  \norm{Xw-y}^2 + \lambda \norm{w}^2 $$
# where $\lambda>0$ is the regularization parameter.
# 
# 
# The solution is given using the following equivalent formula
#   $$ w = (X^\top X + \lambda \text{Id}_p )^{-1} X^\top y, $$
#   $$ w = X^\top ( XX^\top + \lambda \text{Id}_n)^{-1} y, $$
# When $p<n$ (which is the case here), the first formula should be
# prefered.
# 
# 
# In contrast, when the dimensionality $p$ of the feature is very
# large and there is little data, the second is faster. Furthermore, this
# second expression is generalizable to Kernel Hilbert space setting,
# corresponding possibly to $p=+\infty$ for some kernels.

# In[27]:


Lambda = .2*np.linalg.norm(X0)**2;
w = np.linalg.solve( X0.transpose().dot(X0) + Lambda*np.eye(p), X0.transpose().dot(y0) )
u = np.linalg.solve( X0.dot(X0.transpose()) + Lambda*np.eye(n0), y0 )
w1 = X0.transpose().dot( u )
print( ('Error (should be 0): ' + str( np.linalg.norm(w-w1)/np.linalg.norm(w) ) ) )


# __Exercise 1__
# 
# Display the evolution of the test error $E$ as a function of $\lambda$.

# In[28]:


run -i nt_solutions/ml_2_regression/exo1


# In[29]:


## Insert your code here.


# __Exercise 2__
# 
# Display the regularization path, i.e. the evolution of $w$ as a function
# of $\lambda$.

# In[30]:


run -i nt_solutions/ml_2_regression/exo2


# In[31]:


## Insert your code here.


# Sparse Regularization
# ---------------------
# In order to perform feature selection (i.e. select a subsect of the
# features which are the most predictive), one needs to replace the
# $\ell^2$ regularization penalty by a sparsity inducing regularizer. The
# most well known is the $\ell^1$ norm
# $$ \norm{w}_1 \eqdef \sum_i \abs{w_i} . $$
# 
# 
# The energy to minimize is
# $$ \umin{w} J(w) \eqdef \frac{1}{2}\norm{X w-y}^2 + \lambda \norm{w}_1. $$

# In[32]:


def J(w,Lambda): return 1/2*np.linalg.norm(X0.dot(w)-y0)**2 + Lambda*np.linalg.norm(w,1)


# The simplest iterative algorithm to perform the minimization is the
# so-called iterative soft thresholding (ISTA), aka proximal gradient aka
# forward-backward.
# 
# 
# It performs first a gradient step (forward) of the smooth part $\frac{1}{2}\norm{X w-y}^2$ of the
# functional and then a proximal step (backward) step which account for the
# $\ell^1$ penalty and induce sparsity. This proximal step is the soft-thresholding operator
# $$ \Ss_s(x) \eqdef \max( \abs{x}-\lambda,0 ) \text{sign}(x).  $$

# In[33]:


def Soft(x,s): return np.maximum( abs(x)-s, np.zeros(x.shape)  ) * np.sign(x)


# The ISTA algorithm reads
# $$ w_{k+1} \eqdef \Ss_{\la\tau}( w_k - \tau X^\top ( X w_k - y )  ), $$
# where, to ensure convergence, the step size should verify $ 0 < \tau <
# 2/\norm{X}^2  $ where $\norm{X}$ is the operator norm.
# 
# 
# Display the soft thresholding operator.

# In[34]:


t = np.linspace(-5,5,201)
plt.clf
plt.plot(t,Soft(t,2)) 
plt.axis('tight');


# Descent step size.

# In[35]:


tau = 1.5/np.linalg.norm(X0,2)**2


# Choose a regularization parameter $\la$.

# In[36]:


lmax = abs( X0.transpose().dot(y0) ).max()
Lambda = lmax /10


# Initialization $w_0$.

# In[37]:


w = np.zeros((p,1))


# A single ISTA step.

# In[38]:


C = X0.transpose().dot(X0)
u = X0.transpose().dot(y0)
def ISTA(w,Lambda,tau): return Soft( w-tau*( C.dot(w)-u ), Lambda*tau )
w = ISTA(w,Lambda,tau)


# __Exercise 3__
# 
# Implement the ISTA algorithm, display the convergence of the energy.

# In[39]:


run -i nt_solutions/ml_2_regression/exo3


# In[40]:


## Insert your code here.


# __Exercise 4__
# 
# Compute the test error along the full regularization path. You can start by large $\lambda$ and use a warm restart procedure
# to reduce the computation time. Compute the classification error.
# ind optimal lambda
# isplay error evolution.

# In[41]:


run -i nt_solutions/ml_2_regression/exo4


# In[42]:


## Insert your code here.


# __Exercise 5__
# 
# Display the regularization path, i.e. the evolution of $w$ as a function
# of $\lambda$.
# lot(lambda_list, W', 'LineWidth', 2);

# In[43]:


run -i nt_solutions/ml_2_regression/exo5


# In[44]:


## Insert your code here.


# __Exercise 6__
# 
# Compare the optimal weights for ridge and lasso.

# In[45]:


run -i nt_solutions/ml_2_regression/exo8


# <script>
#   $(document).ready(function(){
#       $('div.prompt').hide();
#   });
# </script>
<script>
  $(document).ready(function(){
      $('div.prompt').hide();
  });
</script>