#!/usr/bin/env python # coding: utf-8 # Logistic Classification # ======================= # # *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes. # $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$ # $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$ # $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$ # $\newcommand{\norm}[1]{\|#1\|}$ # $\newcommand{\abs}[1]{\left|#1\right|}$ # $\newcommand{\choice}[1]{ \left\{ \begin{array}{l} #1 \end{array} \right. }$ # $\newcommand{\pa}[1]{\left(#1\right)}$ # $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$ # $\newcommand{\qandq}{\quad\text{and}\quad}$ # $\newcommand{\qwhereq}{\quad\text{where}\quad}$ # $\newcommand{\qifq}{ \quad \text{if} \quad }$ # $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$ # $\newcommand{\ZZ}{\mathbb{Z}}$ # $\newcommand{\CC}{\mathbb{C}}$ # $\newcommand{\RR}{\mathbb{R}}$ # $\newcommand{\EE}{\mathbb{E}}$ # $\newcommand{\Zz}{\mathcal{Z}}$ # $\newcommand{\Ww}{\mathcal{W}}$ # $\newcommand{\Vv}{\mathcal{V}}$ # $\newcommand{\Nn}{\mathcal{N}}$ # $\newcommand{\NN}{\mathcal{N}}$ # $\newcommand{\Hh}{\mathcal{H}}$ # $\newcommand{\Bb}{\mathcal{B}}$ # $\newcommand{\Ee}{\mathcal{E}}$ # $\newcommand{\Cc}{\mathcal{C}}$ # $\newcommand{\Gg}{\mathcal{G}}$ # $\newcommand{\Ss}{\mathcal{S}}$ # $\newcommand{\Pp}{\mathcal{P}}$ # $\newcommand{\Ff}{\mathcal{F}}$ # $\newcommand{\Xx}{\mathcal{X}}$ # $\newcommand{\Mm}{\mathcal{M}}$ # $\newcommand{\Ii}{\mathcal{I}}$ # $\newcommand{\Dd}{\mathcal{D}}$ # $\newcommand{\Ll}{\mathcal{L}}$ # $\newcommand{\Tt}{\mathcal{T}}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\al}{\alpha}$ # $\newcommand{\la}{\lambda}$ # $\newcommand{\ga}{\gamma}$ # $\newcommand{\Ga}{\Gamma}$ # $\newcommand{\La}{\Lambda}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\Si}{\Sigma}$ # $\newcommand{\be}{\beta}$ # $\newcommand{\de}{\delta}$ # $\newcommand{\De}{\Delta}$ # $\newcommand{\phi}{\varphi}$ # $\newcommand{\th}{\theta}$ # $\newcommand{\om}{\omega}$ # $\newcommand{\Om}{\Omega}$ # $\newcommand{\eqdef}{\equiv}$ # This tour details the logistic classification method (for 2 classes and # multi-classes). # # # _Warning:_ Logisitic classification is actually called ["logistic # regression"](https://en.wikipedia.org/wiki/Logistic_regression) in the literature, but it is in fact a classification method. # # # We recommend that after doing this Numerical Tours, you apply it to your # own data, for instance using a dataset from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/). # # _Disclaimer:_ these machine learning tours are intended to be # overly-simplistic implementations and applications of baseline machine learning methods. # For more advanced uses and implementations, we recommend # to use a state-of-the-art library, the most well known being # [Scikit-Learn](http://scikit-learn.org/) # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # We define a few helpers. # In[2]: def find(x): return np.nonzero(x)[0] # Two Classes Logistic Classification # ----------------------------------- # Logistic classification is, with [support vector machine (SVM)](https://en.wikipedia.org/wiki/Support_vector_machine), the baseline # method to perform classification. Its main advantage over SVM is that is # is a smooth minimization problem, and that it also output class # probabity, offering a probabilistic interpretation of the classification. # # # To understand the behavior of the method, we generate synthetic data # distributed according to a mixture of Gaussian with an overlap governed by an offset $\omega$. # Here classes indexes are set to $y_i \in # \{-1,1\}$ to simplify the equations. # In[39]: n = 1000 # number of sample p = 2 # dimensionality omega = np.array([1,.5])*2.5 # offset n1 = int(n/2) X = np.vstack(( np.random.randn(n1,2), np.random.randn(n1,2)+np.ones([n1,1])*omega )) y = np.vstack(( np.ones([n1,1]), -np.ones([n1,1]) )) # Plot the classes. # In[40]: I = find(y==-1) J = find(y==1) plt.clf plt.plot(X[I,0], X[I,1], '.') plt.plot(X[J,0], X[J,1], '.') plt.axis('equal'); # Logistic classification minimize a logistic loss in place of the usual # $\ell^2$ loss for regression # $$ \umin{w} E(w) \eqdef \frac{1}{n} \sum_{i=1}^n L(\dotp{x_i}{w},y_i) $$ # where the logistic loss reads # $$ L( s,y ) \eqdef \log( 1+\exp(-sy) ) $$ # This corresponds to a smooth convex minimization. If $X$ is injective, # this is also strictly convex, hence it has a single global minimum. # # # Compare the binary (ideal) 0-1 loss, the logistic loss and the # # (the one used for SVM). # In[41]: t = np.linspace(-3,3,255).transpose() plt.clf plt.plot(t, t>0) plt.plot(t, np.log(1+np.exp(t))) plt.plot(t, np.maximum(t,0) ) plt.axis('tight'); plt.legend(['Binary', 'Logistic', 'Hinge']); # This can be interpreted as a when one # models the probability of belonging to the two classes for sample $x_i$ as # $$ h(x_i) \eqdef (\th(x_i),1-\th(x_i)) \qwhereq # \th(s) \eqdef \frac{e^{s}}{1+e^s} = (1+e^{-s})^{-1} $$ # # # Re-writting the energy to minimize # $$ E(w) = \Ll(X w,y) \qwhereq \Ll(s,y)= \frac{1}{n} \sum_i L(s_i,y_i), $$ # its gradient reads # $$ \nabla E(w) = X^\top \nabla \Ll(X w,y) # \qwhereq # \nabla \Ll(s,y) = \frac{y}{n} \odot \th(-y \odot s), $$ # where $\odot$ is the pointwise multiplication operator, i.e. * in # Python. # # # Define the energies. # In[42]: def L(s,y): return 1/n * sum( np.log( 1 + np.exp(-s*y) ) ) def E(w,X,y): return L(X.dot(w),y); # Define their gradients. # In[43]: def theta(v): return 1 / (1+np.exp(-v)) def nablaL(s,r): return - 1/n * y * theta(-s * y) def nablaE(w,X,y): return X.transpose().dot( nablaL(X.dot(w),y) ) # _Important:_ in order to improve performance, it is important (especially # in low dimension $p$) to add a constant bias term $w_{p+1} \in \RR$, and replace $\dotp{x_i}{w}$ # by $ \dotp{x_i}{w} + w_{p+1} $. This is equivalently achieved by # adding an extra $(p+1)^{\text{th}}$ dimension equal to 1 to each # $x_i$, which we do using a convenient macro. # In[44]: def AddBias(X): return np.hstack(( X, np.ones((np.size(X,0),1)) )) # With this added bias term, once $w_{\ell=0} \in \RR^{p+1}$ initialized # (for instance at $0_{p+1}$), # In[45]: w = np.zeros((p+1,1)) # Perform one step of gradient descent reads # $$ w_{\ell+1} = w_\ell - \tau_\ell \nabla E(w_\ell). $$ # In[73]: tau = 1; # here we are using a fixed tau w = w - tau * nablaE(w,AddBias(X),y) # $$\tau < \frac{2}{L}$$ # $$ L \leq \frac{1}{4}\norm{X}^2 $$ # If one chooses # $$\tau < \tau_{\max} \eqdef \frac{2}{\frac{1}{4}\norm{X}^2},$$ # then one is sure that the gradient descent converges. # In[82]: np.linalg.norm(X) tau_max = 2/(1/4 * np.linalg.norm(AddBias(X), 2)**2 ) print(tau_max) # __Exercise 1__ # # Implement a gradient descent # $$ w_{\ell+1} = w_\ell - \tau_\ell \nabla E(w_\ell). $$ # Monitor the energy decay. # Test different step size, and compare with the theory (in particular # plot in log domain to illustrate the linear rate). # etAR(1); # etAR(1); # In[75]: run -i nt_solutions/ml_3_classification/exo1 # In[76]: print(w) # In[77]: ## Insert your code here. # Generate a 2D grid of points. # In[78]: q = 201 tx = np.linspace( X[:,0].min(), X[:,0].max(),num=q) ty = np.linspace( X[:,1].min(), X[:,1].max(),num=q) [B,A] = np.meshgrid( ty,tx ) G = np.vstack([A.flatten(), B.flatten()]).transpose() # Evaluate class probability associated to weight vectors on this grid. # In[79]: Theta = theta(AddBias(G).dot(w)) Theta = Theta.reshape((q,q)) # Display the data overlaid on top of the # classification probability, this highlight the # separating hyperplane $ \enscond{x}{\dotp{w}{x}=0} $. # In[80]: plt.clf plt.imshow(Theta.transpose(), origin="lower", extent=[tx.min(),tx.max(),ty.min(),ty.max()]) plt.axis('equal') plt.plot(X[I,0], X[I,1], '.') plt.plot(X[J,0], X[J,1], '.') plt.axis('off'); # __Exercise 2__ # # Test the influence of the separation offset $\omega$ on the result. # In[27]: run -i nt_solutions/ml_3_classification/exo2 # In[28]: ## Insert your code here. # __Exercise 3__ # # Test logistic classification on a real life dataset. You can look at the Numerical Tour on stochastic gradient descent # for an example. Split the data in training and testing to evaluate the # classification performance, and check the impact of regularization. # In[29]: run -i nt_solutions/ml_3_classification/exo3 # In[19]: ## Insert your code here. # Kernelized Logistic Classification # ---------------------------------- # Logistic classification tries to separate the classes using # a linear separating hyperplane $ \enscond{x}{\dotp{w}{x}=0}. $ # # # In order to generate a non-linear descision boundary, one can replace the # parametric linear model by a non-linear [non-parametric](https://en.wikipedia.org/wiki/Nonparametric_statistics) model, thanks to # kernelization. It is non-parametric in the sense that the number of # parameter grows with the number $n$ of sample (while for the basic # method, the number of parameter is $p$. This allows in particular to # generate decision boundary of arbitrary complexity. # # # The downside is that the numerical complexity of the method grows # (at least) quadratically with $n$. # # # The good news however is that thanks to the theory of # [reproducing kernel Hilbert spaces](https://en.wikipedia.org/wiki/Reproducing_kernel_Hilbert_space) # (RKHS), one can still compute this non-linear decision # function using (almost) the same numerical algorithm. # # # Given a kernel $ \kappa(x,z) \in \RR $ defined for $(x,z) \in \RR^p$, # the kernelized method replace the linear decision functional $f(x) = # \dotp{x}{w}$ by a sum of kernel centered on the samples # $$ f_h(x) = \sum_{i=1}^p h_i k(x_i,x) $$ # where $h \in \RR^n$ is the unknown vector of weight to find. # # # When using the linear kernel $\kappa(x,y)=\dotp{x}{y}$, one retrieves # the previously studied linear method. # # # Macro to compute pairwise squared Euclidean distance matrix. # In[30]: # slow def distmat1(X,Z): D = np.zeros((X.shape[0],Z.shape[0])) for i in np.arange(0,X.shape[0]): for j in np.arange(0,Z.shape[0]): D[i,j] = np.linalg.norm( X[i,:]-Z[j,:] ); return D # In[31]: # fast from scipy import spatial def distmat(X,Z): return spatial.distance.cdist(X,Z)**2 # The gaussian kernel is the most well known and used kernel # $$ \kappa(x,y) \eqdef e^{-\frac{\norm{x-y}^2}{2\sigma^2}} . $$ # The bandwidth parameter $\si>0$ is crucial and controls the locality of # the model. It is typically tuned through cross validation. # In[32]: def kappa(X,Z,sigma): return np.exp( -distmat(X,Z)/(2*sigma**2) ) # We generate synthetic data in 2-D which are not separable by an # hyperplane. # In[35]: n = 1000 p = 2; t = 2*np.pi*np.random.randn(n1,1); R = 2.5; r = R*(1.5 + .2*np.random.randn(n1,1)); # radius X1 = np.hstack((np.cos(t)*r, np.sin(t)*r)); X = np.vstack((np.random.randn(n1,2), X1)) y = np.vstack(( np.ones([n1,1]), -np.ones([n1,1]) )) # Display the classes. # In[36]: I = find(y==-1) J = find(y==1) plt.plot(X[I,0], X[I,1], '.') plt.plot(X[J,0], X[J,1], '.') plt.axis('equal') plt.axis('off'); # Once avaluated on grid points, the kernel define a matrix # $$ K = (\kappa(x_i,x_j))_{i,j=1}^n \in \RR^{n \times n}. $$ # In[37]: sigma = 1; K = kappa(X,X,sigma) plt.imshow(K); # Valid kernels are those that gives rise to positive symmetric matrices # $K$. The linear and Gaussian kernel are valid kernel functions. Other # popular kernels include the polynomial kernel $ \dotp{x}{y}^a $ for $a # \geq 1$ and the Laplacian kernel $ \exp( -\norm{x-y}^2/\si ) $. # # # The kernelized Logistic minimization reads # $$ \umin{h} F(h) \eqdef \Ll(K h,y). $$ # In[38]: def F(h,K,y): return L(K.dot(h),y) def nablaF(h,K,y): return K.transpose().dot( nablaL(K.dot(h),y) ) # This minimization can be related to an infinite dimensional optimization # problem where one minimizes directly over the function $f$. This # is shown to be equivalent to the above finite-dimenisonal optimization problem # thanks to the theory of RKHS. # __Exercise 4__ # # Implement a gradient descent to minimize $F(h)$. # Monitor the energy decay. # Test different step size, and compare with the theory. # In[40]: run -i nt_solutions/ml_3_classification/exo4 # In[41]: ## Insert your code here. # Once this optimal $h$ has been found, class probability at a point # $x$ are obtained as # $$ (\th(f_h(x)), 1-\th(f_h(x)) $$ # where $f_h$ has been defined above. # # # We evaluate this classification probability on a grid. # In[42]: q = 201 tmax = 5 t = np.linspace(-tmax,tmax,num=q) [B,A] = np.meshgrid( t,t ) G = np.vstack([A.flatten(), B.flatten()]).transpose() K1 = kappa(G,X,sigma) Theta = theta( K1.dot(h) ) Theta = Theta.reshape((q,q)) # Display the classification probability. # In[46]: plt.clf plt.imshow(Theta.transpose(), origin="lower", extent=[-tmax, tmax, -tmax, tmax]) plt.plot(X[I,0], X[I,1], '.') plt.plot(X[J,0], X[J,1], '.') plt.axis('equal') plt.axis('off'); # __Exercise 5__ # # Display evolution of the classification probability with $\sigma$ # In[52]: run -i nt_solutions/ml_3_classification/exo5 # In[49]: ## Insert your code here. # __Exercise 6__ # # Separate the dataset into a training set and a testing set. Evaluate the classification performance # for varying $\si$. Try to introduce regularization and minmize # $$ \umin{h} F(h) \eqdef \Ll(K h,y) + \la R(h) $$ # where for instance $R=\norm{\cdot}_2^2$ or $R=\norm{\cdot}_1$. # In[53]: run -i nt_solutions/ml_3_classification/exo6 # In[54]: ## Insert your code here. # Multi-Classes Logistic Classification # ------------------------------------- # The logistic classification method is extended to an arbitrary number # $k$ of classes by considering a familly of weight vectors $ w_\ell # $_{\ell=1}^k, which are conveniently stored as columns of matrix $W \in \RR^{p \times k}$. # # # This allows to model probabilitically the belonging of a point $x \in \RR^p $ to a # the classes using an exponential model # $$ h(x) = \pa{ \frac{ e^{-\dotp{x}{w_\ell}} }{ \sum_m e^{-\dotp{x}{w_m}} } }_\ell $$ # This vector $h(x) \in [0,1]^k $ describes the probability of $x$ # belonging to the different classes, and $ \sum_\ell h(x)_\ell = 1 $. # # # The computation of $w$ is obtained by solving a maximum likelihood # estimator # $$ \umax{w \in \RR^k} \frac{1}{n} \sum_{i=1}^n \log( h(x_i)_{y_i} ) $$ # where we recall that $y_i \in \{1,\ldots,k\}$ is the class index of # point $x_i$. # # # This is conveniently rewritten as # $$ \umin{w} \sum_i \text{LSE}( XW )_i - \dotp{XW}{D} $$ # where $D \in \{0,1\}^{n \times k}$ is the binary class index matrices # $$ D_{i,\ell} = \choice{ # 1 \qifq y_i=\ell, \\ # 0 \quad \text{otherwise}. # } # $$ # and LSE is the log-sum-exp operator # $$ \text{LSE}(S) = \log\pa{ \sum_\ell \exp(S_{i,\ell}) } \in \RR^n. $$ # In[60]: def LSE0(S): return np.log( np.exp(S).sum(axis=1,keepdims=1)) # The computation of LSE is # unstable for large value of $S_{i,\ell}$ (numerical overflow, producing NaN), but this can be # fixed by substracting the largest element in each row, # since $ \text{LSE}(S+a)=\text{LSE}(S)+a $ if $a$ is constant along rows. This is # the [celebrated LSE trick](https://en.wikipedia.org/wiki/LogSumExp). # In[61]: def max2(S): return np.tile( S.max(axis=1,keepdims=1), (1,S.shape[1]) ) def LSE(S): return LSE0( S-max2(S) ) + S.max(axis=1,keepdims=1) # In[62]: # check equality of LSE and LSE0 S = np.random.randn(4,5) np.linalg.norm( LSE(S)-LSE0(S) ) # The gradient of the LSE operator is the # # $$ \nabla \text{LSE}(S) = \text{SM}(S) \eqdef # \pa{ # \frac{ # e^{S_{i,\ell}} # }{ # \sum_m e^{S_{i,m}} # } } $$ # In[63]: def SM0(S): return np.exp(S) / np.tile( np.exp(S).sum(axis=1,keepdims=1), (1, S.shape[1]) ); # Similarely to the LSE, it needs to be stabilized. # In[64]: def SM(S): return SM0(S-max2(S)) # In[67]: # Check equality of SM and SM0 np.linalg.norm( SM(S)-SM0(S) ) # We load a dataset of $n$ images of size $p = 8 \times 8$, representing digits from 0 # to 9 (so there are $k=10$ classes). # # # Load the dataset and randomly permute it. # Separate the features $X$ from the data $y$ to predict information. # In[68]: from scipy import io name = 'digits'; U = io.loadmat('nt_toolbox/data/ml-' + name) A = U['A'] A = A[np.random.permutation(A.shape[0]),:] X = A[:,0:-1] y = A[:,-1] # $n$ is the number of samples, $p$ is the dimensionality of the features, $k$ # the number of classes. # In[71]: [n,p] = X.shape p1 = int(np.sqrt(p)) CL = np.unique(y) # list of classes. k = np.size(CL) # In[73]: f = X[1,:] f = np.reshape( f,(p1,p1) ) plt.imshow(f.max()-f, cmap="gray") plt.axis('off'); # Display a few samples digits # In[74]: q = 5 plt.clf for i in np.arange(0,k): I = find(y==CL[i]) for j in np.arange(0,q): f = X[I[j],:]; f = np.reshape( f,(p1,p1) ) plt.subplot(q,k, j*k+i+1 ) plt.imshow(f.max()-f, cmap="gray") plt.axis('off') # Perform dimensionality reduction using PCA. # In[75]: # substract mean X1 = X-X.mean(axis=0) # covariance, not used C = X1.transpose().dot( X1 ) # SVD, the V matrix contains the eigenvectors to project onto # WARNING: Matlab and Python do not have the same convention for the SVD. # For Python, it reads X1=U*diag(s)*V # For Matlab, it reads X1=U*diag(s)*transpose(V) U, s, V = np.linalg.svd(X1) Xr = X1.dot( V.transpose() ) # display the decay of eigenvalues plt.plot(s); # Display in 2D. # In[76]: col = np.array( [ [1,0,0], [0,1,0], [0,0,1], [0,0,0], [0,1,1], [1,0,1], [1,1,0], [1,.5,.5], [.5,1,.5], [.5,.5,1] ] ).transpose() for i in np.arange(0,k): I = find(y==CL[i]) plt.plot(Xr[I,0], Xr[I,1], '.', color=col[:,i]) plt.axis('equal'); # Display in 3D. # In[79]: from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = Axes3D(fig) plt.clf for i in np.arange(0,k): I = find(y==CL[i]) ax.scatter(Xr[I,0], Xr[I,1], Xr[I,2], '.', color=col[:,i]) plt.axis('tight'); # Compute the $D$ matrix. # In[81]: D = np.double( np.tile( CL, (n,1) ) == np.tile( y, (k,1) ).transpose() ) # Dot product between two vectors. # In[82]: def dotp(x,y): return x.flatten().dot( y.flatten().transpose() ) # Define the energy $E(W)$. # In[83]: def E(W): return 1/n*( LSE(X.dot(W)).sum() - dotp(X.dot(W),D) ) # Define its gradients # $$ \nabla E(W) = \frac{1}{n} X^\top ( \text{SM}(X W) - D ). $$ # In[84]: def nablaE(W): return 1/n * X.transpose().dot( SM(X.dot(W)) - D ) # __Exercise 7__ # # Implement a gradient descent # $$ W_{\ell+1} = W_\ell - \tau_\ell \nabla E(W_\ell). $$ # Monitor the energy decay. # In[86]: run -i nt_solutions/ml_3_classification/exo7 # In[55]: ## Insert your code here. # Generate a 2D grid of points over PCA space and map it to feature space. # In[87]: M = np.abs(Xr.flatten()).max() q = 201 t = np.linspace(-M,M,num=q) [B,A] = np.meshgrid(t,t) G0 = np.vstack([A.flatten(), B.flatten()]).transpose() Xmean = np.tile( X.mean(axis=0,keepdims=1), (q**2,1) ) G = G0.dot(V[0:2,:]) + Xmean # Evaluate class probability associated to weight vectors on this grid. # In[88]: Theta = SM(G.dot(W)) Theta = np.reshape(Theta, (q, q, k) ) # Display each probablity map. # In[90]: plt.clf for i in np.arange(0,k): plt.subplot(3,4,i+1) plt.imshow(Theta[:,:,i].transpose()); plt.title('Class ' + str(i+1)); plt.axis('off') # Build a single color image of this map. # In[91]: R = np.zeros((q,q,3)) for i in np.arange(0,k): for a in np.arange(0,3): R[:,:,a] = R[:,:,a] + Theta[:,:,i] * col[a,i] # Display. # In[92]: plt.clf plt.imshow(R.transpose((1, 0, 2)), origin="lower", extent=[-M,M,-M,M]) for i in np.arange(0,k): I = find(y==CL[i]) plt.plot(Xr[I,0], Xr[I,1], '.', color=col[:,i]) plt.axis('off'); # __Exercise 8__ # # Separate the dataset into a training set and a testing set. Evaluate the classification performance # and display the confusion matrix. You can try the impact of kernlization and regularization. # In[93]: run -i nt_solutions/ml_3_classification/exo8 # In[62]: ## Insert your code here. # In[ ]: