#!/usr/bin/env python # coding: utf-8 # # Introduction to machine learning # # *Maxime Sangnier* # # Fall, 2023 # # ## Practical session 1: discriminant analysis, logistic regression and boosting # # Table of contents # 1. [Discriminant analysis](#part1) # - [Linear discriminant analysis](#part1sec1) # - [Quadratic discriminant analysis](#part1sec2) # - [Fisher discriminant analysis](#part1sec3) # 1. [Logistic regression](#part2) # 1. [Adaboost](#part3) # # In[1]: from mllab import * # # Discriminant analysis # ## Linear discriminant analysis # >The `covariance` function makes it possible to build a $2 \times 2$ covariance matrix based on spreads $\sigma_1$ and $\sigma_2$, and the angle $\theta$. # In[2]: get_ipython().run_line_magic('pinfo', 'covariance') # >Based on the Cholesky decomposition of a $2 \times 2$ covariance matrix $\Sigma$, write a function that generates a multivariate Gaussian $n$-sample of mean $\mu \in \mathbb R^2$ and covariance $\Sigma$. # The corresponding numpy array should be of size $(n, 2)$. # # >Compute the mean and the empirical covariance of the sample using Numpy routines. # In[ ]: # Answer def gaussian_sample(mu=[0, 0], sigma1=1., sigma2=1., theta=0., n=50): # Todo # End todo return X # Todo # End todo # >Generate two multivariate Gaussian samples of size $n_1 = n_2 = 50$ with different means and equal covariance matrices. # Plot both samples with different markers by using the function `plotXY`. # In[ ]: # Answer # >Write a function that creates and returns two Numpy arrays `X` and `y` such that `X` is the data matrix (of size $n \times 2$) and `y` is the vector of labels (of size $n$, with values $\pm 1$) corresponding to i.i.d. copies of $(X, Y)$ distributed such that: # $$ # \begin{cases} # \forall i \in \{\pm 1\}: X~|~Y=i \sim \mathcal N (\mu_i, \Sigma_i)\\ # \mathbb P(Y=1) = \pi, \mathbb P(y=-1)=1-\pi. # \end{cases} # $$ # > # >Create a dataset of size $n=100$. # In[ ]: # Answer def sample_classif(weight=0.5, param1=dict(mu=[0, 0], sigma1=1., sigma2=1.), param2=dict(mu=[0, 0], sigma1=1., sigma2=1.), n=50): Y = 2 * np.random.binomial(n=1, p=weight, size=n) - 1 # Labels 1 or -1 # Todo # End todo return X, Y # Todo # End todo # >Based on the following code, implement a linear discriminant classifier, taking as parameters an $n \times 2$ Numpy array as data and a size-$n$ array of labels. # In[ ]: # Answer from sklearn.base import BaseEstimator from sklearn.discriminant_analysis import LinearClassifierMixin class LDA(BaseEstimator, LinearClassifierMixin): """ LDA classifier for two classes. """ def __init__(self, bias=False): """ bias: default normalization (False) of the covariance matrix estimator is by ``(N - 2)``, where ``N`` is the number of observations given (unbiased estimate). If `bias` is True, then normalization is by ``N``. """ self.bias = bias self.yvalues_ = None self.coef_ = None self.intercept_ = None def fit(self, X, y,): self.yvalues_ = np.unique(y) assert self.yvalues_.size==2 # Estimate covariance matrix and means n_pos, n = np.sum(y == self.yvalues_[1]), X.shape[0] # Number of positive labels and size of the dataset # Todo # End todo # Compute direction and intercept # Todo # End todo return self def decision_function(self, X): # Compute decisions # Todo # End todo return decisions def predict(self, X): # Compute predictions predictions = np.ones(X.shape[0]) * self.yvalues_[0] # Negative label # Todo # End todo return predictions # >Fit a linear discriminant classifier on the data `X,y`. # Plot the data along with the classifier frontiere (use the function `plot_frontiere`). # In[ ]: # Answer # >Compare the result of [scikit-learn LDA](http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html#sklearn.discriminant_analysis.LinearDiscriminantAnalysis) (decision function and frontiere). # In[ ]: # Answer # ## Quadratic discriminant analysis # >Analyze the behavior of LDA and [QDA](http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis) when it is faced to anisotropic Gaussian samples (in particular, check if the frontiere is the bisector of the line segment for which the extremities are both class centers), and then to Gaussian samples with different covariance matrices (you can use `plot_frontiere` with a list of classifiers). # In[ ]: # Answer from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis() # Gassian parameters mu1 = [0, 0] mu2 = [5, 3] plt.figure(figsize=(20, 20)) for ifig, (param1, param2) in enumerate([(dict(mu=mu1, sigma1=1, sigma2=1, theta=0), dict(mu=mu2, sigma1=1, sigma2=1, theta=0)), (dict(mu=mu1, sigma1=1, sigma2=5, theta=0), dict(mu=mu2, sigma1=1, sigma2=5, theta=0)), (dict(mu=mu1, sigma1=1, sigma2=5, theta=np.pi/6), dict(mu=mu2, sigma1=1, sigma2=5, theta=np.pi/6)), (dict(mu=mu1, sigma1=1, sigma2=5, theta=0), dict(mu=mu2, sigma1=5, sigma2=1, theta=0)), (dict(mu=mu1, sigma1=1, sigma2=5, theta=0), dict(mu=mu2, sigma1=5, sigma2=1, theta=np.pi/3))]): # Dataset # Todo # End todo # Discriminant analysis # Todo # End todo # Class means # Todo # End todo # Plot frontieres and class means # Todo # End todo # ## Fisher discriminant analysis # >Implement the Fisher discriminant analysis based on the following code. # In practice, what is the difference between LDA and FisherDA? # In[ ]: # Answer class FisherDA(BaseEstimator, LinearClassifierMixin): """ Fisher discriminant analysis for two classes. """ def __init__(self): self.coef_ = None self.intercept_ = None def fit(self, X, y): # Estimate prior, covariance matrix and means # To do # End todo # Compute direction and intercept # Todo # End todo self.intercept_ = 0 ypred = self.decision_function(X) ind = np.argsort(ypred) err = np.cumsum(y[ind]) + np.sum(y==y.min()) #plt.figure() #plt.plot(ypred[ind], err) # Error iintercept = np.argmin(err) if iintercept < y.size-1: self.intercept_ = -0.5*(ypred[ind[iintercept]] + ypred[ind[iintercept+1]]) else: self.intercept_ = -ypred[ind[iintercept]] return self def decision_function(self, X): # Compute decisions # Todo # End todo return decisions def predict(self, X): # Compute predictions # Todo # End todo return predictions # In[ ]: # Answer # # Logistic regression # >We consider that # $$ # X|Y=1 \sim \mathcal N(0, I) # \qquad \text{and} \qquad # X|Y=-1 \sim 0.5 \mathcal N\left(\begin{pmatrix} 5 \\ 3 \end{pmatrix}, I\right) + 0.5 \mathcal N\left(\begin{pmatrix} 8 \\ 9 \end{pmatrix}, I\right) # \quad \text{(non-Gaussian class)}. # $$ # Compare LDA and [logistic regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). # In[ ]: # Answer # >What about with this dataset (class $-1$ is Gaussian but with an outlier)? # In[13]: # Dataset X, Y = sample_classif(weight=.5, param1=dict(mu=[0, 0], sigma1=1, sigma2=1, theta=0), param2=dict(mu=[5, 3], sigma1=1, sigma2=1, theta=0), n=100) X[np.argmin(Y)] = np.random.randn(2) + 20 # In[ ]: # Answer # # Adaboost # >We consider the dataset defined below. # In[15]: # Dataset X, Y = sample_classif(weight=.3, param1=dict(mu=[0, 0], sigma1=10, sigma2=1, theta=np.pi/6), param2=dict(mu=[0, 0], sigma1=1, sigma2=1, theta=0), n=100) X_ng, Y_ng = sample_classif(weight=0.5, param1=dict(mu=[5, 3], sigma1=3, sigma2=10, theta=np.pi/6), param2=dict(mu=[-5, -2], sigma1=3, sigma2=10, theta=np.pi/10), n=X.shape[0]) X[Y==-1] = X_ng[:np.sum(Y==-1)] plotXY(X, Y) # >Fit an [Adaboost](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html) classifier with $100$ weak learners and the algorithm SAMME. # Map the classifier regions on a figure. # In[ ]: # Answer # >Plot on a new figure the estimator errors (attribute `estimator_errors_`). # What do you observe? # In[ ]: # Answer # >Load the [dataset digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits). # How many observations, covariates and classes has it? # [Split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) the dataset into two equally sized subsets (one for training, the other for testin, i.e. estimating the true error). # In[ ]: # Answer # >Plot the train and test errors of both algorithms SAMME and SAMME.R with respect to the number of iterations (from 1 to 200) for the dataset digits. # For this purpose, use [`DecisionTreeClassifier(max_depth=5)`](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) as base learner. # In[ ]: # Answer