#!/usr/bin/env python # coding: utf-8 # # Classifier models # # In our introductory materials related to data, model, and algorithm objects, we used a regression task as a representative concrete example. Here we complement that material with some implementations of simple classifier objects. # __Contents:__ # # - Classifier base class # - Multi-class logistic regression # # ___ # # ## Classifier base class # # Here we put together `Classifier`, a base class for classification models, which naturally inherits `Model`. # In[ ]: import models class Classifier(models.Model): ''' Generic classifier model, an object with methods for both training and evaluating classifiers. ''' def __init__(self, data=None, name=None): super(Classifier, self).__init__(name=name) # If given data, collect information about labels. if data is not None: self.labels = self.get_labels(data=data) # all unique labels. self.nc = self.labels.size # number of unique labels. def onehot(self, y): ''' A function for encoding y into a one-hot vector. Inputs: - y is a (k,1) array, taking values in {0,1,...,nc-1}. ''' nc = self.nc k = y.shape[0] C = np.zeros((k,nc), dtype=y.dtype) for i in range(k): j = y[i,0] # assumes y has only one column. C[i,j] = 1 return C def get_labels(self, data): ''' Get all the (unique) labels that appear in the data. ''' A = (data.y_tr is None) B = (data.y_te is None) if (A and B): raise ValueError("No label data provided!") else: if A: out_labels = np.unique(data.y_te) elif B: out_labels = np.unique(data.y_tr) else: out_labels = np.unique(np.concatenate((data.y_tr, data.y_te), axis=0)) count = out_labels.size return out_labels.reshape((count,1)) def classify(self, X): ''' Must be implemented by sub-classes. ''' raise NotImplementedError def class_perf(self, y_est, y_true): ''' Given class label estimates and true values, compute the fraction of correct classifications made for each label, yielding typical binary classification performance metrics. Input: y_est and y_true are (k x 1) matrices of labels. Output: Returns a dictionary with two components, (1) being the fraction of correctly classified labels, and (2) being a dict of per-label precison/recall/F1 scores. ''' # First, get the classification rate. k = y_est.size num_correct = (y_est == y_true).sum() frac_correct = num_correct / k frac_incorrect = 1.0 - frac_correct # Then, get precision/recall for each class. prec_rec = { i:None for i in range(self.nc) } # initialize for c in range(self.nc): idx_c = (y_true == c) idx_notc = (idx_c == False) TP = (y_est[idx_c] == c).sum() FN = idx_c.sum() - TP FP = (y_est[idx_notc] == c).sum() TN = idx_notc.sum() - FP # Precision. if (TP == 0 and FP == 0): prec = 0 else: prec = TP / (TP+FP) # Recall. if (TP == 0 and FN == 0): rec = 0 else: rec = TP / (TP+FN) # F1 (harmonic mean of precision and recall). if (prec == 0 or rec == 0): f1 = 0 else: f1 = 2 * prec * rec / (prec + rec) prec_rec[c] = {"P": prec, "R": rec, "F1": f1} return {"rate": frac_incorrect, "PRF1": prec_rec} # __Exercises:__ # # 0. Explain the functions of `onehot()`, `get_labels()`, and `class_perf()`. # 0. What does `class_perf` return? # # ## Multi-class logistic regression # # The multi-class logistic regression model is among the simplest and most popular classification models used in practice. Let us sort out the key elements. # # - Data: instance-label pairs $(x,y)$ where $x \in \mathbb{R}^{d}$ and $y \in \{0,1,\ldots,K\}$. There are $K+1$ *classes* here. # # - Model: with controllable parameters $w_{0},w_{1},\ldots,w_{K} \in \mathbb{R}^{d}$, we model class probabilities, conditioned on data $x$, as # # \begin{align*} # P\{y = j | x\} = \frac{\exp(w_{j}^{T}x)}{\sum_{k=0}^{K}\exp(w_{k}^{T}x)}. # \end{align*} # # - Loss: minimum negative log-likelihood $(-1) \log P\{y | x\}$ evaluated over the sample (details below). # # Let's make the loss expression a bit easier to compute by introducing some new notation. Assume that we are given an $n$-sized sample $(x_{1},y_{1}), \ldots, (x_{n},y_{n})$. Then for $i=1,\ldots,n$ and $j=0,\ldots,K$ define # # \begin{align*} # p_{ij} & = P\{y_{i} = j | x_{i}\}\\ # c_{ij} & = I\{y_{i} = j\}. # \end{align*} # # Collecting all the elements over index $j$, we get vectors of the form # # \begin{align*} # p_{i} & = (p_{i0},\ldots,p_{iK})\\ # c_{i} & = (c_{i0},\ldots,c_{iK}). # \end{align*} # # Note that $c_{i}$ gives us a handy "one-hot" vector representation of $y_{i}$. Using this notation, the example-specific probability returned by our model is given by $p_{ij}^{c_{ij}}$. As a loss function, use the negative log-likelihood, defined as # # \begin{align*} # L(w;x_{i},y_{i}) & = (-1)\log \prod_{j=0}^{K} p_{ij}^{c_{ij}}\\ # & = (-1)\sum_{j=0}^{K} c_{ij} \log p_{ij}\\ # & = (-1)\sum_{j=0}^{K} c_{ij}\left(w_{j}^{T} x_{i} - \log\left( \sum_{k=0}^{K}\exp(w_{k}^{T}x_{i}) \right) \right)\\ # & = \log\left( \sum_{k=0}^{K}\exp(w_{k}^{T}x_{i}) \right) - \sum_{j=0}^{K} c_{ij}w_{j}^{T} x_{i}. # \end{align*} # # Following usual ERM fashion, the goal is to minimize the empirical mean (here rescaled by $n$): # # \begin{align*} # \min_{w} \sum_{i=1}^{n} L(w;x_{i},y_{i}) \to \hat{w}. # \end{align*} # # As a function of $w$, this objective is differentiable and convex. To actually minimize this objective as a function of $w$, gradient descent is a popular approach. The gradient of $L(w;x,y)$ evaluated with respect to $w$, and evaluated at $(w,x_{i},y_{i})$ can be compactly written as # # \begin{align*} # \nabla L(w;x_{i},y_{i}) = (p_{i}-c_{i}) \otimes x_{i} # \end{align*} # # where the binary operator $\otimes$ is the "Kronecker product" defined between two matrices. If $U$ is $a \times b$ and $V$ is $c \times d$, then their Kronecker product is # # \begin{align*} # U \otimes V = # \begin{bmatrix} # u_{1,1}V & \cdots & u_{1,b}V\\ # \vdots & \ddots & \vdots\\ # u_{a,1}V & \cdots & u_{a,b}V # \end{bmatrix} # \end{align*} # # where each $u_{i,j1}V$ is naturally a block matrix of shape $c \times d$, and thus $U \otimes V$ has shape $ac \times bd$. In the special case of $\nabla L(w;x_{i},y_{i})$ here, since $(p_{i}-c_{i})$ has shape $1 \times (K+1)$ and $x_{i}$ has shape $1 \times d$, the gradient has the shape $1 \times d(K+1)$, as we would expect. The Kronecker product is implemented as a part of Numpy, and so implementing a logistic regression model is extremely straightforward. # # __Effective degrees of freedom:__ It is perfectly fine to set $w = (w_{0},\ldots,w_{K}$ and control $d(k+1)$ parameters, but note that because probabilities must sum to one, and by the definition of the model output, if say we have already computed $w_{1},\ldots,w_{K}$ and thus $p_{i1},\ldots,p_{iK}$, then naturally $p_{i0} = 1 - \sum_{j=1}^{K}p_{ij}$. Note that this is of course guaranteed by our model, which means that we could perfectly well fix $w_{0}=(0,\ldots,0)$ and just optimize with respect to $dK$ parameters, rather than $d(K+1)$. This is reflected in our implementation of `LogisticReg`, a sub-class of `Classifier`, below. # In[ ]: class LogisticReg(Classifier): ''' Multi-class logistic regression model. ''' def __init__(self, data=None): # Given data info, load up the (X,y) data. super(LogisticReg, self).__init__(data=data) # Convert original labels to a one-hot binary representation. if data.y_tr is not None: self.C_tr = self.onehot(y=data.y_tr) if data.y_te is not None: self.C_te = self.onehot(y=data.y_te) def classify(self, w, X): ''' Given learned weights (w) and a matrix of one or more observations, classify them as {0,...,nc-1}. Input: w is a (d x 1) matrix of weights. X is a (k x numfeat) matrix of k observations. NOTE: k can be anything, the training/test sample size. Output: A vector of length k, housing labels in {0,...,nc-1}. ''' k, numfeat = X.shape A = np.zeros((self.nc,k), dtype=np.float32) # Get activations, with last row as zeros. A[:-1,:] = w.reshape((self.nc-1, numfeat)).dot(X.T) # Now convert activations to conditional probabilities. maxes = np.max(A, axis=0) # largest score for each obs. A = A - maxes A = np.exp(A) A = A / A.sum(axis=0) # (nc x k). # Assign classes with highest probability, (k x 1) array. return A.argmax(axis=0).reshape((k,1)) def l_imp(self, w, X, C, lamreg=None): ''' Implementation of the multi-class logistic regression loss function. Input: w is a (d x 1) matrix of weights. X is a (k x numfeat) matrix of k observations. C is a (k x nc) matrix giving a binarized encoding of the class labels for each observation; each row a one-hot vector. lam is a non-negative regularization parameter. NOTE: k can be anything, the training/test sample size. Output: A vector of length k with losses evaluated at k points. ''' k, numfeat = X.shape A = np.zeros((self.nc,k), dtype=np.float64) # Get activations, with last row as zeros. A[:-1,:] = w.reshape((self.nc-1, numfeat)).dot(X.T) # Raw activations of all the correct weights. cvec = (A*C.T).sum(axis=0) # Compute the negative log-likelihoods. maxes = np.max(A, axis=0) err = (np.log(np.exp(A-maxes).sum(axis=0))+maxes)-cvec # Return the losses (all data points), with penalty if needed. if lamreg is None: return err else: penalty = lamreg * np.linalg.norm(W)**2 return err + penalty def l_tr(self, w, data, n_idx=None, lamreg=None): if n_idx is None: return self.l_imp(w=w, X=data.X_tr, C=self.C_tr, lamreg=lamreg) else: return self.l_imp(w=w, X=data.X_tr[n_idx,:], C=self.C_tr[n_idx,:], lamreg=lamreg) def l_te(self, w, data, n_idx=None, lamreg=None): if n_idx is None: return self.l_imp(w=w, X=data.X_te, C=self.C_te, lamreg=lamreg) else: return self.l_imp(w=w, X=data.X_te[n_idx,:], C=self.C_te[n_idx,:], lamreg=lamreg) def g_imp(self, w, X, C, lamreg=0): ''' Implementation of the gradient of the loss function used in multi-class logistic regression. Input: w is a (d x 1) matrix of weights. X is a (k x numfeat) matrix of k observations. C is a (k x nc) matrix giving a binarized encoding of the class labels for each observation; each row a one-hot vector. lamreg is a non-negative regularization parameter. NOTE: k can be anything, the training/test sample size. Output: A (k x d) matrix of gradients eval'd at k points. ''' k, numfeat = X.shape A = np.zeros((self.nc,k), dtype=np.float32) # Get activations, with last row as zeros. A[:-1,:] = w.reshape((self.nc-1, numfeat)).dot(X.T) # Now convert activations to conditional probabilities. maxes = np.max(A, axis=0) # largest score for each obs. A = A - maxes A = np.exp(A) A = A / A.sum(axis=0) # (nc x k). # Initialize a large matrix (k x d) to house per-point grads. G = np.zeros((k,w.size), dtype=w.dtype) for i in range(k): # A very tall vector (i.e., just one "axis"). G[i,:] = np.kron(a=(A[:-1,i]-C[i,:-1]), b=X[i,:]) # Note we carefully remove the last elements. if lamreg is None: return G else: return G + lamreg*2*w.T def g_tr(self, w, data, n_idx=None, lamreg=None): if n_idx is None: return self.g_imp(w=w, X=data.X_tr, C=self.C_tr, lamreg=lamreg) else: return self.g_imp(w=w, X=data.X_tr[n_idx,:], C=self.C_tr[n_idx,:], lamreg=lamreg) def g_te(self, w, data, n_idx=None, lamreg=None): if n_idx is None: return self.g_imp(w=w, X=data.X_te, C=self.C_te, lamreg=lamreg) else: return self.g_imp(w=w, X=data.X_te[n_idx,:], C=self.C_te[n_idx,:], lamreg=lamreg) # __Exercise:__ # # 0. Explain what is being computed in each line of `classify`, `l_imp`, and `g_imp` above in `LogisticReg`. # # 0. In the special case of just two classes, where $y \in \{0,1\}$, the traditional model says to train just one $d$-dimensional vector $w$, to map $x \mapsto f(w^{T}x) = P\{y = 1 | x\}$, where $f(u) = 1/(1+\exp(-u))$ is the so-called logistic function. The gradient takes a very simple form in this case: try implementing a two-class logistic regression model object, without using the `np.kron` function. Numerically test to make sure the outputs of `g_imp` in your model and the general `LogisticReg` are the same. # # 0. Why have we re-implemented `l_tr`, `l_te`, `g_tr`, `g_te`? # # 0. What are we doing with`maxes`? Why are we doing this? For reference, given any vector $\mathbf{u}$ and scalar $a$, the following identities hold. # # \begin{align*} # \text{softmax}(\mathbf{u}+a) & = \text{softmax}(\mathbf{u})\\ # \log \left( \sum_{j} \exp(u_{j}) \right) & = a + \log \left( \sum_{j} \exp(u_{j}-a) \right) # \end{align*} # ___