#!/usr/bin/env python # coding: utf-8 # In[1]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns from IPython.display import SVG # 逻辑回归在scikit-learn中的实现简介 # ============================== # 分析用的代码版本信息: # # ```bash # ~/W/g/scikit-learn ❯❯❯ git log -n 1 # commit d161bfaa1a42da75f4940464f7f1c524ef53484f # Author: John B Nelson # Date: Thu May 26 18:36:37 2016 -0400 # # Add missing double quote (#6831) # ``` # ### 0. 总纲 # # 下面是sklearn中逻辑回归的构成情况: # In[2]: SVG("./res/sklearn_lr.svg") # 如[逻辑回归在spark中的实现简介](./spark_ml_lr.ipynb)中分析一样,主要把精力定位到算法代码上,即寻优算子和损失函数。 # ### 1. 寻优算子 # # sklearn支持liblinear, sag, lbfgs和newton-cg四种寻优算子,其中lbfgs属于scipy包,liblinear属于LibLinear库,剩下两种由sklearn自己实现。代码很好定位,逻辑也很明了,不多说: # # ```python # 704 if solver == 'lbfgs': # 705 try: # 706 w0, loss, info = optimize.fmin_l_bfgs_b( # 707 func, w0, fprime=None, # 708 args=(X, target, 1. / C, sample_weight), # 709 iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter) # 710 except TypeError: # 711 # old scipy doesn't have maxiter # 712 w0, loss, info = optimize.fmin_l_bfgs_b( # 713 func, w0, fprime=None, # 714 args=(X, target, 1. / C, sample_weight), # 715 iprint=(verbose > 0) - 1, pgtol=tol) # 716 if info["warnflag"] == 1 and verbose > 0: # 717 warnings.warn("lbfgs failed to converge. Increase the number " # 718 "of iterations.") # 719 try: # 720 n_iter_i = info['nit'] - 1 # 721 except: # 722 n_iter_i = info['funcalls'] - 1 # 723 elif solver == 'newton-cg': # 724 args = (X, target, 1. / C, sample_weight) # 725 w0, n_iter_i = newton_cg(hess, func, grad, w0, args=args, # 726 maxiter=max_iter, tol=tol) # 727 elif solver == 'liblinear': # 728 coef_, intercept_, n_iter_i, = _fit_liblinear( # 729 X, target, C, fit_intercept, intercept_scaling, None, # 730 penalty, dual, verbose, max_iter, tol, random_state, # 731 sample_weight=sample_weight) # 732 if fit_intercept: # 733 w0 = np.concatenate([coef_.ravel(), intercept_]) # 734 else: # 735 w0 = coef_.ravel() # 736 # 737 elif solver == 'sag': # 738 if multi_class == 'multinomial': # 739 target = target.astype(np.float64) # 740 loss = 'multinomial' # 741 else: # 742 loss = 'log' # 743 # 744 w0, n_iter_i, warm_start_sag = sag_solver( # 745 X, target, sample_weight, loss, 1. / C, max_iter, tol, # 746 verbose, random_state, False, max_squared_sum, warm_start_sag) # ``` # ### 2. 损失函数 # # #### 2.1 二分类 # # 二分类的损失函数和导数由`_logistic_loss_and_grad`实现,运算逻辑和[逻辑回归算法简介和Python实现](./demo.ipynb)是相同的,不多说。 # #### 2.2 多分类 # # sklearn的多分类支持ovr (one vs rest,一对多)和multinominal两种方式。 # # # ##### 2.2.0 ovr # 默认是ovr,它会对毎个标签训练一个二分类的分类器,即总共$K$个。训练代码在 # # ```python # 1230 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, # 1231 backend=backend)( # 1232 path_func(X, y, pos_class=class_, Cs=[self.C], # 1233 fit_intercept=self.fit_intercept, tol=self.tol, # 1234 verbose=self.verbose, solver=self.solver, copy=False, # 1235 multi_class=self.multi_class, max_iter=self.max_iter, # 1236 class_weight=self.class_weight, check_input=False, # 1237 random_state=self.random_state, coef=warm_start_coef_, # 1238 max_squared_sum=max_squared_sum, # 1239 sample_weight=sample_weight) # 1240 for (class_, warm_start_coef_) in zip(classes_, warm_start_coef)) # ``` # # 注意,1240L的`for class_ in classes`配合1232L的`pos_class=class`,就是逐个取标签来训练的逻辑。 # ##### 2.2.1 multinominal # # 前面讲到ovr会遍历标签,逐个训练。为了兼容这段逻辑,真正的二分类问题需要做变化: # # ```python # 1201 if len(self.classes_) == 2: # 1202 n_classes = 1 # 1203 classes_ = classes_[1:] # ``` # # 同样地,multinominal需要一次对全部标签做处理,也需要做变化: # # ```python # 1217 # Hack so that we iterate only once for the multinomial case. # 1218 if self.multi_class == 'multinomial': # 1219 classes_ = [None] # 1220 warm_start_coef = [warm_start_coef] # ``` # # 好,接下来,我们看multinoinal的损失函数和导数计算代码,它是`_multinomial_loss_grad`这个函数。 # # sklearn里多分类的代码使用的公式和[逻辑回归算法简介和Python实现](./demo.ipynb)里一致,即: # # \begin{align} # L(\beta) &= \log(\sum_i e^{\beta_{i0} + \beta_i x)}) - (\beta_{k0} + \beta_k x) \\ # \frac{\partial L}{\partial \beta} &= x \left ( \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} - I(y = k) \right ) \\ # \end{align} # # 具体到损失函数: # # ```python # 244 def _multinomial_loss(w, X, Y, alpha, sample_weight): # 245 #+-- 37 lines: """Computes multinomial loss and class probabilities.--- # 282 n_classes = Y.shape[1] # 283 n_features = X.shape[1] # 284 fit_intercept = w.size == (n_classes * (n_features + 1)) # 285 w = w.reshape(n_classes, -1) # 286 sample_weight = sample_weight[:, np.newaxis] # 287 if fit_intercept: # 288 intercept = w[:, -1] # 289 w = w[:, :-1] # 290 else: # 291 intercept = 0 # 292 p = safe_sparse_dot(X, w.T) # 293 p += intercept # 294 p -= logsumexp(p, axis=1)[:, np.newaxis] # 295 loss = -(sample_weight * Y * p).sum() # 296 loss += 0.5 * alpha * squared_norm(w) # 297 p = np.exp(p, p) # 298 return loss, p, w # ``` # # + 292L-293L是计算$\beta_{i0} + \beta_i x$。 # + 294L是计算 $L(\beta)$。注意,这里防止计算溢出,是在`logsumexp`函数里作的,原理和[逻辑回归在spark中的实现简介](./spark_ml_lr.ipynb)一样。 # + 295L是加总(注意,$Y$毎列是单位向量,所以起了选标签对应$k$的作用)。 # + 296L加上L2正则。 # + 注意,297L是p变回了$\frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}}$,为了计算导数时直接用。 # # 好,再看导数的计算: # # ```python # 301 def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): # 302 #+-- 37 lines: """Computes the multinomial loss, gradient and class probabilities.--- # 339 n_classes = Y.shape[1] # 340 n_features = X.shape[1] # 341 fit_intercept = (w.size == n_classes * (n_features + 1)) # 342 grad = np.zeros((n_classes, n_features + bool(fit_intercept))) # 343 loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight) # 344 sample_weight = sample_weight[:, np.newaxis] # 345 diff = sample_weight * (p - Y) # 346 grad[:, :n_features] = safe_sparse_dot(diff.T, X) # 347 grad[:, :n_features] += alpha * w # 348 if fit_intercept: # 349 grad[:, -1] = diff.sum(axis=0) # 350 return loss, grad.ravel(), p # ``` # # + 345L-346L,对应了导数的计算式; # + 347L是加上L2的导数; # + 348L-349L,是对intercept的计算。 # #### 2.3 Hessian # # 注意,sklearn支持牛顿法,需要用到Hessian阵,定义见维基[Hessian matrix](https://en.wikipedia.org/wiki/Hessian_matrix), # # \begin{equation} # {\mathbf H}={\begin{bmatrix}{\dfrac {\partial ^{2}f}{\partial x_{1}^{2}}}&{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{n}}}\\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{2}^{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{n}}}\\[2.2ex]\vdots &\vdots &\ddots &\vdots \\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{n}^{2}}}\end{bmatrix}}. # \end{equation} # # 其实就是各点位的二阶偏导。具体推导就不写了,感兴趣可以看[Logistic Regression - Jia Li](http://sites.stat.psu.edu/~jiali/course/stat597e/notes2/logit.pdf)或[Logistic regression: a simple ANN Nando de Freitas](https://www.cs.ox.ac.uk/people/nando.defreitas/machinelearning/lecture6.pdf)。 # # 基本公式是$\mathbf{H} = \mathbf{X}^T \operatorname{diag}(\pi_i (1 - \pi_i)) \mathbf{X}$,其中$\pi_i = \operatorname{sigm}(x_i \beta)$。 # # ```python # 167 def _logistic_grad_hess(w, X, y, alpha, sample_weight=None): # 168 #+-- 33 lines: """Computes the gradient and the Hessian, in the case of a logistic loss. # 201 w, c, yz = _intercept_dot(w, X, y) # 202 #+-- 4 lines: if sample_weight is None:--------- # 206 z = expit(yz) # 207 #+-- 8 lines: z0 = sample_weight * (z - 1) * y--- # 215 # The mat-vec product of the Hessian # 216 d = sample_weight * z * (1 - z) # 217 if sparse.issparse(X): # 218 dX = safe_sparse_dot(sparse.dia_matrix((d, 0), # 219 shape=(n_samples, n_samples)), X) # 220 else: # 221 # Precompute as much as possible # 222 dX = d[:, np.newaxis] * X # 223 # 224 if fit_intercept: # 225 # Calculate the double derivative with respect to intercept # 226 # In the case of sparse matrices this returns a matrix object. # 227 dd_intercept = np.squeeze(np.array(dX.sum(axis=0))) # 228 # 229 def Hs(s): # 230 ret = np.empty_like(s) # 231 ret[:n_features] = X.T.dot(dX.dot(s[:n_features])) # 232 ret[:n_features] += alpha * s[:n_features] # 233 # 234 # For the fit intercept case. # 235 if fit_intercept: # 236 ret[:n_features] += s[-1] * dd_intercept # 237 ret[-1] = dd_intercept.dot(s[:n_features]) # 238 ret[-1] += d.sum() * s[-1] # 239 return ret # 240 # 241 return grad, Hs # ``` # # + 201L, 206L, 和216L是计算中间的$\pi_i (1 - \pi_i)$。 # + 217L-222L,对中间参数变为对角阵后,预算公式后半部份,配合231L就是整个式子了。 # # 这里我也只知其然,以后有时间再深挖下吧。 # ### 3. 小结 # # 本文简单介绍了sklearn中逻辑回归的实现,包括二分类和多分类的具体代码和公式对应。 # In[ ]: