In [1]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import SVG

逻辑回归在scikit-learn中的实现简介

分析用的代码版本信息:

~/W/g/scikit-learn ❯❯❯ git log -n 1
commit d161bfaa1a42da75f4940464f7f1c524ef53484f
Author: John B Nelson <[email protected]>
Date:   Thu May 26 18:36:37 2016 -0400

    Add missing double quote (#6831)

0. 总纲

下面是sklearn中逻辑回归的构成情况:

In [2]:
SVG("./res/sklearn_lr.svg")
Out[2]:
LogisticRegression+fit()+predict_proba()+predict_log_proba()BaseEstimator+get_params()+set_params()-get_param_names()LinearClassifierMixin+decision_function()+predict()-predict_proba_lr()_LearntSelectorMixin+transform()SparseCoefMixin+densify()+sparsify()objectClassifierMixin-estimator_type+score()TransformerMixin+fit_transform()«dataType»base.py+_fit_liblinear()«dataType»logistic.py+logistic_regression_path()-check_solver_option()-logistic_loss()-logistic_loss_and_grad()-multinomial_loss()-multinomial_loss_grad()-multinomial_grad_hess()«dataType»optimize.py+newton_cg()«dataType»sag.py+sag_solver()

逻辑回归在spark中的实现简介中分析一样,主要把精力定位到算法代码上,即寻优算子和损失函数。

1. 寻优算子

sklearn支持liblinear, sag, lbfgs和newton-cg四种寻优算子,其中lbfgs属于scipy包,liblinear属于LibLinear库,剩下两种由sklearn自己实现。代码很好定位,逻辑也很明了,不多说:

704         if solver == 'lbfgs':
 705             try:
 706                 w0, loss, info = optimize.fmin_l_bfgs_b(
 707                     func, w0, fprime=None,
 708                     args=(X, target, 1. / C, sample_weight),
 709                     iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
 710             except TypeError:
 711                 # old scipy doesn't have maxiter
 712                 w0, loss, info = optimize.fmin_l_bfgs_b(
 713                     func, w0, fprime=None,
 714                     args=(X, target, 1. / C, sample_weight),
 715                     iprint=(verbose > 0) - 1, pgtol=tol)
 716             if info["warnflag"] == 1 and verbose > 0:
 717                 warnings.warn("lbfgs failed to converge. Increase the number "
 718                               "of iterations.")
 719             try:
 720                 n_iter_i = info['nit'] - 1
 721             except:
 722                 n_iter_i = info['funcalls'] - 1
 723         elif solver == 'newton-cg':
 724             args = (X, target, 1. / C, sample_weight)
 725             w0, n_iter_i = newton_cg(hess, func, grad, w0, args=args,
 726                                      maxiter=max_iter, tol=tol)
 727         elif solver == 'liblinear':
 728             coef_, intercept_, n_iter_i, = _fit_liblinear(
 729                 X, target, C, fit_intercept, intercept_scaling, None,
 730                 penalty, dual, verbose, max_iter, tol, random_state,
 731                 sample_weight=sample_weight)
 732             if fit_intercept:
 733                 w0 = np.concatenate([coef_.ravel(), intercept_])
 734             else:
 735                 w0 = coef_.ravel()
 736
 737         elif solver == 'sag':
 738             if multi_class == 'multinomial':
 739                 target = target.astype(np.float64)
 740                 loss = 'multinomial'
 741             else:
 742                 loss = 'log'
 743
 744             w0, n_iter_i, warm_start_sag = sag_solver(
 745                 X, target, sample_weight, loss, 1. / C, max_iter, tol,
 746                 verbose, random_state, False, max_squared_sum, warm_start_sag)

2. 损失函数

2.1 二分类

二分类的损失函数和导数由_logistic_loss_and_grad实现,运算逻辑和逻辑回归算法简介和Python实现是相同的,不多说。

2.2 多分类

sklearn的多分类支持ovr (one vs rest,一对多)和multinominal两种方式。

2.2.0 ovr

默认是ovr,它会对毎个标签训练一个二分类的分类器,即总共$K$个。训练代码在

1230         fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1231                                backend=backend)(
1232             path_func(X, y, pos_class=class_, Cs=[self.C],
1233                       fit_intercept=self.fit_intercept, tol=self.tol,
1234                       verbose=self.verbose, solver=self.solver, copy=False,
1235                       multi_class=self.multi_class, max_iter=self.max_iter,
1236                       class_weight=self.class_weight, check_input=False,
1237                       random_state=self.random_state, coef=warm_start_coef_,
1238                       max_squared_sum=max_squared_sum,
1239                       sample_weight=sample_weight)
1240             for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))

注意,1240L的for class_ in classes配合1232L的pos_class=class,就是逐个取标签来训练的逻辑。

2.2.1 multinominal

前面讲到ovr会遍历标签,逐个训练。为了兼容这段逻辑,真正的二分类问题需要做变化:

1201         if len(self.classes_) == 2:
1202             n_classes = 1
1203             classes_ = classes_[1:]

同样地,multinominal需要一次对全部标签做处理,也需要做变化:

1217         # Hack so that we iterate only once for the multinomial case.
1218         if self.multi_class == 'multinomial':
1219             classes_ = [None]
1220             warm_start_coef = [warm_start_coef]

好,接下来,我们看multinoinal的损失函数和导数计算代码,它是_multinomial_loss_grad这个函数。

sklearn里多分类的代码使用的公式和逻辑回归算法简介和Python实现里一致,即:

\begin{align} L(\beta) &= \log(\sum_i e^{\beta_{i0} + \beta_i x)}) - (\beta_{k0} + \beta_k x) \\ \frac{\partial L}{\partial \beta} &= x \left ( \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} - I(y = k) \right ) \\ \end{align}

具体到损失函数:

244 def _multinomial_loss(w, X, Y, alpha, sample_weight):
 245 #+-- 37 lines: """Computes multinomial loss and class probabilities.---
 282     n_classes = Y.shape[1]
 283     n_features = X.shape[1]
 284     fit_intercept = w.size == (n_classes * (n_features + 1))
 285     w = w.reshape(n_classes, -1)
 286     sample_weight = sample_weight[:, np.newaxis]
 287     if fit_intercept:
 288         intercept = w[:, -1]
 289         w = w[:, :-1]
 290     else:
 291         intercept = 0
 292     p = safe_sparse_dot(X, w.T)
 293     p += intercept
 294     p -= logsumexp(p, axis=1)[:, np.newaxis]
 295     loss = -(sample_weight * Y * p).sum()
 296     loss += 0.5 * alpha * squared_norm(w)
 297     p = np.exp(p, p)
 298     return loss, p, w
  • 292L-293L是计算$\beta_{i0} + \beta_i x$。
  • 294L是计算 $L(\beta)$。注意,这里防止计算溢出,是在logsumexp函数里作的,原理和逻辑回归在spark中的实现简介一样。
  • 295L是加总(注意,$Y$毎列是单位向量,所以起了选标签对应$k$的作用)。
  • 296L加上L2正则。
  • 注意,297L是p变回了$\frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}}$,为了计算导数时直接用。

好,再看导数的计算:

301 def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
 302 #+-- 37 lines: """Computes the multinomial loss, gradient and class probabilities.---
 339     n_classes = Y.shape[1]
 340     n_features = X.shape[1]
 341     fit_intercept = (w.size == n_classes * (n_features + 1))
 342     grad = np.zeros((n_classes, n_features + bool(fit_intercept)))
 343     loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
 344     sample_weight = sample_weight[:, np.newaxis]
 345     diff = sample_weight * (p - Y)
 346     grad[:, :n_features] = safe_sparse_dot(diff.T, X)
 347     grad[:, :n_features] += alpha * w
 348     if fit_intercept:
 349         grad[:, -1] = diff.sum(axis=0)
 350     return loss, grad.ravel(), p
  • 345L-346L,对应了导数的计算式;
  • 347L是加上L2的导数;
  • 348L-349L,是对intercept的计算。

2.3 Hessian

注意,sklearn支持牛顿法,需要用到Hessian阵,定义见维基Hessian matrix

\begin{equation} {\mathbf H}={\begin{bmatrix}{\dfrac {\partial ^{2}f}{\partial x_{1}^{2}}}&{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{n}}}\\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{2}^{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{n}}}\\[2.2ex]\vdots &\vdots &\ddots &\vdots \\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{n}^{2}}}\end{bmatrix}}. \end{equation}

其实就是各点位的二阶偏导。具体推导就不写了,感兴趣可以看Logistic Regression - Jia LiLogistic regression: a simple ANN Nando de Freitas

基本公式是$\mathbf{H} = \mathbf{X}^T \operatorname{diag}(\pi_i (1 - \pi_i)) \mathbf{X}$,其中$\pi_i = \operatorname{sigm}(x_i \beta)$。

167 def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
 168 #+-- 33 lines: """Computes the gradient and the Hessian, in the case of a logistic loss.
 201     w, c, yz = _intercept_dot(w, X, y)
 202 #+--  4 lines: if sample_weight is None:---------
 206     z = expit(yz)
 207 #+--  8 lines: z0 = sample_weight * (z - 1) * y---
 215     # The mat-vec product of the Hessian
 216     d = sample_weight * z * (1 - z)
 217     if sparse.issparse(X):
 218         dX = safe_sparse_dot(sparse.dia_matrix((d, 0),
 219                              shape=(n_samples, n_samples)), X)
 220     else:
 221         # Precompute as much as possible
 222         dX = d[:, np.newaxis] * X
 223
 224     if fit_intercept:
 225         # Calculate the double derivative with respect to intercept
 226         # In the case of sparse matrices this returns a matrix object.
 227         dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))
 228
 229     def Hs(s):
 230         ret = np.empty_like(s)
 231         ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
 232         ret[:n_features] += alpha * s[:n_features]
 233
 234         # For the fit intercept case.
 235         if fit_intercept:
 236             ret[:n_features] += s[-1] * dd_intercept
 237             ret[-1] = dd_intercept.dot(s[:n_features])
 238             ret[-1] += d.sum() * s[-1]
 239         return ret
 240
 241     return grad, Hs
  • 201L, 206L, 和216L是计算中间的$\pi_i (1 - \pi_i)$。
  • 217L-222L,对中间参数变为对角阵后,预算公式后半部份,配合231L就是整个式子了。

这里我也只知其然,以后有时间再深挖下吧。

3. 小结

本文简单介绍了sklearn中逻辑回归的实现,包括二分类和多分类的具体代码和公式对应。