In [1]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import SVG


# 逻辑回归在scikit-learn中的实现简介¶

~/W/g/scikit-learn ❯❯❯ git log -n 1
commit d161bfaa1a42da75f4940464f7f1c524ef53484f
Author: John B Nelson <[email protected]>
Date:   Thu May 26 18:36:37 2016 -0400



### 0. 总纲¶

In [2]:
SVG("./res/sklearn_lr.svg")

Out[2]:

### 1. 寻优算子¶

sklearn支持liblinear, sag, lbfgs和newton-cg四种寻优算子，其中lbfgs属于scipy包，liblinear属于LibLinear库，剩下两种由sklearn自己实现。代码很好定位，逻辑也很明了，不多说：

704         if solver == 'lbfgs':
705             try:
706                 w0, loss, info = optimize.fmin_l_bfgs_b(
707                     func, w0, fprime=None,
708                     args=(X, target, 1. / C, sample_weight),
709                     iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
710             except TypeError:
711                 # old scipy doesn't have maxiter
712                 w0, loss, info = optimize.fmin_l_bfgs_b(
713                     func, w0, fprime=None,
714                     args=(X, target, 1. / C, sample_weight),
715                     iprint=(verbose > 0) - 1, pgtol=tol)
716             if info["warnflag"] == 1 and verbose > 0:
717                 warnings.warn("lbfgs failed to converge. Increase the number "
718                               "of iterations.")
719             try:
720                 n_iter_i = info['nit'] - 1
721             except:
722                 n_iter_i = info['funcalls'] - 1
723         elif solver == 'newton-cg':
724             args = (X, target, 1. / C, sample_weight)
725             w0, n_iter_i = newton_cg(hess, func, grad, w0, args=args,
726                                      maxiter=max_iter, tol=tol)
727         elif solver == 'liblinear':
728             coef_, intercept_, n_iter_i, = _fit_liblinear(
729                 X, target, C, fit_intercept, intercept_scaling, None,
730                 penalty, dual, verbose, max_iter, tol, random_state,
731                 sample_weight=sample_weight)
732             if fit_intercept:
733                 w0 = np.concatenate([coef_.ravel(), intercept_])
734             else:
735                 w0 = coef_.ravel()
736
737         elif solver == 'sag':
738             if multi_class == 'multinomial':
739                 target = target.astype(np.float64)
740                 loss = 'multinomial'
741             else:
742                 loss = 'log'
743
744             w0, n_iter_i, warm_start_sag = sag_solver(
745                 X, target, sample_weight, loss, 1. / C, max_iter, tol,
746                 verbose, random_state, False, max_squared_sum, warm_start_sag)


### 2. 损失函数¶

#### 2.2 多分类¶

sklearn的多分类支持ovr (one vs rest，一对多)和multinominal两种方式。

##### 2.2.0 ovr¶

1230         fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1231                                backend=backend)(
1232             path_func(X, y, pos_class=class_, Cs=[self.C],
1233                       fit_intercept=self.fit_intercept, tol=self.tol,
1234                       verbose=self.verbose, solver=self.solver, copy=False,
1235                       multi_class=self.multi_class, max_iter=self.max_iter,
1236                       class_weight=self.class_weight, check_input=False,
1237                       random_state=self.random_state, coef=warm_start_coef_,
1238                       max_squared_sum=max_squared_sum,
1239                       sample_weight=sample_weight)
1240             for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))


##### 2.2.1 multinominal¶

1201         if len(self.classes_) == 2:
1202             n_classes = 1
1203             classes_ = classes_[1:]


1217         # Hack so that we iterate only once for the multinomial case.
1218         if self.multi_class == 'multinomial':
1219             classes_ = [None]
1220             warm_start_coef = [warm_start_coef]


sklearn里多分类的代码使用的公式和逻辑回归算法简介和Python实现里一致，即：

\begin{align} L(\beta) &= \log(\sum_i e^{\beta_{i0} + \beta_i x)}) - (\beta_{k0} + \beta_k x) \\ \frac{\partial L}{\partial \beta} &= x \left ( \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} - I(y = k) \right ) \\ \end{align}

244 def _multinomial_loss(w, X, Y, alpha, sample_weight):
245 #+-- 37 lines: """Computes multinomial loss and class probabilities.---
282     n_classes = Y.shape[1]
283     n_features = X.shape[1]
284     fit_intercept = w.size == (n_classes * (n_features + 1))
285     w = w.reshape(n_classes, -1)
286     sample_weight = sample_weight[:, np.newaxis]
287     if fit_intercept:
288         intercept = w[:, -1]
289         w = w[:, :-1]
290     else:
291         intercept = 0
292     p = safe_sparse_dot(X, w.T)
293     p += intercept
294     p -= logsumexp(p, axis=1)[:, np.newaxis]
295     loss = -(sample_weight * Y * p).sum()
296     loss += 0.5 * alpha * squared_norm(w)
297     p = np.exp(p, p)
298     return loss, p, w

• 292L-293L是计算$\beta_{i0} + \beta_i x$。
• 294L是计算 $L(\beta)$。注意，这里防止计算溢出，是在logsumexp函数里作的，原理和逻辑回归在spark中的实现简介一样。
• 295L是加总（注意，$Y$毎列是单位向量，所以起了选标签对应$k$的作用）。
• 296L加上L2正则。
• 注意，297L是p变回了$\frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}}$，为了计算导数时直接用。

301 def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
302 #+-- 37 lines: """Computes the multinomial loss, gradient and class probabilities.---
339     n_classes = Y.shape[1]
340     n_features = X.shape[1]
341     fit_intercept = (w.size == n_classes * (n_features + 1))
342     grad = np.zeros((n_classes, n_features + bool(fit_intercept)))
343     loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
344     sample_weight = sample_weight[:, np.newaxis]
345     diff = sample_weight * (p - Y)
346     grad[:, :n_features] = safe_sparse_dot(diff.T, X)
347     grad[:, :n_features] += alpha * w
348     if fit_intercept:

• 345L-346L，对应了导数的计算式；
• 347L是加上L2的导数；
• 348L-349L，是对intercept的计算。

#### 2.3 Hessian¶

$${\mathbf H}={\begin{bmatrix}{\dfrac {\partial ^{2}f}{\partial x_{1}^{2}}}&{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{1}\,\partial x_{n}}}\\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{2}^{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{2}\,\partial x_{n}}}\\[2.2ex]\vdots &\vdots &\ddots &\vdots \\[2.2ex]{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{1}}}&{\dfrac {\partial ^{2}f}{\partial x_{n}\,\partial x_{2}}}&\cdots &{\dfrac {\partial ^{2}f}{\partial x_{n}^{2}}}\end{bmatrix}}.$$

167 def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
168 #+-- 33 lines: """Computes the gradient and the Hessian, in the case of a logistic loss.
201     w, c, yz = _intercept_dot(w, X, y)
202 #+--  4 lines: if sample_weight is None:---------
206     z = expit(yz)
207 #+--  8 lines: z0 = sample_weight * (z - 1) * y---
215     # The mat-vec product of the Hessian
216     d = sample_weight * z * (1 - z)
217     if sparse.issparse(X):
218         dX = safe_sparse_dot(sparse.dia_matrix((d, 0),
219                              shape=(n_samples, n_samples)), X)
220     else:
221         # Precompute as much as possible
222         dX = d[:, np.newaxis] * X
223
224     if fit_intercept:
225         # Calculate the double derivative with respect to intercept
226         # In the case of sparse matrices this returns a matrix object.
227         dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))
228
229     def Hs(s):
230         ret = np.empty_like(s)
231         ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
232         ret[:n_features] += alpha * s[:n_features]
233
234         # For the fit intercept case.
235         if fit_intercept:
236             ret[:n_features] += s[-1] * dd_intercept
237             ret[-1] = dd_intercept.dot(s[:n_features])
238             ret[-1] += d.sum() * s[-1]
239         return ret
240

• 201L, 206L, 和216L是计算中间的$\pi_i (1 - \pi_i)$。