#!/usr/bin/env python
# coding: utf-8

# In[1]:


# %load /Users/facai/Study/book_notes/preconfig.py
get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import SVG


# 逻辑回归在scikit-learn中的实现简介
# ==============================

# 分析用的代码版本信息：
# 
# ```bash
# ~/W/g/scikit-learn ❯❯❯ git log -n 1
# commit d161bfaa1a42da75f4940464f7f1c524ef53484f
# Author: John B Nelson <jnelso11@gmu.edu>
# Date:   Thu May 26 18:36:37 2016 -0400
# 
#     Add missing double quote (#6831)
# ```

# ### 0. 总纲
# 
# 下面是sklearn中逻辑回归的构成情况:

# In[2]:


SVG("./res/sklearn_lr.svg")


# 如[逻辑回归在spark中的实现简介](./spark_ml_lr.ipynb)中分析一样，主要把精力定位到算法代码上，即寻优算子和损失函数。

# ### 1. 寻优算子
# 
# sklearn支持liblinear, sag, lbfgs和newton-cg四种寻优算子，其中lbfgs属于scipy包，liblinear属于LibLinear库，剩下两种由sklearn自己实现。代码很好定位，逻辑也很明了，不多说：
# 
# ```python
#  704         if solver == 'lbfgs':
#  705             try:
#  706                 w0, loss, info = optimize.fmin_l_bfgs_b(
#  707                     func, w0, fprime=None,
#  708                     args=(X, target, 1. / C, sample_weight),
#  709                     iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
#  710             except TypeError:
#  711                 # old scipy doesn't have maxiter
#  712                 w0, loss, info = optimize.fmin_l_bfgs_b(
#  713                     func, w0, fprime=None,
#  714                     args=(X, target, 1. / C, sample_weight),
#  715                     iprint=(verbose > 0) - 1, pgtol=tol)
#  716             if info["warnflag"] == 1 and verbose > 0:
#  717                 warnings.warn("lbfgs failed to converge. Increase the number "
#  718                               "of iterations.")
#  719             try:
#  720                 n_iter_i = info['nit'] - 1
#  721             except:
#  722                 n_iter_i = info['funcalls'] - 1
#  723         elif solver == 'newton-cg':
#  724             args = (X, target, 1. / C, sample_weight)
#  725             w0, n_iter_i = newton_cg(hess, func, grad, w0, args=args,
#  726                                      maxiter=max_iter, tol=tol)
#  727         elif solver == 'liblinear':
#  728             coef_, intercept_, n_iter_i, = _fit_liblinear(
#  729                 X, target, C, fit_intercept, intercept_scaling, None,
#  730                 penalty, dual, verbose, max_iter, tol, random_state,
#  731                 sample_weight=sample_weight)
#  732             if fit_intercept:
#  733                 w0 = np.concatenate([coef_.ravel(), intercept_])
#  734             else:
#  735                 w0 = coef_.ravel()
#  736
#  737         elif solver == 'sag':
#  738             if multi_class == 'multinomial':
#  739                 target = target.astype(np.float64)
#  740                 loss = 'multinomial'
#  741             else:
#  742                 loss = 'log'
#  743
#  744             w0, n_iter_i, warm_start_sag = sag_solver(
#  745                 X, target, sample_weight, loss, 1. / C, max_iter, tol,
#  746                 verbose, random_state, False, max_squared_sum, warm_start_sag)
# ```

# ### 2. 损失函数
# 
# #### 2.1 二分类
# 
# 二分类的损失函数和导数由`_logistic_loss_and_grad`实现，运算逻辑和[逻辑回归算法简介和Python实现](./demo.ipynb)是相同的，不多说。

# #### 2.2 多分类
# 
# sklearn的多分类支持ovr (one vs rest，一对多)和multinominal两种方式。
# 
# 
# ##### 2.2.0 ovr
# 默认是ovr，它会对毎个标签训练一个二分类的分类器，即总共$K$个。训练代码在
# 
# ```python
# 1230         fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
# 1231                                backend=backend)(
# 1232             path_func(X, y, pos_class=class_, Cs=[self.C],
# 1233                       fit_intercept=self.fit_intercept, tol=self.tol,
# 1234                       verbose=self.verbose, solver=self.solver, copy=False,
# 1235                       multi_class=self.multi_class, max_iter=self.max_iter,
# 1236                       class_weight=self.class_weight, check_input=False,
# 1237                       random_state=self.random_state, coef=warm_start_coef_,
# 1238                       max_squared_sum=max_squared_sum,
# 1239                       sample_weight=sample_weight)
# 1240             for (class_, warm_start_coef_) in zip(classes_, warm_start_coef))
# ```
# 
# 注意，1240L的`for class_ in classes`配合1232L的`pos_class=class`，就是逐个取标签来训练的逻辑。

# ##### 2.2.1 multinominal
# 
# 前面讲到ovr会遍历标签，逐个训练。为了兼容这段逻辑，真正的二分类问题需要做变化：
# 
# ```python
# 1201         if len(self.classes_) == 2:
# 1202             n_classes = 1
# 1203             classes_ = classes_[1:]
# ```
# 
# 同样地，multinominal需要一次对全部标签做处理，也需要做变化：
# 
# ```python
# 1217         # Hack so that we iterate only once for the multinomial case.
# 1218         if self.multi_class == 'multinomial':
# 1219             classes_ = [None]
# 1220             warm_start_coef = [warm_start_coef]
# ```
# 
# 好，接下来，我们看multinoinal的损失函数和导数计算代码，它是`_multinomial_loss_grad`这个函数。
# 
# sklearn里多分类的代码使用的公式和[逻辑回归算法简介和Python实现](./demo.ipynb)里一致，即：
# 
# \begin{align}
#   L(\beta) &= \log(\sum_i e^{\beta_{i0} + \beta_i x)}) - (\beta_{k0} + \beta_k x) \\
#   \frac{\partial L}{\partial \beta} &= x \left ( \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} - I(y = k) \right ) \\
# \end{align}
# 
# 具体到损失函数：
# 
# ```python
# 244 def _multinomial_loss(w, X, Y, alpha, sample_weight):
#  245 #+-- 37 lines: """Computes multinomial loss and class probabilities.---
#  282     n_classes = Y.shape[1]
#  283     n_features = X.shape[1]
#  284     fit_intercept = w.size == (n_classes * (n_features + 1))
#  285     w = w.reshape(n_classes, -1)
#  286     sample_weight = sample_weight[:, np.newaxis]
#  287     if fit_intercept:
#  288         intercept = w[:, -1]
#  289         w = w[:, :-1]
#  290     else:
#  291         intercept = 0
#  292     p = safe_sparse_dot(X, w.T)
#  293     p += intercept
#  294     p -= logsumexp(p, axis=1)[:, np.newaxis]
#  295     loss = -(sample_weight * Y * p).sum()
#  296     loss += 0.5 * alpha * squared_norm(w)
#  297     p = np.exp(p, p)
#  298     return loss, p, w
# ```
# 
# + 292L-293L是计算$\beta_{i0} + \beta_i x$。
# + 294L是计算 $L(\beta)$。注意，这里防止计算溢出，是在`logsumexp`函数里作的，原理和[逻辑回归在spark中的实现简介](./spark_ml_lr.ipynb)一样。
# + 295L是加总（注意，$Y$毎列是单位向量，所以起了选标签对应$k$的作用）。
# + 296L加上L2正则。
# + 注意，297L是p变回了$\frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}}$，为了计算导数时直接用。
# 
# 好，再看导数的计算：
# 
# ```python
#  301 def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
#  302 #+-- 37 lines: """Computes the multinomial loss, gradient and class probabilities.---
#  339     n_classes = Y.shape[1]
#  340     n_features = X.shape[1]
#  341     fit_intercept = (w.size == n_classes * (n_features + 1))
#  342     grad = np.zeros((n_classes, n_features + bool(fit_intercept)))
#  343     loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
#  344     sample_weight = sample_weight[:, np.newaxis]
#  345     diff = sample_weight * (p - Y)
#  346     grad[:, :n_features] = safe_sparse_dot(diff.T, X)
#  347     grad[:, :n_features] += alpha * w
#  348     if fit_intercept:
#  349         grad[:, -1] = diff.sum(axis=0)
#  350     return loss, grad.ravel(), p
# ```
# 
# + 345L-346L，对应了导数的计算式；
# + 347L是加上L2的导数；
# + 348L-349L，是对intercept的计算。

# #### 2.3 Hessian
# 
# 注意，sklearn支持牛顿法，需要用到Hessian阵，定义见维基[Hessian matrix](https://en.wikipedia.org/wiki/Hessian_matrix)，
# 
# \begin{equation}
# {\mathbf  H}={\begin{bmatrix}{\dfrac  {\partial ^{2}f}{\partial x_{1}^{2}}}&{\dfrac  {\partial ^{2}f}{\partial x_{1}\,\partial x_{2}}}&\cdots &{\dfrac  {\partial ^{2}f}{\partial x_{1}\,\partial x_{n}}}\\[2.2ex]{\dfrac  {\partial ^{2}f}{\partial x_{2}\,\partial x_{1}}}&{\dfrac  {\partial ^{2}f}{\partial x_{2}^{2}}}&\cdots &{\dfrac  {\partial ^{2}f}{\partial x_{2}\,\partial x_{n}}}\\[2.2ex]\vdots &\vdots &\ddots &\vdots \\[2.2ex]{\dfrac  {\partial ^{2}f}{\partial x_{n}\,\partial x_{1}}}&{\dfrac  {\partial ^{2}f}{\partial x_{n}\,\partial x_{2}}}&\cdots &{\dfrac  {\partial ^{2}f}{\partial x_{n}^{2}}}\end{bmatrix}}.
# \end{equation}
# 
# 其实就是各点位的二阶偏导。具体推导就不写了，感兴趣可以看[Logistic Regression - Jia Li](http://sites.stat.psu.edu/~jiali/course/stat597e/notes2/logit.pdf)或[Logistic regression: a simple ANN Nando de Freitas](https://www.cs.ox.ac.uk/people/nando.defreitas/machinelearning/lecture6.pdf)。
# 
# 基本公式是$\mathbf{H} = \mathbf{X}^T \operatorname{diag}(\pi_i (1 - \pi_i)) \mathbf{X}$，其中$\pi_i = \operatorname{sigm}(x_i \beta)$。
# 
# ```python
#  167 def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
#  168 #+-- 33 lines: """Computes the gradient and the Hessian, in the case of a logistic loss.
#  201     w, c, yz = _intercept_dot(w, X, y)
#  202 #+--  4 lines: if sample_weight is None:---------
#  206     z = expit(yz)
#  207 #+--  8 lines: z0 = sample_weight * (z - 1) * y---
#  215     # The mat-vec product of the Hessian
#  216     d = sample_weight * z * (1 - z)
#  217     if sparse.issparse(X):
#  218         dX = safe_sparse_dot(sparse.dia_matrix((d, 0),
#  219                              shape=(n_samples, n_samples)), X)
#  220     else:
#  221         # Precompute as much as possible
#  222         dX = d[:, np.newaxis] * X
#  223
#  224     if fit_intercept:
#  225         # Calculate the double derivative with respect to intercept
#  226         # In the case of sparse matrices this returns a matrix object.
#  227         dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))
#  228
#  229     def Hs(s):
#  230         ret = np.empty_like(s)
#  231         ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
#  232         ret[:n_features] += alpha * s[:n_features]
#  233
#  234         # For the fit intercept case.
#  235         if fit_intercept:
#  236             ret[:n_features] += s[-1] * dd_intercept
#  237             ret[-1] = dd_intercept.dot(s[:n_features])
#  238             ret[-1] += d.sum() * s[-1]
#  239         return ret
#  240
#  241     return grad, Hs
# ```
# 
# + 201L, 206L, 和216L是计算中间的$\pi_i (1 - \pi_i)$。
# + 217L-222L，对中间参数变为对角阵后，预算公式后半部份，配合231L就是整个式子了。
# 
# 这里我也只知其然，以后有时间再深挖下吧。

# ### 3. 小结
# 
# 本文简单介绍了sklearn中逻辑回归的实现，包括二分类和多分类的具体代码和公式对应。

# In[ ]: