In [34]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy as sp

import pandas as pd
pd.options.display.max_rows = 20


# 逻辑回归算法简介和Python实现¶

### 0. 实验数据¶

In [35]:
names = [("x", k) for k in range(8)] + [("y", 8)]

Out[35]:
x y
0 1 2 3 4 5 6 7 8
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1

### 1. 二分类¶

#### 1.0 基本原理¶

In [36]:
x = np.linspace(-1.5, 1.5, 1000)

y1 = 0.5 * x + 0.5
y2 = sp.special.expit(5 * x)

pd.DataFrame({'linear': y1, 'logistic regression': y2}).plot()

Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x1146cca20>

$$g(x) = \sigma(f(x)) = \frac1{1 + e^{-(w^T x + b)}}$$

##### 1.0.0 损失函数¶

\begin{align} g(x) & \to 1 \implies y = 1 \\ 1 - g(x) & \to 1 \implies y = 0 \end{align}

\begin{align} P(y = 1 | x, w) &= g(x) & &= \frac1{1 + e^{-z}} &= \frac1{1 + e^{-y z}} \\ P(y = 0 | x, w) &= 1 - g(x) &= 1 - \frac1{1 + e^{-z}} &= \frac1{1 + e^z} &= \frac1{1 + e^{-y z}} \\ \end{align}

$${\mathcal {L}}(\theta \,;\,x_{1},\ldots ,x_{n})=f(x_{1},x_{2},\ldots ,x_{n}\mid \theta )=\prod _{i=1}^{n}f(x_{i}\mid \theta )$$

\begin{align} w &= \operatorname{arg \ max} \prod_i^n P(y_i | x_i, w) \\ &= \operatorname{arg \ min} - \log \left ( \prod_i^n P(y_i | x_i, w) \right ) \quad \text{用negative log likelihood转成极小值} \\ & = \operatorname{arg \ min} - \log \left ( \prod_i^n \frac1{1 + e^{-y z}} \right ) \\ & = \operatorname{arg \ min} \sum_i^n - \log \left ( \frac1{1 + e^{-y z}} \right ) \\ & = \operatorname{arg \ min} \sum_i^n \log ( 1 + e^{-y z} ) \\ \end{align}

$$L(w) = \log (1 + e^{-y z}) = \log \left ( 1 + e^{-y (w^T x + b)} \right )$$

##### 1.0.1 一阶导数¶

\begin{align} \frac{\partial L}{\partial w} &= \frac1{1 + e^{-y (w^T x + b)}} \cdot e^{-yb} \cdot -yx e^{-y w^T x} \\ &= \frac{e^{-y (w^T x + b)}}{1 + e^{-y (w^T x + b)}} \cdot -y x \\ &= -y \left ( 1 - \frac1{1 + e^{-y (w^T x + b)}} \right ) x \end{align}

#### 1.1 实现演示¶

\begin{align} L(w) &= \log \left ( 1 + e^{-y (w^T x + b)} \right ) \\ \frac{\partial L}{\partial w} &= -y \left ( 1 - \frac1{1 + e^{-y (w^T x + b)}} \right ) x \end{align}

\begin{align} L(w) &= \sum_i^n \log \left ( 1 + e^{-y (w^T x + b)} \right ) \\ \frac{\partial L}{\partial w} &= \sum_i^n -y \left ( 1 - \frac1{1 + e^{-y (w^T x + b)}} \right ) x \end{align}

\begin{align} z &= \exp \left ( -y \cdot (X \times w) \right ) \\ L(w) &= \sum \log (1 + z) \\ \frac{\partial L}{\partial w} &= (-y \cdot (1 - \frac1{1 + z}))^T \times X \end{align}

In [37]:
def logit_loss_and_grad(w, X, y):
w = w[:, None] if len(w.shape) == 1 else w

z = np.exp(np.multiply(-y, np.dot(X, w)))
loss = np.sum(np.log1p(z))
grad = np.dot((np.multiply(-y, (1 - 1 / (1 + z)))).T, X)


In [38]:
# 测试数据
X = df["x"].as_matrix()
# 标签
y = df["y"].as_matrix()

In [39]:
# 初始权重值
w0 = np.zeros(X.shape[1])

In [40]:
# 演示一轮损失函数和导数值

Out[40]:
(532.33703467003795,
array([  -652.   , -18928.5  ,  -9490.5  ,  -2970.   , -13445.   ,
-4709.1  ,    -73.767,  -4967.   ]))
In [41]:
# 调过数值寻优方法，求解得到w

(w, loss, info) = sp.optimize.fmin_l_bfgs_b(logit_loss_and_grad, w0, args=(X, y))

w

Out[41]:
array([ 0.02490757,  0.72310265,  0.36255412,  0.11345933,  0.51362311,
0.17989607,  0.00281803,  0.18974831])
In [42]:
# 预测概率值
y_pred_probability = 1 / (1 + np.exp(np.multiply(-y, np.dot(X, w[:, None]))))
# 预测结果
y_pred = (y_pred_probability >= 0.5).astype(int)

In [43]:
from sklearn.metrics import accuracy_score, auc, precision_score

In [44]:
# 预测准确度
accuracy_score(y, y_pred)

Out[44]:
0.34895833333333331
In [45]:
auc(y, y_pred_probability, reorder=True)

Out[45]:
0.75

### 2. 多分类¶

#### 2.0 基本原理¶

In mathematics, the softmax function, or normalized exponential function, is a generalization of the logistic function）$\sigma (\mathbf {z} )_{j}={\frac {e^{z_{j}}}{\sum _{k=1}^{K}e^{z_{k}}}}$

\begin{align} \log \frac{P(y = 1 | x)}{P(y = 0 | x)} &= \beta_{10} + \beta_1 x \\ \cdots \\ \log \frac{P(y = K - 1 | x)}{P(y = 0 | x)} &= \beta_{(K-1)0} + \beta_{K-1} x \\ \end{align}

\begin{align} \log \frac{P(y = 0 | x)}{P(y = 0 | x)} &= \log(1) \\ & = 0 \\ & = 0 + [0, 0, \dots, 0] x \\ & = \beta_{00} + \beta_0 x \quad \text{令$\beta_0$是零阵} \\ \end{align}

$$\log \frac{P(y = k | x)}{P(y = 0 | x)} = \beta_{k0} + \beta_k x$$

$$P(y = k | x) = \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}}$$

\begin{align} L(\beta) &= -log P(y = k | x, \beta) \\ &= \log(\sum_i e^{\beta_{i0} + \beta_i x)}) - (\beta_{k0} + \beta_k x) \end{align}

\begin{align} \frac{\partial L}{\partial \beta} &= \frac1{\sum_i e^{\beta_{i0} + \beta_i x} x} e^{\beta_{k0} + \beta_k x} - x I(y = k) \\ &= x \left ( \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} - I(y = k) \right ) \\ \end{align}

#### 2.1 特例¶

\begin{align} P(y = 1 | x) &= \frac{e^{\beta_{k0} + \beta_k x}}{\sum_i e^{\beta_{i0} + \beta_i x}} |_{k = 1} \\ &= \frac{e^{\beta_{10} + \beta_1 x}}{e^{\beta_{00} + \beta_0 x} + e^{\beta_{10} + \beta_1 x}} \\ &= \frac{e^{\beta_{10} + \beta_1 x}}{1 + e^{\beta_{10} + \beta_1 x}} \\ &= \frac1{1 + e^{- (\beta_{10} + \beta_1 x)}} \\ &= \frac1{1 + e^{- (w^T x + b)}} \\ \end{align}

### 3.0 小结¶

In [ ]: