### 一.简介¶

（1）分的对：怎么能让超平面将正负样本分的开；
（2）分的好：怎么能让距离超平面最近的点的距离尽可能的大。

$$f(x)=sign(w^Tx+b)$$

（1）怎么度量距离？ （2）距离超平面最近的点如何定义？

$$d=\frac{|w^Tx+b|}{||w||}$$

$$d^*=\frac{1}{||w||}$$

$$\min_{w,b} \frac{1}{2}w^Tw \\ s.t.y_i(w^Tx_i+b)\geq 1,i=1,2,...,N$$

### 二.原优化问题的对偶问题¶

$$L(w,b,\alpha)=\frac{1}{2}w^Tw+\sum_{i=1}^N \alpha_i(1-y_i(w^Tx_i+b)),\alpha=[\alpha_1,\alpha_2,...,\alpha_N]$$

$$\min_{w,b}\max_{\alpha}L(w,b,\alpha)\\ s.t.\alpha_i\geq 0,i=1,2,...,N$$

$$\sum_{i=1}^N\alpha_i^*(1-y_i({w^*}^Tx_i+b^*))=0(条件1)$$

$$1-y_i({w^*}^Tx_i+b)\leq0,i=1,2,...,N(条件2)\\ \alpha_i^*\geq0,i=1,2,...,N(条件3)\\$$

$$\alpha_i^*(1-y_i({w^*}^Tx_i+b^*))=0,i=1,2,...,N(条件4)$$

$$\forall \alpha_i^*>0\Rightarrow 1-y_i({w^*}^Tx_i+b^*)=0(关系1)\\ \forall 1-y_i({w^*}^Tx_i+b^*)<0\Rightarrow \alpha_i^*=0(关系2)$$

$$\max_{\alpha}\min_{w,b}L(w,b,\alpha)\\ s.t.\alpha_i\geq 0,i=1,2,...,N$$

$$w=\sum_{i=1}^N\alpha_iy_ix_i(条件5)\\ 0=\sum_{i=1}^N\alpha_iy_i(条件6)$$

$$\max_{\alpha} \sum_{i=1}^N\alpha_i-\frac{1}{2}\sum_{i=1}^N\sum_{j=1}^N\alpha_i\alpha_jy_iy_jx_i^Tx_j\\ s.t.\sum_{i=1}^N\alpha_iy_i=0,\\ \alpha_i\geq0,i=1,2,...,N$$

$$\min_{\alpha} \frac{1}{2}\sum_{i=1}^N\sum_{j=1}^N\alpha_i\alpha_jy_iy_jx_i^Tx_j-\sum_{i=1}^N\alpha_i\\ s.t.\sum_{i=1}^N\alpha_iy_i=0,\\ \alpha_i\geq0,i=1,2,...,N$$

$$w^*=\sum_{i=1}^N\alpha_i^*y_ix_i$$

### 三.SMO求解对偶问题最优解¶

$$\alpha_1=-y_i\sum_{i=2}^N\alpha_iy_i(关系3)$$

$$\min_{\alpha_1,\alpha_2}\frac{1}{2}\alpha_1^2 x_1^Tx_1+\frac{1}{2}\alpha_2^2x_2^Tx_2+\alpha_1\alpha_2y_1y_2x_1^Tx_2+\frac{1}{2}\alpha_1y_1x_1^T\sum_{i=3}^N\alpha_iy_ix_i+\frac{1}{2}\alpha_2y_2x_2^T\sum_{i=3}^N\alpha_iy_ix_i-\alpha_1-\alpha_2\\ s.t.\alpha_1y_1+\alpha_2y_2=-\sum_{i=3}^N\alpha_iy_i=\eta\\ \alpha_1\geq0,\alpha_2\geq0$$

$$\min_{\alpha_2}\frac{1}{2}(x_1-x_2)^T(x_1-x_2)\alpha_2^2+(-y_2\eta x_1^Tx_1+y_1\eta x_1^Tx_2+\frac{1}{2}y_2x_2^T\gamma-\frac{1}{2}y_2x_1^T\gamma-1+y_1y_2)\alpha_2\\ s.t.\alpha_2\geq0,y_1(\eta-\alpha_2y_2)\geq0$$

$$\alpha_2^{unc}=-\frac{-y_2\eta x_1^Tx_1+y_1\eta x_1^Tx_2+\frac{1}{2}y_2x_2^T\gamma-\frac{1}{2}y_2x_1^T\gamma-1+y_1y_2}{(x_1-x_2)^T(x_1-x_2)}(公式1)$$

$$g(x)=\sum_{i=1}^N\alpha_iy_ix_i^Tx+b$$

$$E_i=g(x_i)-y_i$$

$$x_1^T\gamma=g(x_1)-\alpha_1^{old}y_1x_1^Tx_1-\alpha_2^{old}y_2x_2^Tx_1-b^{old}\\ x_2^T\gamma=g(x_2)-\alpha_1^{old}y_1x_1^Tx_2-\alpha_2^{old}y_2x_2^Tx_2-b^{old}$$

$$\eta=\alpha_1^{old}y_1+\alpha_2^{old}y_2$$

$$\alpha_2^{unc}=\alpha_2^{old}+\frac{y_2(E_1^{old}-E_2^{old})}{\beta}$$

$$\alpha_2^{new}=\left\{\begin{matrix} \alpha_2^{unc} & \alpha_2^{unc}\geq max\{0,\alpha_2^{old}-\alpha_1^{old}\}\\ max\{0,\alpha_2^{old}-\alpha_1^{old}\} & \alpha_2^{unc}< max\{0, \alpha_2^{old}-\alpha_1^{old}\} \end{matrix}\right.$$

$$\alpha_1^{new}=\alpha_1^{old}+y_1y_2(\alpha_2^{old}-\alpha_2^{new})$$

$$w^{new}=w^{old}+(\alpha_1^{new}-\alpha_1^{old})y_1x_1+(\alpha_2^{new}-\alpha_2^{old})y_2x_2$$

$${w^{new}}^Tx_1+b=y_1(关系4)\\ {w^{new}}^Tx_2+b=y_2(关系5)\\$$

$$b^{new}=\frac{b_1^{new}+b_2^{new}}{2}$$

$$E_1^{new}={w^{new}}^Tx_1+b^{new}-y_1\\ E_2^{new}={w^{new}}^Tx_2+b^{new}-y_2$$

$\alpha_1$的选择

$$\left\{\begin{matrix} \alpha_i=0\Leftrightarrow y_i(w^Tx_i+b)\geq1\\ \alpha_i>0\Leftrightarrow y_i(w^Tx_i+b)=1 \end{matrix}\right.$$

$\alpha_2$的选择

### 四.代码实现¶

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import copy
import random
import os
os.chdir('../')
from ml_models import utils
%matplotlib inline

In [2]:
#定义一个绘制决策边界以及支持向量的函数（并放到utils中）
def plot_decision_function(X, y, clf, support_vectors=None):
plot_step = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')
# 绘制支持向量
if support_vectors is not None:
plt.scatter(X[support_vectors, 0], X[support_vectors, 1], s=80, c='none', alpha=0.7, edgecolor='red')

In [3]:
"""

"""
class HardMarginSVM(object):
def __init__(self, epochs=100):
self.w = None
self.b = None
self.alpha = None
self.E = None
self.epochs = epochs
# 记录支持向量
self.support_vectors = None

def init_params(self, X, y):
"""
:param X: (n_samples,n_features)
:param y: (n_samples,) y_i\in\{0,1\}
:return:
"""
n_samples, n_features = X.shape
self.w = np.zeros(n_features)
self.b = .0
self.alpha = np.zeros(n_samples)
self.E = np.zeros(n_samples)
# 初始化E
for i in range(0, n_samples):
self.E[i] = np.dot(self.w, X[i, :]) + self.b - y[i]

def _select_j(self, best_i):
"""
选择j
:param best_i:
:return:
"""
valid_j_list = [i for i in range(0, len(self.alpha)) if self.alpha[i] > 0 and i != best_i]
best_j = -1
# 优先选择使得|E_i-E_j|最大的j
if len(valid_j_list) > 0:
max_e = 0
for j in valid_j_list:
current_e = np.abs(self.E[best_i] - self.E[j])
if current_e > max_e:
best_j = j
max_e = current_e
else:
# 随机选择
l = list(range(len(self.alpha)))
seq = l[: best_i] + l[best_i + 1:]
best_j = random.choice(seq)
return best_j

def _meet_kkt(self, w, b, x_i, y_i, alpha_i):
"""
判断是否满足KKT条件

:param w:
:param b:
:param x_i:
:param y_i:
:return:
"""
if alpha_i < 1e-7:
return y_i * (np.dot(w, x_i) + b) >= 1
else:
return abs(y_i * (np.dot(w, x_i) + b) - 1) < 1e-7

def fit(self, X, y2, show_train_process=False):
"""

:param X:
:param y2:
:param show_train_process: 显示训练过程
:return:
"""
y = copy.deepcopy(y2)
y[y == 0] = -1
# 初始化参数
self.init_params(X, y)
for _ in range(0, self.epochs):
if_all_match_kkt = True
for i in range(0, len(self.alpha)):
x_i = X[i, :]
y_i = y[i]
alpha_i_old = self.alpha[i]
E_i_old = self.E[i]
# 外层循环：选择违反KKT条件的点i
if not self._meet_kkt(self.w, self.b, x_i, y_i, alpha_i_old):
if_all_match_kkt = False
# 内层循环，选择使|Ei-Ej|最大的点j
best_j = self._select_j(i)

alpha_j_old = self.alpha[best_j]
x_j = X[best_j, :]
y_j = y[best_j]
E_j_old = self.E[best_j]

# 进行更新
# 1.首先获取无裁剪的最优alpha_2
eta = np.dot(x_i - x_j, x_i - x_j)
# 如果x_i和x_j很接近，则跳过
if eta < 1e-3:
continue
alpha_j_unc = alpha_j_old + y_j * (E_i_old - E_j_old) / eta
# 2.裁剪并得到new alpha_2
if y_i == y_j:
if alpha_j_unc < 0:
alpha_j_new = 0
elif 0 <= alpha_j_unc <= alpha_i_old + alpha_j_old:
alpha_j_new = alpha_j_unc
else:
alpha_j_new = alpha_i_old + alpha_j_old
else:
if alpha_j_unc < max(0, alpha_j_old - alpha_i_old):
alpha_j_new = max(0, alpha_j_old - alpha_i_old)
else:
alpha_j_new = alpha_j_unc

# 如果变化不够大则跳过
if np.abs(alpha_j_new - alpha_j_old) < 1e-5:
continue
# 3.得到alpha_1_new
alpha_i_new = alpha_i_old + y_i * y_j * (alpha_j_old - alpha_j_new)
# 4.更新w
self.w = self.w + (alpha_i_new - alpha_i_old) * y_i * x_i + (alpha_j_new - alpha_j_old) * y_j * x_j
# 5.更新alpha_1,alpha_2
self.alpha[i] = alpha_i_new
self.alpha[best_j] = alpha_j_new
# 6.更新b
b_i_new = y_i - np.dot(self.w, x_i)
b_j_new = y_j - np.dot(self.w, x_j)
if alpha_i_new > 0:
self.b = b_i_new
elif alpha_j_new > 0:
self.b = b_j_new
else:
self.b = (b_i_new + b_j_new) / 2.0
# 7.更新E
for k in range(0, len(self.E)):
self.E[k] = np.dot(self.w, X[k, :]) + self.b - y[k]
# 显示训练过程
if show_train_process is True:
utils.plot_decision_function(X, y2, self, [i, best_j])
utils.plt.pause(0.1)
utils.plt.clf()

# 如果所有的点都满足KKT条件，则中止
if if_all_match_kkt is True:
break
# 计算支持向量
self.support_vectors = np.where(self.alpha > 1e-3)[0]
# 利用所有的支持向量，更新b
self.b = np.mean([y[s] - np.dot(self.w, X[s, :]) for s in self.support_vectors.tolist()])
# 显示最终结果
if show_train_process is True:
utils.plot_decision_function(X, y2, self, self.support_vectors)
utils.plt.show()

def get_params(self):
"""
输出原始的系数
:return: w
"""

return self.w, self.b

def predict_proba(self, x):
"""
:param x:ndarray格式数据: m x n
:return: m x 1
"""
return utils.sigmoid(x.dot(self.w) + self.b)

def predict(self, x):
"""
:param x:ndarray格式数据: m x n
:return: m x 1
"""
proba = self.predict_proba(x)
return (proba >= 0.5).astype(int)


### 五.效果检验¶

In [9]:
from sklearn.datasets import make_classification
# 生成分类数据
data, target = make_classification(n_samples=100, n_features=2, n_classes=2, n_informative=1, n_redundant=0,
n_repeated=0, n_clusters_per_class=1, class_sep=2.0)

In [10]:
plt.scatter(data[:,0],data[:,1],c=target)

Out[10]:
<matplotlib.collections.PathCollection at 0x19e700bb390>
In [11]:
#训练
svm = HardMarginSVM()
svm.fit(data, target)

In [12]:
plot_decision_function(data, target, svm, svm.support_vectors)

In [8]:
#可视化训练过程,建议在pycharm中运行，notebook会生成很多张图片
# svm = HardMarginSVM()
# svm.fit(data, target,show_train_process=True)


### 六.问题讨论¶

#### 2.原问题本就是凸优化问题，为何还要转对偶问题求解？¶

In [ ]: