### 一.E步：确定隐变量，写出Q函数¶

$$Z_{i,k}=\left\{\begin{matrix} 1 & 第i个观测来自第k个NB模型\\ 0 & 否则 \end{matrix}\right.,i=1,2,...,M,k=1,2,...,K$$

$$P(X_i,Z_{i,k}=1\mid \theta)=\alpha_k\prod_{j=1}^Np_k(X_i^j)$$

$$P(Z_{i,k}=1\mid X_i,\theta^t)=\frac{\alpha_k^t\prod_{j=1}^Np_k(X_i^j)^t}{\sum_{l=1}^K\alpha_l^t\prod_{j=1}^Np_l(X_i^j)^t}=w_{i,k}^t$$

ps：注意有这样一个性质$\sum_{k=1}^Kw_{i,k}^t=1$

$$Q(\theta \mid \theta^t)=\sum_{Z_{i,k}}logP(X_i,Z_{i,k}=1\mid \theta)\cdot P(Z_{i,k}=1\mid X_i,\theta^t)\\ =\sum_{i=1}^M\sum_{k=1}^Kw_{i,k}^t[log\alpha_k+\sum_{j=1}^Nlogp_k(X_i^j)]\\ =\sum_{i=1}^M\sum_{k=1}^Kw_{i,k}^tlog\alpha_k+\sum_{k=1}^K\sum_{i=1}^Mw_{i,k}^t\sum_{j=1}^N\sum_{l=1}^{S(j)}logp_k(X_i^j)^{I(X_i^j=f_{j,l})}\\ s.t. \sum_{k=1}^K\alpha_k=1\\ \sum_{l=1}^{S(j)} p_k(X_i^j=f_{j,l})=1$$

### 二.M步：极大化Q函数¶

$\alpha_k$与$p_k(X_i^j)$相互没有关联，我们可以将$Q$函数拆解为两个目标函数的极大化过程，由于均含有约束条件，所以需要转换为拉格朗日函数，通过求KKT条件的方式来求解；

#### 求$\alpha_k^{t+1}$¶

$$L(\alpha,\beta)=-\sum_{i=1}^M\sum_{k=1}^Kw_{i,k}^tlog\alpha_k+\beta(1-\sum_{k=1}^K\alpha_k),\alpha=\{\alpha_1,\alpha_2,...,\alpha_K\}$$

$$\alpha^*=\arg\min_{\alpha}\max_{\beta}L(\alpha,\beta)$$

$$\left\{\begin{matrix} \frac{\partial L(\alpha,\beta)}{\partial \alpha_k}=\frac{-\sum_{i=1}^Mw_{i,k}^t}{\alpha_k}-\beta=0 & k=1,2,...,K\\ 1-\sum_{k=1}^K\alpha_k=0 & \end{matrix}\right. \Rightarrow\alpha_k^{t+1}=\frac{\sum_{i=1}^Mw_{i,k}^t}{M}$$

#### 求$p_k(X_i^j)^{t+1}$¶

$$L(p,\beta)=-\sum_{k=1}^K\sum_{i=1}^Mw_{i,k}^t\sum_{j=1}^N\sum_{l=1}^{S(j)}logp_k(X_i^j)^{I(X_i^j=f_{j,l})}+\sum_{k=1}^K\sum_{j=1}^N\beta_{k,j}(1-\sum_{l=1}^{S(j)} p_k(X_i^j=f_{j,l}))$$

$$p^*=arg\min_p\max_\beta L(p,\beta)$$

$$\left\{\begin{matrix} \frac{\partial L(p,\beta)}{\partial p_k(X_i^j=f_{j,l})} =-\frac{\sum_{i=1}^Mw_{i,k}^t}{p_k(X_i^j=f_{j,l})}-\beta_{k,j}=0&k=1,2,...,K,j=1,2,...,N \\ 1-\sum_{l=1}^{S(j)}p_k(X_i^j=f_{j,l}) & k=1,2,...,K,j=1,2,...,N \end{matrix}\right.\\ \Rightarrow p_k(X_i^j=f_{j,l})^{t+1}=\frac{\sum_{i=1}^Mw_{i,k}^tI(X_i^j=f_{i,l})}{\sum_{i=1}^Mw_{i,k}^t}$$

### 三.代码实现¶

In [1]:
import os
os.chdir('../')
import numpy as np
from ml_models import utils
from ml_models.wrapper_models import DataBinWrapper
%matplotlib inline

"""

"""

class NaiveBayesCluster(object):
def __init__(self, n_components=1, tol=1e-5, n_iter=100, max_bins=10, verbose=False):
"""
:param n_components: 朴素贝叶斯模型数量
:param tol: log likehold增益<tol时，停止训练
:param n_iter: 最多迭代次数
:param verbose: 是否可视化训练过程
"""
self.n_components = n_components
self.tol = tol
self.n_iter = n_iter
self.verbose = verbose
# 分箱
self.dbw = DataBinWrapper(max_bins=max_bins)
# 参数
self.p_y = {}
self.p_x_y = {}
# 默认参数
self.default_y_prob = None  # y的默认概率
self.default_x_prob = {}  # x的默认概率

def get_log_w(self, x_bins):
"""
获取隐变量
:param x_bins:
:return:
"""
W = []
for x_row in x_bins:
tmp = []
for k in range(0, self.n_components):
llh = self.p_y[k]
for j, x_ij in enumerate(x_row):
llh += self.p_x_y[k][j][x_ij]
tmp.append(llh)
W.append(tmp)
W = np.asarray(W)
return W

def fit(self, x):
n_sample = x.shape[0]
self.dbw.fit(x)
x_bins = self.dbw.transform(x)
# 初始化模型参数
self.default_y_prob = np.log(0.5 / self.n_components)  # 默认p_y
for y_label in range(0, self.n_components):
self.p_x_y[y_label] = {}
self.p_y[y_label] = np.log(1.0 / self.n_components)  # 初始p_y设置一样
self.default_x_prob[y_label] = np.log(0.5 / n_sample)  # 默认p_x_y
# 初始p_x_y设置一样
for j in range(0, x_bins.shape[1]):
self.p_x_y[y_label][j] = {}
x_j_set = set(x_bins[:, j])
for x_j in x_j_set:
# 随机抽样计算条件概率
sample_x_index = np.random.choice(n_sample, n_sample // self.n_components)
sample_x_bins = x_bins[sample_x_index]
p_x_y = (np.sum(sample_x_bins[:, j] == x_j) + 1) / (
sample_x_bins.shape[0] + len(x_j_set))
self.p_x_y[y_label][j][x_j] = np.log(p_x_y)
# 计算隐变量
W_log = self.get_log_w(x_bins)
W = utils.softmax(W_log)
W_gen = np.exp(W_log)
current_log_loss = np.log(W_gen.sum(axis=1)).sum()
# 迭代训练
current_epoch = 0
for _ in range(0, self.n_iter):
if self.verbose is True:
utils.plot_decision_function(x, self.predict(x), self)
utils.plt.pause(0.1)
utils.plt.clf()
# 更新模型参数
for k in range(0, self.n_components):
self.p_y[k] = np.log(np.sum(W[:, k]) / n_sample)
for j in range(0, x_bins.shape[1]):
x_j_set = set(x_bins[:, j])
for x_j in x_j_set:
self.p_x_y[k][j][x_j] = np.log(
1e-10 + np.sum(W[:, k] * (x_bins[:, j] == x_j)) / np.sum(W[:, k]))

# 更新隐变量
W_log = self.get_log_w(x_bins)
W = utils.softmax(W_log)
W_gen = np.exp(W_log)
# 计算log like hold
new_log_loss = np.log(W_gen.sum(axis=1)).sum()
if new_log_loss - current_log_loss > self.tol:
current_log_loss = new_log_loss
current_epoch += 1
else:
print('total epochs:', current_epoch)
break
if self.verbose:
utils.plot_decision_function(x, self.predict(x), self)
utils.plt.show()

def predict_proba(self, x):
x_bins = self.dbw.transform(x)
rst = []
for x_row in x_bins:
tmp = []
for y_index in range(0, self.n_components):
try:
p_y_log = self.p_y[y_index]
except:
p_y_log = self.default_y_prob
for i, xij in enumerate(x_row):
try:
p_y_log += self.p_x_y[y_index][i][xij]
except:
p_y_log += self.default_x_prob[y_index]
tmp.append(p_y_log)
rst.append(tmp)
return utils.softmax(np.asarray(rst))

def predict(self, x):
return np.argmax(self.predict_proba(x), axis=1)

In [2]:
#造伪数据
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.85, random_state=0)
X = X[:, ::-1]

In [3]:
#查看效果
nbc = NaiveBayesCluster(n_iter=500, tol=1e-5, n_components=4, max_bins=20, verbose=False)
nbc.fit(X)
utils.plot_decision_function(X, y, nbc)

total epochs: 319


### 四.gaussian实现¶

$$p_k(X_i^j)=\frac{1}{\sqrt{2\pi}\sigma_{k,j}}exp(-\frac{(X_i^j-u_{k,j})^2}{2\sigma_{k,j}^2})$$

$$u_k^{t+1}=\frac{\sum_{i=1}^Mw_{i,k}^tx_i}{\sum_{i=1}^Mw_{i,k}^t}\\$$$$\sigma_{k,j}^{t+1}=\sqrt{\frac{\sum_{i=1}^Mw_{i,k}^t(x_i-u_{k,j}^t)^2}{\sum_{i=1}^Mw_{i,k}^t}}$$
In [4]:
"""

"""

class GaussianNBCluster(object):
def __init__(self, n_components=1, tol=1e-5, n_iter=100, verbose=False):
"""
:param n_components: 朴素贝叶斯模型数量
:param tol: log likehold增益<tol时，停止训练
:param n_iter: 最多迭代次数
:param verbose: 是否可视化训练过程
"""
self.n_components = n_components
self.tol = tol
self.n_iter = n_iter
self.verbose = verbose

# 参数
self.p_y = {}
self.p_x_y = {}
# 默认参数
self.default_y_prob = None  # y的默认概率

def get_w(self, x):
"""
获取隐变量
:return:
"""
W = []
for k in range(0, self.n_components):
tmp = []
for j in range(0, x.shape[1]):
tmp.append(np.log(utils.gaussian_1d(x[:, j], self.p_x_y[k][j][0], self.p_x_y[k][j][1])))
W.append(np.sum(tmp, axis=0) + np.log(self.p_y[k]))
W = np.asarray(W)
return np.exp(W.T)

def fit(self, x):
n_sample = x.shape[0]

# 初始化模型参数
self.default_y_prob = np.log(0.5 / self.n_components)  # 默认p_y
for y_label in range(0, self.n_components):
self.p_x_y[y_label] = {}
self.p_y[y_label] = 1.0 / self.n_components  # 初始p_y设置一样
# 初始p_x_y设置一样
for j in range(0, x.shape[1]):
self.p_x_y[y_label][j] = {}
u = np.mean(x[:, j], axis=0) + np.random.random() * (x[:, j].max() + x[:, j].min()) / 2
sigma = np.std(x[:, j])
self.p_x_y[y_label][j] = [u, sigma]

# 计算隐变量
W= self.get_w(x)
current_log_loss = np.log(W.sum(axis=1)).sum()
W = W / np.sum(W, axis=1, keepdims=True)
# 迭代训练
current_epoch = 0
for _ in range(0, self.n_iter):
if self.verbose is True:
utils.plot_decision_function(x, self.predict(x), self)
utils.plt.pause(0.1)
utils.plt.clf()
# 更新模型参数
for k in range(0, self.n_components):
self.p_y[k] = np.sum(W[:, k]) / n_sample
for j in range(0, x.shape[1]):
x_j = x[:, j]
u = np.sum(x_j * W[:, k]) / np.sum(W[:, k])
sigma = np.sqrt(np.sum((x_j - u) * (x_j - u) * W[:, k]) / np.sum(W[:, k]))
self.p_x_y[k][j] = [u, sigma]

# 更新隐变量
W = self.get_w(x)
new_log_loss = np.log(W.sum(axis=1)).sum()
W = W / np.sum(W, axis=1, keepdims=True)
if new_log_loss - current_log_loss > self.tol:
current_log_loss = new_log_loss
current_epoch += 1
else:
print('total epochs:', current_epoch)
break
if self.verbose:
utils.plot_decision_function(x, self.predict(x), self)
utils.plt.show()

def predict_proba(self, x):
rst = []
for x_row in x:
tmp = []
for y_index in range(0, self.n_components):
try:
p_y_log = self.p_y[y_index]
except:
p_y_log = self.default_y_prob
for i, xij in enumerate(x_row):
p_y_log += np.log(
1e-12 + utils.gaussian_1d(xij, self.p_x_y[y_index][i][0], self.p_x_y[y_index][i][1]))
tmp.append(p_y_log)
rst.append(tmp)
return utils.softmax(np.asarray(rst))

def predict(self, x):
return np.argmax(self.predict_proba(x), axis=1)

In [5]:
#查看效果
gnb = GaussianNBCluster(n_iter=200, tol=1e-5, n_components=4,verbose=False)
gnb.fit(X)
utils.plot_decision_function(X, y, gnb)

total epochs: 59

In [ ]: