### 0.补充¶

#### 高斯-Wishart分布¶

$$\mathcal{W}(\Lambda\mid W,\mathcal{v})=B|\Lambda|^{\frac{\mathcal{v}-D-1}{2}}exp(-\frac{1}{2}Tr(W^{-1}\Lambda))$$

$$B(W,\mathcal{v})=|W|^{-\frac{\mathcal{v}}{2}}(2^{\frac{\mathcal {v}D}{2}}\pi^{\frac{D(D-1)}{4}}\prod_{i=1}^D\Gamma(\frac{\mathcal{v}+1-i}{2}))^{-1}$$

$$p(\mu,\Lambda\mid\mu_0,beta,W,\mathcal{v})=N(\mu\mid\mu_0,(\beta\Lambda)^{-1})\mathcal{W}(\Lambda\mid W,\mathcal{v})$$

#### 学生t分布¶

$$p(x\mid \mu,a,b)=\int_0^\infty N(x\mid\mu,\tau^{-1})Gam(\tau\mid a,b)d\tau\\ =\int_0^\infty \frac{b^ae^{(-b\tau)}\tau^{a-1}}{\Gamma(a)}(\frac{\tau}{2\pi})^{\frac{1}{2}}exp(-\frac{\tau}{2}(x-\mu)^2)d\tau\\ =\frac{b^a}{\Gamma(a)}(\frac{1}{2\pi})^{\frac{1}{2}}\int_0^\infty(\frac{z}{b+\frac{(x-\mu)^2}{2}})^{a-\frac{1}{2}}e^{-z}[b+\frac{(x-\mu)^2}{2}]^{-1}dz（令z=\tau(b+\frac{(x-\mu)^2}{2})）\\ =\frac{b^a}{\Gamma(a)}(\frac{1}{2\pi})^{\frac{1}{2}}(b+\frac{(x-\mu)^2}{2})^{-a-\frac{1}{2}}\int_0^\infty z^{(a+\frac{1}{2})-1}e^{-z}dz\\ =\frac{b^a}{\Gamma(a)}(\frac{1}{2\pi})^{\frac{1}{2}}(b+\frac{(x-\mu)^2}{2})^{-a-\frac{1}{2}}\Gamma(a+\frac{1}{2})$$

$$St(x\mid\mu,\lambda,\mathcal{v})=\frac{\Gamma(\frac{\mathcal{v}}{2}+\frac{1}{2})}{\Gamma(\frac{\mathcal{v}}{2})}(\frac{\lambda}{\pi\mathcal{v}})^{\frac{1}{2}}(1+\frac{\lambda(x-\mu)^2}{\mathcal{v}})^{-\frac{\mathcal{v}}{2}-\frac{1}{2}}$$

$$St(x\mid\mu,\Lambda,\mathcal{v})=\frac{\Gamma(\frac{D}{2}+\frac{\mathcal{v}}{2})}{\Gamma(\frac{\mathcal{v}}{2})}\frac{|\Lambda|^{\frac{1}{2}}}{(\pi\mathcal{v})^{\frac{D}{2}}}[1+\frac{(x-\mu)^T\Lambda(x-\mu)}{\mathcal{v}}]^{-\frac{D}{2}-\frac{\mathcal{v}}{2}}$$

（1）$E[x]=\mu,\mathcal{v}>1$；

（2）$cov[x]=\frac{\mathcal{v}}{\mathcal{v}-2}\Lambda^{-1},\mathcal{v}>2$；

（3）$mode[x]=\mu$

### 一.联合概率分布¶

$$p(Z\mid\pi)=\prod_{n=1}^N\prod_{k=1}^K\pi_k^{z_{nk}}$$

$$p(X\mid Z,\mu,\Lambda)=\prod_{n=1}^N\prod_{k=1}^KN(x_n\mid\mu_k,\Lambda_k^{-1})^{z_{nk}}$$

$$p(\pi)=Dir(\pi\mid\alpha_0)=C(\alpha_0)\prod_{k=1}^K\pi_k^{\alpha_0-1}$$

$$p(\mu,\Lambda)=p(\mu\mid\Lambda)p(\Lambda)\\ =\prod_{k=1}^KN(\mu_k\mid m_0,(\beta_0\Lambda_k)^{-1})\mathcal{W}(\Lambda_k\mid W_0,\mathcal{v}_0)$$

$$p(X,Z,\pi,\mu,\Lambda)=p(X\mid Z,\mu,\Lambda)p(Z\mid\pi)p(\pi)p(\mu\mid\Lambda)p(\Lambda)$$

### 二.变分分布¶

$$q(Z,\pi,\mu,\Lambda)\rightarrow p(Z,\pi,\mu,\Lambda\mid X)$$

$$q(Z,\pi,\mu,\Lambda)=q(Z)q(\pi,\mu,\Lambda)$$

$$ln\ q^*(Z)=E_{\pi,\mu,\Lambda}[ln\ p(X,Z,\pi,\mu,\Lambda)]+const$$

$$ln\ q^*(Z)=E_\pi[ln\ p(Z\mid\pi)]+E_{\mu,\Lambda}[ln\ p(X\mid Z,\mu,\Lambda)]+const$$

$$ln\ q^*(Z)=\sum_{n=1}^N\sum_{k=1}^Kz_{nk}ln\ \rho_{nk}+const$$

$$ln\ \rho_{nk}=E[ln\ \pi_k]+\frac{1}{2}E[ln\ |\Lambda_k|]-\frac{D}{2}ln(2\pi)-\frac{1}{2}E_{\mu_k,\Lambda_k}[(x_n-\mu_k)^T\Lambda_k(x_n-\mu_k)]$$

$$q^*(Z)\propto\prod_{n=1}^N\prod_{k=1}^K\rho_{nk}^{z_{nk}}$$

$$q^*(Z)=\prod_{n=1}^N\prod_{k=1}^Kr_{nk}^{z_{nk}}\\ r_{nk}=\frac{\rho_{nk}}{\sum_{j=1}^K\rho_{nj}}$$

$$E[z_{nk}]=r_{nk}$$

$$N_k=\sum_{n=1}^Nr_{nk}\\ \bar{x}_k=\frac{1}{N_k}\sum_{n=1}^Nr_{nk}x_n\\ S_k=\frac{1}{N_k}\sum_{n=1}^Nr_{nk}(x_n-\bar{x}_k)(x_n-\bar{x}_k)^T$$

$$ln\ q^*(\pi,\mu,\Lambda)=ln\ p(\pi)+\sum_{k=1}^Kln\ p(\mu_k,\Lambda_k)+E_Z[ln\ p(Z\mid\pi)]+\sum_{k=1}^K\sum_{n=1}^NE[z_{nk}]ln\ N(x_n\mid\mu_k,\Lambda_k^{-1})+const$$

$$q(\pi,\mu,\Lambda)=q(\pi)\prod_{k=1}^Kq(\mu_k,\Lambda_k)$$

$$ln\ q^*(\pi)=(\alpha_0-1)\sum_{k=1}^Kln\ \pi_k+\sum_{k=1}^K\sum_{n=1}^Nr_{nk}ln\ \pi_k+const$$

$$q^*(\pi)=Dir(\pi\mid\alpha),\alpha_k=\alpha_0+N_k$$

$$q^*(\mu_k,\Lambda_k)=N(\mu_k\mid m_k,(\beta_k\Lambda_k)^{-1})\mathcal{W}(\Lambda_k\mid W_k,\mathcal{v}_k)$$

$$\beta_k=\beta_0+N_k\\ m_k=\frac{1}{\beta_k}(\beta_0m_0+N_k\bar{x}_k)\\ W_k^{-1}=W_0^{-1}+N_kS_k+\frac{\beta_0N_k}{\beta_0+N_k}(\bar{x}_k-m_0)(\bar{x}_k-m_0)^T\\ \mathcal{v}_k=\mathcal{v}_0+N_k$$

### 三.迭代更新¶

$$ln\ \rho_{nk}=E[ln\ \pi_k]+\frac{1}{2}E[ln\ |\Lambda_k|]-\frac{D}{2}ln(2\pi)-\frac{1}{2}E_{\mu_k,\Lambda_k}[(x_n-\mu_k)^T\Lambda_k(x_n-\mu_k)]$$

$$E[ln\ |\Lambda_k|]=\sum_{i=1}^D\psi(\frac{\mathcal{v}_k+1-i}{2})+Dln2+ln|W_k|\\ E[ln\ \pi_k]=\psi(\alpha_k)-\psi(\hat{\alpha})$$

$$\psi(a)=\frac{d}{da}ln\Gamma(a)$$

$$E_{\mu_k,\Lambda_k}[(x_n-\mu_k)^T\Lambda_k(x_n-\mu_k)]=D\beta_k^{-1}+\mathcal{v}_k(x_n-m_k)^TW_k(x_n-m_k)$$

### 四.变分分布的应用：概率密度预测¶

$$p(\hat{x}\mid X)=\sum_{\hat{z}}\int\int\int p(\hat{x}\mid \hat{z},\mu,\Lambda)p(\hat{z}\mid \pi)p(\pi,\mu,\Lambda\mid X)d\pi d\mu d\Lambda\\ =\sum_{k=1}^K\int\int\int \pi_kN(\hat{x}\mid \mu_k,\Lambda_k^{-1})p(\pi,\mu,\Lambda\mid X)d\pi d\mu d\Lambda（消去\hat{x}）\\ \simeq \sum_{k=1}^K\int\int\int \pi_kN(\hat{x}\mid \mu_k,\Lambda_k^{-1})q(\pi)q(\mu_k,\Lambda_k)d\pi d\mu_k d\Lambda_k（变分分布替换后验分布）\\ =\frac{1}{\hat{\alpha}}\sum_{k=1}^K\alpha_kSt(\hat{x}\mid m_k,L_k,\mathcal{v}_k+1-D)$$

$$L_k=\frac{(\mathcal{v}_k+1-D)\beta_k}{1+\beta_k}W_k$$

### 五.代码实现¶

In [1]:
import numpy as np
from scipy import special
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
"""

"""

def dirichlet(u, alpha):
"""
狄利克雷分布
:param u: 随机变量
:param alpha: 超参数
:return:
"""
# 计算归一化因子
beta = special.gamma(np.sum(alpha))
for alp in alpha:
beta /= special.gamma(np.sum(alp))
rst = beta
# 计算结果
for idx in range(0, len(alpha)):
rst *= np.power(u[idx], alpha[idx] - 1)
return rst

def wishart(Lambda, W, v):
"""
wishart分布
:param Lambda:随机变量
:param W:超参数
:param v:超参数
:return:
"""
# 维度
D = W.shape[0]
# 先计算归一化因子
B = np.power(np.linalg.det(W), -1 * v / 2)
B_ = np.power(2.0, v * D / 2) * np.power(np.pi, D * (D - 1) / 4)
for i in range(1, D + 1):
B_ *= special.gamma((v + 1 - i) / 2)
B = B / B_
# 计算剩余部分
rst = B * np.power(np.linalg.det(Lambda), (v - D - 1) / 2)
rst *= np.exp(-0.5 * np.trace(np.linalg.inv(W) @ Lambda))
return rst

def St(X, mu, Lambda, v):
"""
学生t分布
:param X: 随机变量
:param mu: 超参数
:param Lambda: 超参数
:param v: 超参数
:return:
"""
n_sample, D = X.shape
return special.gamma(D / 2 + v / 2) * np.power(np.linalg.det(Lambda), 0.5) * np.power(
1 + np.sum((X - mu) @ Lambda * (X - mu), axis=1) / v, -1.0 * D / 2 - v / 2) / special.gamma(v / 2) / np.power(
np.pi * v, D / 2)


In [3]:
import os
os.chdir('../')
from ml_models import utils

"""

"""

class GMMCluster(object):
def __init__(self, n_components=1, tol=1e-6, n_iter=100, prior_dirichlet_alpha=None, prior_gaussian_mean=None,
prior_gaussian_beta=None, prior_wishart_w=None, prior_wishart_v=None,
verbose=False):
"""
GMM的VI实现
:param n_components: 高斯混合模型数量
:param tol: -log likehold增益<tol时，停止训练
:param n_iter: 最多迭代次数
:prior_dirichlet_alpha:先验狄利克雷分布的超参数
:prior_gaussian_mean:先验高斯分布的均值
:prior_gaussian_beta:先验高斯分布的beta值，即精度前的系数
:prior_wishart_w:先验wishart分布的w
:prior_wishart_v:先验wishart分布的v
:param verbose: 是否可视化训练过程
"""
self.n_components = n_components
self.tol = tol
self.n_iter = n_iter
self.verbose = verbose
self.prior_dirichlet_alpha = prior_dirichlet_alpha
self.prior_gaussian_mean = prior_gaussian_mean
self.prior_gaussian_beta = prior_gaussian_beta
self.prior_wishart_w = prior_wishart_w
self.prior_wishart_v = prior_wishart_v
# 填充默认值
if self.prior_dirichlet_alpha is None:
self.prior_dirichlet_alpha_0 = np.asarray([1] * n_components)
self.prior_dirichlet_alpha = self.prior_dirichlet_alpha_0 + np.random.random(size=n_components) * 0.1
if self.prior_gaussian_beta is None:
self.prior_gaussian_beta_0 = np.asarray([1] * n_components)
self.prior_gaussian_beta = self.prior_gaussian_beta_0 + np.random.random(size=n_components) * 0.1

# 高斯模型参数
self.params = []
# 记录数据维度
self.D = None

def _init_params(self):
"""
初始化另一部分参数
:return:
"""
if self.prior_gaussian_mean is None:
self.prior_gaussian_mean_0 = np.zeros(shape=(self.n_components, self.D))
self.prior_gaussian_mean = self.prior_gaussian_mean_0 + np.random.random(
size=(self.n_components, self.D)) * 0.1
if self.prior_wishart_w is None:
self.prior_wishart_w_0 = [np.identity(self.D)] * self.n_components  # 单位矩阵
self.prior_wishart_w = []
for w in self.prior_wishart_w_0:
self.prior_wishart_w.append(w + np.random.random(size=(self.D, self.D)) * 0.1)

if self.prior_wishart_v is None:
self.prior_wishart_v_0 = np.asarray([self.D] * self.n_components)
self.prior_wishart_v = self.prior_wishart_v_0 + np.random.random(size=self.n_components) * 0.1

def _update_single_step(self, X):
# 首先计算3个期望
E_ln_Lambda = []
for k in range(0, self.n_components):
value = self.D * np.log(2) + np.log(np.linalg.det(self.prior_wishart_w[k]))
for i in range(1, self.D + 1):
value += utils.special.digamma((self.prior_wishart_v[k] + 1 - i) / 2)
E_ln_Lambda.append(value)
E_ln_pi = []
hat_alpha = np.sum(self.prior_dirichlet_alpha)
for k in range(0, self.n_components):
E_ln_pi.append(utils.special.digamma(self.prior_dirichlet_alpha[k]) - utils.special.digamma(hat_alpha))
E_mu_Lambda = []
for k in range(0, self.n_components):
value = self.D * (1.0 / self.prior_gaussian_beta[k]) + np.sum(self.prior_wishart_v[k] * (
X - self.prior_gaussian_mean[k]) @ self.prior_wishart_w[k] * (X - self.prior_gaussian_mean[k]), axis=1)
E_mu_Lambda.append(value)
# 然后计算 r_nk
rho_n_k = []
for k in range(0, self.n_components):
value = np.exp(E_ln_pi[k] + 0.5 * E_ln_Lambda[k] - self.D / 2.0 * np.log(2 * np.pi) - 0.5 * E_mu_Lambda[k])
rho_n_k.append(value)
rho_n_k = np.asarray(rho_n_k).T
r_n_k = rho_n_k / np.sum(np.asarray(rho_n_k), axis=1, keepdims=True)

# 然后计算N_k,\bar{x}_k,S_k
N_k = np.sum(r_n_k, axis=0)
x_k = []
for k in range(0, self.n_components):
x_k.append(np.sum(r_n_k[:, [k]] * X, axis=0) / N_k[k])
S_k = []
for k in range(0, self.n_components):
S_k.append(np.transpose(r_n_k[:, [k]] * (X - x_k[k])) @ (r_n_k[:, [k]] * (X - x_k[k])) / N_k[k])

# 最后更新变分分布中的各个参数
for k in range(0, self.n_components):
self.prior_dirichlet_alpha[k] = self.prior_dirichlet_alpha_0[k] + N_k[k]

self.prior_gaussian_beta[k] = self.prior_gaussian_beta_0[k] + N_k[k]

self.prior_gaussian_mean[k] = 1.0 / self.prior_gaussian_beta[k] * (
self.prior_gaussian_beta_0[k] * self.prior_gaussian_mean_0[k] + N_k[k] * x_k[k])

W_k_inv = np.linalg.inv(self.prior_wishart_w_0[k]) + N_k[k] * S_k[k] + (self.prior_gaussian_beta_0[k] * N_k[
k]) / (self.prior_gaussian_beta_0[k] + N_k[k]) * np.transpose(
x_k[k] - self.prior_gaussian_mean_0[k]) @ (x_k[k] - self.prior_gaussian_mean_0[k])

self.prior_wishart_w[k] = np.linalg.inv(W_k_inv)

self.prior_wishart_v[k] = self.prior_wishart_v_0[k] + N_k[k]

def fit(self, X):
n_sample, n_feature = X.shape
self.D = n_feature
self._init_params()
last_rst = np.zeros(n_sample)
# 迭代训练
for _ in range(0, self.n_iter):
self._update_single_step(X)
if self.verbose:
utils.plot_contourf(X, lambda x: self.predict_sample_generate_proba(x), lines=5)
utils.plt.pause(0.1)
utils.plt.clf()
current_rst = self.predict_sample_generate_proba(X)
if np.mean(np.abs(current_rst - last_rst)) < self.tol:
break
last_rst = current_rst

if self.verbose:
utils.plot_contourf(X, lambda x: self.predict_sample_generate_proba(x), lines=5)
utils.plt.show()

def predict_proba(self, X):
# 预测样本在几个St上的概率分布
_, D = X.shape
hat_alpha = np.sum(self.prior_dirichlet_alpha)
W = np.asarray([utils.St(X, self.prior_gaussian_mean[k],
(self.prior_wishart_v[k] + 1 - D) / (1 + self.prior_gaussian_beta[k]) *
self.prior_wishart_w[k] * self.prior_gaussian_beta[k],
self.prior_wishart_v[k] + 1 - D) * self.prior_dirichlet_alpha[
k] / hat_alpha for k in range(0, self.n_components)]).T
W = W / np.sum(W, axis=1, keepdims=True)
return W

def predict(self, X):
# 预测样本最有可能产生于那一个高斯模型
return np.argmax(self.predict_proba(X), axis=1)

def predict_sample_generate_proba(self, X):
# 返回样本的生成概率
_, D = X.shape
hat_alpha = np.sum(self.prior_dirichlet_alpha)
W = np.asarray([utils.St(X, self.prior_gaussian_mean[k],
(self.prior_wishart_v[k] + 1 - D) / (1 + self.prior_gaussian_beta[k]) *
self.prior_wishart_w[k] * self.prior_gaussian_beta[k],
self.prior_wishart_v[k] + 1 - D) * self.prior_dirichlet_alpha[
k] / hat_alpha for k in range(0, self.n_components)]).T
return np.sum(W, axis=1)


In [4]:
from sklearn.datasets.samples_generator import make_blobs

X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.55, random_state=0)
X = X[:, ::-1]

In [5]:
#训练，查看收敛过程可以设置verbose=True
gmm = GMMCluster(verbose=False, n_iter=100, n_components=4)
gmm.fit(X)

In [6]:
#查看效果
utils.plot_contourf(X,gmm.predict_sample_generate_proba,lines=5)

D:\app\Anaconda3\lib\site-packages\matplotlib\contour.py:967: UserWarning: The following kwargs were not used by contour: 'linewidth'
s)

In [7]:
#查看分类边界
utils.plot_decision_function(X,gmm.predict(X),gmm)


In [8]:
gmm = GMMCluster(verbose=False, n_iter=100, n_components=20)
gmm.fit(X)
utils.plot_contourf(X,gmm.predict_sample_generate_proba,lines=5)

D:\app\Anaconda3\lib\site-packages\matplotlib\contour.py:967: UserWarning: The following kwargs were not used by contour: 'linewidth'
s)


### 六.总结一下¶

In [ ]: