### 一.分类原理¶

In [1]:
import os
os.chdir('../')
import numpy as np
%matplotlib inline
from ml_models.em import GMMCluster

"""

"""

class GMMClassifier(object):
def __init__(self, cluster_each_class=1, n_iter=100, tol=1e-3, shr_cov=False):
"""
:param cluster_each_class:每个类需要几个高斯模型去拟合，默认1个
:param n_iter:迭代次数
:param tol: -log likehold增益<tol时，停止训练
:param shr_cov:是否共享协方差矩阵
"""
self.cluster_each_class = cluster_each_class
self.n_iter = n_iter
self.tol = tol
self.shr_cov = shr_cov
self.models = []

def fit(self, X, y):
for y_index in range(y.max() + 1):
new_X = X[y == y_index]
cluster = GMMCluster(n_components=self.cluster_each_class, tol=self.tol, n_iter=self.n_iter)
cluster.fit(new_X)
self.models.append(cluster)
if self.shr_cov:
# 获取所有的协方差矩阵
sigmas = []
for model in self.models:
params = model.params
for param in params:
sigmas.append(param[2])
# 求平均
ave_sigma = np.mean(sigmas, axis=0)
# 更新
for model in self.models:
params = model.params
for param in params:
param[2] = ave_sigma

def predict_proba(self, X):
W = np.asarray([model.predict_sample_generate_proba(X) for model in self.models]).T
W = W / np.sum(W, axis=1, keepdims=True)
return W

def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)

In [2]:
#造数据
from sklearn.datasets.samples_generator import make_blobs
from ml_models import utils

X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.85, random_state=0)
X = X[:, ::-1]

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:144: FutureWarning: The sklearn.datasets.samples_generator module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.datasets. Anything that cannot be imported from sklearn.datasets is now part of the private API.
warnings.warn(message, FutureWarning)

In [3]:
#训练模型并可视化
gmm = GMMClassifier(n_iter=100)
gmm.fit(X, y)
utils.plot_decision_function(X, y, gmm)


In [4]:
gmm = GMMClassifier(n_iter=100,shr_cov=True)
gmm.fit(X, y)
utils.plot_decision_function(X, y, gmm)


### 二.协方差矩阵共享的GMM模型与LogisticRegression的关系¶

$$p(x\mid C_1)=N(x\mid u_1,\Sigma)\\ p(x\mid C_2)=N(x\mid u_2,\Sigma)\\ p(C_1)+p(C_2)=1$$

$$p(x)=p(C_1)p(x\mid C_1)+p(C_2)p(x\mid C_2)$$

$$p(C_1\mid x)=\frac{p(C_1)p(x\mid C_1)}{p(x)}\\ =\frac{p(C_1)p(x\mid C_1)}{p(C_1)p(x\mid C_1)+p(C_2)p(x\mid C_2)}\\ =\frac{1}{1+exp(-Z)}$$

$$Z=log\frac{p(C_1)p(x\mid C_1)}{p(C_2)p(x\mid C_2)}\\ =log\frac{p(x\mid C_1)}{p(x\mid C_2)}+log\frac{p(C_1)}{p(C_2)}\\ =....省略....\\ =(u_1-u_2)^T\Sigma^{-1}x-\frac{1}{2}u_1^T\Sigma^{-1}u_1+\frac{1}{2}u_2^T\Sigma^{-1}u_2+log\frac{N_1}{N_2}(N_1,N_2表示1,2类的样本量\frac{p(C_1)}{p(C_2)}=\frac{N_1}{N_2})$$

$$w^T=(u_1-u_2)^T\Sigma^{-1}\\ b=-\frac{1}{2}u_1^T\Sigma^{-1}u_1+\frac{1}{2}u_2^T\Sigma^{-1}u_2+log\frac{N_1}{N_2}$$

$$Z=w^Tx+b$$

$$p(C_1\mid x)=\frac{1}{1+exp(-w^Tx-b)}$$

In [ ]: