### 一.证据近似的理论推导¶

$$p(\hat{t}\mid t)=\int\int\int p(\hat{t}\mid w,\beta)p(w\mid t,\alpha,\beta)p(\alpha,\beta\mid t)\ dw\ d\alpha\ d\beta$$

$$p(\hat{t}\mid t)\simeq p(\hat{t}\mid t,\hat{\alpha},\hat{\beta})=\int p(\hat{t}\mid w,\hat{\beta})p(w\mid t,\hat{\alpha},\hat{\beta})dw\int\int p(\alpha,\beta\mid t)\ d\alpha\ d\beta=\int p(\hat{t}\mid w,\hat{\beta})p(w\mid t,\hat{\alpha},\hat{\beta})dw$$

$$p(\alpha,\beta \mid t)\propto p(t\mid \alpha,\beta)p(\alpha,\beta)$$

### 二.求证据函数¶

$$p(t\mid\alpha,\beta)=\int\ p(t\mid w,\beta)p(w\mid\alpha)dw\\ =\left(\frac{\beta}{2\pi}\right)^{\frac{N}{2}}\left(\frac{\alpha}{2\pi}\right)^{\frac{M}{2}}\int exp\{-E(w)\}dw$$

$$E(w)=\beta E_D(w)+\alpha E_W(w)\\ =\frac{\beta}{2}\left\|t-\Phi w\right\|^2+\frac{\alpha}{2}w^Tw$$

$$E(w)=E(m_N)+\frac{1}{2}(w-m_N)^TA(w-m_N)$$

$$A=\alpha I+\beta\Phi^T\Phi\\ m_N=\beta A^{-1}\Phi^Tt\\ E(m_N)=\frac{\beta}{2}\left\|t-\Phi m_N\right\|^2+\frac{\beta}{2}m_N^Tm_N$$

$$\int exp\{-E(w)\}dw\\ =exp\{-E(m_N)\}\int exp\left\{-\frac{1}{2}(w-m_N)^TA(w-m_N)\right\}dw\\ =exp\left\{-E(m_N)\right\}(2\pi)^{\frac{M}{2}}\left|A\right|^{-\frac{1}{2}}$$

$$ln\ p(t\mid\alpha,\beta)=\frac{M}{2}ln\ \alpha+\frac{N}{2}ln\ \beta-E(m_N)-\frac{1}{2}ln\ |A|-\frac{N}{2}ln(2\pi)$$

### 三.极大化证据函数¶

$$(\beta\Phi^T\Phi)\mu_i=\lambda_i\mu_i,i=1,2,...,M$$

$$\frac{d}{d\alpha}ln\ |A|=\frac{d}{d\ln\alpha}\prod_i(\lambda_i+\alpha)=\frac{d}{d\ln\alpha}\sum_i ln(\lambda_i+\alpha)=\sum_i\frac{1}{\lambda_i+\alpha}$$

$$0=\frac{M}{2\alpha}-\frac{1}{2}m_N^Tm_N-\frac{1}{2}\sum_i\frac{1}{\lambda_i+\alpha}$$

$$\gamma=\sum_i\frac{\lambda_i}{\lambda_i+\alpha}$$

$$\alpha=\frac{\gamma}{m_N^Tm_N}$$

$$\frac{d}{d\beta}ln\ |A|=\frac{d}{d\beta}\sum_i ln(\lambda_i+\alpha)=\frac{1}{\beta}\sum_i\frac{\lambda_i}{\lambda_i+\alpha}=\frac{\gamma}{\beta}$$

$$0=\frac{N}{2\beta}-\frac{1}{2}\sum_{n=1}^N\left\{t_n-m_N^T\phi(x_n)\right\}^2-\frac{\gamma}{2\beta}$$

$$\frac{1}{\beta}=\frac{1}{N-r}\sum_{n=1}^N\left\{t_n-m_N^T\phi(x_n)\right\}^2$$

### 四.代码实现¶

$$\alpha=\frac{\gamma}{m_N^Tm_N}\\ \frac{1}{\beta}=\frac{1}{N-r}\sum_{n=1}^N\left\{t_n-m_N^T\phi(x_n)\right\}^2$$
In [1]:
"""

"""
import numpy as np
import matplotlib.pyplot as plt

class LinearRegression(object):
def __init__(self, basis_func=None, tol=1e-7, epochs=100, normalized=True):
"""
:param basis_func: list,基函数列表，包括rbf,sigmoid,poly_{num},linear，fm,默认None为linear，其中poly_{num}中的{num}表示多项式的最高阶数,fm表示构建交叉因子
:param tol:  两次迭代参数平均绝对值变化小于tol则停止
:param epochs: 默认迭代次数
:param normalized:是否归一化
"""
if basis_func is None:
self.basis_func = ['linear']
else:
self.basis_func = basis_func
self.tol = tol
self.epochs = epochs
self.normalized = normalized
# 特征均值、标准差
self.feature_mean = None
self.feature_std = None
# 训练参数
self.w = None

def _map_basis(self, X):
"""
将X进行基函数映射
:param X:
:return:
"""
n_sample, n_feature = X.shape
x_list = []
for basis_func in self.basis_func:
if basis_func == "linear":
x_list.append(X)
elif basis_func == "rbf":
x_list.append(np.exp(-0.5 * X * X))
elif basis_func == "sigmoid":
x_list.append(1 / (1 + np.exp(-1 * X)))
elif basis_func.startswith("poly"):
p = int(basis_func.split("_")[1])
for pow in range(1, p + 1):
x_list.append(np.power(X, pow))
elif basis_func == 'fm':
X_fm = np.zeros(shape=(n_sample, int(n_feature * (n_feature - 1) / 2)))
c = 0
for i in range(0, n_feature - 1):
for j in range(i + 1, n_feature):
X_fm[:, c] = X[:, i] * X[:, j]
c += 1
x_list.append(X_fm)
return np.concatenate(x_list, axis=1)

def fit(self, X, y):
if self.normalized:
self.feature_mean = np.mean(X, axis=0)
self.feature_std = np.std(X, axis=0) + 1e-8
X_ = (X - self.feature_mean) / self.feature_std
else:
X_ = X
X_ = self._map_basis(X_)
X_ = np.c_[np.ones(X_.shape[0]), X_]
n_sample, n_feature = X_.shape
alpha = 1
beta = 1
current_w = None
for _ in range(0, self.epochs):
A = alpha * np.eye(n_feature) + beta * X_.T @ X_
self.w = beta * np.linalg.inv(A) @ X_.T @ y.reshape((-1, 1))  # 即m_N
if current_w is not None and np.mean(np.abs(current_w - self.w)) < self.tol:
break
current_w = self.w
# 更新alpha,beta
if n_sample // n_feature >= 100:
# 利用prml中的公式3.98与3.99进行简化计算，避免求特征值的开销
alpha = n_feature / np.dot(self.w.reshape(-1), self.w.reshape(-1))
beta = n_sample / np.sum(np.power(y.reshape(-1) - self.predict(X).reshape(-1), 2))
else:
gamma = 0.0
for lamb in np.linalg.eig(beta * X_.T @ X_)[0]:
gamma += lamb / (lamb + alpha)
alpha = gamma.real / np.dot(self.w.reshape(-1), self.w.reshape(-1))
beta = 1.0 / (
1.0 / (n_sample - gamma) * np.sum(np.power(y.reshape(-1) - self.predict(X).reshape(-1), 2)))
#ml_models包中不会return,这里用作分析
return alpha,beta

def predict(self, X):
if self.normalized:
X_ = (X - self.feature_mean) / self.feature_std
else:
X_ = X
X_ = self._map_basis(X_)
X_ = np.c_[np.ones(X_.shape[0]), X_]
return (self.w.T @ X_.T).reshape(-1)

def plot_fit_boundary(self, x, y):
"""
绘制拟合结果
:param x:
:param y:
:return:
"""
plt.scatter(x[:, 0], y)
plt.plot(x[:, 0], self.predict(x), 'r')


### 测试¶

In [2]:
%matplotlib inline

In [3]:
#造伪样本
X=np.linspace(0,100,100)
X=np.c_[X,np.ones(100)]
w=np.asarray([3,2])
Y=X.dot(w)
X=X.astype('float')
Y=Y.astype('float')
X[:,0]+=np.random.normal(size=(X[:,0].shape))*3#添加噪声
Y=Y.reshape(100,1)

In [4]:
#添加噪声
X=np.concatenate([X,np.asanyarray([[100,1],[101,1],[102,1],[103,1],[104,1]])])
Y=np.concatenate([Y,np.asanyarray([[3000],[3300],[3600],[3800],[3900]])])

In [5]:
lr=LinearRegression()
alpha,beta=lr.fit(X[:,:-1],Y)
lr.plot_fit_boundary(X[:,:-1],Y)

In [6]:
alpha/beta

Out[6]:
4.080800093752806

In [ ]: