一.原理介绍¶

$$\hat{y}\rightarrow \mu_{ML}$$

二.泊松回归¶

$$P(y\mid\lambda)=\frac{\lambda^y}{y!}e^{-\lambda}$$

$$\lambda=e^{\hat{y}}$$

$$\prod_{i=1}^N\frac{e^{y_i\hat{y_i}}e^{-e^{\hat{y_i}}}}{y_i!}$$

$$L(y,\hat{y})=\sum_{i=1}^N(e^{\hat{y_i}}-y_i\hat{y_i})$$

$$\frac{\partial L(y,\hat{y})}{\partial \hat{y}}=e^{\hat{y}}-y\\ \frac{\partial^2 L(y,\hat{y})}{{\partial \hat{y}}^2}=e^{\hat{y}}\\$$

三.gamma回归¶

gamma分布如下：

$$p(y\mid\alpha,\lambda)=\frac{1}{\Gamma(\alpha)\lambda^\alpha}y^{\alpha-1}e^{-y/\lambda}$$

$$p(y\mid\mu,\phi)=\frac{1}{y\Gamma(1/\phi)}(\frac{y}{\mu\phi})^{1/\phi}exp[-\frac{y}{\mu\phi}]$$

$$p(y\mid\mu,\phi)=exp[\frac{-y/\mu-ln\mu}{\phi}+\frac{1-\phi}{\phi}lny-\frac{ln\phi}{\phi}-ln\Gamma(\frac{1}{\phi})]$$

$$\mu=e^{\hat{y}}$$

$$L(y,\hat{y})=\sum_{i=1}^N(\frac{y_i}{e^{\hat{y_i}}}+\hat{y_i})$$

$$\frac{\partial L(y,\hat{y})}{\partial \hat{y}}=1-ye^{-\hat{y}}\\ \frac{\partial^2 L(y,\hat{y})}{{\partial \hat{y}}^2}=ye^{-\hat{y}}\\$$

四.tweedie回归¶

tweedie回归是多个分布的组合体，包括gamma分布，泊松分布，高斯分布等，tweedie回归由一个超参数$p$控制，$p$不同，则其对应的对数似然函数也不同：

$$g(y,\phi)+\left\{\begin{matrix} \frac{1}{\phi}(ylog(\mu)-\frac{\mu^{2-p}}{2-p}) & p=1\\ \frac{1}{\phi}(y\frac{\mu^{1-p}}{1-p}-log\mu) & p=2 \\ \frac{1}{\phi}(y\frac{\mu^{1-p}}{1-p}-\frac{\mu^{2-p}}{2-p}) & p\neq 1,p\neq 2 \end{matrix}\right.$$

$$\mu=e^{\hat{y}}$$

$$L(y,\hat{y})=\left\{\begin{matrix} \sum_{i=1}^n(\frac{e^{\hat{y_i}(2-p)}}{2-p}-y_i\hat{y_i})=\sum_{i=1}^n(e^{\hat{y_i}}-y_i\hat{y_i}) & p=1\\ \sum_{i=1}^n(\hat{y_i}+y_ie^{-\hat{y_i}}) & p=2 \\ \sum_{i=1}^n(\frac{exp[\hat{y_i}(2-p)]}{2-p}-y_i\frac{exp[\hat{y_i}(1-p)]}{1-p}) & p\neq 1,p\neq 2 \end{matrix}\right.$$

$$\frac{\partial L(y,\hat{y})}{\partial \hat{y}}=\left\{\begin{matrix} e^{\hat{y}}-y & p=1\\ 1-ye^{-\hat{y}} & p=2 \\ e^{\hat{y}(2-p)}-ye^{\hat{y}(1-p)} & p\neq 1,p\neq 2 \end{matrix}\right.$$

$$\frac{\partial^2 L(y,\hat{y})}{{\partial \hat{y}}^2}=\left\{\begin{matrix} e^{\hat{y}} & p=1\\ ye^{-\hat{y}} & p=2 \\ (2-p)e^{\hat{y}(2-p)}-(1-p)ye^{\hat{y}(1-p)} & p\neq 1,p\neq 2 \end{matrix}\right.$$

五.代码实现¶

In [1]:
import os
os.chdir('../')
import matplotlib.pyplot as plt
%matplotlib inline
from ml_models.ensemble import XGBoostBaseTree
from ml_models import utils
import copy
import numpy as np

"""
xgboost回归树的实现，封装到ml_models.ensemble
"""

class XGBoostRegressor(object):
def __init__(self, base_estimator=None, n_estimators=10, learning_rate=1.0, loss='squarederror', p=2.5):
"""
:param base_estimator: 基学习器
:param n_estimators: 基学习器迭代数量
:param learning_rate: 学习率，降低后续基学习器的权重，避免过拟合
:param loss:损失函数，支持squarederror、logistic、poisson,gamma,tweedie
:param p:对tweedie回归生效
"""
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.learning_rate = learning_rate
if self.base_estimator is None:
# 默认使用决策树桩
self.base_estimator = XGBoostBaseTree()
# 同质分类器
if type(base_estimator) != list:
estimator = self.base_estimator
self.base_estimator = [copy.deepcopy(estimator) for _ in range(0, self.n_estimators)]
# 异质分类器
else:
self.n_estimators = len(self.base_estimator)
self.loss = loss
self.p = p

"""
获取一阶、二阶导数信息
:param y:真实值
:param y_pred:预测值
:return:
"""
if self.loss == 'squarederror':
return y_pred - y, np.ones_like(y)
elif self.loss == 'logistic':
return utils.sigmoid(y_pred) - utils.sigmoid(y), utils.sigmoid(y_pred) * (1 - utils.sigmoid(y_pred))
elif self.loss == 'poisson':
return np.exp(y_pred) - y, np.exp(y_pred)
elif self.loss == 'gamma':
return 1.0 - y * np.exp(-1.0 * y_pred), y * np.exp(-1.0 * y_pred)
elif self.loss == 'tweedie':
if self.p == 1:
return np.exp(y_pred) - y, np.exp(y_pred)
elif self.p == 2:
return 1.0 - y * np.exp(-1.0 * y_pred), y * np.exp(-1.0 * y_pred)
else:
return np.exp(y_pred * (2.0 - self.p)) - y * np.exp(y_pred * (1.0 - self.p)), (2.0 - self.p) * np.exp(
y_pred * (2.0 - self.p)) - (1.0 - self.p) * y * np.exp(y_pred * (1.0 - self.p))

def fit(self, x, y):
y_pred = np.zeros_like(y)
for index in range(0, self.n_estimators):
self.base_estimator[index].fit(x, g, h)
y_pred += self.base_estimator[index].predict(x) * self.learning_rate

def predict(self, x):
rst_np = np.sum(
[self.base_estimator[0].predict(x)] +
[self.learning_rate * self.base_estimator[i].predict(x) for i in
range(1, self.n_estimators - 1)] +
[self.base_estimator[self.n_estimators - 1].predict(x)]
, axis=0)
if self.loss in ["poisson", "gamma", "tweedie"]:
return np.exp(rst_np)
else:
return rst_np


In [2]:
data = np.linspace(1, 10, num=100)
target = np.sin(data) + np.random.random(size=100) + 1  # 添加噪声
data = data.reshape((-1, 1))

In [3]:
model = XGBoostRegressor(loss='poisson')
model.fit(data, target)
plt.scatter(data, target)
plt.plot(data, model.predict(data), color='r')

Out[3]:
[<matplotlib.lines.Line2D at 0x29c33c1a9b0>]
In [4]:
model = XGBoostRegressor(loss='gamma')
model.fit(data, target)
plt.scatter(data, target)
plt.plot(data, model.predict(data), color='r')

Out[4]:
[<matplotlib.lines.Line2D at 0x29c33c1a400>]
In [5]:
model = XGBoostRegressor(loss='tweedie',p=2.5)
model.fit(data, target)
plt.scatter(data, target)
plt.plot(data, model.predict(data), color='r')

Out[5]:
[<matplotlib.lines.Line2D at 0x29c33c1aba8>]
In [6]:
model = XGBoostRegressor(loss='tweedie',p=1.5)
model.fit(data, target)
plt.scatter(data, target)
plt.plot(data, model.predict(data), color='r')

Out[6]:
[<matplotlib.lines.Line2D at 0x29c33c43d68>]

In [7]:
model = XGBoostRegressor(loss='tweedie',p=0.1)
model.fit(data, target)
plt.scatter(data, target)
plt.plot(data, model.predict(data), color='r')

Out[7]:
[<matplotlib.lines.Line2D at 0x29c44ed2cc0>]
In [8]:
model = XGBoostRegressor(loss='tweedie',p=20)
model.fit(data, target)
plt.scatter(data, target)
plt.plot(data, model.predict(data), color='r')

Out[8]:
[<matplotlib.lines.Line2D at 0x29c44e94f28>]