Notebook

In [1]:

import numpy as np
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.stats import _weighted_percentile
from sklearn.ensemble import GradientBoostingRegressor as skGradientBoostingRegressor

Implementation 1¶

similar to scikit-learn loss="ls"

In [2]:

class GradientBoostingRegressor():
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3, random_state=0):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state

    def fit(self, X, y):
        self.n_features_ = X.shape[1]
        self.estimators_ = np.empty((self.n_estimators, 1), dtype=np.object)
        raw_predictions = np.zeros(X.shape[0])
        rng = np.random.RandomState(0)
        for i in range(self.n_estimators):
            residual = y - raw_predictions
            tree = DecisionTreeRegressor(criterion="friedman_mse", max_depth=self.max_depth,
                                         random_state=rng)
            tree.fit(X, residual)
            raw_predictions += self.learning_rate * tree.predict(X)
            self.estimators_[i, 0] = tree
        return self

    def predict(self, X):
        raw_predictions = np.zeros(X.shape[0])
        for i in range(self.n_estimators):
            raw_predictions += self.learning_rate * self.estimators_[i, 0].predict(X)
        return raw_predictions

    @property
    def feature_importances_(self):
        all_importances = np.zeros(self.n_features_)
        for i in range(self.n_estimators):
            all_importances += self.estimators_[i, 0].tree_.compute_feature_importances(normalize=False)
        return all_importances / np.sum(all_importances)

In [3]:

X, y = load_boston(return_X_y=True)
clf1 = GradientBoostingRegressor().fit(X, y)
clf2 = skGradientBoostingRegressor(init="zero", presort=False, random_state=0).fit(X, y)
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.allclose(pred1, pred2)

Implementation 2¶

similar to scikit-learn loss="lad"

In [4]:

class GradientBoostingRegressor():
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3, random_state=0):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state

    def fit(self, X, y):
        self.n_features_ = X.shape[1]
        self.estimators_ = np.empty((self.n_estimators, 1), dtype=np.object)
        raw_predictions = np.zeros(X.shape[0])
        rng = np.random.RandomState(0)
        for i in range(self.n_estimators):
            residual = np.sign(y - raw_predictions)
            tree = DecisionTreeRegressor(criterion="friedman_mse", max_depth=self.max_depth,
                                         random_state=rng)
            tree.fit(X, residual)
            terminal_regions = tree.apply(X)
            for leaf in np.where(tree.tree_.children_left == -1)[0]:
                cur = np.where(terminal_regions == leaf)[0]
                # scikit-learn uses _weightef_percentile, which is inconsistent with np.median
                tree.tree_.value[leaf, 0, 0] = _weighted_percentile(y[cur] - raw_predictions[cur],
                                                                    np.ones(cur.shape[0]))
            raw_predictions += self.learning_rate * tree.tree_.value[:, 0, 0][terminal_regions]
            self.estimators_[i, 0] = tree
        return self

    def predict(self, X):
        raw_predictions = np.zeros(X.shape[0])
        for i in range(self.n_estimators):
            raw_predictions += self.learning_rate * self.estimators_[i, 0].predict(X)
        return raw_predictions

    @property
    def feature_importances_(self):
        all_importances = np.zeros(self.n_features_)
        for i in range(self.n_estimators):
            all_importances += self.estimators_[i, 0].tree_.compute_feature_importances(normalize=False)
        return all_importances / np.sum(all_importances)

In [5]:

X, y = load_boston(return_X_y=True)
clf1 = GradientBoostingRegressor().fit(X, y)
clf2 = skGradientBoostingRegressor(init="zero", loss="lad", presort=False, random_state=0).fit(X, y)
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.allclose(pred1, pred2)

Implementation 3¶

similar to scikit-learn loss="huber"

In [6]:

class GradientBoostingRegressor():
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3,
                 random_state=0, alpha=0.9):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.alpha = alpha

    def fit(self, X, y):
        self.n_features_ = X.shape[1]
        self.estimators_ = np.empty((self.n_estimators, 1), dtype=np.object)
        raw_predictions = np.zeros(X.shape[0])
        rng = np.random.RandomState(0)
        for i in range(self.n_estimators):
            residual = np.zeros(X.shape[0])
            diff = y - raw_predictions
            gamma = _weighted_percentile(np.abs(diff), np.ones(diff.shape[0]), self.alpha * 100)
            gamma_mask = np.abs(diff) <= gamma
            residual[gamma_mask] = diff[gamma_mask]
            residual[~gamma_mask] = gamma * np.sign(diff[~gamma_mask])
            tree = DecisionTreeRegressor(criterion="friedman_mse", max_depth=self.max_depth,
                                         random_state=rng)
            tree.fit(X, residual)
            terminal_regions = tree.apply(X)
            for leaf in np.where(tree.tree_.children_left == -1)[0]:
                cur = np.where(terminal_regions == leaf)[0]
                diff = y[cur] - raw_predictions[cur]
                # scikit-learn uses _weightef_percentile, which is inconsistent with np.median
                median = _weighted_percentile(diff, np.ones(diff.shape[0]))
                diff_minus_median = diff - median
                tree.tree_.value[leaf, 0, 0] = median + np.mean(np.sign(diff_minus_median)
                                                                * np.minimum(np.abs(diff_minus_median), gamma))
            raw_predictions += self.learning_rate * tree.tree_.value[:, 0, 0][terminal_regions]
            self.estimators_[i, 0] = tree
        return self

    def predict(self, X):
        raw_predictions = np.zeros(X.shape[0])
        for i in range(self.n_estimators):
            raw_predictions += self.learning_rate * self.estimators_[i, 0].predict(X)
        return raw_predictions

    @property
    def feature_importances_(self):
        all_importances = np.zeros(self.n_features_)
        for i in range(self.n_estimators):
            all_importances += self.estimators_[i, 0].tree_.compute_feature_importances(normalize=False)
        return all_importances / np.sum(all_importances)

In [7]:

X, y = load_boston(return_X_y=True)
clf1 = GradientBoostingRegressor().fit(X, y)
clf2 = skGradientBoostingRegressor(init="zero", loss="huber", presort=False, random_state=0).fit(X, y)
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.allclose(pred1, pred2)