import numpy as np
from scipy.special import xlogy
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier as skAdaBoostClassifier
class AdaBoostClassifier():
def __init__(self, n_estimators=50, random_state=0):
self.n_estimators = n_estimators
self.random_state = random_state
def fit(self, X, y):
self.classes_, y_train = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
sample_weight = np.full(X.shape[0], 1 / X.shape[0])
self.estimators_ = []
self.estimator_weights_ = np.zeros(self.n_estimators)
self.estimator_errors_ = np.ones(self.n_estimators)
MAX_INT = np.iinfo(np.int32).max
rng = np.random.RandomState(self.random_state)
for i in range(self.n_estimators):
est = DecisionTreeClassifier(max_depth=1,
random_state=rng.randint(MAX_INT))
est.fit(X, y_train, sample_weight=sample_weight)
y_predict = est.predict(X)
incorrect = y_predict != y_train
estimator_error = np.average(incorrect, weights=sample_weight)
estimator_weight = (np.log((1 - estimator_error) / estimator_error) +
np.log(self.n_classes_ - 1))
sample_weight *= np.exp(estimator_weight * incorrect)
sample_weight /= np.sum(sample_weight)
self.estimators_.append(est)
self.estimator_errors_[i] = estimator_error
self.estimator_weights_[i] = estimator_weight
return self
def decision_function(self, X):
pred = np.zeros((X.shape[0], self.n_classes_))
for i in range(self.n_estimators):
pred[np.arange(X.shape[0]), self.estimators_[i].predict(X)] += self.estimator_weights_[i]
pred /= np.sum(self.estimator_weights_)
if self.n_classes_ == 2:
return pred[:, 1] - pred[:, 0]
return pred
def predict(self, X):
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = np.argmax(scores, axis=1)
return self.classes_[indices]
X, y = load_breast_cancer(return_X_y=True)
clf1 = AdaBoostClassifier(random_state=0).fit(X, y)
clf2 = skAdaBoostClassifier(algorithm="SAMME", random_state=0).fit(X, y)
assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)
assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.decision_function(X)
prob2 = clf2.decision_function(X)
assert np.allclose(prob1, prob2)
X, y = load_iris(return_X_y=True)
clf1 = AdaBoostClassifier(random_state=0).fit(X, y)
clf2 = skAdaBoostClassifier(algorithm="SAMME", random_state=0).fit(X, y)
assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)
assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.decision_function(X)
prob2 = clf2.decision_function(X)
assert np.allclose(prob1, prob2)
class AdaBoostClassifier():
def __init__(self, n_estimators=50, random_state=0):
self.n_estimators = n_estimators
self.random_state = random_state
def fit(self, X, y):
self.classes_, y_train = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
sample_weight = np.full(X.shape[0], 1 / X.shape[0])
self.estimators_ = []
self.estimator_weights_ = np.zeros(self.n_estimators)
self.estimator_errors_ = np.ones(self.n_estimators)
MAX_INT = np.iinfo(np.int32).max
rng = np.random.RandomState(self.random_state)
for i in range(self.n_estimators):
est = DecisionTreeClassifier(max_depth=1,
random_state=rng.randint(MAX_INT))
est.fit(X, y_train, sample_weight=sample_weight)
y_predict = est.predict(X)
estimator_error = np.average(y_predict != y_train, weights=sample_weight)
y_predict_proba = est.predict_proba(X)
np.clip(y_predict_proba, np.finfo(y_predict_proba.dtype).eps, None, y_predict_proba)
y_coding = np.full((y_train.shape[0], self.n_classes_), -1 / (self.n_classes_ - 1))
y_coding[np.arange(y_train.shape[0]), y_train] = 1
sample_weight *= np.exp(-1 * ((self.n_classes_ - 1) / self.n_classes_)
* xlogy(y_coding, y_predict_proba).sum(axis=1))
sample_weight /= np.sum(sample_weight)
self.estimators_.append(est)
self.estimator_errors_[i] = estimator_error
self.estimator_weights_[i] = 1
return self
def decision_function(self, X):
pred = np.zeros((X.shape[0], self.n_classes_))
for i in range(self.n_estimators):
y_predict_proba = self.estimators_[i].predict_proba(X)
np.clip(y_predict_proba, np.finfo(y_predict_proba.dtype).eps, None, y_predict_proba)
log_proba = np.log(y_predict_proba)
pred += ((self.n_classes_ - 1)
* (log_proba - (1 / self.n_classes_) * log_proba.sum(axis=1)[:, np.newaxis]))
pred /= np.sum(self.estimator_weights_)
if self.n_classes_ == 2:
return pred[:, 1] - pred[:, 0]
return pred
def predict(self, X):
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = np.argmax(scores, axis=1)
return self.classes_[indices]
X, y = load_breast_cancer(return_X_y=True)
clf1 = AdaBoostClassifier(random_state=0).fit(X, y)
clf2 = skAdaBoostClassifier(algorithm="SAMME.R", random_state=0).fit(X, y)
assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)
assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.decision_function(X)
prob2 = clf2.decision_function(X)
assert np.allclose(prob1, prob2)
X, y = load_iris(return_X_y=True)
clf1 = AdaBoostClassifier(n_estimators=50, random_state=0).fit(X, y)
clf2 = skAdaBoostClassifier(n_estimators=50, algorithm="SAMME.R", random_state=0).fit(X, y)
assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)
assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.decision_function(X)
prob2 = clf2.decision_function(X)
assert np.allclose(prob1, prob2)