This notebook contains code and comments from Sections 4.1 and 4.2 of the book Ensemble Methods for Machine Learning. Please see the book for additional details on this topic. This notebook and code are released under the MIT license.
Decision stumps (trees of depth 1, left) are commonly used as weak learners in sequential ensemble methods such as boosting. As tree depth increases, it becomes a stronger classifier, and its performance improves. However, it is not possible to arbitrarily increase the strength of classifiers as they will begin to overfit during training, which decreases their prediction performance when deployed.
Below, we visualize the decision boundaries of a weak tree and a strong tree.
from sklearn.datasets import make_circles
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from plot_utils import plot_2d_data, plot_2d_classifier
# Generate the data
X, y = make_circles(n_samples=500, noise=0.15, factor=0.2, random_state=13)
X, Xtst, y, ytst = train_test_split(X, y, train_size=450, random_state=13)
# Train the classifiers
weak_learner = DecisionTreeClassifier(max_depth=1)
weak_learner.fit(X, y)
weak_learner_err = accuracy_score(ytst, weak_learner.predict(Xtst))
strong_learner = DecisionTreeClassifier(max_depth=4)
strong_learner.fit(X, y)
strong_learner_err = accuracy_score(ytst, strong_learner.predict(Xtst))
# Plot the two trees
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 3.5))
title = 'Weak Learner (depth=1, acc={0:3.1f}%)'.format(100 * weak_learner_err)
plot_2d_classifier(ax[0], X, y, predict_function=weak_learner.predict,
alpha=0.2, xlabel='$x_1$', ylabel='$x_2$', s=80,
title=title, colormap='Blues')
title = 'Strong Learner (depth=4, acc={0:3.1f}%)'.format(100 * strong_learner_err)
plot_2d_classifier(ax[1], X, y, predict_function=strong_learner.predict,
alpha=0.2, xlabel='$x_1$', ylabel=None, s=80,
title=title, colormap='Blues')
fig.tight_layout()
# plt.savefig('./figures/CH04_F02_Kunapuli.png', format='png', dpi=300, bbox_inches='tight', pad_inches=0)
# plt.savefig('./figures/CH04_F02_Kunapuli.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0)
AdaBoost is an adaptive algorithm: at every iteration, it trains a new base estimator that fixes the mistakes made by the previous base estimator. Thus, it needs some way to ensure that the base learning algorithm prioritizes misclassified training examples. AdaBoost does this by maintaining weights over individual training examples.
Let’s visualize the first few iterations of boosting. Each iteration performs the same steps:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as col
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.05, random_state=13)
y = 2 * y - 1 # Convert labels from [0, 1] to [-1, 1]
n_samples, n_features = X.shape
n_estimators = 3 # Set the number of weak learners in the ensemble
D = np.ones((n_samples, )) # Initialize example weights
ensemble = [] # Initialize an empty ensemble
cmap = cm.get_cmap('Blues')
colors = cmap(np.linspace(0, 0.5, num=2))
for t in range(n_estimators):
# -- Initialize plotting
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
# --
D = D / np.sum(D) # Normalize the sample weights
# -- Plot the training examples in different sizes proportional to their weights
s = D / np.max(D)
s[(0.00 <= s) & (s < 0.25)] = 16
s[(0.25 <= s) & (s < 0.50)] = 32
s[(0.50 <= s) & (s < 0.75)] = 64
s[(0.75 <= s) & (s <= 1.00)] = 128
ax[0].scatter(X[y <= 0, 0], X[y <= 0, 1], s=s[y <= 0], marker='o', c=col.rgb2hex(colors[0]), edgecolors='k', alpha=0.5)
ax[0].scatter(X[y > 0, 0], X[y > 0, 1], s=s[y > 0], marker='s', c=col.rgb2hex(colors[1]), edgecolors='k', alpha=0.5)
ax[0].set_title('Iteration {0}: Sample weights'.format(t + 1))
ax[0].set_xticks([])
ax[0].set_yticks([])
# --
h = DecisionTreeClassifier(max_depth=1) # Initialize a decision stump
h.fit(X, y, sample_weight=D) # Train a weak learner using sample weights
ypred = h.predict(X) # Predict using the weak learner
# -- Plot the individual weak learner
err = (1 - accuracy_score(y, ypred)) * 100
title = 'Iteration {0}: Weak Learner (err = {1:3.1f}%)'.format(t + 1, err)
plot_2d_classifier(ax[1], X, y, predict_function=h.predict,
alpha=0.25, xlabel=None, ylabel=None,
title=title, colormap='Blues')
pos_err = (y > 0) & (y != ypred)
pos_cor = (y > 0) & (y == ypred)
neg_err = (y <= 0) & (y != ypred)
neg_cor = (y <= 0) & (y == ypred)
ax[1].scatter(X[neg_err, 0], X[neg_err, 1], marker='o', c=col.rgb2hex(colors[0]), edgecolors='k', s=80)
# ax[1].scatter(X[neg_cor, 0], X[neg_cor, 1], marker='o', c='lightgray', edgecolors='k')
ax[1].scatter(X[pos_err, 0], X[pos_err, 1], marker='s', c=col.rgb2hex(colors[1]), edgecolors='k', s=80)
# ax[1].scatter(X[pos_cor, 0], X[pos_cor, 1], marker='s', c='lightgray', edgecolors='k')
ax[1].set_xticks([])
ax[1].set_yticks([])
# --
e = 1 - accuracy_score(y, ypred, sample_weight=D) # Weighted error of the weak learner
a = 0.5 * np.log((1 - e) / e) # Weak learner weight
m = (y == ypred) * 1 + (y != ypred) * -1 # Identify correctly classified and misclassified points
D *= np.exp(-a * m) # Update the sample weights
ensemble.append((a, h)) # Save the weighted weak hypothesis
# Save the figure
fig.tight_layout()
# plt.savefig('./figures/CH04_F0{0}_Kunapuli.png'.format(t + 3), format='png', dpi=300)
# plt.savefig('./figures/CH04_F0{0}_Kunapuli.pdf'.format(t + 3), format='pdf', dpi=300)
After three iterations, we can combine the three individual weak learners into a strong learner. We next visualize this is the boosted ensemble. We define a function predict_boosting
here to make predictions using this boosted ensemble. However, we will revisit this function more formally in Section 4.2 below.
def predict_boosting(X, estimators):
pred = np.zeros((X.shape[0], ))
for a, h in estimators:
pred += a * h.predict(X)
y = np.sign(pred)
return y
ypred = predict_boosting(X, ensemble)
err = (1 - accuracy_score(y, ypred)) * 100
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
plot_2d_classifier(ax, X, y, predict_function=predict_boosting, predict_args=(ensemble),
boundary_level=[0.0], alpha=0.25, xlabel='$x_1$', ylabel='$x_2$', s=80,
title=title, colormap='Blues')
fig.tight_layout()
ax.set_title('Overall ensemble (err = {0:3.1f}%)'.format(err), fontsize=12)
# plt.savefig('./figures/CH04_F06_Kunapuli.png', dpi=300, format='png', bbox_inches='tight', pad_inches=0)
# plt.savefig('./figures/CH04_F06_Kunapuli.pdf', dpi=300, format='pdf', bbox_inches='tight', pad_inches=0)
Text(0.5, 1.0, 'Overall ensemble (err = 9.0%)')
AdaBoost is fairly straightforward to implement. The basic algorithmic outline at the t-th iteration can be described by the following steps:
Train a weak learner ht(x) using the weighted training examples, (xi,yi,Di)
Update the weights of the training examples
At the end of T iterations, we have weak learners ht along with the corresponding weak learner weights αt. The overall classifier after t iterations is just a weighted ensemble:
H(x)=T∑t=1αtht(x).Two key questions we now need to answer are: (1) how do we update the weights on the training examples, Di; and (2) how do we compute the weight of each base estimator, αt?
At each iteration t, we obtain a base estimator ht(x). The training error ϵt of ht(x) is a measure of its performance. AdaBoost computes its weight as
αt=12log(1−ϵt)ϵt.Why this particular formulation? Let’s look at the relationship between αt and the error ϵt.
epsilon = np.linspace(0.01, 0.5, num=50)
alpha = 0.5 * np.log((1 - epsilon) / epsilon)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7, 4))
ax.plot(epsilon, alpha, linewidth=3)
ax.set_xlabel('Training error, $\\epsilon_t$', fontsize=12)
ax.set_ylabel('Estimator weight, $\\alpha_t$', fontsize=12)
ax.set_title('Error vs. Weight, $\\alpha_t \\, = \\, \\frac{1}{2} \\log{\\frac{1 - \\epsilon_t}{\\epsilon_t}}$')
ax.text(0.04, 1.75, 'classifiers with the lowest errors ($\epsilon_t \\approx 0$)\n'
'are the strongest learners; they will have\n'
'the highest weights', fontsize=12)
ax.text(0.29, 0.5, 'classifiers that are only slightly better than\n'
'random guessing ($\epsilon_t \\approx 0.5$) are the weakest\n'
'learners; they will have the lowest weights', fontsize=12)
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
fig.tight_layout()
# plt.savefig('./figures/CH04_F07_Kunapuli.png', dpi=300, format='png', bbox_inches='tight')
# plt.savefig('./figures/CH04_F07_Kunapuli.pdf', dpi=300, format='pdf', bbox_inches='tight')
The base estimator weight (αt) can also be used to update the weights of each training example. AdaBoost updates example weights as
Dt+1i=Dti⋅{eαtif misclassified,e−αtif correctly classified.For example, let's say we have two training examples x1 and x2, both with weights D1=D2=0.75. The current weak learner ht has weight αt=1.5. x1 is correctly classified, so its weight should decrease by a factor of eαt.
D1 = 0.75
alpha_t = 1.5
new_D1 = D1 / np.exp(alpha_t)
print(new_D1)
0.16734762011132237
Conversely, x2 is misclassified, so its weight should increase by a factor of eαt.
D2 = 0.75
alpha_t = 1.5
new_D2 = D2 * np.exp(alpha_t)
print(new_D2)
3.3612668027535486
Listing 4.1: Training an ensemble of weak learners using AdaBoost
This listing is essentially the same listing that was used to generate the figures for Section 4.1.1, but with the plotting code removed.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
def fit_boosting(X, y, n_estimators=10):
n_samples, n_features = X.shape
D = np.ones((n_samples, ))
estimators = []
for t in range(n_estimators):
D = D / np.sum(D)
h = DecisionTreeClassifier(max_depth=1)
h.fit(X, y, sample_weight=D)
ypred = h.predict(X)
e = 1 - accuracy_score(y, ypred, sample_weight=D)
a = 0.5 * np.log((1 - e) / e)
m = (y == ypred) * 1 + (y != ypred) * -1
D *= np.exp(-a * m)
estimators.append((a, h))
return estimators
Listing 4.2: Making predictions with AdaBoost
This listing is exactly the same listing that was used to generate the figure for the final boosting ensemble for Section 4.1.1 above, and is replicated here again.
def predict_boosting(X, estimators):
pred = np.zeros((X.shape[0], ))
for a, h in estimators:
pred += a * h.predict(X)
y = np.sign(pred)
return y
Let us see how this works on a simple, synthetic data set.
NOTE: The boosting algorithm we have implemented requires negative examples and positive examples to be labeled -1 and 1 respectively. In the example above, since the function make_moons returns labels y with negative examples and positive examples labeled 0 and 1 respectively, we convert them to -1 and 1 with y = (2 * y) – 1
.
X, y = make_moons(n_samples=200, noise=0.1, random_state=13)
y = (2 * y) - 1
Xtrn, Xtst, ytrn, ytst = train_test_split(X, y, test_size=0.25, random_state=13)
estimators = fit_boosting(Xtrn, ytrn)
ypred = predict_boosting(Xtst, estimators)
from sklearn.metrics import accuracy_score
tst_err = 1 - accuracy_score(ytst, ypred)
tst_err
0.020000000000000018
Finally, we visualize the performance of AdaBoost as the number of base estimators increases in the figure below. As we add more and more weak learners into the mix, the overall ensemble is increasingly boosted into a stronger, more complex and more nonlinear classifier.
n_samples, n_features = X.shape
n_estimators = 20
D = np.ones((n_samples, )) # Initialize example weights
ensemble = [] # Initialize an empty ensemble
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(12, 8))
subplot_id = -1
for t in range(n_estimators):
D = D / np.sum(D) # Normalize the sample weights
h = DecisionTreeClassifier(max_depth=1) # Initialize a decision stump
h.fit(X, y, sample_weight=D) # Train a weak learner using sample weights
ypred = h.predict(X) # Predict using the weak learner
e = 1 - accuracy_score(y, ypred, sample_weight=D) # Weighted error of the weak learner
a = 0.5 * np.log((1 - e) / e) # Weak learner weight
m = (y == ypred) * 1 + (y != ypred) * -1 # Identify correctly classified and misclassified points
D *= np.exp(-a * m) # Update the sample weights
ensemble.append((a, h)) # Save the weighted weak hypothesis
# Plot the ensemble
if t in [0, 2, 4, 9, 14, 19]:
subplot_id += 1
r, c = np.divmod(subplot_id, 3)
err = (1 - accuracy_score(y, predict_boosting(X, ensemble))) * 100
title = 'Iteration {0}: err = {1:4.2f}%'.format(t + 1, err)
plot_2d_classifier(ax[r, c], X, y,
predict_function=predict_boosting, predict_args=ensemble,
alpha=0.25, xlabel=None, ylabel=None, s=80,
title=title, colormap='Blues')
ax[r, c].set_xticks([])
ax[r, c].set_yticks([])
fig.tight_layout()
# plt.savefig('./figures/CH04_F09_Kunapuli.png', dpi=300, format='png', bbox_inches='tight');
# plt.savefig('./figures/CH04_F09_Kunapuli.pdf', dpi=300, format='pdf', bbox_inches='tight');
scikit-learn
’s AdaBoostClassifier
provides additional functionality including support for multi-class classification, as well as other base learning algorithms beyond decision trees. There are three important arguments that the AdaBoostClassifier package takes:
base_estimator
, the base learning algorithm AdaBoost uses to train weak learners,n_estimators
, the number of weak learners that will be trained sequentially by AdaBoost, andlearning_rate
, an additional parameter that progressively shrinks the contribution of each successive weak learnerfrom sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
X, y = load_breast_cancer(return_X_y=True)
Xtrn, Xtst, ytrn, ytst = train_test_split(X, y, test_size=0.25, random_state=13)
shallow_tree = DecisionTreeClassifier(max_depth=2)
ensemble = AdaBoostClassifier(base_estimator=shallow_tree,
n_estimators=20, learning_rate=0.75)
ensemble.fit(Xtrn, ytrn)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), learning_rate=0.75, n_estimators=20)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), learning_rate=0.75, n_estimators=20)
DecisionTreeClassifier(max_depth=2)
DecisionTreeClassifier(max_depth=2)
ypred = ensemble.predict(Xtst)
err = 1 - accuracy_score(ytst, ypred)
print(err)
0.05594405594405594
AdaBoostClassifier
also contains a multi-class implementation of AdaBoost called SAMME.
from sklearn.datasets import load_iris
from sklearn.utils.multiclass import unique_labels
X, y = load_iris(return_X_y=True)
Xtrn, Xtst, ytrn, ytst = train_test_split(X, y, test_size=0.25, random_state=13)
unique_labels(y)
array([0, 1, 2])
ensemble = AdaBoostClassifier(base_estimator=shallow_tree,
n_estimators=20,
learning_rate=0.75, algorithm='SAMME.R')
ensemble.fit(Xtrn, ytrn)
ypred = ensemble.predict(Xtst)
err = 1 - accuracy_score(ytst, ypred)
print(err)
0.07894736842105265