For example, we can train M different trees fm on different subsets of the data (chosen randomly with replacement) and compute the ensemble:
f(x)=1MM∑m=1fm(x)%matplotlib inline
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
plt.style.use("fivethirtyeight")
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
np.random.seed(0)
import warnings
def fxn():
warnings.warn("deprecated", DeprecationWarning)
warnings.warn("ignore", FutureWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
iris = datasets.load_iris()
X, y = iris.data[:, 0:2], iris.target
clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
clf2 = KNeighborsClassifier(n_neighbors=1)
bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf2, n_estimators=10, max_samples=0.8, max_features=0.8)
label = ['Decision Tree', 'K-NN', 'Bagging Tree', 'Bagging K-NN']
clf_list = [clf1, clf2, bagging1, bagging2]
fig = plt.figure(figsize=(20, 12))
gs = gridspec.GridSpec(2, 2)
grid = itertools.product([0,1],repeat=2)
for clf, label, grd in zip(clf_list, label, grid):
scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(label)
plt.show()
Accuracy: 0.63 (+/- 0.02) [Decision Tree] Accuracy: 0.70 (+/- 0.02) [K-NN] Accuracy: 0.66 (+/- 0.02) [Bagging Tree] Accuracy: 0.61 (+/- 0.02) [Bagging K-NN]
#plot learning curves
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
plt.figure(figsize=(20,8))
plot_learning_curves(X_train, y_train, X_test, y_test, bagging1, print_model=False, style='ggplot')
plt.show()
#Ensemble Size
num_est = np.linspace(1,100,20, dtype=int)
bg_clf_cv_mean = []
bg_clf_cv_std = []
for n_est in num_est:
bg_clf = BaggingClassifier(base_estimator=clf1, n_estimators=n_est, max_samples=0.8, max_features=0.8)
scores = cross_val_score(bg_clf, X, y, cv=3, scoring='accuracy')
bg_clf_cv_mean.append(scores.mean())
bg_clf_cv_std.append(scores.std())
plt.figure(figsize=(20,8))
(_, caps, _) = plt.errorbar(num_est, bg_clf_cv_mean, yerr=bg_clf_cv_std, c='red', fmt='-o', capsize=5)
for cap in caps:
cap.set_markeredgewidth(1)
plt.ylabel('Accuracy'); plt.xlabel('Ensemble Size'); plt.title('Bagging Tree Ensemble');
plt.show()
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
iris = datasets.load_iris()
X, y = iris.data[:, 0:2], iris.target
#XOR dataset
#X = np.random.randn(200, 2)
#y = np.array(map(int,np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)))
clf = DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=10)']
fig = plt.figure(figsize=(20, 13))
gs = gridspec.GridSpec(2, 2)
grid = itertools.product([0,1],repeat=2)
for n_est, label, grd in zip(num_est, label, grid):
boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est)
boosting.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=boosting, legend=2)
plt.title(label)
plt.show()
#plot learning curves
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=10)
plt.figure(figsize=(20,8))
plot_learning_curves(X_train, y_train, X_test, y_test, boosting, print_model=False, style='ggplot')
plt.show()
#Ensemble Size
num_est = np.linspace(1,100,20, dtype=int)
bg_clf_cv_mean = []
bg_clf_cv_std = []
for n_est in num_est:
ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est)
scores = cross_val_score(ada_clf, X, y, cv=3, scoring='accuracy')
bg_clf_cv_mean.append(scores.mean())
bg_clf_cv_std.append(scores.std())
plt.figure(figsize=(20,8))
(_, caps, _) = plt.errorbar(num_est, bg_clf_cv_mean, yerr=bg_clf_cv_std, c='red', fmt='-o', capsize=5)
for cap in caps:
cap.set_markeredgewidth(1)
plt.xticks(range(4), ['KNN', 'RF', 'NB', 'Adaboost'])
plt.ylabel('Accuracy'); plt.xlabel('Ensemble Size'); plt.title('AdaBoost Ensemble');
plt.show()
The figure above shows how the test accuracy improves with the size of the ensemble.
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
meta_classifier=lr)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
label = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]
fig = plt.figure(figsize=(20,13))
gs = gridspec.GridSpec(2, 2)
grid = itertools.product([0,1],repeat=2)
clf_cv_mean = []
clf_cv_std = []
for clf, label, grd in zip(clf_list, label, grid):
scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
clf_cv_mean.append(scores.mean())
clf_cv_std.append(scores.std())
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf)
plt.title(label)
plt.show()
Accuracy: 0.91 (+/- 0.01) [KNN] Accuracy: 0.93 (+/- 0.05) [Random Forest] Accuracy: 0.92 (+/- 0.03) [Naive Bayes] Accuracy: 0.95 (+/- 0.03) [Stacking Classifier]
#plot classifier accuracy
plt.figure(figsize=(20,8))
(_, caps, _) = plt.errorbar(range(4), clf_cv_mean, yerr=clf_cv_std, c='orange', fmt='-o', capsize=5)
for cap in caps:
cap.set_markeredgewidth(1)
plt.xticks(range(4), ['KNN', 'RF', 'NB', 'Stacking'])
plt.ylabel('Accuracy'); plt.xlabel('Classifier'); plt.title('Stacking Ensemble');
plt.show()
#plot learning curves
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
plt.figure(figsize=(20,8))
plot_learning_curves(X_train, y_train, X_test, y_test, sclf, print_model=False, style='ggplot')
plt.show()
We can see that stacking achieves higher accuracy than individual classifiers and based on learning curves, it shows no signs of overfitting.