#!/usr/bin/env python # coding: utf-8 # Copyright (c) 2015, 2016 [Sebastian Raschka](sebastianraschka.com) # # https://github.com/rasbt/python-machine-learning-book # # [MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt) # # Python Machine Learning - Code Examples # # Chapter 7 - Combining Different Models for Ensemble Learning # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scipy,sklearn") # *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.* #
#
# ### Overview # - [Learning with ensembles](#Learning-with-ensembles) # - [Implementing a simple majority vote classifier](#Implementing-a-simple-majority-vote-classifier) # - [Combining different algorithms for classification with majority vote](#Combining-different-algorithms-for-classification-with-majority-vote) # - [Evaluating and tuning the ensemble classifier](#Evaluating-and-tuning-the-ensemble-classifier) # - [Bagging – building an ensemble of classifiers from bootstrap samples](#Bagging----Building-an-ensemble-of-classifiers-from-bootstrap-samples) # - [Leveraging weak learners via adaptive boosting](#Leveraging-weak-learners-via-adaptive-boosting) # - [Summary](#Summary) #
#
# In[2]: from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Added version check for recent scikit-learn 0.18 checks from distutils.version import LooseVersion as Version from sklearn import __version__ as sklearn_version # # Learning with ensembles # In[4]: Image(filename='./images/07_01.png', width=500) # In[5]: Image(filename='./images/07_02.png', width=500) # In[6]: from scipy.misc import comb import math def ensemble_error(n_classifier, error): k_start = math.ceil(n_classifier / 2.0) probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k) for k in range(k_start, n_classifier + 1)] return sum(probs) # **Note** # # For historical reasons, Python 2.7's `math.ceil` returns a `float` instead of an integer like in Python 3.x. Although Although this book was written for Python >3.4, let's make it compatible to Python 2.7 by casting it to an it explicitely: # In[7]: from scipy.misc import comb import math def ensemble_error(n_classifier, error): k_start = int(math.ceil(n_classifier / 2.0)) probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k) for k in range(k_start, n_classifier + 1)] return sum(probs) # In[8]: ensemble_error(n_classifier=11, error=0.25) # In[9]: import numpy as np error_range = np.arange(0.0, 1.01, 0.01) ens_errors = [ensemble_error(n_classifier=11, error=error) for error in error_range] # In[10]: import matplotlib.pyplot as plt plt.plot(error_range, ens_errors, label='Ensemble error', linewidth=2) plt.plot(error_range, error_range, linestyle='--', label='Base error', linewidth=2) plt.xlabel('Base error') plt.ylabel('Base/Ensemble error') plt.legend(loc='upper left') plt.grid() plt.tight_layout() # plt.savefig('./figures/ensemble_err.png', dpi=300) plt.show() #
#
# # Implementing a simple majority vote classifier # In[11]: import numpy as np np.argmax(np.bincount([0, 0, 1], weights=[0.2, 0.2, 0.6])) # In[12]: ex = np.array([[0.9, 0.1], [0.8, 0.2], [0.4, 0.6]]) p = np.average(ex, axis=0, weights=[0.2, 0.2, 0.6]) p # In[13]: np.argmax(p) # In[14]: from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.preprocessing import LabelEncoder from sklearn.externals import six from sklearn.base import clone from sklearn.pipeline import _name_estimators import numpy as np import operator class MajorityVoteClassifier(BaseEstimator, ClassifierMixin): """ A majority vote ensemble classifier Parameters ---------- classifiers : array-like, shape = [n_classifiers] Different classifiers for the ensemble vote : str, {'classlabel', 'probability'} (default='label') If 'classlabel' the prediction is based on the argmax of class labels. Else if 'probability', the argmax of the sum of probabilities is used to predict the class label (recommended for calibrated classifiers). weights : array-like, shape = [n_classifiers], optional (default=None) If a list of `int` or `float` values are provided, the classifiers are weighted by importance; Uses uniform weights if `weights=None`. """ def __init__(self, classifiers, vote='classlabel', weights=None): self.classifiers = classifiers self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)} self.vote = vote self.weights = weights def fit(self, X, y): """ Fit classifiers. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Matrix of training samples. y : array-like, shape = [n_samples] Vector of target class labels. Returns ------- self : object """ if self.vote not in ('probability', 'classlabel'): raise ValueError("vote must be 'probability' or 'classlabel'" "; got (vote=%r)" % self.vote) if self.weights and len(self.weights) != len(self.classifiers): raise ValueError('Number of classifiers and weights must be equal' '; got %d weights, %d classifiers' % (len(self.weights), len(self.classifiers))) # Use LabelEncoder to ensure class labels start with 0, which # is important for np.argmax call in self.predict self.lablenc_ = LabelEncoder() self.lablenc_.fit(y) self.classes_ = self.lablenc_.classes_ self.classifiers_ = [] for clf in self.classifiers: fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y)) self.classifiers_.append(fitted_clf) return self def predict(self, X): """ Predict class labels for X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Matrix of training samples. Returns ---------- maj_vote : array-like, shape = [n_samples] Predicted class labels. """ if self.vote == 'probability': maj_vote = np.argmax(self.predict_proba(X), axis=1) else: # 'classlabel' vote # Collect results from clf.predict calls predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T maj_vote = np.apply_along_axis( lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions) maj_vote = self.lablenc_.inverse_transform(maj_vote) return maj_vote def predict_proba(self, X): """ Predict class probabilities for X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. Returns ---------- avg_proba : array-like, shape = [n_samples, n_classes] Weighted average probability for each class per sample. """ probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_]) avg_proba = np.average(probas, axis=0, weights=self.weights) return avg_proba def get_params(self, deep=True): """ Get classifier parameter names for GridSearch""" if not deep: return super(MajorityVoteClassifier, self).get_params(deep=False) else: out = self.named_classifiers.copy() for name, step in six.iteritems(self.named_classifiers): for key, value in six.iteritems(step.get_params(deep=True)): out['%s__%s' % (name, key)] = value return out #
#
# ## Combining different algorithms for classification with majority vote # In[15]: from sklearn import datasets from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split iris = datasets.load_iris() X, y = iris.data[50:, [1, 2]], iris.target[50:] le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test =\ train_test_split(X, y, test_size=0.5, random_state=1) # In[16]: import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline if Version(sklearn_version) < '0.18': from sklearn.cross_validation import cross_val_score else: from sklearn.model_selection import cross_val_score clf1 = LogisticRegression(penalty='l2', C=0.001, random_state=0) clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0) clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski') pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]]) pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]]) clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN'] print('10-fold cross validation:\n') for clf, label in zip([pipe1, clf2, pipe3], clf_labels): scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc') print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # In[17]: # Majority Rule (hard) Voting mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3]) clf_labels += ['Majority Voting'] all_clf = [pipe1, clf2, pipe3, mv_clf] for clf, label in zip(all_clf, clf_labels): scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc') print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) #
#
# # Evaluating and tuning the ensemble classifier # In[18]: from sklearn.metrics import roc_curve from sklearn.metrics import auc colors = ['black', 'orange', 'blue', 'green'] linestyles = [':', '--', '-.', '-'] for clf, label, clr, ls \ in zip(all_clf, clf_labels, colors, linestyles): # assuming the label of the positive class is 1 y_pred = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred) roc_auc = auc(x=fpr, y=tpr) plt.plot(fpr, tpr, color=clr, linestyle=ls, label='%s (auc = %0.2f)' % (label, roc_auc)) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2) plt.xlim([-0.1, 1.1]) plt.ylim([-0.1, 1.1]) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') # plt.tight_layout() # plt.savefig('./figures/roc.png', dpi=300) plt.show() # In[19]: sc = StandardScaler() X_train_std = sc.fit_transform(X_train) # In[20]: from itertools import product all_clf = [pipe1, clf2, pipe3, mv_clf] x_min = X_train_std[:, 0].min() - 1 x_max = X_train_std[:, 0].max() + 1 y_min = X_train_std[:, 1].min() - 1 y_max = X_train_std[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(nrows=2, ncols=2, sharex='col', sharey='row', figsize=(7, 5)) for idx, clf, tt in zip(product([0, 1], [0, 1]), all_clf, clf_labels): clf.fit(X_train_std, y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3) axarr[idx[0], idx[1]].scatter(X_train_std[y_train==0, 0], X_train_std[y_train==0, 1], c='blue', marker='^', s=50) axarr[idx[0], idx[1]].scatter(X_train_std[y_train==1, 0], X_train_std[y_train==1, 1], c='red', marker='o', s=50) axarr[idx[0], idx[1]].set_title(tt) plt.text(-3.5, -4.5, s='Sepal width [standardized]', ha='center', va='center', fontsize=12) plt.text(-10.5, 4.5, s='Petal length [standardized]', ha='center', va='center', fontsize=12, rotation=90) plt.tight_layout() # plt.savefig('./figures/voting_panel', bbox_inches='tight', dpi=300) plt.show() # In[21]: mv_clf.get_params() # In[22]: if Version(sklearn_version) < '0.18': from sklearn.cross_validation import GridSearchCV else: from sklearn.model_selection import GridSearchCV params = {'decisiontreeclassifier__max_depth': [1, 2], 'pipeline-1__clf__C': [0.001, 0.1, 100.0]} grid = GridSearchCV(estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc') grid.fit(X_train, y_train) if Version(sklearn_version) < '0.18': for params, mean_score, scores in grid.grid_scores_: print("%0.3f +/- %0.2f %r" % (mean_score, scores.std() / 2.0, params)) else: cv_keys = ('mean_test_score', 'std_test_score','params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): print("%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r])) # In[23]: print('Best parameters: %s' % grid.best_params_) print('Accuracy: %.2f' % grid.best_score_) # **Note** # By default, the default setting for `refit` in `GridSearchCV` is `True` (i.e., `GridSeachCV(..., refit=True)`), which means that we can use the fitted `GridSearchCV` estimator to make predictions via the `predict` method, for example: # # grid = GridSearchCV(estimator=mv_clf, # param_grid=params, # cv=10, # scoring='roc_auc') # grid.fit(X_train, y_train) # y_pred = grid.predict(X_test) # # In addition, the "best" estimator can directly be accessed via the `best_estimator_` attribute. # In[24]: grid.best_estimator_.classifiers # In[25]: mv_clf = grid.best_estimator_ # In[26]: mv_clf.set_params(**grid.best_estimator_.get_params()) # In[27]: mv_clf #
#
# # Bagging -- Building an ensemble of classifiers from bootstrap samples # In[28]: Image(filename='./images/07_06.png', width=500) # In[29]: Image(filename='./images/07_07.png', width=400) # In[30]: import pandas as pd df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/' 'machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] # drop 1 class df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'Hue']].values # In[31]: from sklearn.preprocessing import LabelEncoder if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test =\ train_test_split(X, y, test_size=0.40, random_state=1) # In[32]: from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1) bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) # In[33]: from sklearn.metrics import accuracy_score tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) bag = bag.fit(X_train, y_train) y_train_pred = bag.predict(X_train) y_test_pred = bag.predict(X_test) bag_train = accuracy_score(y_train, y_train_pred) bag_test = accuracy_score(y_test, y_test_pred) print('Bagging train/test accuracies %.3f/%.3f' % (bag_train, bag_test)) # In[34]: import numpy as np import matplotlib.pyplot as plt x_min = X_train[:, 0].min() - 1 x_max = X_train[:, 0].max() + 1 y_min = X_train[:, 1].min() - 1 y_max = X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(8, 3)) for idx, clf, tt in zip([0, 1], [tree, bag], ['Decision Tree', 'Bagging']): clf.fit(X_train, y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx].contourf(xx, yy, Z, alpha=0.3) axarr[idx].scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], c='blue', marker='^') axarr[idx].scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], c='red', marker='o') axarr[idx].set_title(tt) axarr[0].set_ylabel('Alcohol', fontsize=12) plt.text(10.2, -1.2, s='Hue', ha='center', va='center', fontsize=12) plt.tight_layout() # plt.savefig('./figures/bagging_region.png', # dpi=300, # bbox_inches='tight') plt.show() #
#
# # Leveraging weak learners via adaptive boosting # In[35]: Image(filename='./images/07_09.png', width=400) # In[36]: Image(filename='./images/07_10.png', width=500) # In[37]: from sklearn.ensemble import AdaBoostClassifier tree = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0) ada = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=0) # In[38]: tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) ada = ada.fit(X_train, y_train) y_train_pred = ada.predict(X_train) y_test_pred = ada.predict(X_test) ada_train = accuracy_score(y_train, y_train_pred) ada_test = accuracy_score(y_test, y_test_pred) print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test)) # In[39]: x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1 y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3)) for idx, clf, tt in zip([0, 1], [tree, ada], ['Decision Tree', 'AdaBoost']): clf.fit(X_train, y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx].contourf(xx, yy, Z, alpha=0.3) axarr[idx].scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], c='blue', marker='^') axarr[idx].scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], c='red', marker='o') axarr[idx].set_title(tt) axarr[0].set_ylabel('Alcohol', fontsize=12) plt.text(10.2, -1.2, s='Hue', ha='center', va='center', fontsize=12) plt.tight_layout() # plt.savefig('./figures/adaboost_region.png', # dpi=300, # bbox_inches='tight') plt.show() #
#
# # Summary # ...