In averaging methodss, the driving principle is to build several estimators independently and then to average their predictions. On average, the combined estimator is usually better than any of the single base estimator because its variance is reduced. Examples: Bagging methods, Forests of randomized trees, …
By contrast, in boosting methods, base estimators are built sequentially and one tries to reduce the bias of the combined estimator. The motivation is to combine several weak models to produce a powerful ensemble. Examples: AdaBoost, Gradient Tree Boosting, …
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Modified version of the "Titanic" data can be found at: http://http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv. Original unmodified Titanic data is available at CMU StatLib.
url = "http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv"
titanic = pd.read_csv(url)
titanic.head(10)
titanic.describe(include="all")
titanic[titanic.age.isnull()].shape
age_mean = titanic.age.mean()
titanic.age.fillna(age_mean, axis=0, inplace=True)
titanic.dropna(axis=0, inplace=True)
titanic.shape
titanic.set_index('pid', drop=True, inplace=True)
titanic.head()
titanic.describe().T
titanic_ssf = pd.get_dummies(titanic)
titanic_ssf.head(10)
titanic_names = titanic_ssf.columns.values
titanic_names
y = titanic_ssf['survived']
X = titanic_ssf[titanic_names[1:]]
X.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred = clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y, y_pred),"\n")
if show_confussion_matrix:
print("Confussion matrix")
print(metrics.confusion_matrix(y, y_pred),"\n")
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='gini')
dt = dt.fit(X_train, y_train)
from sklearn import metrics
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=33)
rf = rf.fit(X_train, y_train)
measure_performance(X_test, y_test, rf, show_confussion_matrix=False, show_classification_report=False)
print(rf.get_params())
from sklearn.model_selection import KFold
def calc_params(X, y, clf, param_values, param_name, K):
# Convert input to Numpy arrays
X = np.array(X)
y = np.array(y)
# initialize training and testing score arrays with zeros
train_scores = np.zeros(len(param_values))
test_scores = np.zeros(len(param_values))
# iterate over the different parameter values
for i, param_value in enumerate(param_values):
# set classifier parameters
clf.set_params(**{param_name:param_value})
# initialize the K scores obtained for each fold
k_train_scores = np.zeros(K)
k_test_scores = np.zeros(K)
# create KFold cross validation
cv = KFold(n_splits=K, shuffle=True, random_state=0)
# iterate over the K folds
j = 0
for train, test in cv.split(X):
# fit the classifier in the corresponding fold
# and obtain the corresponding accuracy scores on train and test sets
clf.fit(X[train], y[train])
k_train_scores[j] = clf.score(X[train], y[train])
k_test_scores[j] = clf.score(X[test], y[test])
j += 1
# store the mean of the K fold scores
train_scores[i] = np.mean(k_train_scores)
test_scores[i] = np.mean(k_test_scores)
print(param_name, '=', param_value, "Train =", train_scores[i], "Test =", test_scores[i])
# plot the training and testing scores in a log scale
plt.plot(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
plt.plot(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
plt.legend(loc=7)
plt.xlabel(param_name + " values")
plt.ylabel("Mean cross validation accuracy")
# return the training and testing scores on each parameter value
return train_scores, test_scores
msl = range(1,6)
print(msl)
rf = RandomForestClassifier(n_estimators=10, random_state=33)
train_scores, test_scores = calc_params(X_train, y_train, rf, msl, 'min_samples_leaf', 5)
m_depth = [1,2,3,4,5,6, 7, 8]
rf = RandomForestClassifier(n_estimators=10, random_state=33)
train_scores, test_scores = calc_params(X_train, y_train, rf, m_depth, 'max_depth', 5)
nest = range(5, 101, 5)
print(nest)
rf = RandomForestClassifier(n_estimators=10, random_state=33)
train_scores, test_scores = calc_params(X_train, y_train, rf, nest, 'n_estimators', 5)
rf = RandomForestClassifier(n_estimators=25, min_samples_leaf=3, max_depth=4)
rf = rf.fit(X_train, y_train)
measure_performance(X_test, y_test, rf, show_confussion_matrix=False, show_classification_report=False)
rf.feature_importances_
def plot_feature_importances(model, n_features, feature_names):
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
features = titanic_names[1:]
plot_feature_importances(rf, len(features), features)
rf.estimators_[:3]
fig, ax = plt.subplots(figsize=(50,20))
tree.plot_tree(rf.estimators_[0], feature_names=features, class_names=["No","Yes"], filled=True, ax=ax);
import graphviz
from sklearn.tree import export_graphviz
dot_data = export_graphviz(rf.estimators_[0],out_file=None, feature_names=features,
class_names=["No","Yes"], filled=True, rotate=True)
graph = graphviz.Source(dot_data)
graph
#import pydotplus
#from sklearn.externals.six import StringIO
#from IPython.display import Image
#dot_data = StringIO()
#export_graphviz(rf.estimators_[0],out_file=dot_data, feature_names=features, class_names=["Yes","No"])
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#Image(graph.create_png())
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()
ab = ab.fit(X_train, y_train)
measure_performance(X_test, y_test, ab, show_confussion_matrix=False, show_classification_report=False)
train_scores, test_scores = calc_params(X_train, y_train, ab, nest, 'n_estimators', 5)
ab = AdaBoostClassifier(n_estimators=10)
ab = ab.fit(X_train, y_train)
measure_performance(X_test, y_test, ab, show_confussion_matrix=False, show_classification_report=False)
lr = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0, 1.3, 1.5, 1.8, 2.0]
ab = AdaBoostClassifier()
train_scores, test_scores = calc_params(X_train, y_train, ab, lr, 'learning_rate', 5)
ab = AdaBoostClassifier(learning_rate=1.3)
ab = ab.fit(X_train, y_train)
measure_performance(X_test, y_test, ab, show_confussion_matrix=False, show_classification_report=False)
from sklearn.model_selection import GridSearchCV
ab = AdaBoostClassifier()
parameters = {
'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0, 1.3, 1.5, 1.8, 2.0],
'n_estimators': range(5, 101, 5),
}
gs = GridSearchCV(ab, parameters, cv=3)
%time _ = gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
ab = AdaBoostClassifier(n_estimators=20, learning_rate=1.3)
ab = ab.fit(X_train, y_train)
measure_performance(X_test, y_test, ab, show_confussion_matrix=False, show_classification_report=False)
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=45, learning_rate=1.0,
max_depth=3, random_state=0).fit(X_train, y_train)
gb.score(X_test, y_test)
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5,
n_estimators=25, random_state=5)
bagging.fit(X_train, y_train)
bagging.score(X_test, y_test)