by Alejandro Correa Bahnsen and Jesus Solano
version 1.5, February 2019
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Kevin Markham)
Why are we learning about ensembling?
Students will be able to:
import pandas as pd
import numpy as np
# read in the data
url = 'https://raw.githubusercontent.com/albahnsen/PracticalMachineLearningClass/master/datasets/hitters.csv'
hitters = pd.read_csv(url)
# remove rows with missing values
hitters.dropna(inplace=True)
hitters.head()
AtBat | Hits | HmRun | Runs | RBI | Walks | Years | CAtBat | CHits | CHmRun | CRuns | CRBI | CWalks | League | Division | PutOuts | Assists | Errors | Salary | NewLeague | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 315 | 81 | 7 | 24 | 38 | 39 | 14 | 3449 | 835 | 69 | 321 | 414 | 375 | N | W | 632 | 43 | 10 | 475.0 | N |
2 | 479 | 130 | 18 | 66 | 72 | 76 | 3 | 1624 | 457 | 63 | 224 | 266 | 263 | A | W | 880 | 82 | 14 | 480.0 | A |
3 | 496 | 141 | 20 | 65 | 78 | 37 | 11 | 5628 | 1575 | 225 | 828 | 838 | 354 | N | E | 200 | 11 | 3 | 500.0 | N |
4 | 321 | 87 | 10 | 39 | 42 | 30 | 2 | 396 | 101 | 12 | 48 | 46 | 33 | N | E | 805 | 40 | 4 | 91.5 | N |
5 | 594 | 169 | 4 | 74 | 51 | 35 | 11 | 4408 | 1133 | 19 | 501 | 336 | 194 | A | W | 282 | 421 | 25 | 750.0 | A |
# encode categorical variables as integers
hitters['League'] = pd.factorize(hitters.League)[0]
hitters['Division'] = pd.factorize(hitters.Division)[0]
hitters['NewLeague'] = pd.factorize(hitters.NewLeague)[0]
hitters.head()
AtBat | Hits | HmRun | Runs | RBI | Walks | Years | CAtBat | CHits | CHmRun | CRuns | CRBI | CWalks | League | Division | PutOuts | Assists | Errors | Salary | NewLeague | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 315 | 81 | 7 | 24 | 38 | 39 | 14 | 3449 | 835 | 69 | 321 | 414 | 375 | 0 | 0 | 632 | 43 | 10 | 475.0 | 0 |
2 | 479 | 130 | 18 | 66 | 72 | 76 | 3 | 1624 | 457 | 63 | 224 | 266 | 263 | 1 | 0 | 880 | 82 | 14 | 480.0 | 1 |
3 | 496 | 141 | 20 | 65 | 78 | 37 | 11 | 5628 | 1575 | 225 | 828 | 838 | 354 | 0 | 1 | 200 | 11 | 3 | 500.0 | 0 |
4 | 321 | 87 | 10 | 39 | 42 | 30 | 2 | 396 | 101 | 12 | 48 | 46 | 33 | 0 | 1 | 805 | 40 | 4 | 91.5 | 0 |
5 | 594 | 169 | 4 | 74 | 51 | 35 | 11 | 4408 | 1133 | 19 | 501 | 336 | 194 | 1 | 0 | 282 | 421 | 25 | 750.0 | 1 |
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# scatter plot of Years versus Hits colored by Salary
hitters.plot(kind='scatter', x='Years', y='Hits', c='Salary', colormap='jet', xlim=(0, 25), ylim=(0, 250))
<matplotlib.axes._subplots.AxesSubplot at 0x19a30087b38>
# define features: exclude career statistics (which start with "C") and the response (Salary)
feature_cols = hitters.columns[hitters.columns.str.startswith('C') == False].drop('Salary')
feature_cols
Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'League', 'Division', 'PutOuts', 'Assists', 'Errors', 'NewLeague'], dtype='object')
hitters.Salary.describe()
count 263.000000 mean 535.925882 std 451.118681 min 67.500000 25% 190.000000 50% 425.000000 75% 750.000000 max 2460.000000 Name: Salary, dtype: float64
# define X and y
X = hitters[feature_cols]
y = (hitters.Salary > 425).astype(int)
X.columns
Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'League', 'Division', 'PutOuts', 'Assists', 'Errors', 'NewLeague'], dtype='object')
max_depth = None
num_pct = 10
max_features = None
min_gain=0.001
For feature 1 calculate possible splitting points
j = 1
print(X.columns[j])
Hits
# Split the variable in num_ctp points
splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / num_pct).tolist())
# Only unique values for filter binary and few unique values features
splits = np.unique(splits)
splits
array([ 1. , 52. , 66.8, 77. , 92. , 103. , 120. , 136. , 148.6, 168. ])
split the data using split 5
k = 5
filter_l = X.iloc[:, j] < splits[k]
y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]
The Gini Impurity of a node is the probability that a randomly chosen sample in a node would be incorrectly labeled if it was labeled by the distribution of samples in the node.
For each node
def gini(y):
if y.shape[0] == 0:
return 0
else:
return 1 - (y.mean()**2 + (1 - y.mean())**2)
gini_l = gini(y_l)
gini_l
0.39928079856159704
gini_r = gini(y_r)
gini_r
0.42690311418685123
The gini impurity of the split is the Gini Impurity of each node is weighted by the fraction of points from the parent node in that node.
def gini_impurity(X_col, y, split):
"Calculate the gain of an split k on feature j"
filter_l = X_col < split
y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]
n_l = y_l.shape[0]
n_r = y_r.shape[0]
gini_y = gini(y)
gini_l = gini(y_l)
gini_r = gini(y_r)
gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
return gini_impurity_
gini_impurity(X.iloc[:, j], y, splits[k])
0.0862547016583845
def best_split(X, y, num_pct=10):
features = range(X.shape[1])
best_split = [0, 0, 0] # j, split, gain
# For all features
for j in features:
splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
splits = np.unique(splits)[1:]
# For all splits
for split in splits:
gain = gini_impurity(X.iloc[:, j], y, split)
if gain > best_split[2]:
best_split = [j, split, gain]
return best_split
j, split, gain = best_split(X, y, 5)
j, split, gain
(6, 6.0, 0.1428365268140297)
filter_l = X.iloc[:, j] < split
y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]
y.shape[0], y_l.shape[0], y_r.shape[0]
(263, 116, 147)
y.mean(), y_l.mean(), y_r.mean()
(0.49049429657794674, 0.1896551724137931, 0.7278911564625851)
def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10):
# If only one observation
if X.shape[0] == 1:
tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
return tree
# Calculate the best split
j, split, gain = best_split(X, y, num_pct)
# save tree and estimate prediction
y_pred = int(y.mean() >= 0.5)
y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0) # Laplace correction
tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
# Check stooping criteria
if gain < min_gain:
return tree
if max_depth is not None:
if level >= max_depth:
return tree
# No stooping criteria was meet, then continue to create the partition
filter_l = X.iloc[:, j] < split
X_l, y_l = X.loc[filter_l], y.loc[filter_l]
X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
tree['split'] = [j, split]
# Next iteration to each split
tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
return tree
tree_grow(X, y, level=0, min_gain=0.001, max_depth=1, num_pct=10)
{'y_pred': 0, 'y_prob': 0.49056603773584906, 'level': 0, 'split': [6, 5.0], 'n_samples': 263, 'gain': 0.15865574114903452, 'sl': {'y_pred': 0, 'y_prob': 0.10869565217391304, 'level': 1, 'split': -1, 'n_samples': 90, 'gain': 0.01935558112773289}, 'sr': {'y_pred': 1, 'y_prob': 0.6914285714285714, 'level': 1, 'split': -1, 'n_samples': 173, 'gain': 0.1127122881295256}}
tree = tree_grow(X, y, level=0, min_gain=0.001, max_depth=3, num_pct=10)
tree
{'y_pred': 0, 'y_prob': 0.49056603773584906, 'level': 0, 'split': [6, 5.0], 'n_samples': 263, 'gain': 0.15865574114903452, 'sl': {'y_pred': 0, 'y_prob': 0.10869565217391304, 'level': 1, 'split': [5, 65.0], 'n_samples': 90, 'gain': 0.01935558112773289, 'sl': {'y_pred': 0, 'y_prob': 0.07407407407407407, 'level': 2, 'split': [0, 185.0], 'n_samples': 79, 'gain': 0.009619566461418955, 'sl': {'y_pred': 0, 'y_prob': 0.3333333333333333, 'level': 3, 'split': -1, 'n_samples': 7, 'gain': 0.40816326530612246}, 'sr': {'y_pred': 0, 'y_prob': 0.05405405405405406, 'level': 3, 'split': -1, 'n_samples': 72, 'gain': 0.009027777777777565}}, 'sr': {'y_pred': 0, 'y_prob': 0.38461538461538464, 'level': 2, 'split': [0, 470.90909090909093], 'n_samples': 11, 'gain': 0.2203856749311295, 'sl': {'y_pred': 0, 'y_prob': 0.14285714285714285, 'level': 3, 'split': -1, 'n_samples': 5, 'gain': 0}, 'sr': {'y_pred': 1, 'y_prob': 0.625, 'level': 3, 'split': -1, 'n_samples': 6, 'gain': 0.4444444444444444}}}, 'sr': {'y_pred': 1, 'y_prob': 0.6914285714285714, 'level': 1, 'split': [1, 103.0], 'n_samples': 173, 'gain': 0.1127122881295256, 'sl': {'y_pred': 0, 'y_prob': 0.43037974683544306, 'level': 2, 'split': [5, 22.0], 'n_samples': 77, 'gain': 0.07695385846646363, 'sl': {'y_pred': 0, 'y_prob': 0.17857142857142858, 'level': 3, 'split': -1, 'n_samples': 26, 'gain': 0.06860475087899842}, 'sr': {'y_pred': 1, 'y_prob': 0.5660377358490566, 'level': 3, 'split': -1, 'n_samples': 51, 'gain': 0.09501691508611931}}, 'sr': {'y_pred': 1, 'y_prob': 0.8979591836734694, 'level': 2, 'split': [2, 6.0], 'n_samples': 96, 'gain': 0.01107413837448551, 'sl': {'y_pred': 1, 'y_prob': 0.7058823529411765, 'level': 3, 'split': -1, 'n_samples': 15, 'gain': 0.16547008547008554}, 'sr': {'y_pred': 1, 'y_prob': 0.927710843373494, 'level': 3, 'split': -1, 'n_samples': 81, 'gain': 0.006994315787586275}}}}
def tree_predict(X, tree, proba=False):
predicted = np.ones(X.shape[0])
# Check if final node
if tree['split'] == -1:
if not proba:
predicted = predicted * tree['y_pred']
else:
predicted = predicted * tree['y_prob']
else:
j, split = tree['split']
filter_l = (X.iloc[:, j] < split)
X_l = X.loc[filter_l]
X_r = X.loc[~filter_l]
if X_l.shape[0] == 0: # If left node is empty only continue with right
predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
elif X_r.shape[0] == 0: # If right node is empty only continue with left
predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
else:
predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
return predicted
tree_predict(X, tree)
array([1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1.])
# list of values to try for max_depth
max_depth_range = range(1, 21)
# list to store the average RMSE for each value of max_depth
accuracy_scores = []
# use 10-fold cross-validation with each value of max_depth
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
for depth in max_depth_range:
clf = DecisionTreeClassifier(max_depth=depth, random_state=1)
accuracy_scores.append(cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean())
# plot max_depth (x-axis) versus RMSE (y-axis)
plt.plot(max_depth_range, accuracy_scores)
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
Text(0,0.5,'Accuracy')
# show the best accuracy and the corresponding max_depth
sorted(zip(accuracy_scores, max_depth_range))[::-1][0]
(0.8205754985754986, 4)
# max_depth=2 was best, so fit a tree using that parameter
clf = DecisionTreeClassifier(max_depth=4, random_state=1)
clf.fit(X, y)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=1, splitter='best')
# compute feature importances
pd.DataFrame({'feature':feature_cols, 'importance':clf.feature_importances_}).sort_values('importance')
feature | importance | |
---|---|---|
0 | AtBat | 0.000000 |
7 | League | 0.000000 |
8 | Division | 0.000000 |
10 | Assists | 0.000000 |
11 | Errors | 0.000000 |
12 | NewLeague | 0.000000 |
9 | PutOuts | 0.006048 |
2 | HmRun | 0.010841 |
4 | RBI | 0.012073 |
3 | Runs | 0.021020 |
5 | Walks | 0.103473 |
1 | Hits | 0.298269 |
6 | Years | 0.548277 |
pd.Series(cross_val_score(clf, X, y, cv=10)).describe()
count 10.000000 mean 0.820575 std 0.083007 min 0.692308 25% 0.751781 50% 0.830484 75% 0.879630 max 0.923077 dtype: float64
Random Forests is a slight variation of bagged trees that has even better performance:
What's the point?
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf
C:\Users\albah\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release. from numpy.core.umath_tests import inner1d
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
pd.Series(cross_val_score(clf, X, y, cv=10)).describe()
count 10.000000 mean 0.817749 std 0.062496 min 0.703704 25% 0.783333 50% 0.811254 75% 0.846154 max 0.923077 dtype: float64
One important tuning parameter is n_estimators, which is the number of trees that should be grown. It should be a large enough value that the error seems to have "stabilized".
# list of values to try for n_estimators
estimator_range = range(10, 310, 10)
# list to store the average Accuracy for each value of n_estimators
accuracy_scores = []
# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for estimator in estimator_range:
clf = RandomForestClassifier(n_estimators=estimator, random_state=1, n_jobs=-1)
accuracy_scores.append(cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean())
plt.plot(estimator_range, accuracy_scores)
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
Text(0,0.5,'Accuracy')
The other important tuning parameter is max_features, which is the number of features that should be considered at each split.
# list of values to try for max_features
feature_range = range(1, len(feature_cols)+1)
# list to store the average Accuracy for each value of max_features
accuracy_scores = []
# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
for feature in feature_range:
clf = RandomForestClassifier(n_estimators=200, max_features=feature, random_state=1, n_jobs=-1)
accuracy_scores.append(cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean())
plt.plot(feature_range, accuracy_scores)
plt.xlabel('max_features')
plt.ylabel('Accuracy')
Text(0,0.5,'Accuracy')
# max_features=6 is best and n_estimators=200 is sufficiently large
clf = RandomForestClassifier(n_estimators=200, max_features=6, random_state=1, n_jobs=-1)
clf.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1, oob_score=False, random_state=1, verbose=0, warm_start=False)
# compute feature importances
pd.DataFrame({'feature':feature_cols, 'importance':clf.feature_importances_}).sort_values('importance')
feature | importance | |
---|---|---|
8 | Division | 0.006081 |
7 | League | 0.008834 |
12 | NewLeague | 0.009709 |
11 | Errors | 0.032638 |
10 | Assists | 0.040503 |
2 | HmRun | 0.047118 |
9 | PutOuts | 0.051506 |
0 | AtBat | 0.078822 |
3 | Runs | 0.080185 |
5 | Walks | 0.082160 |
4 | RBI | 0.091048 |
1 | Hits | 0.132156 |
6 | Years | 0.339239 |
Advantages of Random Forests:
Disadvantages of Random Forests: