Improving a gradient boosting classifier¶

In [32]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
sns.set_style('whitegrid')

In [15]:

df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()
df.head()

Out[15]:

	cntry	idno	year	tvtot	ppltrst	pplfair	pplhlp	happy	sclmeet	sclact	gndr	agea	partner
0	CH	5.0	6	3.0	3.0	10.0	5.0	8.0	5.0	4.0	2.0	60.0	1.0
1	CH	25.0	6	6.0	5.0	7.0	5.0	9.0	3.0	2.0	2.0	59.0	1.0
2	CH	26.0	6	1.0	8.0	8.0	8.0	7.0	6.0	3.0	1.0	24.0	2.0
3	CH	28.0	6	4.0	6.0	6.0	7.0	10.0	6.0	2.0	2.0	64.0	1.0
4	CH	29.0	6	5.0	6.0	7.0	5.0	8.0	7.0	2.0	2.0	55.0	1.0

In [3]:

df.cntry.value_counts()

Out[3]:

ES    2292
SE    1726
CH    1475
NO    1420
CZ    1207
DE      27
Name: cntry, dtype: int64

In [16]:

df['partner'] = df.partner - 1
df['gndr'] = df.gndr - 1

Notes: Dummies¶

We can generate dummy features for the countries listed.

In [17]:

df = pd.concat([df, pd.get_dummies(df.cntry, prefix='cntry')], 1).drop('cntry', 1)

In [18]:

df.head()

Out[18]:

	idno	year	tvtot	ppltrst	pplfair	pplhlp	happy	sclmeet	sclact	gndr	agea	partner	cntry_CH
0	5.0	6	3.0	3.0	10.0	5.0	8.0	5.0	4.0	1.0	60.0	0.0	1
1	25.0	6	6.0	5.0	7.0	5.0	9.0	3.0	2.0	1.0	59.0	0.0	1
2	26.0	6	1.0	8.0	8.0	8.0	7.0	6.0	3.0	0.0	24.0	1.0	1
3	28.0	6	4.0	6.0	6.0	7.0	10.0	6.0	2.0	1.0	64.0	0.0	1
4	29.0	6	5.0	6.0	7.0	5.0	8.0	7.0	2.0	1.0	55.0	0.0	1

In [20]:

X = df.drop('partner', 1)
y = df.partner

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

GridSearchCV¶

Let's try several hyperparameters and see how much the predictions react to them.¶

In [62]:

params = {'learning_rate': [0.1, 0.33], 'subsample': [0.25, 0.33, 1], 
          'n_estimators': [100, 300], 'max_depth': [2, 3, 4]}
gbc_clf = GradientBoostingClassifier()
search = GridSearchCV(gbc_clf, params, n_jobs=-1)
search.fit(X_train, y_train)

Out[62]:

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.33], 'subsample': [0.25, 0.33, 1], 'n_estimators': [100, 300], 'max_depth': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [63]:

search.best_params_

Out[63]:

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33}

In [64]:

search.best_score_

Out[64]:

0.7552554856529078

In [65]:

search.grid_scores_

Out[65]:

[mean: 0.75004, std: 0.00237, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.75234, std: 0.00388, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.74988, std: 0.00212, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 1},
 mean: 0.74467, std: 0.00289, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.75203, std: 0.00571, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.75326, std: 0.00310, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 1},
 mean: 0.75403, std: 0.00163, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.75526, std: 0.00218, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.75464, std: 0.00060, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1},
 mean: 0.74344, std: 0.00278, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.74482, std: 0.00281, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.74835, std: 0.00285, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1},
 mean: 0.74866, std: 0.00566, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.74820, std: 0.00175, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.75433, std: 0.00199, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1},
 mean: 0.72764, std: 0.00338, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.73316, std: 0.00619, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.74513, std: 0.00151, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1},
 mean: 0.73669, std: 0.00861, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.74467, std: 0.00174, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.75295, std: 0.00322, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 100, 'subsample': 1},
 mean: 0.72257, std: 0.00256, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.72871, std: 0.00255, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.74052, std: 0.00430, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 300, 'subsample': 1},
 mean: 0.72595, std: 0.00484, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.73408, std: 0.00324, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.74083, std: 0.00423, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1},
 mean: 0.70615, std: 0.00621, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.70953, std: 0.00718, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.72457, std: 0.00707, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1},
 mean: 0.71106, std: 0.00643, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.72334, std: 0.00245, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.73792, std: 0.00168, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1},
 mean: 0.69526, std: 0.00864, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.68513, std: 0.00250, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.72625, std: 0.00526, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1}]

In [66]:

new_gbc_clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.33)
new_gbc_clf.fit(X_train, y_train).score(X_test, y_test)

Out[66]:

0.7429447852760737

In [67]:

confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))

Out[67]:

array([[918, 106],
       [309, 297]], dtype=int64)

In [68]:

print('Type I error: {}'.format(confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))[0][1] / len(y_test)))
print('Type II error: {}'.format(confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))[1][0] / len(y_test)))

Type I error: 0.06134969325153374
Type II error: 0.19631901840490798

Notes: GridSearchCV¶

After trying several different hyperparameters, the scores don't seem to be changing very much at all - they sit at roughly 0.70 to 0.75. We picked the best results it returned.