import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
%matplotlib inline
sns.set_style('whitegrid')
df = pd.read_csv((
"https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
"master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()
df.head()
cntry | idno | year | tvtot | ppltrst | pplfair | pplhlp | happy | sclmeet | sclact | gndr | agea | partner | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CH | 5.0 | 6 | 3.0 | 3.0 | 10.0 | 5.0 | 8.0 | 5.0 | 4.0 | 2.0 | 60.0 | 1.0 |
1 | CH | 25.0 | 6 | 6.0 | 5.0 | 7.0 | 5.0 | 9.0 | 3.0 | 2.0 | 2.0 | 59.0 | 1.0 |
2 | CH | 26.0 | 6 | 1.0 | 8.0 | 8.0 | 8.0 | 7.0 | 6.0 | 3.0 | 1.0 | 24.0 | 2.0 |
3 | CH | 28.0 | 6 | 4.0 | 6.0 | 6.0 | 7.0 | 10.0 | 6.0 | 2.0 | 2.0 | 64.0 | 1.0 |
4 | CH | 29.0 | 6 | 5.0 | 6.0 | 7.0 | 5.0 | 8.0 | 7.0 | 2.0 | 2.0 | 55.0 | 1.0 |
df.cntry.value_counts()
ES 2292 SE 1726 CH 1475 NO 1420 CZ 1207 DE 27 Name: cntry, dtype: int64
df['partner'] = df.partner - 1
df['gndr'] = df.gndr - 1
We can generate dummy features for the countries listed.
df = pd.concat([df, pd.get_dummies(df.cntry, prefix='cntry')], 1).drop('cntry', 1)
df.head()
idno | year | tvtot | ppltrst | pplfair | pplhlp | happy | sclmeet | sclact | gndr | agea | partner | cntry_CH | cntry_CZ | cntry_DE | cntry_ES | cntry_NO | cntry_SE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5.0 | 6 | 3.0 | 3.0 | 10.0 | 5.0 | 8.0 | 5.0 | 4.0 | 1.0 | 60.0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | 25.0 | 6 | 6.0 | 5.0 | 7.0 | 5.0 | 9.0 | 3.0 | 2.0 | 1.0 | 59.0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 26.0 | 6 | 1.0 | 8.0 | 8.0 | 8.0 | 7.0 | 6.0 | 3.0 | 0.0 | 24.0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | 28.0 | 6 | 4.0 | 6.0 | 6.0 | 7.0 | 10.0 | 6.0 | 2.0 | 1.0 | 64.0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 29.0 | 6 | 5.0 | 6.0 | 7.0 | 5.0 | 8.0 | 7.0 | 2.0 | 1.0 | 55.0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 |
X = df.drop('partner', 1)
y = df.partner
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
params = {'learning_rate': [0.1, 0.33], 'subsample': [0.25, 0.33, 1],
'n_estimators': [100, 300], 'max_depth': [2, 3, 4]}
gbc_clf = GradientBoostingClassifier()
search = GridSearchCV(gbc_clf, params, n_jobs=-1)
search.fit(X_train, y_train)
GridSearchCV(cv=None, error_score='raise', estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False), fit_params=None, iid=True, n_jobs=-1, param_grid={'learning_rate': [0.1, 0.33], 'subsample': [0.25, 0.33, 1], 'n_estimators': [100, 300], 'max_depth': [2, 3, 4]}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0)
search.best_params_
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33}
search.best_score_
0.7552554856529078
search.grid_scores_
[mean: 0.75004, std: 0.00237, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.25}, mean: 0.75234, std: 0.00388, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.33}, mean: 0.74988, std: 0.00212, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 1}, mean: 0.74467, std: 0.00289, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.25}, mean: 0.75203, std: 0.00571, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.33}, mean: 0.75326, std: 0.00310, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 1}, mean: 0.75403, std: 0.00163, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.25}, mean: 0.75526, std: 0.00218, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33}, mean: 0.75464, std: 0.00060, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}, mean: 0.74344, std: 0.00278, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.25}, mean: 0.74482, std: 0.00281, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.33}, mean: 0.74835, std: 0.00285, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1}, mean: 0.74866, std: 0.00566, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.25}, mean: 0.74820, std: 0.00175, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.33}, mean: 0.75433, std: 0.00199, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1}, mean: 0.72764, std: 0.00338, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.25}, mean: 0.73316, std: 0.00619, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.33}, mean: 0.74513, std: 0.00151, params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1}, mean: 0.73669, std: 0.00861, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.25}, mean: 0.74467, std: 0.00174, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.33}, mean: 0.75295, std: 0.00322, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 100, 'subsample': 1}, mean: 0.72257, std: 0.00256, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.25}, mean: 0.72871, std: 0.00255, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.33}, mean: 0.74052, std: 0.00430, params: {'learning_rate': 0.33, 'max_depth': 2, 'n_estimators': 300, 'subsample': 1}, mean: 0.72595, std: 0.00484, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.25}, mean: 0.73408, std: 0.00324, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33}, mean: 0.74083, std: 0.00423, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}, mean: 0.70615, std: 0.00621, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.25}, mean: 0.70953, std: 0.00718, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.33}, mean: 0.72457, std: 0.00707, params: {'learning_rate': 0.33, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1}, mean: 0.71106, std: 0.00643, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.25}, mean: 0.72334, std: 0.00245, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.33}, mean: 0.73792, std: 0.00168, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1}, mean: 0.69526, std: 0.00864, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.25}, mean: 0.68513, std: 0.00250, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.33}, mean: 0.72625, std: 0.00526, params: {'learning_rate': 0.33, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1}]
new_gbc_clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.33)
new_gbc_clf.fit(X_train, y_train).score(X_test, y_test)
0.7429447852760737
confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))
array([[918, 106], [309, 297]], dtype=int64)
print('Type I error: {}'.format(confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))[0][1] / len(y_test)))
print('Type II error: {}'.format(confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))[1][0] / len(y_test)))
Type I error: 0.06134969325153374 Type II error: 0.19631901840490798
After trying several different hyperparameters, the scores don't seem to be changing very much at all - they sit at roughly 0.70 to 0.75. We picked the best results it returned.