Decision Trees with Titanic Kaggle Dataset

In [1]:
import pandas
from dtreeviz.trees import dtreeviz
data = pandas.read_csv("/Users/likejazz/workspace/github.com/likejazz/jupyter-notebooks/machine-learning/data/titanic.csv")
data.head()
Out[1]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.00 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.00 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
In [2]:
y, X = data['survived'], data[['pclass','sex','age','sibsp', 'fare']].fillna(0)
X['sex'].replace(['female','male'],[0,1],inplace=True)
X['age'] = X['age'].astype(int)
y.head(), X.head()
Out[2]:
(0    1
 1    1
 2    0
 3    0
 4    0
 Name: survived, dtype: int64,    pclass  sex  age  sibsp      fare
 0       1    0   29      0  211.3375
 1       1    1    0      1  151.5500
 2       1    0    2      1  151.5500
 3       1    1   30      1  151.5500
 4       1    0   25      1  151.5500)
In [3]:
X.columns
Out[3]:
Index(['pclass', 'sex', 'age', 'sibsp', 'fare'], dtype='object')
In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=123)
len(X_train), len(y_train), len(X_test), len(y_test)
Out[4]:
(1047, 1047, 262, 262)
In [5]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=42, max_depth=3)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[5]:
0.8091603053435115
In [6]:
import os
from tempfile import mkstemp
import subprocess

from sklearn.tree.export import export_graphviz

def convert_decision_tree_to_ipython_image(clf, feature_names=None, class_names=None,
                                           image_filename=None, tmp_dir=None):
    dot_filename = mkstemp(suffix='.dot', dir=tmp_dir)[1]
    with open(dot_filename, "w") as out_file:
        export_graphviz(clf, out_file=out_file,
                        feature_names=feature_names,
                        class_names=class_names,
                        filled=True, rounded=True,
                        special_characters=True)

    import graphviz
    from IPython.display import display

    with open(dot_filename) as f:
        dot_graph = f.read()
    display(graphviz.Source(dot_graph))
    os.remove(dot_filename)

convert_decision_tree_to_ipython_image(clf, image_filename='titanic.png', feature_names=X.columns, class_names=["dead", "surv"])
Tree 0 sex ≤ 0.5 gini = 0.468 samples = 1047 value = [655, 392] class = dead 1 pclass ≤ 2.5 gini = 0.409 samples = 369 value = [106, 263] class = surv 0->1 True 8 pclass ≤ 1.5 gini = 0.308 samples = 678 value = [549, 129] class = dead 0->8 False 2 fare ≤ 26.125 gini = 0.154 samples = 190 value = [16, 174] class = surv 1->2 5 fare ≤ 23.35 gini = 0.5 samples = 179 value = [90, 89] class = dead 1->5 3 gini = 0.278 samples = 66 value = [11, 55] class = surv 2->3 4 gini = 0.077 samples = 124 value = [5, 119] class = surv 2->4 6 gini = 0.488 samples = 151 value = [64, 87] class = surv 5->6 7 gini = 0.133 samples = 28 value = [26, 2] class = dead 5->7 9 age ≤ 38.5 gini = 0.44 samples = 141 value = [95, 46] class = dead 8->9 12 age ≤ 32.5 gini = 0.261 samples = 537 value = [454, 83] class = dead 8->12 10 gini = 0.485 samples = 82 value = [48, 34] class = dead 9->10 11 gini = 0.324 samples = 59 value = [47, 12] class = dead 9->11 13 gini = 0.296 samples = 421 value = [345, 76] class = dead 12->13 14 gini = 0.113 samples = 116 value = [109, 7] class = dead 12->14
In [7]:
viz = dtreeviz(clf, X_train, y_train, target_name='variety', 
               feature_names=X.columns,
               class_names=["dead", "surv"],
)
viz
Out[7]:
G cluster_legend node2 node5 leaf3 node2->leaf3 leaf4 node2->leaf4 leaf6 node5->leaf6 leaf7 node5->leaf7 node1 node1->node2 node1->node5 node8 node9 node12 leaf10 node9->leaf10 leaf11 node9->leaf11 leaf13 node12->leaf13 leaf14 node12->leaf14 node8->node9 node8->node12 node0 node0->node1 < node0->node8 legend
In [8]:
clf.feature_importances_
Out[8]:
array([0.20633449, 0.69231053, 0.03019166, 0.        , 0.07116332])
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
def plot_feature_importances(clf):
    n_features = X_train.shape[1]
    plt.barh(range(n_features), clf.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)
    
plot_feature_importances(clf)
In [10]:
from sklearn import ensemble
clf_rf = ensemble.RandomForestClassifier(n_estimators=50, random_state=800)
clf_rf.fit(X_train, y_train)
clf_rf.score(X_test, y_test)
Out[10]:
0.8129770992366412
In [11]:
plot_feature_importances(clf_rf)
In [12]:
from sklearn import ensemble
clf_gd = ensemble.GradientBoostingClassifier(random_state=800)
clf_gd.fit(X_train, y_train)
clf_gd.score(X_test, y_test)
Out[12]:
0.8282442748091603
In [13]:
plot_feature_importances(clf_gd)
In [14]:
from rfpimp import importances, plot_importances

imp = importances(clf_rf, X_test, y_test) # permutation
viz = plot_importances(imp)
viz
Out[14]:
In [15]:
from rfpimp import importances, plot_importances

imp = importances(clf_gd, X_test, y_test)
viz = plot_importances(imp)
viz
Out[15]:
In [16]:
from rfpimp import plot_corr_heatmap
viz = plot_corr_heatmap(X_train)
viz
Out[16]:
In [17]:
from rfpimp import feature_dependence_matrix
D = feature_dependence_matrix(X_test, sort_by_dependence=True)
D
Out[17]:
Dependence pclass sex age sibsp fare
pclass 0.835376 1 0.0168474 0 0.216722 1
fare 0.499473 1 0.0711578 0.155285 0.132715 1
sibsp 0.494857 0.353675 0 0.173024 1 0.816689
age -0.0650702 0.607611 0 1 0 0.0387899
sex -0.0918864 0.0435091 1 0.0148263 0.0904816 0.161317
In [18]:
from rfpimp import plot_dependence_heatmap
viz = plot_dependence_heatmap(D)
viz
Out[18]:
In [19]:
from eli5.sklearn import PermutationImportance
import eli5
import pandas as pd

perm = PermutationImportance(clf_rf).fit(X_test, y_test)
I = pd.DataFrame(data={"columns":X_test.columns, "importances":perm.feature_importances_})
I = I.set_index("columns")
I = I.sort_values('importances', ascending=True)
I.plot.barh()
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x12cc3ba90>
In [20]:
import shap
shap.initjs()
In [21]:
i = 13
explainer = shap.TreeExplainer(clf_rf)
shap_values = explainer.shap_values(X_test.iloc[i])
In [22]:
shap.summary_plot(explainer.shap_values(X_test), X_test)
In [23]:
c = 0 # died
shap.force_plot(explainer.expected_value[c], shap_values[c], X_test.iloc[i])
Out[23]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [24]:
c = 1 # survived
shap.force_plot(explainer.expected_value[c], shap_values[c], X_test.iloc[i])
Out[24]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [25]:
shap.decision_plot(explainer.expected_value[c], shap_values[c], X_test.iloc[i], feature_display_range=slice(-1, -6, -1))
In [26]:
viz = dtreeviz(clf, X_train, y_train, target_name='variety', 
               feature_names=X.columns,
               class_names=["dead", "surv"],
               X=X_test.iloc[i]
)
viz
Out[26]:
G cluster_legend cluster_instance node2 node5 leaf3 node2->leaf3 leaf4 node2->leaf4 leaf6 node5->leaf6 leaf7 node5->leaf7 node1 node1->node2 node1->node5 node8 node9 node12 leaf10 node9->leaf10 leaf11 node9->leaf11 leaf13 node12->leaf13 leaf14 node12->leaf14 node8->node9 node8->node12 node0