Decision Trees with Titanic Kaggle Dataset¶

In [1]:

import pandas
from dtreeviz.trees import dtreeviz
data = pandas.read_csv("/Users/likejazz/workspace/github.com/likejazz/jupyter-notebooks/machine-learning/data/titanic.csv")
data.head()

Out[1]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.00	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.92	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.00	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.00	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.00	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON

In [2]:

y, X = data['survived'], data[['pclass','sex','age','sibsp', 'fare']].fillna(0)
X['sex'].replace(['female','male'],[0,1],inplace=True)
X['age'] = X['age'].astype(int)
y.head(), X.head()

Out[2]:

(0    1
 1    1
 2    0
 3    0
 4    0
 Name: survived, dtype: int64,    pclass  sex  age  sibsp      fare
 0       1    0   29      0  211.3375
 1       1    1    0      1  151.5500
 2       1    0    2      1  151.5500
 3       1    1   30      1  151.5500
 4       1    0   25      1  151.5500)

In [3]:

X.columns

Out[3]:

Index(['pclass', 'sex', 'age', 'sibsp', 'fare'], dtype='object')

In [4]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=123)
len(X_train), len(y_train), len(X_test), len(y_test)

Out[4]:

(1047, 1047, 262, 262)

In [5]:

from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=42, max_depth=3)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Out[5]:

0.8091603053435115

In [6]:

import os
from tempfile import mkstemp
import subprocess

from sklearn.tree.export import export_graphviz

def convert_decision_tree_to_ipython_image(clf, feature_names=None, class_names=None,
                                           image_filename=None, tmp_dir=None):
    dot_filename = mkstemp(suffix='.dot', dir=tmp_dir)[1]
    with open(dot_filename, "w") as out_file:
        export_graphviz(clf, out_file=out_file,
                        feature_names=feature_names,
                        class_names=class_names,
                        filled=True, rounded=True,
                        special_characters=True)

    import graphviz
    from IPython.display import display

    with open(dot_filename) as f:
        dot_graph = f.read()
    display(graphviz.Source(dot_graph))
    os.remove(dot_filename)

convert_decision_tree_to_ipython_image(clf, image_filename='titanic.png', feature_names=X.columns, class_names=["dead", "surv"])

In [7]:

viz = dtreeviz(clf, X_train, y_train, target_name='variety', 
               feature_names=X.columns,
               class_names=["dead", "surv"],
)
viz

Out[7]:

In [8]:

clf.feature_importances_

Out[8]:

array([0.20633449, 0.69231053, 0.03019166, 0.        , 0.07116332])

In [9]:

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
def plot_feature_importances(clf):
    n_features = X_train.shape[1]
    plt.barh(range(n_features), clf.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)
    
plot_feature_importances(clf)

In [10]:

from sklearn import ensemble
clf_rf = ensemble.RandomForestClassifier(n_estimators=50, random_state=800)
clf_rf.fit(X_train, y_train)
clf_rf.score(X_test, y_test)

Out[10]:

0.8129770992366412

In [11]:

plot_feature_importances(clf_rf)

In [12]:

from sklearn import ensemble
clf_gd = ensemble.GradientBoostingClassifier(random_state=800)
clf_gd.fit(X_train, y_train)
clf_gd.score(X_test, y_test)

Out[12]:

0.8282442748091603

In [13]:

plot_feature_importances(clf_gd)

In [14]:

from rfpimp import importances, plot_importances

imp = importances(clf_rf, X_test, y_test) # permutation
viz = plot_importances(imp)
viz

Out[14]:

In [15]:

from rfpimp import importances, plot_importances

imp = importances(clf_gd, X_test, y_test)
viz = plot_importances(imp)
viz

Out[15]:

In [16]:

from rfpimp import plot_corr_heatmap
viz = plot_corr_heatmap(X_train)
viz

Out[16]:

In [17]:

from rfpimp import feature_dependence_matrix
D = feature_dependence_matrix(X_test, sort_by_dependence=True)
D

Out[17]:

	Dependence	pclass	sex	age	sibsp	fare
pclass	0.835376	1	0.0168474	0	0.216722	1
fare	0.499473	1	0.0711578	0.155285	0.132715	1
sibsp	0.494857	0.353675	0	0.173024	1	0.816689
age	-0.0650702	0.607611	0	1	0	0.0387899
sex	-0.0918864	0.0435091	1	0.0148263	0.0904816	0.161317

In [18]:

from rfpimp import plot_dependence_heatmap
viz = plot_dependence_heatmap(D)
viz

Out[18]:

In [19]:

from eli5.sklearn import PermutationImportance
import eli5
import pandas as pd

perm = PermutationImportance(clf_rf).fit(X_test, y_test)
I = pd.DataFrame(data={"columns":X_test.columns, "importances":perm.feature_importances_})
I = I.set_index("columns")
I = I.sort_values('importances', ascending=True)
I.plot.barh()

Out[19]:

<matplotlib.axes._subplots.AxesSubplot at 0x12cc3ba90>

In [20]:

import shap
shap.initjs()

In [21]:

i = 13
explainer = shap.TreeExplainer(clf_rf)
shap_values = explainer.shap_values(X_test.iloc[i])

In [22]:

shap.summary_plot(explainer.shap_values(X_test), X_test)

In [23]:

c = 0 # died
shap.force_plot(explainer.expected_value[c], shap_values[c], X_test.iloc[i])

Out[23]:

Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

In [24]:

c = 1 # survived
shap.force_plot(explainer.expected_value[c], shap_values[c], X_test.iloc[i])

Out[24]:

Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

In [25]:

shap.decision_plot(explainer.expected_value[c], shap_values[c], X_test.iloc[i], feature_display_range=slice(-1, -6, -1))

In [26]:

viz = dtreeviz(clf, X_train, y_train, target_name='variety', 
               feature_names=X.columns,
               class_names=["dead", "surv"],
               X=X_test.iloc[i]
)
viz

Out[26]: