In [1]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Import titanic data using pandas

Modified version of the "Titanic" data can be found at: http://http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv. Original unmodified Titanic data is available at CMU StatLib.

In [3]:
url = "http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv"
titanic = pd.read_csv(url)
titanic.head(10)
Out[3]:
pid pclass survived sex age sibsp parch fare embarked
0 1 1st 1 female 29.0 0 0 211.337494 Southampton
1 2 1st 1 male NaN 1 2 151.550003 Southampton
2 3 1st 0 female 2.0 1 2 151.550003 Southampton
3 4 1st 0 male 30.0 1 2 151.550003 Southampton
4 5 1st 0 female 25.0 1 2 151.550003 Southampton
5 6 1st 1 male 48.0 0 0 26.549999 Southampton
6 7 1st 1 female 63.0 1 0 77.958298 Southampton
7 8 1st 0 male 39.0 0 0 0.000000 Southampton
8 9 1st 1 female 53.0 2 0 51.479198 Southampton
9 10 1st 0 male 71.0 0 0 49.504200 Cherbourg
In [5]:
titanic.describe(include="all")
Out[5]:
pid pclass survived sex age sibsp parch fare embarked
count 1309.000000 1309 1309.000000 1309 1045.000000 1309.000000 1309.000000 1308.000000 1307
unique NaN 3 NaN 2 NaN NaN NaN NaN 3
top NaN 3rd NaN male NaN NaN NaN NaN Southampton
freq NaN 709 NaN 843 NaN NaN NaN NaN 914
mean 655.000000 NaN 0.381971 NaN 29.908852 0.498854 0.385027 33.295479 NaN
std 378.020061 NaN 0.486055 NaN 14.392485 1.041658 0.865560 51.758669 NaN
min 1.000000 NaN 0.000000 NaN 0.166700 0.000000 0.000000 0.000000 NaN
25% 328.000000 NaN 0.000000 NaN NaN 0.000000 0.000000 NaN NaN
50% 655.000000 NaN 0.000000 NaN NaN 0.000000 0.000000 NaN NaN
75% 982.000000 NaN 1.000000 NaN NaN 1.000000 0.000000 NaN NaN
max 1309.000000 NaN 1.000000 NaN 80.000000 8.000000 9.000000 512.329224 NaN

Handling missing variables

In [6]:
titanic[titanic.age.isnull()].shape
Out[6]:
(264, 9)
In [7]:
age_mean = titanic.age.mean()
titanic.age.fillna(age_mean, axis=0, inplace=True)
titanic.dropna(axis=0, inplace=True)
In [8]:
titanic.shape
Out[8]:
(1306, 9)
In [9]:
titanic.set_index('pid', drop=True, inplace=True)
titanic.head()
Out[9]:
pclass survived sex age sibsp parch fare embarked
pid
1 1st 1 female 29.000000 0 0 211.337494 Southampton
2 1st 1 male 29.908852 1 2 151.550003 Southampton
3 1st 0 female 2.000000 1 2 151.550003 Southampton
4 1st 0 male 30.000000 1 2 151.550003 Southampton
5 1st 0 female 25.000000 1 2 151.550003 Southampton

Creating dummy variables for categorical features

In [10]:
titanic_ssf = pd.get_dummies(titanic)
titanic_ssf.head(10)
Out[10]:
survived age sibsp parch fare pclass_1st pclass_2nd pclass_3rd sex_female sex_male embarked_Cherbourg embarked_Queenstown embarked_Southampton
pid
1 1 29.000000 0 0 211.337494 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2 1 29.908852 1 2 151.550003 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
3 0 2.000000 1 2 151.550003 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
4 0 30.000000 1 2 151.550003 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
5 0 25.000000 1 2 151.550003 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
6 1 48.000000 0 0 26.549999 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
7 1 63.000000 1 0 77.958298 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
8 0 39.000000 0 0 0.000000 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
9 1 53.000000 2 0 51.479198 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
10 0 71.000000 0 0 49.504200 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0
In [11]:
titanic_names = titanic_ssf.columns.values
titanic_names
Out[11]:
array(['survived', 'age', 'sibsp', 'parch', 'fare', 'pclass_1st',
       'pclass_2nd', 'pclass_3rd', 'sex_female', 'sex_male',
       'embarked_Cherbourg', 'embarked_Queenstown', 'embarked_Southampton'], dtype=object)
In [12]:
y = titanic_ssf['survived']
X = titanic_ssf[titanic_names[1:]]
X.head()
Out[12]:
age sibsp parch fare pclass_1st pclass_2nd pclass_3rd sex_female sex_male embarked_Cherbourg embarked_Queenstown embarked_Southampton
pid
1 29.000000 0 0 211.337494 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2 29.908852 1 2 151.550003 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
3 2.000000 1 2 151.550003 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
4 30.000000 1 2 151.550003 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
5 25.000000 1 2 151.550003 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
In [13]:
titanic_ssf.describe().T
Out[13]:
count mean std min 25% 50% 75% max
survived 1306.0 0.381317 0.485896 0.0000 0.0000 0.000000 1.000 1.000000
age 1306.0 29.854661 12.812320 0.1667 22.0000 29.908852 35.000 80.000000
sibsp 1306.0 0.500000 1.042580 0.0000 0.0000 0.000000 1.000 8.000000
parch 1306.0 0.385911 0.866357 0.0000 0.0000 0.000000 0.000 9.000000
fare 1306.0 33.223956 51.765986 0.0000 7.8958 14.454200 31.275 512.329224
pclass_1st 1306.0 0.245789 0.430719 0.0000 0.0000 0.000000 0.000 1.000000
pclass_2nd 1306.0 0.212098 0.408950 0.0000 0.0000 0.000000 0.000 1.000000
pclass_3rd 1306.0 0.542113 0.498414 0.0000 0.0000 1.000000 1.000 1.000000
sex_female 1306.0 0.355283 0.478782 0.0000 0.0000 0.000000 1.000 1.000000
sex_male 1306.0 0.644717 0.478782 0.0000 0.0000 1.000000 1.000 1.000000
embarked_Cherbourg 1306.0 0.206738 0.405121 0.0000 0.0000 0.000000 0.000 1.000000
embarked_Queenstown 1306.0 0.094181 0.292192 0.0000 0.0000 0.000000 0.000 1.000000
embarked_Southampton 1306.0 0.699081 0.458833 0.0000 0.0000 1.000000 1.000 1.000000

Build the training and testing dataset

In [14]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

A versatile function to measure performance of a model

In [15]:
from sklearn import metrics

def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred = clf.predict(X)   
    if show_accuracy:
         print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n"
    if show_classification_report:
        print "Classification report"
        print metrics.classification_report(y, y_pred),"\n"
      
    if show_confussion_matrix:
        print "Confussion matrix"
        print metrics.confusion_matrix(y, y_pred),"\n"

Let's now compare performnace of standard decision tree classifier against some ensemble methods: Random Forest Classifier and Ada Boost.

In [16]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)
In [17]:
from sklearn import metrics
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)
Accuracy:0.748 

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)
In [25]:
measure_performance(X_test, y_test, rf, show_confussion_matrix=False, show_classification_report=False)
Accuracy:0.775 

Exploring and comparing model parameters

In [26]:
print rf.get_params()
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 10, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'gini', 'random_state': None, 'max_features': 'auto', 'max_depth': None, 'class_weight': None}

The following is a general function that performs cross-validation using a range of values for a specified parameter of a model

In [27]:
from sklearn.cross_validation import KFold

def calc_params(X, y, clf, param_values, param_name, K):
    
    # Convert input to Numpy arrays
    X = np.array(X)
    y = np.array(y)

    # initialize training and testing scores with zeros
    train_scores = np.zeros(len(param_values))
    test_scores = np.zeros(len(param_values))
    
    # iterate over the different parameter values
    for i, param_value in enumerate(param_values):
        print param_name, ' = ', param_value
        
        # set classifier parameters
        clf.set_params(**{param_name:param_value})
        
        # initialize the K scores obtained for each fold
        k_train_scores = np.zeros(K)
        k_test_scores = np.zeros(K)
        
        # create KFold cross validation
        cv = KFold(len(X), K, shuffle=True, random_state=0)
        
        # iterate over the K folds
        for j, (train, test) in enumerate(cv):
            # fit the classifier in the corresponding fold
            # and obtain the corresponding accuracy scores on train and test sets
            clf.fit([X[k] for k in train], y[train])
            k_train_scores[j] = clf.score([X[k] for k in train], y[train])
            k_test_scores[j] = clf.score([X[k] for k in test], y[test])
            
        # store the mean of the K fold scores
        train_scores[i] = np.mean(k_train_scores)
        test_scores[i] = np.mean(k_test_scores)
       
    # plot the training and testing scores in a log scale
    plt.plot(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
    plt.plot(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
    plt.legend(loc=7)
    plt.xlabel(param_name + " values")
    plt.ylabel("Mean cross validation accuracy")

    # return the training and testing scores on each parameter value
    return train_scores, test_scores

Now we can explore the impact of min_samples_leaf more systematically

In [29]:
msl = range(1,6)
print msl
[1, 2, 3, 4, 5]
In [30]:
train_scores, test_scores = calc_params(X_train, y_train, rf, msl, 'min_samples_leaf', 5)
min_samples_leaf  =  1
min_samples_leaf  =  2
min_samples_leaf  =  3
min_samples_leaf  =  4
min_samples_leaf  =  5
In [31]:
nest = range(5, 101, 5)
print nest
[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
In [32]:
train_scores, test_scores = calc_params(X_train, y_train, rf, nest, 'n_estimators', 5)
n_estimators  =  5
n_estimators  =  10
n_estimators  =  15
n_estimators  =  20
n_estimators  =  25
n_estimators  =  30
n_estimators  =  35
n_estimators  =  40
n_estimators  =  45
n_estimators  =  50
n_estimators  =  55
n_estimators  =  60
n_estimators  =  65
n_estimators  =  70
n_estimators  =  75
n_estimators  =  80
n_estimators  =  85
n_estimators  =  90
n_estimators  =  95
n_estimators  =  100
In [33]:
rf = RandomForestClassifier(n_estimators=25, min_samples_leaf=3)
rf = rf.fit(X_train, y_train)

measure_performance(X_test, y_test, rf, show_confussion_matrix=False, show_classification_report=False)
Accuracy:0.798 

In [34]:
rf.feature_importances_
Out[34]:
array([ 0.16267378,  0.03331261,  0.04349472,  0.19574659,  0.05559431,
        0.02378845,  0.04649006,  0.1534259 ,  0.24818645,  0.01787115,
        0.0058645 ,  0.01355147])
In [36]:
rf.estimators_
Out[36]:
[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1220789620, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1228144379, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=149402042, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=510474853, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=212645109, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=239210460, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=13181958, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=230145210, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1302917111, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=199570140, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1961979718, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=356906911, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1588804417, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1181114181, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1488247353, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=369129032, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1058873726, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=863720699, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=528807630, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=632890149, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=2129067588, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=897741105, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=360263804, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1179795812, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=1488820203, splitter='best')]
In [37]:
from sklearn.ensemble import AdaBoostClassifier
In [38]:
ab = AdaBoostClassifier()
ab = ab.fit(X_train, y_train)
In [39]:
measure_performance(X_test, y_test, ab, show_confussion_matrix=False, show_classification_report=False)
Accuracy:0.782 

In [40]:
train_scores, test_scores = calc_params(X_train, y_train, ab, nest, 'n_estimators', 5)
n_estimators  =  5
n_estimators  =  10
n_estimators  =  15
n_estimators  =  20
n_estimators  =  25
n_estimators  =  30
n_estimators  =  35
n_estimators  =  40
n_estimators  =  45
n_estimators  =  50
n_estimators  =  55
n_estimators  =  60
n_estimators  =  65
n_estimators  =  70
n_estimators  =  75
n_estimators  =  80
n_estimators  =  85
n_estimators  =  90
n_estimators  =  95
n_estimators  =  100
In [43]:
ab = AdaBoostClassifier(n_estimators=20)
ab = ab.fit(X_train, y_train)
measure_performance(X_test, y_test, ab, show_confussion_matrix=False, show_classification_report=False)
Accuracy:0.809 

In [ ]: