Supervised Machine Learning basics: Titanic example¶

Florent Leclercq,
Imperial Centre for Inference and Cosmology, Imperial College London,
florent.leclercq@polytechnique.org

In [1]:

import numpy as np
from sklearn.ensemble import RandomForestClassifier

Load training data set¶

In [2]:

# data set available at this address: https://www.kaggle.com/c/titanic/data
# (version slightly modified to be conveniently loaded with numpy)
dtype = {'names':('PassengerId','Survived','Pclass','Name','Sex',
                        'Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'),
               'formats': ('i4','i4','i4','S20','S6','S20','i4','i4','S20','f8','S20','S20')}
data = np.loadtxt("data/titanic.csv", dtype=dtype, delimiter=";", comments="#")

Data dictionary

Variable	Definition	Key
Survived	Survival	0 = No, 1 = Yes
Pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
Sex	Sex
Age	Age in years
SibSp	# of siblings / spouses aboard the Titanic
Parch	# of parents / children aboard the Titanic
Ticket	Ticket number
Fare	Passenger fare
Cabin	Cabin number
Embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [3]:

data[500]

Out[3]:

(501, 0, 3, b'"Calic, Mr. Petar"', b'male', b'17', 0, 0, b'315086', 8.6625, b'', b'S')

In [4]:

data[600]

Out[4]:

(601, 1, 2, b'"Jacobsohn, Mrs. Sid', b'female', b'24', 2, 1, b'243847', 27., b'', b'S')

In [5]:

def perform_splitting(condition):
    selected_passengers = data[np.where(condition)]
    Nsplit = selected_passengers.size
    Nsurvived_split = np.sum(selected_passengers['Survived']==1)
    Ndied_split = np.sum(selected_passengers['Survived']==0)
    return Nsurvived_split, Ndied_split, Nsplit
    
def entropy(Nsurvived, Ndied, Ntot):
    assert(Nsurvived + Ndied == Ntot)
    return -Nsurvived/Ntot*np.log2(Nsurvived/Ntot) -Ndied/Ntot*np.log2(Ndied/Ntot)

Parent Entropy¶

In [6]:

Ntot = data['Survived'].size
Nsurvived = np.sum(data['Survived']==1)
Ndied = np.sum(data['Survived']==0)
H_parent = entropy(Nsurvived, Ndied, Ntot)
print("Ndied={}, Nsurvived={}, Ntot={}".format(Ndied,Nsurvived,Ntot))
print("H_parent={}".format(H_parent))

Ndied=549, Nsurvived=342, Ntot=891
H_parent=0.9607079018756469

Information gain: ticket class¶

In [7]:

Nsurvived_first, Ndied_first, Nfirst = perform_splitting(data['Pclass']==1)
H_first = entropy(Nsurvived_first, Ndied_first, Nfirst)
print("Ndied_first={}, Nsurvived_first={}, Nfirst={}".format(Ndied_first,Nsurvived_first,Nfirst))
print("H_first={}".format(H_first))

Ndied_first=80, Nsurvived_first=136, Nfirst=216
H_first=0.9509560484549725

In [8]:

Nsurvived_second, Ndied_second, Nsecond = perform_splitting(data['Pclass']==2)
H_second = entropy(Nsurvived_second, Ndied_second, Nsecond)
print("Ndied_second={}, Nsurvived_second={}, Nsecond={}".format(Ndied_second,Nsurvived_second,Nsecond))
print("H_second={}".format(H_second))

Ndied_second=97, Nsurvived_second=87, Nsecond=184
H_second=0.9978683156711936

In [9]:

Nsurvived_third, Ndied_third, Nthird = perform_splitting(data['Pclass']==3)
H_third = entropy(Nsurvived_third, Ndied_third, Nthird)
print("Ndied_third={}, Nsurvived_third={}, Nthird={}".format(Ndied_third,Nsurvived_third,Nthird))
print("H_third={}".format(H_third))

Ndied_third=372, Nsurvived_third=119, Nthird=491
H_third=0.7989470522661535

In [10]:

H_class = Nfirst/Ntot*H_first + Nsecond/Ntot*H_second + Nthird/Ntot*H_third
IG_class = H_parent - H_class
print("IG_class={}".format(IG_class))

IG_class=0.0838310452960116

Information gain: sex¶

In [11]:

Nsurvived_male, Ndied_male, Nmale = perform_splitting(data['Sex']==b'male')
H_male = entropy(Nsurvived_male, Ndied_male, Nmale)
print("Ndied_male={}, Nsurvived_male={}, Nmale={}".format(Ndied_male,Nsurvived_male,Nmale))
print("H_male={}".format(H_male))

Ndied_male=468, Nsurvived_male=109, Nmale=577
H_male=0.6991817891208407

In [12]:

Nsurvived_female, Ndied_female, Nfemale = perform_splitting(data['Sex']==b'female')
H_female = entropy(Nsurvived_female, Ndied_female, Nfemale)
print("Ndied_female={}, Nsurvived_female={}, Nfemale={}".format(Ndied_female,Nsurvived_female,Nfemale))
print("H_female={}".format(H_female))

Ndied_female=81, Nsurvived_female=233, Nfemale=314
H_female=0.8236550739295191

In [13]:

H_sex = Nmale/Ntot*H_male + Nfemale/Ntot*H_female
IG_sex = H_parent - H_sex
print("IG_sex={}".format(IG_sex))

IG_sex=0.2176601066606142

Information gain: port of embarkation¶

In [14]:

Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg = perform_splitting(data['Embarked']==b'C')
H_cherbourg = entropy(Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg)
print("Ndied_cherbourg={}, Nsurvived_cherbourg={}, Ncherbourg={}"
      .format(Ndied_cherbourg,Nsurvived_cherbourg,Ncherbourg))
print("H_cherbourg={}".format(H_cherbourg))

Ndied_cherbourg=75, Nsurvived_cherbourg=93, Ncherbourg=168
H_cherbourg=0.9917033083725818

In [15]:

Nsurvived_queenstown, Ndied_queenstown, Nqueenstown = perform_splitting(data['Embarked']==b'Q')
H_queenstown = entropy(Nsurvived_queenstown, Ndied_queenstown, Nqueenstown)
print("Ndied_queenstown={}, Nsurvived_queenstown={}, Nqueenstown={}"
      .format(Ndied_queenstown,Nsurvived_queenstown,Nqueenstown))
print("H_queenstown={}".format(H_queenstown))

Ndied_queenstown=47, Nsurvived_queenstown=30, Nqueenstown=77
H_queenstown=0.9645476589143234

In [16]:

Nsurvived_southampton, Ndied_southampton, Nsouthampton = perform_splitting(data['Embarked']==b'S')
H_southampton = entropy(Nsurvived_southampton, Ndied_southampton, Nsouthampton)
print("Ndied_southampton={}, Nsurvived_southampton={}, Nsouthampton={}"
      .format(Ndied_southampton,Nsurvived_southampton,Nsouthampton))
print("H_southampton={}".format(H_southampton))

Ndied_southampton=427, Nsurvived_southampton=217, Nsouthampton=644
H_southampton=0.921876486346913

In [17]:

H_embarked = Ncherbourg/Ntot*H_cherbourg + Nqueenstown/Ntot*H_queenstown + Nsouthampton/Ntot*H_southampton
IG_embarked = H_parent - H_embarked
print("IG_embarked={}".format(IG_embarked))

IG_embarked=0.024047090707960517

Off-the-shelf machine learning algorithm¶

In [18]:

data_Sex=np.zeros(len(data),dtype=np.int)
data_Sex[np.where(data['Sex']==b'male')]=1
data_Sex[np.where(data['Sex']==b'female')]=2
data_Embarked=3*np.ones(len(data),dtype=np.int)
data_Embarked[np.where(data['Embarked']==b'C')]=0
data_Embarked[np.where(data['Embarked']==b'Q')]=1
data_Embarked[np.where(data['Embarked']==b'S')]=2
data_Embarked[np.where((data_Embarked!=0)*(data_Embarked!=1)*(data_Embarked!=2))]=3

In [19]:

features = np.array((data['Pclass'],data_Sex,data_Embarked),dtype=np.int).T
label = data['Survived']

In [20]:

model = RandomForestClassifier(n_estimators=10)
model.fit(features, label)

Out[20]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:

# prediction for a woman in first class, embarked in Southampton
ans = model.predict([[1, 2, 0]])
survival = "survived" if ans==1 else "died"
print(survival)

survived

In [22]:

# prediction for a man in third class, embarked in Cherbourg
ans = model.predict([[3, 1, 0]])
survival = "survived" if ans==1 else "died"
print(survival)

died