Supervised Machine Learning basics: Titanic example

Florent Leclercq,
Institute of Cosmology and Gravitation, University of Portsmouth,
[email protected]

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

Load training data set

In [2]:
# data set available at this address: https://www.kaggle.com/c/titanic/data
# (version slightly modified to be conveniently loaded with numpy)
dtype = {'names':('PassengerId','Survived','Pclass','Name','Sex',
                        'Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'),
               'formats': ('i4','i4','i4','S20','S6','S20','i4','i4','S20','f8','S20','S20')}
data = np.loadtxt("data/titanic.csv", dtype=dtype, delimiter=";", comments="#")

Data dictionary

Variable Definition Key
Survived Survival 0 = No, 1 = Yes
Pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
Sex Sex
Age Age in years
SibSp # of siblings / spouses aboard the Titanic
Parch # of parents / children aboard the Titanic
Ticket Ticket number
Fare Passenger fare
Cabin Cabin number
Embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
In [3]:
data[500]
Out[3]:
(501, 0, 3, '"Calic, Mr. Petar"', 'male', '17', 0, 0, '315086', 8.6625, '', 'S')
In [4]:
data[600]
Out[4]:
(601, 1, 2, '"Jacobsohn, Mrs. Sid', 'female', '24', 2, 1, '243847', 27.0, '', 'S')

Parent Entropy

In [5]:
Ntot=float(data['Survived'].size)
Nsurvived=float(np.sum(data['Survived']==1))
Ndied=float(np.sum(data['Survived']==0))
H_parent = -Nsurvived/Ntot*np.log2(Nsurvived/Ntot) -Ndied/Ntot*np.log2(Ndied/Ntot)
print "Ndied={}, Nsurvived={}, Ntot={}".format(int(Ndied),int(Nsurvived),int(Ntot))
print "H_parent={}".format(H_parent)
Ndied=549, Nsurvived=342, Ntot=891
H_parent=0.960707901876

Information gain: ticket class

In [6]:
first_class_passengers=data[np.where(data['Pclass']==1)]
Nfirst=float(first_class_passengers.size)
Nsurvived_first=float(np.sum(first_class_passengers['Survived']==1))
Ndied_first=float(np.sum(first_class_passengers['Survived']==0))
H_first = -Nsurvived_first/Nfirst*np.log2(Nsurvived_first/Nfirst) -Ndied_first/Nfirst*np.log2(Ndied_first/Nfirst)
print "Ndied_first={}, Nsurvived_first={}, Nfirst={}".format(int(Ndied_first),int(Nsurvived_first),int(Nfirst))
print "H_first={}".format(H_first)
Ndied_first=80, Nsurvived_first=136, Nfirst=216
H_first=0.950956048455
In [7]:
second_class_passengers=data[np.where(data['Pclass']==2)]
Nsecond=float(second_class_passengers.size)
Nsurvived_second=float(np.sum(second_class_passengers['Survived']==1))
Ndied_second=float(np.sum(second_class_passengers['Survived']==0))
H_second = -Nsurvived_second/Nsecond*np.log2(Nsurvived_second/Nsecond) -Ndied_second/Nsecond*np.log2(Ndied_second/Nsecond)
print "Ndied_second={}, Nsurvived_second={}, Nsecond={}".format(int(Ndied_second),int(Nsurvived_second),int(Nsecond))
print "H_second={}".format(H_second)
Ndied_second=97, Nsurvived_second=87, Nsecond=184
H_second=0.997868315671
In [8]:
third_class_passengers=data[np.where(data['Pclass']==3)]
Nthird=float(third_class_passengers.size)
Nsurvived_third=float(np.sum(third_class_passengers['Survived']==1))
Ndied_third=float(np.sum(third_class_passengers['Survived']==0))
H_third = -Nsurvived_third/Nthird*np.log2(Nsurvived_third/Nthird) -Ndied_third/Nthird*np.log2(Ndied_third/Nthird)
print "Ndied_third={}, Nsurvived_third={}, Nthird={}".format(int(Ndied_third),int(Nsurvived_third),int(Nthird))
print "H_third={}".format(H_third)
Ndied_third=372, Nsurvived_third=119, Nthird=491
H_third=0.798947052266
In [9]:
H_class = Nfirst/Ntot*H_first + Nsecond/Ntot*H_second + Nthird/Ntot*H_third
IG_class = H_parent - H_class
print "IG_class={}".format(IG_class)
IG_class=0.083831045296

Information gain: sex

In [10]:
male_passengers=data[np.where(data['Sex']=='male')]
Nmale=float(male_passengers.size)
Nsurvived_male=float(np.sum(male_passengers['Survived']==1))
Ndied_male=float(np.sum(male_passengers['Survived']==0))
H_male = -Nsurvived_male/Nmale*np.log2(Nsurvived_male/Nmale) \
          -Ndied_male/Nmale*np.log2(Ndied_male/Nmale)
print "Ndied_male={}, Nsurvived_male={}, Nmale={}".format(int(Ndied_male),int(Nsurvived_male),int(Nmale))
print "H_male={}".format(H_male)
Ndied_male=468, Nsurvived_male=109, Nmale=577
H_male=0.699181789121
In [11]:
female_passengers=data[np.where(data['Sex']=='female')]
Nfemale=float(female_passengers.size)
Nsurvived_female=float(np.sum(female_passengers['Survived']==1))
Ndied_female=float(np.sum(female_passengers['Survived']==0))
H_female = -Nsurvived_female/Nfemale*np.log2(Nsurvived_female/Nfemale) \
          -Ndied_female/Nfemale*np.log2(Ndied_female/Nfemale)
print "Ndied_female={}, Nsurvived_female={}, Nfemale={}".format(int(Ndied_female),int(Nsurvived_female),int(Nfemale))
print "H_female={}".format(H_female)
Ndied_female=81, Nsurvived_female=233, Nfemale=314
H_female=0.82365507393
In [12]:
H_sex = Nmale/Ntot*H_male + Nfemale/Ntot*H_female
IG_sex = H_parent - H_sex
print "IG_sex={}".format(IG_sex)
IG_sex=0.217660106661

Information gain: port of embarkation

In [13]:
cherbourg_passengers=data[np.where(data['Embarked']=='C')]
Ncherbourg=float(cherbourg_passengers.size)
Nsurvived_cherbourg=float(np.sum(cherbourg_passengers['Survived']==1))
Ndied_cherbourg=float(np.sum(cherbourg_passengers['Survived']==0))
H_cherbourg = -Nsurvived_cherbourg/Ncherbourg*np.log2(Nsurvived_cherbourg/Ncherbourg) \
          -Ndied_cherbourg/Ncherbourg*np.log2(Ndied_cherbourg/Ncherbourg)
print "Ndied_cherbourg={}, Nsurvived_cherbourg={}, Ncherbourg={}".format(int(Ndied_cherbourg),int(Nsurvived_cherbourg),int(Ncherbourg))
print "H_cherbourg={}".format(H_cherbourg)
Ndied_cherbourg=75, Nsurvived_cherbourg=93, Ncherbourg=168
H_cherbourg=0.991703308373
In [14]:
queenstown_passengers=data[np.where(data['Embarked']=='Q')]
Nqueenstown=float(queenstown_passengers.size)
Nsurvived_queenstown=float(np.sum(queenstown_passengers['Survived']==1))
Ndied_queenstown=float(np.sum(queenstown_passengers['Survived']==0))
H_queenstown = -Nsurvived_queenstown/Nqueenstown*np.log2(Nsurvived_queenstown/Nqueenstown) \
          -Ndied_queenstown/Nqueenstown*np.log2(Ndied_queenstown/Nqueenstown)
print "Ndied_queenstown={}, Nsurvived_queenstown={}, Nqueenstown={}".format(int(Ndied_queenstown),int(Nsurvived_queenstown),int(Nqueenstown))
print "H_queenstown={}".format(H_queenstown)
Ndied_queenstown=47, Nsurvived_queenstown=30, Nqueenstown=77
H_queenstown=0.964547658914
In [15]:
southampton_passengers=data[np.where(data['Embarked']=='S')]
Nsouthampton=float(southampton_passengers.size)
Nsurvived_southampton=float(np.sum(southampton_passengers['Survived']==1))
Ndied_southampton=float(np.sum(southampton_passengers['Survived']==0))
H_southampton = -Nsurvived_southampton/Nsouthampton*np.log2(Nsurvived_southampton/Nsouthampton) \
          -Ndied_southampton/Nsouthampton*np.log2(Ndied_southampton/Nsouthampton)
print "Ndied_southampton={}, Nsurvived_southampton={}, Nsouthampton={}".format(int(Ndied_southampton),int(Nsurvived_southampton),int(Nsouthampton))
print "H_southampton={}".format(H_southampton)
Ndied_southampton=427, Nsurvived_southampton=217, Nsouthampton=644
H_southampton=0.921876486347
In [16]:
H_embarked = Ncherbourg/Ntot*H_cherbourg + Nqueenstown/Ntot*H_queenstown + Nsouthampton/Ntot*H_southampton
IG_embarked = H_parent - H_embarked
print "IG_embarked={}".format(IG_embarked)
IG_embarked=0.024047090708

Off-the-shelf machine learning algorithm

In [12]:
data['Sex'][np.where(data['Sex']=='male')]=1
data['Sex'][np.where(data['Sex']=='female')]=2
data_Sex=data['Sex'].astype(int)
data['Embarked'][np.where(data['Embarked']=='C')]=0
data['Embarked'][np.where(data['Embarked']=='Q')]=1
data['Embarked'][np.where(data['Embarked']=='S')]=2
data_Embarked=data['Embarked'].astype(int)
data_Embarked[np.where((data_Embarked!=0)*(data_Embarked!=1)*(data_Embarked!=2))]=3
In [13]:
features = np.array((data['Pclass'],data_Sex,data_Embarked),dtype=np.int).T
label = data['Survived']
In [14]:
model = RandomForestClassifier()
model.fit(features, label)
Out[14]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [15]:
# prediction for a man in third class, embarked in Cherbourg
model.predict([[3, 1, 0]])
Out[15]:
array([0], dtype=int32)
In [16]:
# prediction for a woman in first class, embarked in Southampton
model.predict([[1, 2, 0]])
Out[16]:
array([1], dtype=int32)