#!/usr/bin/env python # coding: utf-8 # # Supervised Machine Learning basics: Titanic example # Florent Leclercq,
# Imperial Centre for Inference and Cosmology, Imperial College London,
# florent.leclercq@polytechnique.org # In[1]: import numpy as np from sklearn.ensemble import RandomForestClassifier # ## Load training data set # In[2]: # data set available at this address: https://www.kaggle.com/c/titanic/data # (version slightly modified to be conveniently loaded with numpy) dtype = {'names':('PassengerId','Survived','Pclass','Name','Sex', 'Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'), 'formats': ('i4','i4','i4','S20','S6','S20','i4','i4','S20','f8','S20','S20')} data = np.loadtxt("data/titanic.csv", dtype=dtype, delimiter=";", comments="#") # Data dictionary # # | Variable | Definition | Key | # |----------|--------------------------------------------|------------------------------------------------| # | Survived | Survival | 0 = No, 1 = Yes | # | Pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd | # | Sex | Sex | | # | Age | Age in years | | # | SibSp | # of siblings / spouses aboard the Titanic | | # | Parch | # of parents / children aboard the Titanic | | # | Ticket | Ticket number | | # | Fare | Passenger fare | | # | Cabin | Cabin number | | # | Embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton | # In[3]: data[500] # In[4]: data[600] # In[5]: def perform_splitting(condition): selected_passengers = data[np.where(condition)] Nsplit = selected_passengers.size Nsurvived_split = np.sum(selected_passengers['Survived']==1) Ndied_split = np.sum(selected_passengers['Survived']==0) return Nsurvived_split, Ndied_split, Nsplit def entropy(Nsurvived, Ndied, Ntot): assert(Nsurvived + Ndied == Ntot) return -Nsurvived/Ntot*np.log2(Nsurvived/Ntot) -Ndied/Ntot*np.log2(Ndied/Ntot) # ## Parent Entropy # In[6]: Ntot = data['Survived'].size Nsurvived = np.sum(data['Survived']==1) Ndied = np.sum(data['Survived']==0) H_parent = entropy(Nsurvived, Ndied, Ntot) print("Ndied={}, Nsurvived={}, Ntot={}".format(Ndied,Nsurvived,Ntot)) print("H_parent={}".format(H_parent)) # ## Information gain: ticket class # In[7]: Nsurvived_first, Ndied_first, Nfirst = perform_splitting(data['Pclass']==1) H_first = entropy(Nsurvived_first, Ndied_first, Nfirst) print("Ndied_first={}, Nsurvived_first={}, Nfirst={}".format(Ndied_first,Nsurvived_first,Nfirst)) print("H_first={}".format(H_first)) # In[8]: Nsurvived_second, Ndied_second, Nsecond = perform_splitting(data['Pclass']==2) H_second = entropy(Nsurvived_second, Ndied_second, Nsecond) print("Ndied_second={}, Nsurvived_second={}, Nsecond={}".format(Ndied_second,Nsurvived_second,Nsecond)) print("H_second={}".format(H_second)) # In[9]: Nsurvived_third, Ndied_third, Nthird = perform_splitting(data['Pclass']==3) H_third = entropy(Nsurvived_third, Ndied_third, Nthird) print("Ndied_third={}, Nsurvived_third={}, Nthird={}".format(Ndied_third,Nsurvived_third,Nthird)) print("H_third={}".format(H_third)) # In[10]: H_class = Nfirst/Ntot*H_first + Nsecond/Ntot*H_second + Nthird/Ntot*H_third IG_class = H_parent - H_class print("IG_class={}".format(IG_class)) # ## Information gain: sex # In[11]: Nsurvived_male, Ndied_male, Nmale = perform_splitting(data['Sex']==b'male') H_male = entropy(Nsurvived_male, Ndied_male, Nmale) print("Ndied_male={}, Nsurvived_male={}, Nmale={}".format(Ndied_male,Nsurvived_male,Nmale)) print("H_male={}".format(H_male)) # In[12]: Nsurvived_female, Ndied_female, Nfemale = perform_splitting(data['Sex']==b'female') H_female = entropy(Nsurvived_female, Ndied_female, Nfemale) print("Ndied_female={}, Nsurvived_female={}, Nfemale={}".format(Ndied_female,Nsurvived_female,Nfemale)) print("H_female={}".format(H_female)) # In[13]: H_sex = Nmale/Ntot*H_male + Nfemale/Ntot*H_female IG_sex = H_parent - H_sex print("IG_sex={}".format(IG_sex)) # ## Information gain: port of embarkation # In[14]: Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg = perform_splitting(data['Embarked']==b'C') H_cherbourg = entropy(Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg) print("Ndied_cherbourg={}, Nsurvived_cherbourg={}, Ncherbourg={}" .format(Ndied_cherbourg,Nsurvived_cherbourg,Ncherbourg)) print("H_cherbourg={}".format(H_cherbourg)) # In[15]: Nsurvived_queenstown, Ndied_queenstown, Nqueenstown = perform_splitting(data['Embarked']==b'Q') H_queenstown = entropy(Nsurvived_queenstown, Ndied_queenstown, Nqueenstown) print("Ndied_queenstown={}, Nsurvived_queenstown={}, Nqueenstown={}" .format(Ndied_queenstown,Nsurvived_queenstown,Nqueenstown)) print("H_queenstown={}".format(H_queenstown)) # In[16]: Nsurvived_southampton, Ndied_southampton, Nsouthampton = perform_splitting(data['Embarked']==b'S') H_southampton = entropy(Nsurvived_southampton, Ndied_southampton, Nsouthampton) print("Ndied_southampton={}, Nsurvived_southampton={}, Nsouthampton={}" .format(Ndied_southampton,Nsurvived_southampton,Nsouthampton)) print("H_southampton={}".format(H_southampton)) # In[17]: H_embarked = Ncherbourg/Ntot*H_cherbourg + Nqueenstown/Ntot*H_queenstown + Nsouthampton/Ntot*H_southampton IG_embarked = H_parent - H_embarked print("IG_embarked={}".format(IG_embarked)) # ## Off-the-shelf machine learning algorithm # In[18]: data_Sex=np.zeros(len(data),dtype=np.int) data_Sex[np.where(data['Sex']==b'male')]=1 data_Sex[np.where(data['Sex']==b'female')]=2 data_Embarked=3*np.ones(len(data),dtype=np.int) data_Embarked[np.where(data['Embarked']==b'C')]=0 data_Embarked[np.where(data['Embarked']==b'Q')]=1 data_Embarked[np.where(data['Embarked']==b'S')]=2 data_Embarked[np.where((data_Embarked!=0)*(data_Embarked!=1)*(data_Embarked!=2))]=3 # In[19]: features = np.array((data['Pclass'],data_Sex,data_Embarked),dtype=np.int).T label = data['Survived'] # In[20]: model = RandomForestClassifier(n_estimators=10) model.fit(features, label) # In[21]: # prediction for a woman in first class, embarked in Southampton ans = model.predict([[1, 2, 0]]) survival = "survived" if ans==1 else "died" print(survival) # In[22]: # prediction for a man in third class, embarked in Cherbourg ans = model.predict([[3, 1, 0]]) survival = "survived" if ans==1 else "died" print(survival)