Random forest

I use 128 trees regarding this publication

In [123]:
import numpy as np
import pandas as pd
from sklearn import tree

input_file = "/Users/fede/development/data-science/titanic/titanic/train.csv"
df = pd.read_csv(input_file, header = 0)
In [124]:
df.head()
Out[124]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Mapping values

In [125]:
d = {'male': 1, 'female': 0}
df['Sex'] = df['Sex'].map(d)
#(C=Cherbourg, Q=Queenstown, S=Southampton)
c = {'C': 0,'Q':1,'S':2}
df['Embarked'] = df['Embarked'].map(c)

Remove unnecessary values

In [126]:
df = df.drop('Name', 1)
df = df.drop('Ticket', 1)
df = df.drop('Cabin', 1)
df = df.drop('PassengerId', 1)
In [127]:
df = df[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare', 'Survived']]

df.head(10)
Out[127]:
Pclass Sex Age SibSp Parch Fare Survived
0 3 1 22.0 1 0 7.2500 0
1 1 0 38.0 1 0 71.2833 1
2 3 0 26.0 0 0 7.9250 1
3 1 0 35.0 1 0 53.1000 1
4 3 1 35.0 0 0 8.0500 0
5 3 1 NaN 0 0 8.4583 0
6 1 1 54.0 0 0 51.8625 0
7 3 1 2.0 3 1 21.0750 0
8 3 0 27.0 0 2 11.1333 1
9 2 0 14.0 1 0 30.0708 1

Build random forest trees

In [128]:
from sklearn.ensemble import RandomForestClassifier

df1 = df.astype(object).replace(np.nan, '-1')

y = df1["Survived"]
X = df1[features]

clf = RandomForestClassifier(n_estimators = 128)
clf = clf.fit(X, y)

Compare with test cases

In [129]:
#Read train file

input_file_test = "/Users/fede/development/data-science/titanic/titanic/test.csv"
df_test = pd.read_csv(input_file, header = 0)

# map values
df_test['Sex'] = df_test['Sex'].map(d)
df_test['Embarked'] = df_test['Embarked'].map(c)

#remove unnecessary values
df_test = df_test.drop('Name', 1)
df_test = df_test.drop('Ticket', 1)
df_test = df_test.drop('Cabin', 1)
df_test = df_test.drop('PassengerId', 1)


df_test = df_test[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare', 'Survived']]

df_test_survived = df_test.loc[df_test['Survived'] == 1]
df_test_victims = df_test.loc[df_test['Survived'] == 0]


df_test_survived = df_test_survived[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare']]
df_test_victims = df_test_victims[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare']]

Survivors

In [130]:
df_test_survived = df_test_survived.astype(object).replace(np.nan, '-1')

survivors = clf.predict(df_test_survived)
In [131]:
from collections import Counter

total = Counter(survivors)
percetange_of_error_survivors =  total[0] * 100 / total[1]
percetange_of_error_survivors
Out[131]:
3.323262839879154

Victims

In [132]:
df_test_victims = df_test_victims.astype(object).replace(np.nan, '-1')

victims = clf.predict(df_test_victims)

total = Counter(victims)
percetange_of_error_victims =  total[1] * 100 / total[0]
percetange_of_error_victims
Out[132]:
0.9191176470588235