Titanic with Decision Tree

In [30]:
import numpy as np
import pandas as pd

setup

In [70]:
DATA_HOME_DIR = "/home/tsu-nera/repo/kaggle/input/titanic/"
row_data = pd.read_csv(DATA_HOME_DIR + 'train.csv', index_col=0)
test_data = pd.read_csv(DATA_HOME_DIR + 'test.csv', index_col=0)

Preprocess

In [71]:
test_ind = test_data.index

train_X = row_data[['Pclass','Sex','Age','SibSp','Parch','Cabin']]
train_y = row_data[['Survived']]
test_X= test_data[['Pclass','Sex','Age','SibSp','Parch', 'Cabin']]

all_data = pd.concat([train_X, test_X])

all_data.shape, train_y.shape
Out[71]:
((1309, 6), (891, 1))
In [72]:
## クラスごとに分割
Pclass = pd.get_dummies(all_data['Pclass'])
Pclass.columns=['1st','2nd','3rd']
In [73]:
## 女性、男性、子供ごとに分割
Sex = pd.get_dummies(all_data['Sex'])

def male_female_child(passenger):
    age,sex = passenger
    if np.isnan(age):
        age = 30
    if age < 16:
        return 'child'
    else:
        return sex

Person = all_data[['Age','Sex']].apply(male_female_child,axis=1)
Person = pd.get_dummies(Person)
In [74]:
# 独身かそうでないかで分類
Alone = all_data.Parch + all_data.SibSp

def is_alone(alone):
    if alone > 0:
        return 0
    else:
        return 1

Alone = Alone.apply(is_alone)
Alone = pd.DataFrame(Alone)
Alone.columns = ['Alone']
In [75]:
def get_level(deck):
    if pd.isnull(deck):
        deck = 'CXX'
    return deck[0]

Level = all_data.Cabin.apply(get_level)
Level = pd.get_dummies(Level)
In [76]:
merge_data = pd.merge(Alone,Pclass,right_index=True,left_index=True)
merge_data = pd.merge(merge_data,Person,right_index=True,left_index=True)
merge_data = pd.merge(merge_data,Level,right_index=True,left_index=True)

X = merge_data[:train_X.shape[0]]
y = train_y.values.ravel()

test_X = merge_data[train_X.shape[0]:]

X.shape, y.shape, test_X.shape
# tx
Out[76]:
((891, 15), (891,), (418, 15))

Build Model

In [66]:
# create model
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=17)

Training

In [67]:
clf.fit(X, y)
Out[67]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=17, splitter='best')

Evaluate

In [68]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
In [69]:
scores.mean()
Out[69]:
0.80248958520625846

Evaluate with KFold¶

In [52]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import accuracy_score
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=17)
In [53]:
score_train_tmp = 0
score_test_tmp = 0
In [54]:
X = np.array(X)
y = np.array(y)
In [55]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # 構築データでモデル構築
    clf.fit(X_train, y_train)

    # 構築データの予測値
    pred_train = clf.predict(X_train)

    # 構築データのaccuracy
    auccuracy = accuracy_score(pred_train, y_train)

    #構築データのaccuracyを足していく
    score_train_tmp+=auccuracy

    #検証データの予測値
    pred_test = clf.predict(X_test)

    #検証データのaccuracy
    auccuracy = accuracy_score(pred_test, y_test)

    #検証データのaccuracyを足していく
    score_test_tmp+=auccuracy
In [56]:
score_train_tmp/K
Out[56]:
0.82463676190176005
In [57]:
score_test_tmp/K
Out[57]:
0.80247944259619608

GridSearch

In [58]:
from sklearn.model_selection import GridSearchCV

# use a full grid over all parameters
param_grid = {"max_depth": [2,4,6,8,10],
              "max_features": ['log2', 'sqrt','auto'],
              "min_samples_split": [2, 3, 5],
              "min_samples_leaf": [1,5,8],
              "criterion": ["gini", "entropy"]}

tree_grid = GridSearchCV(estimator=clf,
                 param_grid = param_grid,   
                 scoring="accuracy",  #metrics
                 cv = K,              #cross-validation
                 n_jobs =-1)          #number of core

tree_grid.fit(X,y) #fit

tree_grid_best = tree_grid.best_estimator_ #best estimator
print("Best Model Parameter: ",tree_grid.best_params_)
print("Best Model Score    : ",tree_grid.best_score_)
Best Model Parameter:  {'criterion': 'gini', 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 2}
Best Model Score    :  0.812570145903

Testing

In [84]:
model = tree.DecisionTreeClassifier(criterion="gini", max_depth=6, max_features="log2", min_samples_leaf=8, min_samples_split=2)
In [85]:
model.fit(X, y)
Out[85]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
In [86]:
p_survived = model.predict(test_X.values)
In [87]:
submission = pd.DataFrame()
submission['PassengerId'] = test_ind
submission['Survived'] = p_survived
In [88]:
submission.to_csv('submission_1001_1.csv', index=False)