import numpy as np
import pandas as pd
from pandas import DataFrame
DATA_HOME_DIR = "/home/tsu-nera/repo/kaggle/input/titanic/"
row_data = pd.read_csv(DATA_HOME_DIR + 'train.csv', index_col=0)
test_data = pd.read_csv(DATA_HOME_DIR + 'test.csv', index_col=0)
test_ind = test_data.index
train_X = row_data[['Pclass','Sex','Age','SibSp','Parch','Cabin']]
train_y = row_data[['Survived']]
test_X= test_data[['Pclass','Sex','Age','SibSp','Parch', 'Cabin']]
all_data = pd.concat([train_X, test_X])
all_data.shape, train_y.shape
((1309, 6), (891, 1))
## クラスごとに分割
Pclass = pd.get_dummies(all_data['Pclass'])
Pclass.columns=['1st','2nd','3rd']
## 女性、男性、子供ごとに分割
Sex = pd.get_dummies(all_data['Sex'])
def male_female_child(passenger):
age,sex = passenger
if np.isnan(age):
age = 30
if age < 16:
return 'child'
else:
return sex
Person = all_data[['Age','Sex']].apply(male_female_child,axis=1)
Person = pd.get_dummies(Person)
# 独身かそうでないかで分類
Alone = all_data.Parch + all_data.SibSp
def is_alone(alone):
if alone > 0:
return 0
else:
return 1
Alone = Alone.apply(is_alone)
Alone = pd.DataFrame(Alone)
Alone.columns = ['Alone']
def get_level(deck):
if pd.isnull(deck):
deck = 'CXX'
return deck[0]
Level = all_data.Cabin.apply(get_level)
Level = pd.get_dummies(Level)
merge_data = pd.merge(Alone,Pclass,right_index=True,left_index=True)
merge_data = pd.merge(merge_data,Person,right_index=True,left_index=True)
merge_data = pd.merge(merge_data,Level,right_index=True,left_index=True)
X = merge_data[:train_X.shape[0]]
y = train_y.values.ravel()
test_X = merge_data[train_X.shape[0]:]
X.shape, y.shape, test_X.shape
# tx
((891, 15), (891,), (418, 15))
# create model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=10)
model.fit(X, y)
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
p_survived = model.predict(test_X.values)
submission = pd.DataFrame()
submission['PassengerId'] = test_ind
submission['Survived'] = p_survived
submission.to_csv('submission.csv', index=False)