#!/usr/bin/env python # coding: utf-8 # # Kaggle: Titanic: Machine Learning from Disaster # https://www.kaggle.com/c/titanic # In[8]: import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') np.random.seed(123) from tensorflow import set_random_seed set_random_seed(123) train = pd.read_csv('titanic/train.csv', index_col=0) test = pd.read_csv('titanic/test.csv', index_col=0) # In[3]: train.head() # ### Drop Survived and Ticket, then combine train with test # In[4]: train_tmp = train.drop(['Survived', 'Ticket'], axis=1) test_tmp = test.drop(['Ticket'], axis=1) df = pd.concat([train_tmp, test_tmp]) df.info() # ### Name --> Title --> Number # In[5]: # Name to Title df = df.assign(Title=df.Name.str.extract(' ([A-Za-z]+)\..', expand=True)) title_list = df.Title.unique() print(title_list) # In[6]: # Title to Number(0-17) df.Title = df.Title.replace(df.Title.unique(), np.arange(len(df.Title.unique()))) # Drop Name column df = df.drop(['Name'], axis=1) df.head() # ### Sex --> male:0, female:1 # In[1501]: df.Sex = df.Sex.replace({'male': 0, 'female': 1}) # ### Cabin --> Number: nan:0, C:1, E:2, G:3, D:4, A:5, B:6, F:7, T:8 # In[1502]: df = df.assign(Cabin=df.Cabin.str[0]) cabin_list = df.Cabin.unique() df.Cabin = df.Cabin.replace(df.Cabin.str[0].unique(), np.arange(len(df.Cabin.str[0].unique()))) print(cabin_list) print(df.Cabin.unique()) # ### Embarked --> S:0, C:1, Q:2, nan # In[1503]: df.Embarked.unique() # In[1504]: df.Embarked = df.Embarked.replace({'S':0, 'C':1, 'Q':2}) # ## zscore or normalization: # * Age: including NaN # * Fare: including NaN # # Z = (x - x.mean) / x.std # N = (x - x.min)/(x.max - x.min) # # sklearn.preprocessing.MinMaxScaler causes error with Null data. # In[1505]: # Normalize Function def normalize(df_col): df_col = (df_col - df_col.min()) / (df_col.max() - df_col.min()) return df_col # In[1506]: # Standardization(zscore) def zscore(df_col): df_col = (df_col - df_col.mean()) / df_col.std() return df_col # In[1507]: df.Age = zscore(df.Age) df.Fare = zscore(df.Fare) # df.Age = normalize(df.Age) # df.Fare = normalize(df.Fare) # for col in df.columns: # df[col] = zscore(df[col]) df.describe() # ## Separate Notnull data from Null data # # Make a Copy of df: df0 = df.copy() # * Age # * Embarked # * Fare # # In[1508]: df0 = df.copy() df0.info() # In[1509]: Age_null = df[df.Age.isnull()] df = df[df.Age.notnull()] Embarked_null = df[df.Embarked.isnull()] df = df[df.Embarked.notnull()] Fare_null = df[df.Fare.isnull()] df = df[df.Fare.notnull()] # ## Notnull Data: df.shape = (1043, 9) # In[1510]: print(df.shape) df.info() # ## Model to fill NaN in Fare, Embarked, Age # In[1511]: from keras.models import Sequential from keras.layers import Flatten, Dense, Dropout, BatchNormalization # model for Fare, Embarked, Age def fill_data(col): n_cols = len(df.columns) - 1 num = len(df[col].unique()) model = Sequential() model.add(Dense(64, activation='relu', input_shape=(n_cols,))) model.add(Dropout(0.5)) model.add(Dense(32, activation='relu')) model.add(Dropout(0.5)) if col == 'Embarked': model.add(Dense(num, activation='softmax')) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) else: # 'Fare', 'Age' model.add(Dense(1, activation='linear')) model.compile(optimizer='rmsprop', loss='mse', metrics=['mae']) data = df.drop([col], axis=1) epochs = 100 hist = model.fit(data, df[col], epochs=epochs, batch_size=32) null_data = df0[df0[col].isnull()] null_data = null_data.drop([col], axis=1) pred = model.predict(null_data) if col == 'Embarked': pred = pred.argmax(axis=1) plt.plot(hist.history['acc'], 'b-', label='acc' ) plt.plot(hist.history['loss'], 'r-', label='loss' ) plt.xlabel('epochs') plt.legend() plt.show() pred = pred.reshape(-1, ) idx = df0[df0[col].isnull()].index.values for n, i in enumerate(idx): df0.loc[i, col] = pred[n] # In[1512]: fill_data('Embarked') # id:62,830 # In[1513]: fill_data('Fare') # id:1044 # In[1514]: fill_data('Age') # id: 6,18,20,27,29,30 # In[1517]: train0 = df0[0:891].copy() test0 = df0[891:].copy() # ## Model to estimate Survived for submission # In[1518]: df0_cols = len(df0.columns) model = Sequential() model.add(Dense(64, activation='relu', input_shape=(df0_cols,))) model.add(Dropout(0.5)) model.add(Dense(32, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(2, activation='softmax')) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) epochs=300 hist = model.fit(train0, train.Survived, epochs=epochs, batch_size=5) pred = model.predict(test0) # In[1519]: # print(model.metrics_names) plt.plot(hist.history['acc'], 'b-', label='acc' ) plt.plot(hist.history['loss'], 'r-', label='loss' ) plt.xlabel('epochs') plt.legend() plt.show() # In[ ]: result = pred.argmax(axis=1) # ## Submission file: # In[1523]: submission = pd.DataFrame({'PassengerId': test.index, 'Survived': result}) submission.to_csv('titanic/submission.csv', index=False)