#!/usr/bin/env python
# coding: utf-8

# # Kaggle: Titanic: Machine Learning from Disaster  
# https://www.kaggle.com/c/titanic

# In[8]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
np.random.seed(123)

from tensorflow import set_random_seed
set_random_seed(123)

train = pd.read_csv('titanic/train.csv', index_col=0)
test = pd.read_csv('titanic/test.csv', index_col=0)


# In[3]:


train.head()


# ### Drop Survived and Ticket, then combine train with  test 

# In[4]:


train_tmp = train.drop(['Survived', 'Ticket'], axis=1)
test_tmp = test.drop(['Ticket'], axis=1)
df = pd.concat([train_tmp, test_tmp])
df.info()


# ### Name --> Title --> Number

# In[5]:


# Name to Title
df = df.assign(Title=df.Name.str.extract(' ([A-Za-z]+)\..', expand=True))
title_list = df.Title.unique()
print(title_list)


# In[6]:


# Title to Number(0-17)
df.Title = df.Title.replace(df.Title.unique(), np.arange(len(df.Title.unique())))

# Drop Name column
df = df.drop(['Name'], axis=1)
df.head()


# ### Sex --> male:0, female:1

# In[1501]:


df.Sex = df.Sex.replace({'male': 0, 'female': 1})


# ### Cabin --> Number: nan:0, C:1, E:2, G:3, D:4, A:5, B:6, F:7, T:8

# In[1502]:


df = df.assign(Cabin=df.Cabin.str[0])
cabin_list = df.Cabin.unique()

df.Cabin = df.Cabin.replace(df.Cabin.str[0].unique(), np.arange(len(df.Cabin.str[0].unique())))

print(cabin_list)
print(df.Cabin.unique())


# ### Embarked --> S:0, C:1, Q:2, nan

# In[1503]:


df.Embarked.unique()


# In[1504]:


df.Embarked = df.Embarked.replace({'S':0, 'C':1, 'Q':2})


# ## zscore or normalization:   
# * Age: including NaN
# * Fare: including NaN  
#   
# Z = (x - x.mean) / x.std  
# N = (x - x.min)/(x.max - x.min) 
#   
# sklearn.preprocessing.MinMaxScaler causes error with Null data.

# In[1505]:


# Normalize Function
def normalize(df_col):
    df_col = (df_col - df_col.min()) / (df_col.max() - df_col.min())
    return df_col


# In[1506]:


# Standardization(zscore)
def zscore(df_col):
    df_col = (df_col - df_col.mean()) / df_col.std()
    return df_col


# In[1507]:


df.Age = zscore(df.Age)
df.Fare = zscore(df.Fare)

# df.Age = normalize(df.Age)
# df.Fare = normalize(df.Fare)

# for col in df.columns:
#     df[col] = zscore(df[col])

df.describe()


# ## Separate Notnull data from Null data
# 
# Make a Copy of df: df0 = df.copy()  
# * Age
# * Embarked
# * Fare
# 

# In[1508]:


df0 = df.copy()
df0.info()


# In[1509]:


Age_null = df[df.Age.isnull()]
df = df[df.Age.notnull()]

Embarked_null = df[df.Embarked.isnull()]
df = df[df.Embarked.notnull()]

Fare_null = df[df.Fare.isnull()]
df = df[df.Fare.notnull()]


# ## Notnull Data: df.shape = (1043, 9)

# In[1510]:


print(df.shape)
df.info()


# ## Model to fill NaN in Fare, Embarked, Age

# In[1511]:


from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, BatchNormalization

# model for Fare, Embarked, Age
def fill_data(col):
    n_cols = len(df.columns) - 1
    num = len(df[col].unique())
    
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(n_cols,)))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    
    if col == 'Embarked':
        model.add(Dense(num, activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
    else: # 'Fare', 'Age'
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
        
    data = df.drop([col], axis=1)
    epochs = 100
    hist = model.fit(data, df[col], epochs=epochs, batch_size=32)

    null_data = df0[df0[col].isnull()]
    null_data = null_data.drop([col], axis=1)
    pred = model.predict(null_data)
    
    if col == 'Embarked':
        pred = pred.argmax(axis=1)
        
        plt.plot(hist.history['acc'], 'b-', label='acc' )
        plt.plot(hist.history['loss'], 'r-', label='loss' )
        plt.xlabel('epochs')
        plt.legend()
        plt.show()
        
    pred = pred.reshape(-1, )
    
    idx = df0[df0[col].isnull()].index.values
    
    for n, i in enumerate(idx):
        df0.loc[i, col] = pred[n]


# In[1512]:


fill_data('Embarked') # id:62,830


# In[1513]:


fill_data('Fare') # id:1044


# In[1514]:


fill_data('Age') # id: 6,18,20,27,29,30


# In[1517]:


train0 = df0[0:891].copy()
test0 = df0[891:].copy()


# ## Model to estimate Survived for submission

# In[1518]:


df0_cols = len(df0.columns)

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(df0_cols,)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

epochs=300
hist = model.fit(train0, train.Survived, epochs=epochs, batch_size=5)

pred = model.predict(test0)


# In[1519]:


# print(model.metrics_names)
plt.plot(hist.history['acc'], 'b-', label='acc' )
plt.plot(hist.history['loss'], 'r-', label='loss' )
plt.xlabel('epochs')
plt.legend()
plt.show()


# In[ ]:


result = pred.argmax(axis=1)


# ## Submission file:

# In[1523]:


submission = pd.DataFrame({'PassengerId': test.index, 'Survived': result})
submission.to_csv('titanic/submission.csv', index=False)