使用 TensorFlow 搭建神经网络预测泰坦尼克号乘客生存率

比赛地址:Titanic: Machine Learning from Disaster

本文将从以下几方面介绍解决方案:

  1. 添加库
  2. 定义全局变量
  3. 加载数据文件
  4. 清洗数据
  5. 特征工程
  6. 搭建神经网络(前向传播)
  7. 训练(反向传播)
  8. 测试模型效果 && 生成提交结果

1. 添加库

In [1]:
import tensorflow as tf
import model
import pandas as pd
import numpy as np

2. 定义全局变量

In [2]:
train_data_file = r'./data/train.csv'
test_data_file = r'./data/test.csv'
test_label_file = r'./data/gender_submission.csv'
model_save_path = r'./ckpt/model'
output_file = r'./result.csv'

learning_rate = 0.001
BATCH_SIZE = 20

input_size = 11
hidden_size = 20
output_size = 2

epoch = 2000
KEEP_PROB = 0.5

3. 加载数据文件

In [3]:
data = pd.read_csv(train_data_file, 
    sep=',', 
    dtype={
        'Name' : 'str',
        'Survived' : 'int64',
        'Pclass' : 'float32',   
        'Sex' : 'str',
        'Age' : 'float32',
        'SibSp' : 'float32',
        'Parch' : 'float32',
        'Fare' : 'float32',
        'Embarked' : 'str',
    }
)
In [4]:
data.head()
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3.0 Braund, Mr. Owen Harris male 22.0 1.0 0.0 A/5 21171 7.250000 NaN S
1 2 1 1.0 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1.0 0.0 PC 17599 71.283302 C85 C
2 3 1 3.0 Heikkinen, Miss. Laina female 26.0 0.0 0.0 STON/O2. 3101282 7.925000 NaN S
3 4 1 1.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1.0 0.0 113803 53.099998 C123 S
4 5 0 3.0 Allen, Mr. William Henry male 35.0 0.0 0.0 373450 8.050000 NaN S
In [5]:
data.describe()
Out[5]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102744 0.806057 49.693428
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329224

4. 清洗数据

In [6]:
# 提取标签,构造训练集标签 y_
y_ = data.loc[:,'Survived']
y_0 = y_.map(lambda x: 0 if x==1 else 1)
y_1 = y_
y_ = pd.concat([y_0, y_1], axis=1)
# 重命名列名
y_.columns = ['Dead','Survived']
# 转换数据类型
y_ = y_.astype('float32')
# 获取 Numpy 格式的矩阵,便于 TensorFlow 处理
y_ = y_.values
In [7]:
# 提取需要考虑的字段作为特征,构造训练集数据 x
x = data.loc[:,['Name','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
In [8]:
# 清洗训练集数据前
x.head(10)
Out[8]:
Name Pclass Sex Age SibSp Parch Fare Embarked
0 Braund, Mr. Owen Harris 3.0 male 22.0 1.0 0.0 7.250000 S
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1.0 female 38.0 1.0 0.0 71.283302 C
2 Heikkinen, Miss. Laina 3.0 female 26.0 0.0 0.0 7.925000 S
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1.0 female 35.0 1.0 0.0 53.099998 S
4 Allen, Mr. William Henry 3.0 male 35.0 0.0 0.0 8.050000 S
5 Moran, Mr. James 3.0 male NaN 0.0 0.0 8.458300 Q
6 McCarthy, Mr. Timothy J 1.0 male 54.0 0.0 0.0 51.862499 S
7 Palsson, Master. Gosta Leonard 3.0 male 2.0 3.0 1.0 21.075001 S
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 3.0 female 27.0 0.0 2.0 11.133300 S
9 Nasser, Mrs. Nicholas (Adele Achem) 2.0 female 14.0 1.0 0.0 30.070801 C
In [9]:
# 清洗数据
# 将 Sex 列中的性别字符串替换为数字
x['Sex'] = x['Sex'].replace(['female', 'male'],[0,1]).astype('int32')
# 将 Embarked 出发地的字符替换为数字,同时将列中 NaN 值替换为 0
x['Embarked'] = x['Embarked'].fillna('S')
mapping = {'C':0,'Q':1,'S':2}
x['Embarked'] = x['Embarked'].map(mapping)
# 将 Fare 收入中为 NaN 的填充为中位数
x['Fare'] = x['Fare'].fillna(x['Fare'].median())

# 使用 随机森林 预测 Age 字段的缺失值
from sklearn.ensemble import RandomForestRegressor
age = data[['Age','Survived','Fare','Parch','SibSp','Pclass']]
age_notnull = age.loc[(data.Age.notnull())]
age_isnull = age.loc[(data.Age.isnull())]
X = age_notnull.values[:,1:]
Y = age_notnull.values[:,0]
rfr = RandomForestRegressor(n_estimators=1000,n_jobs=-1)
rfr.fit(X,Y)
predictAges = rfr.predict(age_isnull.values[:,1:])
x.loc[(x.Age.isnull()),'Age'] = predictAges
c:\python36\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d
In [10]:
x.head(10)
Out[10]:
Name Pclass Sex Age SibSp Parch Fare Embarked
0 Braund, Mr. Owen Harris 3.0 1 22.000000 1.0 0.0 7.250000 2
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1.0 0 38.000000 1.0 0.0 71.283302 0
2 Heikkinen, Miss. Laina 3.0 0 26.000000 0.0 0.0 7.925000 2
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1.0 0 35.000000 1.0 0.0 53.099998 2
4 Allen, Mr. William Henry 3.0 1 35.000000 0.0 0.0 8.050000 2
5 Moran, Mr. James 3.0 1 23.601349 0.0 0.0 8.458300 1
6 McCarthy, Mr. Timothy J 1.0 1 54.000000 0.0 0.0 51.862499 2
7 Palsson, Master. Gosta Leonard 3.0 1 2.000000 3.0 1.0 21.075001 2
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 3.0 0 27.000000 0.0 2.0 11.133300 2
9 Nasser, Mrs. Nicholas (Adele Achem) 2.0 0 14.000000 1.0 0.0 30.070801 0

5. 特征工程

In [11]:
# 构造新的特征

# 添加 Child 特征,<=12 的为小孩子,设为1,否则为0
x['Child'] = x.Age.apply(lambda x: 1 if x<=16 else 0).astype('int32')

# 添加 FamilySize 特征,表示家族大小
x['FamilySize'] = x['SibSp'] + x['Parch'] + 1
x['FamilySize'] = x['FamilySize'].astype('int32')

# 添加 IsAlone 特征,表示是否独身一人。如果 FamilySize==1,则为1,否则为0
x['IsAlone'] = x.FamilySize.apply(lambda x: 1 if x==1 else 0)

# 添加 Age_bin 特征,划分年龄区间
x['Age_bin'] = pd.cut(x['Age'], bins=[0,16,32,48,1200], 
                    labels=['Children','Teenage','Adult','Elder'])
mapping = {'Children':0,'Teenage':1,'Adult':2,'Elder':3}
x['Age_bin'] = x['Age_bin'].map(mapping)


# 添加 Fare_bin 特征,划分收入区间
x['Fare_bin'] = pd.cut(x['Fare'], bins=[-1,7.91,14.45,31,12000], 
                    labels=['Low_fare','median_fare','Average_fare','high_fare'])
mapping = {'Low_fare':0,'median_fare':1,'Average_fare':2,'high_fare':3}
x['Fare_bin'] = x['Fare_bin'].map(mapping)

# 处理 Name 特征
import re
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
x['Title'] = x['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
x['Title'] = x['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                            'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

x['Title'] = x['Title'].replace('Mlle', 'Miss')
x['Title'] = x['Title'].replace('Ms', 'Miss')
x['Title'] = x['Title'].replace('Mme', 'Mrs')

mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
x['Title'] = x['Title'].map(mapping)
x['Title'] = x['Title'].fillna(0)

# 丢弃无用字段
x.drop(["Name", "Age", "Fare"], axis=1, inplace=True)
In [12]:
x.head(10)
Out[12]:
Pclass Sex SibSp Parch Embarked Child FamilySize IsAlone Age_bin Fare_bin Title
0 3.0 1 1.0 0.0 2 0 2 0 1 0 1
1 1.0 0 1.0 0.0 0 0 2 0 2 3 3
2 3.0 0 0.0 0.0 2 0 1 1 1 1 2
3 1.0 0 1.0 0.0 2 0 2 0 2 3 3
4 3.0 1 0.0 0.0 2 0 1 1 2 1 1
5 3.0 1 0.0 0.0 1 0 1 1 1 1 1
6 1.0 1 0.0 0.0 2 0 1 1 3 3 1
7 3.0 1 3.0 1.0 2 1 5 0 0 2 4
8 3.0 0 0.0 2.0 2 0 3 0 1 1 3
9 2.0 0 1.0 0.0 0 1 2 0 0 2 3
In [13]:
# 获取 Numpy 格式的矩阵,便于 TensorFlow 处理
x = x.values

6. 搭建神经网络(前向传播)

In [14]:
def input_placeholder(input_size, output_size):
    # 输入占位符
    x = tf.placeholder(dtype=tf.float32, shape=[None, input_size])
    y_ = tf.placeholder(dtype=tf.float32, shape=[None, output_size])
    keep_prob = tf.placeholder(tf.float32)

    return x, y_, keep_prob

def forward(x, w1, w2, b1, b2, keep_prob=1.0):
    # 模型结构
    # 定义一个多层感知机(MLP),最后加一个softmax归一化进行二分类
    # 输入定义9个神经元,隐藏层定义100个神经元,输出层定义两个神经元(二分类),然后做一个softmax
    
    a = tf.matmul(x, w1) + b1
    a = tf.nn.dropout(a, keep_prob=keep_prob)
    a = tf.nn.relu(a)
    y = tf.matmul(a, w2) + b2
    return y

def loss(y, y_):
    # 交叉熵 损失
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    return loss

def accuary(y, y_):
    # 预测准确率
    correct_pred = tf.equal(tf.argmax(y_, 1),tf.argmax(y,1))
    acc = tf.reduce_mean(tf.cast(correct_pred,tf.float32))
    return acc

7.训练(反向传播)

In [15]:
def Train():
    global x, y_
    # 训练过程
    X, Y_ = x, y_
    
    # 模型
    w1 = tf.Variable(tf.random_normal([input_size, hidden_size], stddev=1.0, seed=2.0))
    w2 = tf.Variable(tf.random_normal([hidden_size, output_size], stddev=1.0, seed=2.0))
    b1 = tf.Variable(tf.zeros([hidden_size]), name='bias1')
    b2 = tf.Variable(tf.zeros([output_size]), name='bias2')

    x, y_, keep_prob = model.input_placeholder(input_size, output_size)
    y = model.forward(x, w1, w2, b1, b2, keep_prob=keep_prob)
    loss = model.loss(y, y_)
    y = tf.nn.softmax(y)
    accuary = model.accuary(y, y_)

    # 定义训练(反向传播)过程
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    # tf saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # 变量初始化
        init_op = tf.global_variables_initializer()
        sess.run(init_op)

        # 训练
        print('Train start...')
        for i in range(epoch):
            for j in range(len(Y_) // BATCH_SIZE + 1):
                start = j * BATCH_SIZE
                end = start + BATCH_SIZE
                # 反向传播
                _, loss_result, y_result, acc_result = sess.run([train_op, loss, y, accuary], feed_dict={x:X[start:end], y_:Y_[start:end], keep_prob: KEEP_PROB})
            # 输出每个 epoch 之后的 loss 和 准确率
            if i%400==0 or i==0 or i+1==epoch:
                print(i, sess.run([loss, accuary], feed_dict={x:X, y_:Y_, keep_prob: 1.0}))
        print('Train end.')

        # 保存模型到本地
        print('Saving model...')
        saver.save(sess, model_save_path)
        print('Save finally.')
        
        
Train()
WARNING:tensorflow:From G:\workspace\kaggle_titanic\model.py:30: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

Train start...
0 [9.188999, 0.38271606]
400 [0.38498607, 0.83501685]
800 [0.36183944, 0.84511787]
1200 [0.35613248, 0.8552188]
1600 [0.35285753, 0.85409653]
1999 [0.35182995, 0.8552188]
Train end.
Saving model...
Save finally.

8. 测试模型效果 && 生成提交结果

In [16]:
# 导入测试数据加载函数,处理方式和步骤3-5相似,详情请见代码
from train import get_test_data
In [17]:
# 定义测试过程 && 生成提交结果
def Test():
    # 测试
    # 提取测试数据
    X, Y_, PassengerId = get_test_data()

    # 模型
    w1 = tf.Variable(tf.random_normal([input_size, hidden_size], stddev=1.0, seed=2.0))
    w2 = tf.Variable(tf.random_normal([hidden_size, output_size], stddev=1.0, seed=2.0))
    b1 = tf.Variable(tf.zeros([hidden_size]), name='bias1')
    b2 = tf.Variable(tf.zeros([output_size]), name='bias2')

    x, y_, keep_prob = model.input_placeholder(input_size, output_size)
    y = model.forward(x, w1, w2, b1, b2, keep_prob=keep_prob)
    loss = model.loss(y, y_)
    y = tf.nn.softmax(y)
    accuary = model.accuary(y, y_)
    y = tf.argmax(y, 1)

    #保存模型对象saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # 变量初始化
        saver.restore(sess, model_save_path)
        loss, y, acc_result = sess.run([loss, y, accuary], feed_dict={x:X, y_:Y_, keep_prob: 1.0})
        print('loss:',loss)
        print('accuary:',acc_result)
    
    # 生成输出文件
    # 准备输出数据
    Survived = y.reshape((-1,1))
    result = np.hstack((PassengerId, Survived))
    result = pd.DataFrame(result, columns=['PassengerId', 'Survived'])
    result.to_csv(output_file, sep=',', encoding='utf-8', index=False)

# 清除默认图的堆栈,并设置全局图为默认图 
tf.reset_default_graph() 
Test()
INFO:tensorflow:Restoring parameters from ./ckpt/model
loss: 0.9090207
accuary: 0.8732057