#!/usr/bin/env python # coding: utf-8 # ## 使用 TensorFlow 搭建神经网络预测泰坦尼克号乘客生存率 # # 比赛地址:[Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic) # # 比赛内容:给定大量泰坦尼克号的乘客信息数据,要求预测乘客是否生存 # # 比赛思路:典型的二分类问题,本文搭建多层感知机(MLP)解决 # # 本文将从以下几方面介绍解决方案: # 1. 添加库 # 2. 定义全局变量 # 3. 加载数据文件 # 4. **数据可视化** # 5. **清洗数据** # 6. **特征工程** # 7. **搭建神经网络(前向传播)** # 8. **训练(反向传播)** # 9. **测试模型效果 && 生成提交结果** # # ## 1. 添加库 # In[7]: import tensorflow as tf import model import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # ## 2. 定义全局变量 # In[8]: train_data_file = r'./data/train.csv' test_data_file = r'./data/test.csv' test_label_file = r'./data/gender_submission.csv' model_save_path = r'./ckpt/model' output_file = r'./result.csv' learning_rate = 0.001 BATCH_SIZE = 20 input_size = 11 hidden_size = 20 output_size = 2 epoch = 2000 KEEP_PROB = 0.5 # ## 3. 加载数据文件 # In[9]: data = pd.read_csv(train_data_file, sep=',', dtype={ 'Name' : 'str', 'Survived' : 'int64', 'Pclass' : 'float32', 'Sex' : 'str', 'Age' : 'float32', 'SibSp' : 'float32', 'Parch' : 'float32', 'Fare' : 'float32', 'Embarked' : 'str', } ) # In[10]: data.head(10) # 乘客信息说明: # * PassengerId => 乘客ID # * Pclass => 客舱等级(1/2/3等舱位) # * Name => 乘客姓名 # * Sex => 性别 # * Age => 年龄 # * SibSp => 兄弟姐妹数/配偶数 # * Parch => 父母数/子女数 # * Ticket => 船票编号 # * Fare => 船票价格 # * Cabin => 客舱号 # * Embarked => 登船港口 # In[11]: # 生成简要的统计信息 # count - 数量 # mean - 均值 # std - 标准差 # min - 最小值 # 25% - 下四分位 # 50% - 中位数 # 75% - 上四分位 # max - 最大值 data.describe() # ## 4. 数据可视化 # In[12]: # 性别与是否生存的关系 Sex data.groupby(['Sex','Survived'])['Survived'].count() # In[13]: data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar() # In[14]: # 船舱等级和生存与否的关系 Pclass data.groupby(['Pclass','Survived'])['Pclass'].count() # In[15]: data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar() # In[16]: # 船舱等级 && 性别 和生存与否的关系 Pclass && Sex data[['Sex','Pclass','Survived']].groupby(['Pclass','Sex']).mean().plot.bar() # In[17]: # 不同年龄下的是否生存的分布情况: facet = sns.FacetGrid(data, hue="Survived",aspect=4) facet.map(sns.kdeplot,'Age',shade= True) facet.set(xlim=(0, data['Age'].max())) facet.add_legend() # In[18]: # 年龄 && 船舱等级 && 性别 与存活与否的关系 Age && Pclass && Sex fig, ax = plt.subplots(1, 2, figsize = (18, 8)) sns.violinplot("Pclass", "Age", hue="Survived", data=data, split=True, ax=ax[0]) ax[0].set_title('Pclass and Age vs Survived') ax[0].set_yticks(range(0, 110, 10)) sns.violinplot("Sex", "Age", hue="Survived", data=data, split=True, ax=ax[1]) ax[1].set_title('Sex and Age vs Survived') ax[1].set_yticks(range(0, 110, 10)) plt.show() # In[19]: # 按照年龄,将乘客划分为儿童、少年、成年和老年,分析四个群体的生还情况: bins = [0, 12, 18, 65, 100] data['Age_group'] = pd.cut(data['Age'], bins) by_age = data.groupby('Age_group')['Survived'].mean() by_age.plot(kind = 'bar') # In[20]: # 亲友的人数和存活与否的关系 data['Family_Size'] = data['Parch'] + data['SibSp'] + 1 data[['Family_Size','Survived']].groupby(['Family_Size']).mean().plot.bar() # ## 5. 清洗数据 # In[21]: # 首先进行预处理 # 提取标签,构造训练集标签 y_ y_ = data.loc[:,'Survived'] y_0 = y_.map(lambda x: 0 if x==1 else 1) y_1 = y_ y_ = pd.concat([y_0, y_1], axis=1) # 重命名列名 y_.columns = ['Dead','Survived'] # 转换数据类型 y_ = y_.astype('float32') # 获取 Numpy 格式的矩阵,便于 TensorFlow 处理 y_ = y_.values # In[22]: # 提取需要考虑的字段作为特征,构造训练集数据 x x = data.loc[:,['Name','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] # In[23]: # 清洗训练集数据前 x.head(10) # In[24]: # 清洗数据 # 将 Sex 列中的性别字符串替换为数字 x['Sex'] = x['Sex'].replace(['female', 'male'],[0,1]).astype('int32') # 将 Embarked 出发地的字符替换为数字,同时将列中 NaN 值替换为 0 x['Embarked'] = x['Embarked'].fillna('S') mapping = {'C':0,'Q':1,'S':2} x['Embarked'] = x['Embarked'].map(mapping) # 将 Fare 收入中为 NaN 的填充为中位数 x['Fare'] = x['Fare'].fillna(x['Fare'].median()) # 使用 随机森林 预测 Age 字段的缺失值 from sklearn.ensemble import RandomForestRegressor age = data[['Age','Survived','Fare','Parch','SibSp','Pclass']] age_notnull = age.loc[(data.Age.notnull())] age_isnull = age.loc[(data.Age.isnull())] X = age_notnull.values[:,1:] Y = age_notnull.values[:,0] rfr = RandomForestRegressor(n_estimators=1000,n_jobs=-1) rfr.fit(X,Y) predictAges = rfr.predict(age_isnull.values[:,1:]) x.loc[(x.Age.isnull()),'Age'] = predictAges # In[25]: x.head(10) # ## 6. 特征工程 # In[26]: # 构造新的特征 # 添加 Child 特征,<=16 的为小孩子,设为1,否则为0 x['Child'] = x.Age.apply(lambda x: 1 if x<=16 else 0).astype('int32') # 添加 FamilySize 特征,表示家族大小 x['FamilySize'] = x['SibSp'] + x['Parch'] + 1 x['FamilySize'] = x['FamilySize'].astype('int32') # 添加 IsAlone 特征,表示是否独身一人。如果 FamilySize==1,则为1,否则为0 x['IsAlone'] = x.FamilySize.apply(lambda x: 1 if x==1 else 0) # 添加 Age_bin 特征,划分年龄区间 x['Age_bin'] = pd.cut(x['Age'], bins=[0,16,32,48,1200], labels=['Children','Teenage','Adult','Elder']) mapping = {'Children':0,'Teenage':1,'Adult':2,'Elder':3} x['Age_bin'] = x['Age_bin'].map(mapping) # 添加 Fare_bin 特征,划分收入区间 x['Fare_bin'] = pd.cut(x['Fare'], bins=[-1,7.91,14.45,31,12000], labels=['Low_fare','median_fare','Average_fare','high_fare']) mapping = {'Low_fare':0,'median_fare':1,'Average_fare':2,'high_fare':3} x['Fare_bin'] = x['Fare_bin'].map(mapping) # 处理 Name 特征 import re # Define function to extract titles from passenger names def get_title(name): title_search = re.search(' ([A-Za-z]+)\.', name) # If the title exists, extract and return it. if title_search: return title_search.group(1) return "" # Create a new feature Title, containing the titles of passenger names x['Title'] = x['Name'].apply(get_title) # Group all non-common titles into one single grouping "Rare" x['Title'] = x['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') x['Title'] = x['Title'].replace('Mlle', 'Miss') x['Title'] = x['Title'].replace('Ms', 'Miss') x['Title'] = x['Title'].replace('Mme', 'Mrs') mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} x['Title'] = x['Title'].map(mapping) x['Title'] = x['Title'].fillna(0) # 丢弃无用字段 x.drop(["Name", "Age", "Fare"], axis=1, inplace=True) # In[27]: x.head(10) # In[28]: # 获取 Numpy 格式的矩阵,便于 TensorFlow 处理 x = x.values # ## 7. 搭建神经网络(前向传播) # In[29]: def input_placeholder(input_size, output_size): # 输入占位符 x = tf.placeholder(dtype=tf.float32, shape=[None, input_size]) y_ = tf.placeholder(dtype=tf.float32, shape=[None, output_size]) keep_prob = tf.placeholder(tf.float32) return x, y_, keep_prob def forward(x, w1, w2, b1, b2, keep_prob=1.0): # 模型结构 # 定义一个多层感知机(MLP),最后加一个softmax归一化进行二分类 # 输入定义9个神经元,隐藏层定义100个神经元,输出层定义两个神经元(二分类),然后做一个softmax a = tf.matmul(x, w1) + b1 a = tf.nn.dropout(a, keep_prob=keep_prob) a = tf.nn.relu(a) y = tf.matmul(a, w2) + b2 return y def loss(y, y_): # 交叉熵 损失 loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) return loss def accuary(y, y_): # 预测准确率 correct_pred = tf.equal(tf.argmax(y_, 1),tf.argmax(y,1)) acc = tf.reduce_mean(tf.cast(correct_pred,tf.float32)) return acc # ## 8.训练(反向传播) # In[30]: def Train(): global x, y_ # 训练过程 X, Y_ = x, y_ # 模型 w1 = tf.Variable(tf.random_normal([input_size, hidden_size], stddev=1.0, seed=2.0)) w2 = tf.Variable(tf.random_normal([hidden_size, output_size], stddev=1.0, seed=2.0)) b1 = tf.Variable(tf.zeros([hidden_size]), name='bias1') b2 = tf.Variable(tf.zeros([output_size]), name='bias2') x, y_, keep_prob = model.input_placeholder(input_size, output_size) y = model.forward(x, w1, w2, b1, b2, keep_prob=keep_prob) loss = model.loss(y, y_) y = tf.nn.softmax(y) accuary = model.accuary(y, y_) # 定义训练(反向传播)过程 train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) # tf saver saver = tf.train.Saver() with tf.Session() as sess: # 变量初始化 init_op = tf.global_variables_initializer() sess.run(init_op) # 训练 print('Train start...') for i in range(epoch): for j in range(len(Y_) // BATCH_SIZE + 1): start = j * BATCH_SIZE end = start + BATCH_SIZE # 反向传播 _, loss_result, y_result, acc_result = sess.run([train_op, loss, y, accuary], feed_dict={x:X[start:end], y_:Y_[start:end], keep_prob: KEEP_PROB}) # 输出每个 epoch 之后的 loss 和 准确率 if i%400==0 or i==0 or i+1==epoch: print(i, sess.run([loss, accuary], feed_dict={x:X, y_:Y_, keep_prob: 1.0})) print('Train end.') # 保存模型到本地 print('Saving model...') saver.save(sess, model_save_path) print('Save finally.') Train() # ## 9. 测试模型效果 && 生成提交结果 # In[25]: # 导入测试数据加载函数,处理方式和步骤3-5相似,详情请见代码 from train import get_test_data # In[26]: # 定义测试过程 && 生成提交结果 def Test(): # 测试 # 提取测试数据 X, Y_, PassengerId = get_test_data() # 模型 w1 = tf.Variable(tf.random_normal([input_size, hidden_size], stddev=1.0, seed=2.0)) w2 = tf.Variable(tf.random_normal([hidden_size, output_size], stddev=1.0, seed=2.0)) b1 = tf.Variable(tf.zeros([hidden_size]), name='bias1') b2 = tf.Variable(tf.zeros([output_size]), name='bias2') x, y_, keep_prob = model.input_placeholder(input_size, output_size) y = model.forward(x, w1, w2, b1, b2, keep_prob=keep_prob) loss = model.loss(y, y_) y = tf.nn.softmax(y) accuary = model.accuary(y, y_) y = tf.argmax(y, 1) #保存模型对象saver saver = tf.train.Saver() with tf.Session() as sess: # 变量初始化 saver.restore(sess, model_save_path) loss, y, acc_result = sess.run([loss, y, accuary], feed_dict={x:X, y_:Y_, keep_prob: 1.0}) print('loss:',loss) print('accuary:',acc_result) # 生成输出文件 # 准备输出数据 Survived = y.reshape((-1,1)) result = np.hstack((PassengerId, Survived)) result = pd.DataFrame(result, columns=['PassengerId', 'Survived']) result.to_csv(output_file, sep=',', encoding='utf-8', index=False) # 清除默认图的堆栈,并设置全局图为默认图 tf.reset_default_graph() Test()