#!/usr/bin/env python
# coding: utf-8

# In[1]:


import warnings
warnings.simplefilter(action='ignore')


# In[2]:


import urllib.request
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing


# ### 1. 下载 Titanic 号上旅客的数据集

# In[3]:


url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'
file_path = 'data/titanic3.xls'
if not os.path.isfile(file_path):
    result = urllib.request.urlretrieve(url, file_path)
    print('download:', result)


# ### 2. 使用 pandas 读取数据并进行预处理

# #### 2.1 读取 titanic3.xls 文件为 DataFrame

# In[4]:


all_df = pd.read_excel(file_path)


# #### 2.2 查看前两项数据

# In[5]:


all_df[:2]


# #### 2.3 把需要的字段选取到 DataFrame 中

# In[6]:


cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]


# In[7]:


all_df[:2]


# #### 2.4 找出含有 null 值的字段

# In[8]:


all_df.isnull().sum()


# #### 2.5 将 name 字段删除

# In[9]:


df = all_df.drop(['name'], axis=1)


# #### 2.6 将 age 与 fare 为 null 的数据替换成平均值

# In[10]:


age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)


# In[11]:


df[:2]


# #### 2.7 转换性别字段为 0 与 1

# In[12]:


df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)


# In[13]:


df[:2]


# #### 2.8 将 embarked 字段进行 one-hot 编码

# In[14]:


x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])


# In[15]:


x_one_hot_df[:2]


# ### 3. 将 DataFrame 转换为 Array

# #### 3.1 DataFrame 转换为 Array

# In[16]:


ndarray = x_one_hot_df.values


# #### 3.2 查看 ndarray 的 shape, 以及前两项数据

# In[17]:


ndarray.shape


# In[18]:


ndarray[:2]


# #### 3.3 提取 features 与 label

# In[19]:


label = ndarray[:, 0]
features = ndarray[:, 1:]


# #### 3.4 分别查看 features 与 label 的 shape, 以及前两项数据

# In[20]:


label.shape


# In[21]:


label[:2]


# In[22]:


features.shape


# In[23]:


features[:2]


# ### 4. 将 Array 进行标准化

# In[24]:


minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))  # 标准仳之后的范围在0与1之间
scaled_features = minmax_scale.fit_transform(features)


# In[25]:


scaled_features[:2]


# ### 5. 将数据分为训练数据与测试数据

# #### 5.1 将数据以随机方式分为训练数据与测试数据

# In[26]:


mask = np.random.rand(len(all_df)) < 0.8
train_df = all_df[mask]
test_df = all_df[~mask]


# In[27]:


print('total size:', len(all_df))
print('train size:', len(train_df))
print('test size:', len(test_df))


# #### 5.2 创建函数进行数据的预处理

# In[28]:


def preprocess_data(raw_df):
    df = raw_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
    x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])
    
    ndarray = x_one_hot_df.values
    label = ndarray[:, 0]
    features = ndarray[:, 1:]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaled_features = minmax_scale.fit_transform(features)
    
    return scaled_features, label


# In[29]:


train_features, train_label = preprocess_data(train_df)
test_features, test_label = preprocess_data(test_df)


# ### 5.3 查看数据预处理后训练数据的特征与标签字段

# In[30]:


train_features[:2]


# In[31]:


train_label[:2]