#!/usr/bin/env python # coding: utf-8 # In[1]: import warnings warnings.simplefilter(action='ignore') # In[2]: import urllib.request import os import numpy as np import pandas as pd from sklearn import preprocessing # ### 1. 下载 Titanic 号上旅客的数据集 # In[3]: url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls' file_path = 'data/titanic3.xls' if not os.path.isfile(file_path): result = urllib.request.urlretrieve(url, file_path) print('download:', result) # ### 2. 使用 pandas 读取数据并进行预处理 # #### 2.1 读取 titanic3.xls 文件为 DataFrame # In[4]: all_df = pd.read_excel(file_path) # #### 2.2 查看前两项数据 # In[5]: all_df[:2] # #### 2.3 把需要的字段选取到 DataFrame 中 # In[6]: cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'] all_df = all_df[cols] # In[7]: all_df[:2] # #### 2.4 找出含有 null 值的字段 # In[8]: all_df.isnull().sum() # #### 2.5 将 name 字段删除 # In[9]: df = all_df.drop(['name'], axis=1) # #### 2.6 将 age 与 fare 为 null 的数据替换成平均值 # In[10]: age_mean = df['age'].mean() df['age'] = df['age'].fillna(age_mean) fare_mean = df['fare'].mean() df['fare'] = df['fare'].fillna(fare_mean) # In[11]: df[:2] # #### 2.7 转换性别字段为 0 与 1 # In[12]: df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int) # In[13]: df[:2] # #### 2.8 将 embarked 字段进行 one-hot 编码 # In[14]: x_one_hot_df = pd.get_dummies(data=df, columns=['embarked']) # In[15]: x_one_hot_df[:2] # ### 3. 将 DataFrame 转换为 Array # #### 3.1 DataFrame 转换为 Array # In[16]: ndarray = x_one_hot_df.values # #### 3.2 查看 ndarray 的 shape, 以及前两项数据 # In[17]: ndarray.shape # In[18]: ndarray[:2] # #### 3.3 提取 features 与 label # In[19]: label = ndarray[:, 0] features = ndarray[:, 1:] # #### 3.4 分别查看 features 与 label 的 shape, 以及前两项数据 # In[20]: label.shape # In[21]: label[:2] # In[22]: features.shape # In[23]: features[:2] # ### 4. 将 Array 进行标准化 # In[24]: minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) # 标准仳之后的范围在0与1之间 scaled_features = minmax_scale.fit_transform(features) # In[25]: scaled_features[:2] # ### 5. 将数据分为训练数据与测试数据 # #### 5.1 将数据以随机方式分为训练数据与测试数据 # In[26]: mask = np.random.rand(len(all_df)) < 0.8 train_df = all_df[mask] test_df = all_df[~mask] # In[27]: print('total size:', len(all_df)) print('train size:', len(train_df)) print('test size:', len(test_df)) # #### 5.2 创建函数进行数据的预处理 # In[28]: def preprocess_data(raw_df): df = raw_df.drop(['name'], axis=1) age_mean = df['age'].mean() df['age'] = df['age'].fillna(age_mean) fare_mean = df['fare'].mean() df['fare'] = df['fare'].fillna(fare_mean) df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int) x_one_hot_df = pd.get_dummies(data=df, columns=['embarked']) ndarray = x_one_hot_df.values label = ndarray[:, 0] features = ndarray[:, 1:] minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) scaled_features = minmax_scale.fit_transform(features) return scaled_features, label # In[29]: train_features, train_label = preprocess_data(train_df) test_features, test_label = preprocess_data(test_df) # ### 5.3 查看数据预处理后训练数据的特征与标签字段 # In[30]: train_features[:2] # In[31]: train_label[:2]