import warnings
warnings.simplefilter(action='ignore')
import urllib.request
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'
file_path = 'data/titanic3.xls'
if not os.path.isfile(file_path):
result = urllib.request.urlretrieve(url, file_path)
print('download:', result)
all_df = pd.read_excel(file_path)
all_df[:2]
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]
all_df[:2]
survived | name | pclass | sex | age | sibsp | parch | fare | embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Allen, Miss. Elisabeth Walton | 1 | female | 29.0000 | 0 | 0 | 211.3375 | S |
1 | 1 | Allison, Master. Hudson Trevor | 1 | male | 0.9167 | 1 | 2 | 151.5500 | S |
all_df.isnull().sum()
survived 0 name 0 pclass 0 sex 0 age 263 sibsp 0 parch 0 fare 1 embarked 2 dtype: int64
df = all_df.drop(['name'], axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df[:2]
survived | pclass | sex | age | sibsp | parch | fare | embarked | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | female | 29.0000 | 0 | 0 | 211.3375 | S |
1 | 1 | 1 | male | 0.9167 | 1 | 2 | 151.5500 | S |
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
df[:2]
survived | pclass | sex | age | sibsp | parch | fare | embarked | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 29.0000 | 0 | 0 | 211.3375 | S |
1 | 1 | 1 | 1 | 0.9167 | 1 | 2 | 151.5500 | S |
x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])
x_one_hot_df[:2]
survived | pclass | sex | age | sibsp | parch | fare | embarked_C | embarked_Q | embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 29.0000 | 0 | 0 | 211.3375 | 0 | 0 | 1 |
1 | 1 | 1 | 1 | 0.9167 | 1 | 2 | 151.5500 | 0 | 0 | 1 |
ndarray = x_one_hot_df.values
ndarray.shape
(1309, 10)
ndarray[:2]
array([[ 1. , 1. , 0. , 29. , 0. , 0. , 211.3375, 0. , 0. , 1. ], [ 1. , 1. , 1. , 0.9167, 1. , 2. , 151.55 , 0. , 0. , 1. ]])
label = ndarray[:, 0]
features = ndarray[:, 1:]
label.shape
(1309,)
label[:2]
array([1., 1.])
features.shape
(1309, 9)
features[:2]
array([[ 1. , 0. , 29. , 0. , 0. , 211.3375, 0. , 0. , 1. ], [ 1. , 1. , 0.9167, 1. , 2. , 151.55 , 0. , 0. , 1. ]])
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) # 标准仳之后的范围在0与1之间
scaled_features = minmax_scale.fit_transform(features)
scaled_features[:2]
array([[0. , 0. , 0.36116884, 0. , 0. , 0.41250333, 0. , 0. , 1. ], [0. , 1. , 0.00939458, 0.125 , 0.22222222, 0.2958059 , 0. , 0. , 1. ]])
mask = np.random.rand(len(all_df)) < 0.8
train_df = all_df[mask]
test_df = all_df[~mask]
print('total size:', len(all_df))
print('train size:', len(train_df))
print('test size:', len(test_df))
total size: 1309 train size: 1043 test size: 266
def preprocess_data(raw_df):
df = raw_df.drop(['name'], axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])
ndarray = x_one_hot_df.values
label = ndarray[:, 0]
features = ndarray[:, 1:]
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaled_features = minmax_scale.fit_transform(features)
return scaled_features, label
train_features, train_label = preprocess_data(train_df)
test_features, test_label = preprocess_data(test_df)
train_features[:2]
array([[0. , 0. , 0.36116884, 0. , 0. , 0.41250333, 0. , 0. , 1. ], [0. , 0. , 0.31106443, 0.125 , 0.22222222, 0.2958059 , 0. , 0. , 1. ]])
train_label[:2]
array([1., 0.])