import pandas as pd
data_train=pd.read_csv(r"E:\MLdata\kaggle_titanic\train.csv")
data_predict=pd.read_csv(r"E:\MLdata\kaggle_titanic\test.csv")
data_predict['Survived']=-1
data_train=pd.concat([data_train,data_predict],axis=0)
data_train.info()#是否有空数据,Age、Cabin、Embarked,Fare缺失
<class 'pandas.core.frame.DataFrame'> Int64Index: 1309 entries, 0 to 417 Data columns (total 12 columns): Age 1046 non-null float64 Cabin 295 non-null object Embarked 1307 non-null object Fare 1308 non-null float64 Name 1309 non-null object Parch 1309 non-null int64 PassengerId 1309 non-null int64 Pclass 1309 non-null int64 Sex 1309 non-null object SibSp 1309 non-null int64 Survived 1309 non-null int64 Ticket 1309 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 132.9+ KB
data_train.groupby('Survived').count()#类别分布是否均衡,差不多
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Ticket | |
---|---|---|---|---|---|---|---|---|---|---|---|
Survived | |||||||||||
-1 | 332 | 91 | 418 | 417 | 418 | 418 | 418 | 418 | 418 | 418 | 418 |
0 | 424 | 68 | 549 | 549 | 549 | 549 | 549 | 549 | 549 | 549 | 549 |
1 | 290 | 136 | 340 | 342 | 342 | 342 | 342 | 342 | 342 | 342 | 342 |
data_train.head(3)
#分别是乘客ID,是否存活,几等舱,名字,性别,年龄,兄弟姐妹或配偶数量,父母或孩子数量,机票代码,票价,客舱,登船港口
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Survived | Ticket | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | NaN | S | 7.2500 | Braund, Mr. Owen Harris | 0 | 1 | 3 | male | 1 | 0 | A/5 21171 |
1 | 38.0 | C85 | C | 71.2833 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 2 | 1 | female | 1 | 1 | PC 17599 |
2 | 26.0 | NaN | S | 7.9250 | Heikkinen, Miss. Laina | 0 | 3 | 3 | female | 0 | 1 | STON/O2. 3101282 |
### 开始处理
# 缺失值age
data_train.loc[data_train['Age'].isnull(),'Age']=data_train['Age'].dropna().mean()
# 缺失值 Fare
data_train.loc[data_train['Fare'].isnull(),'Fare']=data_train['Fare'].dropna().mean()
# 归一化age,fare
import sklearn.preprocessing as preprocessing
scaler=preprocessing.StandardScaler()
data_train['Age']=scaler.fit_transform(data_train['Age'].values.reshape(-1,1))
data_train['Fare']=scaler.fit_transform(data_train['Fare'].values.reshape(-1,1))
data_train.head(3)
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Survived | Ticket | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.611972 | NaN | S | -0.503595 | Braund, Mr. Owen Harris | 0 | 1 | 3 | male | 1 | 0 | A/5 21171 |
1 | 0.630431 | C85 | C | 0.734503 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 2 | 1 | female | 1 | 1 | PC 17599 |
2 | -0.301371 | NaN | S | -0.490544 | Heikkinen, Miss. Laina | 0 | 3 | 3 | female | 0 | 1 | STON/O2. 3101282 |
# Cabin缺失较多,且值比较繁杂,根据是否有客舱替换
set(data_train['Cabin'].values)
{nan, 'A24', 'A18', 'B61', 'E46', 'F2', 'B18', 'D34', 'E50', 'B57 B59 B63 B66', 'C23 C25 C27', 'B35', 'E34', 'C53', 'F G73', 'D43', 'D49', 'B38', 'C46', 'A26', 'C28', 'A32', 'D26', 'C106', 'B96 B98', 'E58', 'C45', 'A10', 'C97', 'B71', 'F E46', 'E36', 'B52 B54 B56', 'B11', 'C105', 'C92', 'C6', 'A23', 'C31', 'D15', 'D22', 'E44', 'D37', 'D46', 'E60', 'E39 E41', 'A11', 'C99', 'D40', 'D28', 'E68', 'E38', 'C49', 'C110', 'A16', 'D45', 'C54', 'B78', 'B73', 'C89', 'C7', 'C2', 'E52', 'C86', 'T', 'C80', 'B20', 'E24', 'E77', 'B24', 'B19', 'E31', 'B86', 'E67', 'D36', 'C124', 'A7', 'B10', 'B39', 'B51 B53 B55', 'D35', 'E10', 'D21', 'B101', 'B49', 'C148', 'D33', 'B36', 'F38', 'C22 C26', 'D10 D12', 'C132', 'C125', 'B77', 'C91', 'B79', 'D30', 'B3', 'C55 C57', 'D11', 'B94', 'C128', 'C123', 'A6', 'B80', 'A19', 'D19', 'E121', 'A21', 'A20', 'C90', 'E33', 'B42', 'D47', 'C101', 'F', 'E25', 'C39', 'C118', 'C87', 'B22', 'C82', 'C111', 'C130', 'E40', 'C83', 'C93', 'B41', 'E45', 'B82 B84', 'F4', 'C104', 'B30', 'D20', 'C68', 'A9', 'E12', 'A14', 'C47', 'D17', 'C116', 'C51', 'C70', 'A34', 'D48', 'E8', 'C52', 'D56', 'B58 B60', 'B102', 'D', 'F E57', 'C85', 'C50', 'B37', 'A31', 'B26', 'B4', 'D38', 'F33', 'E49', 'A5', 'C78', 'E17', 'B5', 'A36', 'D6', 'C95', 'D50', 'B69', 'G6', 'F E69', 'A29', 'C126', 'E101', 'D9', 'B28', 'E63', 'C62 C64', 'C30', 'F G63', 'C103', 'B45', 'D7', 'B50', 'C32', 'C65'}
data_train.loc[data_train['Cabin'].notnull(),'Cabin']="Yes"
data_train.loc[data_train['Cabin'].isnull(),'Cabin']="No"
set(data_train['Cabin'].values)
{'No', 'Yes'}
data_train['Cabin'].count()
1309
data_train.head(3)
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Survived | Ticket | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.611972 | No | S | -0.503595 | Braund, Mr. Owen Harris | 0 | 1 | 3 | male | 1 | 0 | A/5 21171 |
1 | 0.630431 | Yes | C | 0.734503 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 2 | 1 | female | 1 | 1 | PC 17599 |
2 | -0.301371 | No | S | -0.490544 | Heikkinen, Miss. Laina | 0 | 3 | 3 | female | 0 | 1 | STON/O2. 3101282 |
# 类别特征one-hot
data_train=data_train.drop(["Name","Ticket"],axis=1)#删除不相关的特征,也可以后期使用filter函数选择;PassengerId要用与标记,暂不删除
for i in ['Pclass','Sex','Embarked',"Cabin"]: #one-hot将embarked的nan列转为了全0
dum_p=pd.get_dummies(data_train[i],prefix=i)
data_train=pd.concat([data_train,dum_p],axis=1)
data_train=data_train.drop(['Pclass','Sex','Embarked',"Cabin"],axis=1)
data_train.head(3)#
Age | Fare | Parch | PassengerId | SibSp | Survived | Pclass_1 | Pclass_2 | Pclass_3 | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | Cabin_No | Cabin_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.611972 | -0.503595 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
1 | 0.630431 | 0.734503 | 0 | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
2 | -0.301371 | -0.490544 | 0 | 3 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
temp=data_train.pop("Survived")
data_train.insert(data_train.columns.size,"Survived",temp)
data_train.head(3)
Age | Fare | Parch | PassengerId | SibSp | Pclass_1 | Pclass_2 | Pclass_3 | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | Cabin_No | Cabin_Yes | Survived | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.611972 | -0.503595 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
1 | 0.630431 | 0.734503 | 0 | 2 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 |
2 | -0.301371 | -0.490544 | 0 | 3 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
data_train.to_csv(r"E:\MLdata\kaggle_titanic\processing.csv")