import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain', nrows=6)
cols = ['Fare', 'Embarked', 'Sex', 'Age']
X = df[cols]
X
Fare | Embarked | Sex | Age | |
---|---|---|---|---|
0 | 7.2500 | S | male | 22.0 |
1 | 71.2833 | C | female | 38.0 |
2 | 7.9250 | S | female | 26.0 |
3 | 53.1000 | S | female | 35.0 |
4 | 8.0500 | S | male | 35.0 |
5 | 8.4583 | Q | male | NaN |
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
ohe = OneHotEncoder()
imp = SimpleImputer()
ct = make_column_transformer(
(ohe, ['Embarked', 'Sex']), # apply OneHotEncoder to Embarked and Sex
(imp, ['Age']), # apply SimpleImputer to Age
remainder='passthrough') # include remaining column (Fare) in the output
# column order: Embarked (3 columns), Sex (2 columns), Age (1 column), Fare (1 column)
ct.fit_transform(X)
array([[ 0. , 0. , 1. , 0. , 1. , 22. , 7.25 ], [ 1. , 0. , 0. , 1. , 0. , 38. , 71.2833], [ 0. , 0. , 1. , 1. , 0. , 26. , 7.925 ], [ 0. , 0. , 1. , 1. , 0. , 35. , 53.1 ], [ 0. , 0. , 1. , 0. , 1. , 35. , 8.05 ], [ 0. , 1. , 0. , 0. , 1. , 31.2 , 8.4583]])
© 2020 Data School. All rights reserved.