import pandas as pd
import numpy as np
train = pd.DataFrame({'feat1':[10, 20, np.nan, 2], 'feat2':[25., 20, 5, 3], 'label':['A', 'A', 'B', 'B']})
test = pd.DataFrame({'feat1':[30., 5, 15], 'feat2':[12, 10, np.nan]})
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
imputer = SimpleImputer()
clf = LogisticRegression()
# 2-step pipeline: impute missing values, then pass the results to the classifier
pipe = make_pipeline(imputer, clf)
train
feat1 | feat2 | label | |
---|---|---|---|
0 | 10.0 | 25.0 | A |
1 | 20.0 | 20.0 | A |
2 | NaN | 5.0 | B |
3 | 2.0 | 3.0 | B |
test
feat1 | feat2 | |
---|---|---|
0 | 30.0 | 12.0 |
1 | 5.0 | 10.0 |
2 | 15.0 | NaN |
features = ['feat1', 'feat2']
X, y = train[features], train['label']
X_new = test[features]
# pipeline applies the imputer to X before fitting the classifier
pipe.fit(X, y)
# pipeline applies the imputer to X_new before making predictions
# note: pipeline uses imputation values learned during the "fit" step
pipe.predict(X_new)
array(['A', 'B', 'A'], dtype=object)
© 2020 Data School. All rights reserved.