import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
df = pd.read_csv('http://bit.ly/kaggletrain')
X = df[['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age']]
y = df['Survived']
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder()
imp_ohe = make_pipeline(imp_constant, ohe)
vect = CountVectorizer()
imp = SimpleImputer()
# pipeline step 1
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
('passthrough', ['Parch']))
# pipeline step 2
selection = SelectPercentile(chi2, percentile=50)
# pipeline step 3
logreg = LogisticRegression(solver='liblinear')
# display estimators as diagrams
from sklearn import set_config
set_config(display='diagram')
pipe = make_pipeline(ct, selection, logreg)
pipe
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('pipeline', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='constant')), ('onehotencoder', OneHotEncoder())]), ['Embarked', 'Sex']), ('countvectorizer', CountVectorizer(), 'Name'), ('simpleimputer', SimpleImputer(), ['Age', 'Fare']), ('passthrough', 'passthrough', ['Parch'])])), ('selectpercentile', SelectPercentile(percentile=50, score_func=)), ('logisticregression', LogisticRegression(solver='liblinear'))])
ColumnTransformer(transformers=[('pipeline', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='constant')), ('onehotencoder', OneHotEncoder())]), ['Embarked', 'Sex']), ('countvectorizer', CountVectorizer(), 'Name'), ('simpleimputer', SimpleImputer(), ['Age', 'Fare']), ('passthrough', 'passthrough', ['Parch'])])
['Embarked', 'Sex']
SimpleImputer(strategy='constant')
OneHotEncoder()
Name
CountVectorizer()
['Age', 'Fare']
SimpleImputer()
['Parch']
passthrough
SelectPercentile(percentile=50, score_func=)
LogisticRegression(solver='liblinear')
# export the diagram to a file
from sklearn.utils import estimator_html_repr
with open('pipeline.html', 'w') as f:
f.write(estimator_html_repr(pipe))
© 2020 Data School. All rights reserved.