import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
cols = ['Sex', 'Name', 'Age']
X = df[cols]
y = df['Survived']
from sklearn import set_config
set_config(display='diagram')
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
ct = ColumnTransformer(
[('ohe', OneHotEncoder(), ['Sex']),
('vectorizer', CountVectorizer(), 'Name'),
('imputer', SimpleImputer(), ['Age'])])
fs = SelectPercentile(chi2, percentile=50)
clf = LogisticRegression(solver='liblinear', random_state=1)
# create Pipeline
pipe = Pipeline([('preprocessor', ct), ('feature selector', fs), ('classifier', clf)])
pipe
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('ohe', OneHotEncoder(), ['Sex']), ('vectorizer', CountVectorizer(), 'Name'), ('imputer', SimpleImputer(), ['Age'])])), ('feature selector', SelectPercentile(percentile=50, score_func=)), ('classifier', LogisticRegression(random_state=1, solver='liblinear'))])
ColumnTransformer(transformers=[('ohe', OneHotEncoder(), ['Sex']), ('vectorizer', CountVectorizer(), 'Name'), ('imputer', SimpleImputer(), ['Age'])])
['Sex']
OneHotEncoder()
Name
CountVectorizer()
['Age']
SimpleImputer()
SelectPercentile(percentile=50, score_func=)
LogisticRegression(random_state=1, solver='liblinear')
# access step 0 (preprocessor)
pipe[0].fit_transform(X)
<891x1512 sparse matrix of type '<class 'numpy.float64'>' with 5348 stored elements in Compressed Sparse Row format>
# access steps 0 and 1 (preprocessor and feature selector)
pipe[0:2].fit_transform(X, y)
<891x756 sparse matrix of type '<class 'numpy.float64'>' with 4128 stored elements in Compressed Sparse Row format>
# access step 1 (feature selector)
pipe[1].get_support()
array([ True, True, True, ..., True, False, True])
© 2020 Data School. All rights reserved.