import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
cols = ['Sex', 'Name', 'Age']
X = df[cols]
y = df['Survived']
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# this will be the first Pipeline step
ct = ColumnTransformer(
[('ohe', OneHotEncoder(), ['Sex']),
('vectorizer', CountVectorizer(), 'Name'),
('imputer', SimpleImputer(), ['Age'])])
# each of these models will take a turn as the second Pipeline step
clf1 = LogisticRegression(solver='liblinear', random_state=1)
clf2 = RandomForestClassifier(random_state=1)
# create the Pipeline
pipe = Pipeline([('preprocessor', ct), ('classifier', clf1)])
# create the parameter dictionary for clf1
params1 = {}
params1['preprocessor__vectorizer__ngram_range'] = [(1, 1), (1, 2)]
params1['classifier__penalty'] = ['l1', 'l2']
params1['classifier__C'] = [0.1, 1, 10]
params1['classifier'] = [clf1]
# create the parameter dictionary for clf2
params2 = {}
params2['preprocessor__vectorizer__ngram_range'] = [(1, 1), (1, 2)]
params2['classifier__n_estimators'] = [100, 200]
params2['classifier__min_samples_leaf'] = [1, 2]
params2['classifier'] = [clf2]
# create a list of parameter dictionaries
params = [params1, params2]
# this will search every parameter combination within each dictionary
grid = GridSearchCV(pipe, params)
grid.fit(X, y)
grid.best_params_
{'classifier': LogisticRegression(C=10, penalty='l1', random_state=1, solver='liblinear'), 'classifier__C': 10, 'classifier__penalty': 'l1', 'preprocessor__vectorizer__ngram_range': (1, 2)}
© 2020 Data School. All rights reserved.