import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
X = df[['Pclass', 'Sex', 'Name']]
y = df['Survived']
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
ohe = OneHotEncoder()
vect = CountVectorizer()
clf = LogisticRegression(solver='liblinear', random_state=1)
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'), remainder='passthrough')
pipe = Pipeline([('preprocessor', ct), ('model', clf)])
# specify parameter values to search
params = {}
params['model__C'] = [0.1, 1, 10]
params['model__penalty'] = ['l1', 'l2']
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);
# convert results into a DataFrame
results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
# sort by test score
results.sort_values('rank_test_score')
params | mean_test_score | rank_test_score | |
---|---|---|---|
2 | {'model__C': 1, 'model__penalty': 'l1'} | 0.821512 | 1 |
4 | {'model__C': 10, 'model__penalty': 'l1'} | 0.820413 | 2 |
5 | {'model__C': 10, 'model__penalty': 'l2'} | 0.817055 | 3 |
3 | {'model__C': 1, 'model__penalty': 'l2'} | 0.812573 | 4 |
1 | {'model__C': 0.1, 'model__penalty': 'l2'} | 0.791225 | 5 |
0 | {'model__C': 0.1, 'model__penalty': 'l1'} | 0.788984 | 6 |
© 2020 Data School. All rights reserved.