import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
cols = ['Pclass', 'Parch', 'SibSp', 'Fare']
X = df[cols]
y = df['Survived']
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV
lr = LogisticRegression(solver='liblinear', random_state=1)
rf = RandomForestClassifier(max_features=None, random_state=1)
nb = MultinomialNB()
# create an ensemble of 3 classifiers
vc = VotingClassifier([('clf1', lr), ('clf2', rf), ('clf3', nb)])
cross_val_score(vc, X, y).mean()
0.6970560542338836
# define VotingClassifier parameters to search
params = {'voting':['hard', 'soft'],
'weights':[(1,1,1), (2,1,1), (1,2,1), (1,1,2)]}
# find the best set of parameters
grid = GridSearchCV(vc, params)
grid.fit(X, y)
grid.best_params_
{'voting': 'soft', 'weights': (1, 2, 1)}
# accuracy has improved
grid.best_score_
0.7262820915196786
© 2020 Data School. All rights reserved.