import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
X = df['Name']
y = df['Survived']
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
vect = CountVectorizer()
clf = LogisticRegression()
pipe = make_pipeline(vect, clf)
cross_val_score(pipe, X, y, scoring='accuracy').mean()
0.7957190383528967
from sklearn.feature_selection import SelectPercentile, chi2
# keep 50% of features with the best chi-squared scores
selection = SelectPercentile(chi2, percentile=50)
pipe = make_pipeline(vect, selection, clf)
cross_val_score(pipe, X, y, scoring='accuracy').mean()
0.8147824995292197
© 2020 Data School. All rights reserved.