In [1]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups

cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
In [2]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
In [3]:
pipeline = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',LinearSVC())
])

# this is where you define the values for
# GridSearchCV to iterate over

# l1 penalty is incompatible with other configs
param_grid = [
    {
        'vect__max_df':[0.8,0.9,1.0],
        'clf__penalty':['l2'],
        'clf__dual':[True,False]
    },
    {
        'vect__max_df':[0.8,0.9,1.0],
        'clf__penalty':['l1'],
        'clf__dual': [False]
    }
]

# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')
grid.fit(X_train,y_train)
Out[3]:
GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__max_df': [0.8, 0.9, 1.0], 'clf__dual': [True, False], 'clf__penalty': ['l2']}, {'vect__max_df': [0.8, 0.9, 1.0], 'clf__dual': [False], 'clf__penalty': ['l1']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1_micro', verbose=0)
In [4]:
# summarize results
print("Best: %f using %s" % (grid.best_score_, 
    grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.992544 using {'vect__max_df': 0.9, 'clf__dual': True, 'clf__penalty': 'l2'}
0.990680 (0.001312) with: {'vect__max_df': 0.8, 'clf__dual': True, 'clf__penalty': 'l2'}
0.992544 (0.001309) with: {'vect__max_df': 0.9, 'clf__dual': True, 'clf__penalty': 'l2'}
0.990680 (0.001312) with: {'vect__max_df': 1.0, 'clf__dual': True, 'clf__penalty': 'l2'}
0.990680 (0.001312) with: {'vect__max_df': 0.8, 'clf__dual': False, 'clf__penalty': 'l2'}
0.992544 (0.001309) with: {'vect__max_df': 0.9, 'clf__dual': False, 'clf__penalty': 'l2'}
0.990680 (0.001312) with: {'vect__max_df': 1.0, 'clf__dual': False, 'clf__penalty': 'l2'}
0.971109 (0.003459) with: {'vect__max_df': 0.8, 'clf__dual': False, 'clf__penalty': 'l1'}
0.972041 (0.008199) with: {'vect__max_df': 0.9, 'clf__dual': False, 'clf__penalty': 'l1'}
0.971109 (0.004717) with: {'vect__max_df': 1.0, 'clf__dual': False, 'clf__penalty': 'l1'}
In [5]:
# now train and predict test instances
# using the best configs
pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
In [6]:
# calculate f1
f1_score(y_test, y_preds, average='micro')
Out[6]:
0.97615708274894808