Notebook

In [41]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.decomposition import PCA

cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)

In [42]:

X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [54]:

class DenseTransformer(BaseEstimator,TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit(self, X, y=None, **fit_params):
        return self

In [55]:

pipeline = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('to_dense',DenseTransformer()),
    ('pca',PCA()),
    ('clf',DecisionTreeClassifier())
])

# this is where you define the values for
# GridSearchCV to iterate over

# l1 penalty is incompatible with other configs
param_grid = [
    {
        'tfidf__max_df':[0.8,0.9,1.0]
    }
]

# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')
grid.fit(X_train,y_train)

Out[55]:

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'tfidf__max_df': [0.8, 0.9, 1.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_micro', verbose=0)

In [56]:

# summarize results
print("Best: %f using %s" % (grid.best_score_, 
    grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.946878 using {'tfidf__max_df': 0.9}
0.917987 (0.051901) with: {'tfidf__max_df': 0.8}
0.946878 (0.003989) with: {'tfidf__max_df': 0.9}
0.945946 (0.009163) with: {'tfidf__max_df': 1.0}

In [5]:

# now train and predict test instances
# using the best configs
pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)

In [6]:

# calculate f1
f1_score(y_test, y_preds, average='micro')

Out[6]:

0.97615708274894808