In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups

cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
In [7]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
In [21]:
# this calculates a vector of term frequencies for 
# each document
vect = CountVectorizer()

# this normalizes each term frequency by the 
# number of documents having that term
tfidf = TfidfTransformer()

# this is a linear SVM classifier
clf = LinearSVC()

pipeline = Pipeline([
    ('vect',vect),
    ('tfidf',tfidf),
    ('clf',clf)
])

scores = cross_val_score(pipeline,X_train,y_train,cv=3,
    scoring='f1_micro')
In [22]:
scores
Out[22]:
array([ 0.99162011,  0.98882682,  0.99159664])
In [23]:
scores.mean()
Out[23]:
0.99068118867658794
In [26]:
# now train and predict test instances
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
In [27]:
# calculate f1
f1_score(y_test, y_preds, average='micro')
Out[27]:
0.97475455820476853