from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.decomposition import PCA
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
class DenseTransformer(BaseEstimator,TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit(self, X, y=None, **fit_params):
return self
pipeline = Pipeline([
('tfidf',TfidfVectorizer()),
('to_dense',DenseTransformer()),
('pca',PCA()),
('clf',DecisionTreeClassifier())
])
# this is where you define the values for
# GridSearchCV to iterate over
# l1 penalty is incompatible with other configs
param_grid = [
{
'tfidf__max_df':[0.8,0.9,1.0]
}
]
# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')
grid.fit(X_train,y_train)
GridSearchCV(cv=3, error_score='raise', estimator=Pipeline(memory=None, steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, ... min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best'))]), fit_params=None, iid=True, n_jobs=1, param_grid=[{'tfidf__max_df': [0.8, 0.9, 1.0]}], pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring='f1_micro', verbose=0)
# summarize results
print("Best: %f using %s" % (grid.best_score_,
grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.946878 using {'tfidf__max_df': 0.9} 0.917987 (0.051901) with: {'tfidf__max_df': 0.8} 0.946878 (0.003989) with: {'tfidf__max_df': 0.9} 0.945946 (0.009163) with: {'tfidf__max_df': 1.0}
# now train and predict test instances
# using the best configs
pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
# calculate f1
f1_score(y_test, y_preds, average='micro')
0.97615708274894808