In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt

import math

cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
In [2]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
In [131]:
class SkippableTruncatedSVD(TruncatedSVD):
    
    def __init__(self,skip=False,n_components=2, algorithm="randomized", n_iter=5,
                 random_state=None, tol=0.):
        self.skip = skip
        
        super().__init__(n_components, algorithm, n_iter, random_state, tol)
    
    # execute if not being skipped
    def fit(self, X, y=None):
        if self.skip:
            return self
        else:
            return super().fit(X,y)
    
    # execute if not being skipped
    def fit_transform(self, X, y=None):
        if self.skip:
            return X
        else:
            return super().fit_transform(X,y) 
      
    # execute if not being skipped
    def transform(self, X):
        if self.skip:
            return X
        else:
            return super().transform(X) 
In [125]:
param_grid = [
    {
        'tfidf__max_features':[100,200,500,1000],
        'svd__skip':[True,False],
        'svd__n_components':[2,5,10,20]
    }
]
In [129]:
len(ParameterGrid(param_grid))
Out[129]:
32
In [127]:
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('svd',SkippableTruncatedSVD()),
    ('clf',LogisticRegression())
])

num_cols = 3
num_rows = math.ceil(len(ParameterGrid(param_grid)) / num_cols)

plt.clf()
fig,axes = plt.subplots(num_rows,num_columns,sharey=True)
fig.set_size_inches(num_columns*5,num_rows*5)

for i,g in enumerate(ParameterGrid(param_grid)):
    
    pipeline.set_params(**g)
    pipeline.fit(X_train,y_train)
    
    y_preds = pipeline.predict_proba(X_test)

    # take the second column because the classifier outputs scores for
    # the 0 class as well
    preds = y_preds[:,1]

    # fpr means false-positive-rate
    # tpr means true-positive-rate
    fpr, tpr, _ = metrics.roc_curve(y_test, preds)

    auc_score = metrics.auc(fpr, tpr)
       
    ax = axes[i // num_cols, i % num_cols]
    
    ax.set_title(str(g),fontsize=8)
    ax.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score))
    ax.legend(loc='lower right')

    # it's helpful to add a diagonal to indicate where chance 
    # scores lie (i.e. just flipping a coin)
    ax.plot([0,1],[0,1],'r--')

    ax.set_xlim([-0.1,1.1])
    ax.set_ylim([-0.1,1.1])
    ax.set_ylabel('True Positive Rate')
    ax.set_xlabel('False Positive Rate')

plt.show()
<matplotlib.figure.Figure at 0x7fdbff28dac8>
<matplotlib.figure.Figure at 0x7fdbff297c50>
In [ ]: