from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.grid_search import GridSearchCV
from sklearn import metrics

from sklearn.cluster import MiniBatchKMeans
import numpy as np

news_data = fetch_20newsgroups(shuffle=True, random_state = 0)
texts = news_data.data
labels = news_data.target
print len(texts), np.unique(labels)

## different vectorizer

## hasher alone - better perform l2 normalization for kmeans
hasher = HashingVectorizer(stop_words='english', n_features = 10000,
                           non_negative=False,
                           norm = 'l2', binary = False)

## since tfidf will do the normalization, so 
## the previous hasher step only need to extract the counts of words
idf_hasher = Pipeline(steps = [
    ('hasher', HashingVectorizer(stop_words='english', 
                                 n_features = 10000,
                                 non_negative=True,
                                 norm = None,
                                 binary = False)),
    ('tf_idf', TfidfTransformer())
])

## use tfidf vectorization directly
tfidf = TfidfVectorizer(max_df=0.5, stop_words='english', 
                        use_idf = True, norm = 'l2')

vectorizers = {'hasher': hasher, 
               'idf_hasher': idf_hasher, 
               'tfidf': tfidf}

## LSA dimensionality reduction - after vectorization

## vectorizer results are normalized, which makes kmeans behave
## as expected. Since LSA/SVD results are NOT normalized, we
## have to redo the normalization
lsa = TruncatedSVD(n_components=500)
normalizer = Normalizer(copy = True, norm='l2')
lsa_normalizer = Pipeline(steps = [
        ('lsa', lsa),
        ('normalizer', normalizer)
])

## clustering algorithms

km = MiniBatchKMeans(n_clusters = 20, init_size=1000, 
                     batch_size = 1000, verbose = 0)

from sklearn.base import BaseEstimator, TransformerMixin
class OptionalStep(BaseEstimator, TransformerMixin):
    def __init__(self, estimator = None, on = True):
        self.estimator = estimator
        self.on = on
    def fit(self, X, y = None):
        if self.on:
            self.estimator.fit(X, y)
        return self
    def transform(self, X):
        if self.on:
            return self.estimator.transform(X)
        else:
            return X

## testing vectorizer without lsa dimensionality reduction
for vectorizer_name, vectorizer in vectorizers.items():
    print '============================================='
    print vectorizer_name
    %time X = vectorizer.fit_transform(texts)
    ## create new lsa to avoid sign indeterminancy
    lsa_normalizer = Pipeline(steps = [
        ('lsa', TruncatedSVD(n_components=500)),
        ('normalizer', Normalizer(copy = True, norm='l2'))])
    %time Z = lsa_normalizer.fit_transform(X)
    %time km.fit(X)
    clustered_X = km.labels_
    print 'clustering result without lsa:',
    print metrics.adjusted_mutual_info_score(labels, clustered_X)
    %time km.fit(Z)
    clustered_Z = km.labels_
    print 'clustering result with lsa:',
    print metrics.adjusted_mutual_info_score(labels, clustered_Z)