from sklearn.datasets import fetch_20newsgroups from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import Normalizer from sklearn.grid_search import GridSearchCV from sklearn import metrics from sklearn.cluster import MiniBatchKMeans import numpy as np news_data = fetch_20newsgroups(shuffle=True, random_state = 0) texts = news_data.data labels = news_data.target print len(texts), np.unique(labels) ## different vectorizer ## hasher alone - better perform l2 normalization for kmeans hasher = HashingVectorizer(stop_words='english', n_features = 10000, non_negative=False, norm = 'l2', binary = False) ## since tfidf will do the normalization, so ## the previous hasher step only need to extract the counts of words idf_hasher = Pipeline(steps = [ ('hasher', HashingVectorizer(stop_words='english', n_features = 10000, non_negative=True, norm = None, binary = False)), ('tf_idf', TfidfTransformer()) ]) ## use tfidf vectorization directly tfidf = TfidfVectorizer(max_df=0.5, stop_words='english', use_idf = True, norm = 'l2') vectorizers = {'hasher': hasher, 'idf_hasher': idf_hasher, 'tfidf': tfidf} ## LSA dimensionality reduction - after vectorization ## vectorizer results are normalized, which makes kmeans behave ## as expected. Since LSA/SVD results are NOT normalized, we ## have to redo the normalization lsa = TruncatedSVD(n_components=500) normalizer = Normalizer(copy = True, norm='l2') lsa_normalizer = Pipeline(steps = [ ('lsa', lsa), ('normalizer', normalizer) ]) ## clustering algorithms km = MiniBatchKMeans(n_clusters = 20, init_size=1000, batch_size = 1000, verbose = 0) from sklearn.base import BaseEstimator, TransformerMixin class OptionalStep(BaseEstimator, TransformerMixin): def __init__(self, estimator = None, on = True): self.estimator = estimator self.on = on def fit(self, X, y = None): if self.on: self.estimator.fit(X, y) return self def transform(self, X): if self.on: return self.estimator.transform(X) else: return X ## testing vectorizer without lsa dimensionality reduction for vectorizer_name, vectorizer in vectorizers.items(): print '=============================================' print vectorizer_name %time X = vectorizer.fit_transform(texts) ## create new lsa to avoid sign indeterminancy lsa_normalizer = Pipeline(steps = [ ('lsa', TruncatedSVD(n_components=500)), ('normalizer', Normalizer(copy = True, norm='l2'))]) %time Z = lsa_normalizer.fit_transform(X) %time km.fit(X) clustered_X = km.labels_ print 'clustering result without lsa:', print metrics.adjusted_mutual_info_score(labels, clustered_X) %time km.fit(Z) clustered_Z = km.labels_ print 'clustering result with lsa:', print metrics.adjusted_mutual_info_score(labels, clustered_Z)