from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans
import numpy as np
news_data = fetch_20newsgroups(shuffle=True, random_state = 0)
texts = news_data.data
labels = news_data.target
print len(texts), np.unique(labels)
11314 [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]
## different vectorizer
## hasher alone - better perform l2 normalization for kmeans
hasher = HashingVectorizer(stop_words='english', n_features = 10000,
non_negative=False,
norm = 'l2', binary = False)
## since tfidf will do the normalization, so
## the previous hasher step only need to extract the counts of words
idf_hasher = Pipeline(steps = [
('hasher', HashingVectorizer(stop_words='english',
n_features = 10000,
non_negative=True,
norm = None,
binary = False)),
('tf_idf', TfidfTransformer())
])
## use tfidf vectorization directly
tfidf = TfidfVectorizer(max_df=0.5, stop_words='english',
use_idf = True, norm = 'l2')
vectorizers = {'hasher': hasher,
'idf_hasher': idf_hasher,
'tfidf': tfidf}
## LSA dimensionality reduction - after vectorization
## vectorizer results are normalized, which makes kmeans behave
## as expected. Since LSA/SVD results are NOT normalized, we
## have to redo the normalization
lsa = TruncatedSVD(n_components=500)
normalizer = Normalizer(copy = True, norm='l2')
lsa_normalizer = Pipeline(steps = [
('lsa', lsa),
('normalizer', normalizer)
])
## clustering algorithms
km = MiniBatchKMeans(n_clusters = 20, init_size=1000,
batch_size = 1000, verbose = 0)
from sklearn.base import BaseEstimator, TransformerMixin
class OptionalStep(BaseEstimator, TransformerMixin):
def __init__(self, estimator = None, on = True):
self.estimator = estimator
self.on = on
def fit(self, X, y = None):
if self.on:
self.estimator.fit(X, y)
return self
def transform(self, X):
if self.on:
return self.estimator.transform(X)
else:
return X
## testing vectorizer without lsa dimensionality reduction
for vectorizer_name, vectorizer in vectorizers.items():
print '============================================='
print vectorizer_name
%time X = vectorizer.fit_transform(texts)
## create new lsa to avoid sign indeterminancy
lsa_normalizer = Pipeline(steps = [
('lsa', TruncatedSVD(n_components=500)),
('normalizer', Normalizer(copy = True, norm='l2'))])
%time Z = lsa_normalizer.fit_transform(X)
%time km.fit(X)
clustered_X = km.labels_
print 'clustering result without lsa:',
print metrics.adjusted_mutual_info_score(labels, clustered_X)
%time km.fit(Z)
clustered_Z = km.labels_
print 'clustering result with lsa:',
print metrics.adjusted_mutual_info_score(labels, clustered_Z)
============================================= tfidf CPU times: user 4.7 s, sys: 76 ms, total: 4.78 s Wall time: 4.72 s CPU times: user 48.1 s, sys: 12.4 s, total: 1min Wall time: 42.2 s CPU times: user 2.59 s, sys: 12 ms, total: 2.6 s Wall time: 2.6 s clustering result without lsa: 0.267173201537 CPU times: user 1.94 s, sys: 424 ms, total: 2.36 s Wall time: 1.96 s clustering result with lsa: 0.310096714648 ============================================= idf_hasher CPU times: user 4.73 s, sys: 24 ms, total: 4.76 s Wall time: 4.74 s CPU times: user 20.1 s, sys: 3.25 s, total: 23.4 s Wall time: 18 s CPU times: user 1.06 s, sys: 0 ns, total: 1.06 s Wall time: 1.05 s clustering result without lsa: 0.288526525903 CPU times: user 2.49 s, sys: 392 ms, total: 2.88 s Wall time: 2.48 s clustering result with lsa: 0.30267315394 ============================================= hasher CPU times: user 4.59 s, sys: 24 ms, total: 4.61 s Wall time: 4.6 s CPU times: user 20.5 s, sys: 3.37 s, total: 23.8 s Wall time: 18.2 s CPU times: user 1.76 s, sys: 4 ms, total: 1.77 s Wall time: 1.77 s clustering result without lsa: 0.176192306413 CPU times: user 2.42 s, sys: 404 ms, total: 2.82 s Wall time: 2.42 s clustering result with lsa: 0.171964867561
Results with high n_features in hasher
============================================= tfidf CPU times: user 5.45 s, sys: 56 ms, total: 5.51 s Wall time: 5.48 s CPU times: user 47.7 s, sys: 12.5 s, total: 1min Wall time: 42.1 s CPU times: user 2.98 s, sys: 32 ms, total: 3.01 s Wall time: 3.01 s clustering result without lsa: 0.289170149848 CPU times: user 1.92 s, sys: 436 ms, total: 2.35 s Wall time: 1.95 s clustering result with lsa: 0.348944495858
============================================= idf_hasher CPU times: user 5.1 s, sys: 64 ms, total: 5.17 s Wall time: 5.14 s CPU times: user 5min 59s, sys: 44 s, total: 6min 43s Wall time: 5min 9s CPU times: user 12.4 s, sys: 216 ms, total: 12.6 s Wall time: 12.7 s clustering result without lsa: 0.303035797432 CPU times: user 2.31 s, sys: 448 ms, total: 2.76 s Wall time: 2.35 s clustering result with lsa: 0.312858769047
============================================= hasher CPU times: user 4.65 s, sys: 32 ms, total: 4.68 s Wall time: 4.65 s CPU times: user 5min 35s, sys: 46.8 s, total: 6min 21s Wall time: 4min 48s CPU times: user 13.5 s, sys: 192 ms, total: 13.7 s Wall time: 13.7 s clustering result without lsa: 0.165880219367 CPU times: user 2.1 s, sys: 424 ms, total: 2.52 s Wall time: 2.12 s clustering result with lsa: 0.175990937104