Notebook

In [ ]:

## Differnt types of

In [1]:

import numpy as np
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics

In [2]:

train_newsgroup = fetch_20newsgroups(subset='train')
test_newsgroup = fetch_20newsgroups(subset='test')
train_texts, train_labels = train_newsgroup.data, train_newsgroup.target
test_texts, test_labels = test_newsgroup.data, test_newsgroup.target
categories = train_newsgroup.target_names
print len(train_texts)
print len(test_texts)
print len(categories)

11314
7532
20

In [14]:

## vectorizer - hasher or tfidf

## POSITIVE = TRUE for hasher because later chi2 selector
## requires all the features to be positive
hasher = HashingVectorizer(stop_words = 'english', 
                           non_negative = True,
                           n_features = 100000)
vectorizer = TfidfVectorizer(max_df = 0.5, 
                             stop_words='english')
%time hashed_train_X = hasher.fit_transform(train_texts)
%time hashed_test_X = hasher.transform(test_texts)
%time vectorized_train_X = vectorizer.fit_transform(train_texts)
%time vectorized_test_X = vectorizer.transform(test_texts)
print hashed_train_X.shape, hashed_test_X.shape
print vectorized_train_X.shape, vectorized_test_X.shape

CPU times: user 5.1 s, sys: 32 ms, total: 5.13 s
Wall time: 5.11 s
CPU times: user 2.94 s, sys: 72 ms, total: 3.01 s
Wall time: 3.01 s
CPU times: user 4.68 s, sys: 80 ms, total: 4.76 s
Wall time: 4.75 s
CPU times: user 2.68 s, sys: 44 ms, total: 2.72 s
Wall time: 2.72 s
(11314, 100000) (7532, 100000)
(11314, 129792) (7532, 129792)

In [15]:

## feature selection by chi-squared test
ch2 = SelectKBest(chi2, k = 1000)
%time ch2_hashed_train_X = ch2.fit_transform(hashed_train_X, train_labels)
%time ch2_hashed_test_X = ch2.transform(hashed_test_X)
%time ch2_vectorized_train_X = ch2.fit_transform(vectorized_train_X, train_labels)
%time ch2_vectorized_test_X = ch2.transform(vectorized_test_X)

print ch2_hashed_train_X.shape, ch2_hashed_test_X.shape
print ch2_vectorized_train_X.shape, ch2_vectorized_test_X.shape

CPU times: user 216 ms, sys: 0 ns, total: 216 ms
Wall time: 217 ms
CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 31.9 ms
CPU times: user 264 ms, sys: 4 ms, total: 268 ms
Wall time: 269 ms
CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 31.7 ms
(11314, 1000) (7532, 1000)
(11314, 1000) (7532, 1000)

In [71]:

## feature extraction lsa
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=1000)
lsa2 = TruncatedSVD(n_components=1000)
%time lsa_hashed_train_X = lsa.fit_transform(hashed_train_X)
%time lsa_hashed_test_X = lsa.transform(hashed_test_X)
%time lsa_vectorized_train_X = lsa2.fit_transform(vectorized_train_X)
%time lsa_vectorized_test_X = lsa2.transform(vectorized_test_X)

print lsa_hashed_train_X.shape, lsa_hashed_test_X.shape
print lsa_vectorized_train_X.shape, lsa_vectorized_test_X.shape

CPU times: user 1min 54s, sys: 29.1 s, total: 2min 23s
Wall time: 1min 46s
CPU times: user 1.4 s, sys: 480 ms, total: 1.88 s
Wall time: 1.79 s
CPU times: user 2min 8s, sys: 31.7 s, total: 2min 40s
Wall time: 1min 55s
CPU times: user 1.61 s, sys: 632 ms, total: 2.24 s

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-71-f53aacea25ce> in <module>()
      9 
     10 print lsa_hashed_train_X.shape, lsa_hashed_test_X.shape
---> 11 print lsa_vectorized_train_X.shape, lsa_vectorized_test_X.shape

NameError: name 'lsa_vectorized_test_X' is not defined

Wall time: 2.14 s
(11314, 1000) (7532, 1000)
(11314, 1000)

In [73]:

datasets = {
    'hashed': ((hashed_train_X, train_labels), (hashed_test_X, test_labels)),
    'vectorized': ((vectorized_train_X, train_labels), (vectorized_test_X, test_labels)),
    'ch2_hashed': ((ch2_hashed_train_X, train_labels), (ch2_hashed_test_X, test_labels)),
    'ch2_vectorized': ((ch2_vectorized_train_X, train_labels), (ch2_vectorized_test_X, test_labels)),
    'lsa_hashed': ((lsa_hashed_train_X, train_labels), (lsa_hashed_test_X, test_labels)),
    'lsa_vectorized': ((lsa_vectorized_train_X, train_labels), (lsa_vectorized_test_X, test_labels))
}

In [74]:

## classifiers
classifiers = {
    'pa': PassiveAggressiveClassifier(n_iter = 50),
    'knn': KNeighborsClassifier(n_neighbors=10),
    'linearsvcl2': LinearSVC(penalty='l2', ),
    ## SVC can use l1 penalty only when dual is False
    'linearsvcl1': LinearSVC(penalty='l1', dual=False),
    'sgdl12': SGDClassifier(penalty='elasticnet', n_iter=50),
    'sgdl1': SGDClassifier(penalty='l1', n_iter=50),
    'sgdl2': SGDClassifier(penalty='l2', n_iter=50),
    'rocchio': NearestCentroid(),
    'mnb': MultinomialNB(alpha = 0.01),
    'bnb': BernoulliNB(alpha = 0.01)
}

In [75]:

## Persist data files
from os import path
from sklearn.externals import joblib
def persist_data(data_name, data, folder = 'data/temp'):
    data_file = path.join(path.abspath(folder), data_name+'.pkl')
    files = joblib.dump(data, data_file)
    return data_file

data_files = [(name, persist_data(name, data)) 
              for name, data in datasets.items()]

In [76]:

from itertools import product

model_data_infor = list(product(classifiers.items(), data_files))
(model_name, model), (data_name, data_file) = model_data_infor[0]
print model_name
print model
print data_name
print data_file
print len(model_data_infor)

knn
KNeighborsClassifier(algorithm=auto, leaf_size=30, metric=minkowski,
           n_neighbors=10, p=2, weights=uniform)
ch2_vectorized
/home/ce/workspace/x/sklearn-examples/temp/ch2_vectorized.pkl
60

In [84]:

## benchmark methods
def benchmark_model(infor):
    from sklearn.externals import joblib
    (model_name, model), (data_name, data_file) = infor
    train_data, test_data = joblib.load(data_file)
    try:
        model.fit(*train_data)
        train_score = model.score(*train_data)
        test_score = model.score(*test_data)
    except:
        return (model_name, data_name, 0, 0)
    return (model_name, data_name, train_score, test_score)

In [85]:

from IPython.parallel import Client
client = Client()
dv = client[:]
lb_view = client.load_balanced_view()
print len(client)

In [86]:

results = lb_view.map(benchmark_model, model_data_infor, block = True)

In [87]:

len(results)

Out[87]:

In [90]:

results = [(r[0]+'_on_'+r[1], r[3]) for r in results]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-90-6d082f0c34ac> in <module>()
----> 1 results = [(r[0]+'_on_'+r[1], r[3]) for r in results]

TypeError: cannot concatenate 'str' and 'numpy.float64' objects

In [91]:

## PLOT RESULTS
index = range(len(results))
models, scores = zip(*results)
figure(figsize=(50, 16))
bar(index, scores,)
_ = xticks(np.asarray(index)+0.5, models)
locs, labels = xticks()
_ = setp(labels, rotation = 90, fontsize = 30)

In [92]:

from pprint import pprint
pprint(sorted(results, key = lambda x: x[1], reverse = True))

[('linearsvcl2_on_vectorized', 0.85143388210302706),
 ('sgdl2_on_vectorized', 0.85130111524163565),
 ('pa_on_vectorized', 0.84784917684545935),
 ('mnb_on_hashed', 0.83404142326075414),
 ('mnb_on_vectorized', 0.83390865639936274),
 ('linearsvcl2_on_hashed', 0.83284652150823157),
 ('linearsvcl2_on_lsa_vectorized', 0.83178438661710041),
 ('sgdl2_on_hashed', 0.8264737121614445),
 ('sgdl2_on_lsa_vectorized', 0.82328730748805101),
 ('linearsvcl1_on_lsa_vectorized', 0.81956983536909189),
 ('pa_on_hashed', 0.81638343069569841),
 ('linearsvcl1_on_vectorized', 0.81558682952734995),
 ('sgdl12_on_lsa_vectorized', 0.80071694105151359),
 ('linearsvcl1_on_hashed', 0.80018587360594795),
 ('pa_on_lsa_vectorized', 0.79739776951672858),
 ('linearsvcl2_on_lsa_hashed', 0.79646840148698883),
 ('sgdl12_on_vectorized', 0.79434413170472651),
 ('sgdl1_on_lsa_vectorized', 0.79182156133828996),
 ('sgdl2_on_lsa_hashed', 0.78770578863515661),
 ('linearsvcl1_on_lsa_hashed', 0.7874402549123739),
 ('sgdl1_on_vectorized', 0.78491768454593736),
 ('sgdl12_on_hashed', 0.7772172065852363),
 ('bnb_on_vectorized', 0.77110993096123204),
 ('sgdl1_on_hashed', 0.76712692511949021),
 ('bnb_on_hashed', 0.76367498672331391),
 ('linearsvcl2_on_ch2_vectorized', 0.76101964949548595),
 ('linearsvcl2_on_ch2_hashed', 0.75862984599044081),
 ('sgdl12_on_lsa_hashed', 0.75716941051513542),
 ('sgdl2_on_ch2_vectorized', 0.75491237387148169),
 ('linearsvcl1_on_ch2_vectorized', 0.75477960701009028),
 ('pa_on_ch2_vectorized', 0.75424853956452464),
 ('mnb_on_ch2_vectorized', 0.75318640467339348),
 ('mnb_on_ch2_hashed', 0.75318640467339348),
 ('linearsvcl1_on_ch2_hashed', 0.75066383430695693),
 ('rocchio_on_vectorized', 0.75013276686139141),
 ('bnb_on_ch2_vectorized', 0.75013276686139141),
 ('sgdl1_on_lsa_hashed', 0.74734466277217204),
 ('sgdl2_on_ch2_hashed', 0.74548592671269254),
 ('pa_on_ch2_hashed', 0.74349442379182151),
 ('pa_on_lsa_hashed', 0.74349442379182151),
 ('sgdl12_on_ch2_vectorized', 0.7399097185342538),
 ('bnb_on_lsa_vectorized', 0.73911311736590546),
 ('bnb_on_ch2_hashed', 0.73659054699946891),
 ('sgdl12_on_ch2_hashed', 0.73592671269251198),
 ('rocchio_on_lsa_vectorized', 0.73313860860329261),
 ('sgdl1_on_ch2_vectorized', 0.73287307488050979),
 ('sgdl1_on_ch2_hashed', 0.73167817312798722),
 ('knn_on_vectorized', 0.68215613382899631),
 ('bnb_on_lsa_hashed', 0.6785714285714286),
 ('knn_on_hashed', 0.6258629845990441),
 ('knn_on_ch2_vectorized', 0.61404673393520981),
 ('rocchio_on_hashed', 0.61139139670738185),
 ('rocchio_on_ch2_vectorized', 0.60847052575677107),
 ('rocchio_on_lsa_hashed', 0.60223048327137552),
 ('rocchio_on_ch2_hashed', 0.52522570366436538),
 ('knn_on_lsa_hashed', 0.49097185342538502),
 ('knn_on_ch2_hashed', 0.4549920339883165),
 ('knn_on_lsa_vectorized', 0.43813064259160911),
 ('mnb_on_lsa_vectorized', 0),
 ('mnb_on_lsa_hashed', 0)]

Linear models (sgd, passive-agressive, svc, multinomial-nb) work superisingly well with text data, especially with a l2 norm penalty

l2 penalty works better than l1, so it is like that correlations of different terms are NOT very big - and the text classification task is better done by using all words

** Tf-idf is still one of the best for text terms normalization, sometimes LSA (TruncatedSVD) is also useful, hashing is useful for speed-up, but not as accurate as tfidf (specially the idf) part**

In [ ]: