## Differnt types of
import numpy as np
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
train_newsgroup = fetch_20newsgroups(subset='train')
test_newsgroup = fetch_20newsgroups(subset='test')
train_texts, train_labels = train_newsgroup.data, train_newsgroup.target
test_texts, test_labels = test_newsgroup.data, test_newsgroup.target
categories = train_newsgroup.target_names
print len(train_texts)
print len(test_texts)
print len(categories)
11314 7532 20
## vectorizer - hasher or tfidf
## POSITIVE = TRUE for hasher because later chi2 selector
## requires all the features to be positive
hasher = HashingVectorizer(stop_words = 'english',
non_negative = True,
n_features = 100000)
vectorizer = TfidfVectorizer(max_df = 0.5,
stop_words='english')
%time hashed_train_X = hasher.fit_transform(train_texts)
%time hashed_test_X = hasher.transform(test_texts)
%time vectorized_train_X = vectorizer.fit_transform(train_texts)
%time vectorized_test_X = vectorizer.transform(test_texts)
print hashed_train_X.shape, hashed_test_X.shape
print vectorized_train_X.shape, vectorized_test_X.shape
CPU times: user 5.1 s, sys: 32 ms, total: 5.13 s Wall time: 5.11 s CPU times: user 2.94 s, sys: 72 ms, total: 3.01 s Wall time: 3.01 s CPU times: user 4.68 s, sys: 80 ms, total: 4.76 s Wall time: 4.75 s CPU times: user 2.68 s, sys: 44 ms, total: 2.72 s Wall time: 2.72 s (11314, 100000) (7532, 100000) (11314, 129792) (7532, 129792)
## feature selection by chi-squared test
ch2 = SelectKBest(chi2, k = 1000)
%time ch2_hashed_train_X = ch2.fit_transform(hashed_train_X, train_labels)
%time ch2_hashed_test_X = ch2.transform(hashed_test_X)
%time ch2_vectorized_train_X = ch2.fit_transform(vectorized_train_X, train_labels)
%time ch2_vectorized_test_X = ch2.transform(vectorized_test_X)
print ch2_hashed_train_X.shape, ch2_hashed_test_X.shape
print ch2_vectorized_train_X.shape, ch2_vectorized_test_X.shape
CPU times: user 216 ms, sys: 0 ns, total: 216 ms Wall time: 217 ms CPU times: user 32 ms, sys: 0 ns, total: 32 ms Wall time: 31.9 ms CPU times: user 264 ms, sys: 4 ms, total: 268 ms Wall time: 269 ms CPU times: user 32 ms, sys: 0 ns, total: 32 ms Wall time: 31.7 ms (11314, 1000) (7532, 1000) (11314, 1000) (7532, 1000)
## feature extraction lsa
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=1000)
lsa2 = TruncatedSVD(n_components=1000)
%time lsa_hashed_train_X = lsa.fit_transform(hashed_train_X)
%time lsa_hashed_test_X = lsa.transform(hashed_test_X)
%time lsa_vectorized_train_X = lsa2.fit_transform(vectorized_train_X)
%time lsa_vectorized_test_X = lsa2.transform(vectorized_test_X)
print lsa_hashed_train_X.shape, lsa_hashed_test_X.shape
print lsa_vectorized_train_X.shape, lsa_vectorized_test_X.shape
CPU times: user 1min 54s, sys: 29.1 s, total: 2min 23s Wall time: 1min 46s CPU times: user 1.4 s, sys: 480 ms, total: 1.88 s Wall time: 1.79 s CPU times: user 2min 8s, sys: 31.7 s, total: 2min 40s Wall time: 1min 55s CPU times: user 1.61 s, sys: 632 ms, total: 2.24 s
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-71-f53aacea25ce> in <module>() 9 10 print lsa_hashed_train_X.shape, lsa_hashed_test_X.shape ---> 11 print lsa_vectorized_train_X.shape, lsa_vectorized_test_X.shape NameError: name 'lsa_vectorized_test_X' is not defined
Wall time: 2.14 s (11314, 1000) (7532, 1000) (11314, 1000)
datasets = {
'hashed': ((hashed_train_X, train_labels), (hashed_test_X, test_labels)),
'vectorized': ((vectorized_train_X, train_labels), (vectorized_test_X, test_labels)),
'ch2_hashed': ((ch2_hashed_train_X, train_labels), (ch2_hashed_test_X, test_labels)),
'ch2_vectorized': ((ch2_vectorized_train_X, train_labels), (ch2_vectorized_test_X, test_labels)),
'lsa_hashed': ((lsa_hashed_train_X, train_labels), (lsa_hashed_test_X, test_labels)),
'lsa_vectorized': ((lsa_vectorized_train_X, train_labels), (lsa_vectorized_test_X, test_labels))
}
## classifiers
classifiers = {
'pa': PassiveAggressiveClassifier(n_iter = 50),
'knn': KNeighborsClassifier(n_neighbors=10),
'linearsvcl2': LinearSVC(penalty='l2', ),
## SVC can use l1 penalty only when dual is False
'linearsvcl1': LinearSVC(penalty='l1', dual=False),
'sgdl12': SGDClassifier(penalty='elasticnet', n_iter=50),
'sgdl1': SGDClassifier(penalty='l1', n_iter=50),
'sgdl2': SGDClassifier(penalty='l2', n_iter=50),
'rocchio': NearestCentroid(),
'mnb': MultinomialNB(alpha = 0.01),
'bnb': BernoulliNB(alpha = 0.01)
}
## Persist data files
from os import path
from sklearn.externals import joblib
def persist_data(data_name, data, folder = 'data/temp'):
data_file = path.join(path.abspath(folder), data_name+'.pkl')
files = joblib.dump(data, data_file)
return data_file
data_files = [(name, persist_data(name, data))
for name, data in datasets.items()]
from itertools import product
model_data_infor = list(product(classifiers.items(), data_files))
(model_name, model), (data_name, data_file) = model_data_infor[0]
print model_name
print model
print data_name
print data_file
print len(model_data_infor)
knn KNeighborsClassifier(algorithm=auto, leaf_size=30, metric=minkowski, n_neighbors=10, p=2, weights=uniform) ch2_vectorized /home/ce/workspace/x/sklearn-examples/temp/ch2_vectorized.pkl 60
## benchmark methods
def benchmark_model(infor):
from sklearn.externals import joblib
(model_name, model), (data_name, data_file) = infor
train_data, test_data = joblib.load(data_file)
try:
model.fit(*train_data)
train_score = model.score(*train_data)
test_score = model.score(*test_data)
except:
return (model_name, data_name, 0, 0)
return (model_name, data_name, train_score, test_score)
from IPython.parallel import Client
client = Client()
dv = client[:]
lb_view = client.load_balanced_view()
print len(client)
24
results = lb_view.map(benchmark_model, model_data_infor, block = True)
len(results)
60
results = [(r[0]+'_on_'+r[1], r[3]) for r in results]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-90-6d082f0c34ac> in <module>() ----> 1 results = [(r[0]+'_on_'+r[1], r[3]) for r in results] TypeError: cannot concatenate 'str' and 'numpy.float64' objects
## PLOT RESULTS
index = range(len(results))
models, scores = zip(*results)
figure(figsize=(50, 16))
bar(index, scores,)
_ = xticks(np.asarray(index)+0.5, models)
locs, labels = xticks()
_ = setp(labels, rotation = 90, fontsize = 30)
from pprint import pprint
pprint(sorted(results, key = lambda x: x[1], reverse = True))
[('linearsvcl2_on_vectorized', 0.85143388210302706), ('sgdl2_on_vectorized', 0.85130111524163565), ('pa_on_vectorized', 0.84784917684545935), ('mnb_on_hashed', 0.83404142326075414), ('mnb_on_vectorized', 0.83390865639936274), ('linearsvcl2_on_hashed', 0.83284652150823157), ('linearsvcl2_on_lsa_vectorized', 0.83178438661710041), ('sgdl2_on_hashed', 0.8264737121614445), ('sgdl2_on_lsa_vectorized', 0.82328730748805101), ('linearsvcl1_on_lsa_vectorized', 0.81956983536909189), ('pa_on_hashed', 0.81638343069569841), ('linearsvcl1_on_vectorized', 0.81558682952734995), ('sgdl12_on_lsa_vectorized', 0.80071694105151359), ('linearsvcl1_on_hashed', 0.80018587360594795), ('pa_on_lsa_vectorized', 0.79739776951672858), ('linearsvcl2_on_lsa_hashed', 0.79646840148698883), ('sgdl12_on_vectorized', 0.79434413170472651), ('sgdl1_on_lsa_vectorized', 0.79182156133828996), ('sgdl2_on_lsa_hashed', 0.78770578863515661), ('linearsvcl1_on_lsa_hashed', 0.7874402549123739), ('sgdl1_on_vectorized', 0.78491768454593736), ('sgdl12_on_hashed', 0.7772172065852363), ('bnb_on_vectorized', 0.77110993096123204), ('sgdl1_on_hashed', 0.76712692511949021), ('bnb_on_hashed', 0.76367498672331391), ('linearsvcl2_on_ch2_vectorized', 0.76101964949548595), ('linearsvcl2_on_ch2_hashed', 0.75862984599044081), ('sgdl12_on_lsa_hashed', 0.75716941051513542), ('sgdl2_on_ch2_vectorized', 0.75491237387148169), ('linearsvcl1_on_ch2_vectorized', 0.75477960701009028), ('pa_on_ch2_vectorized', 0.75424853956452464), ('mnb_on_ch2_vectorized', 0.75318640467339348), ('mnb_on_ch2_hashed', 0.75318640467339348), ('linearsvcl1_on_ch2_hashed', 0.75066383430695693), ('rocchio_on_vectorized', 0.75013276686139141), ('bnb_on_ch2_vectorized', 0.75013276686139141), ('sgdl1_on_lsa_hashed', 0.74734466277217204), ('sgdl2_on_ch2_hashed', 0.74548592671269254), ('pa_on_ch2_hashed', 0.74349442379182151), ('pa_on_lsa_hashed', 0.74349442379182151), ('sgdl12_on_ch2_vectorized', 0.7399097185342538), ('bnb_on_lsa_vectorized', 0.73911311736590546), ('bnb_on_ch2_hashed', 0.73659054699946891), ('sgdl12_on_ch2_hashed', 0.73592671269251198), ('rocchio_on_lsa_vectorized', 0.73313860860329261), ('sgdl1_on_ch2_vectorized', 0.73287307488050979), ('sgdl1_on_ch2_hashed', 0.73167817312798722), ('knn_on_vectorized', 0.68215613382899631), ('bnb_on_lsa_hashed', 0.6785714285714286), ('knn_on_hashed', 0.6258629845990441), ('knn_on_ch2_vectorized', 0.61404673393520981), ('rocchio_on_hashed', 0.61139139670738185), ('rocchio_on_ch2_vectorized', 0.60847052575677107), ('rocchio_on_lsa_hashed', 0.60223048327137552), ('rocchio_on_ch2_hashed', 0.52522570366436538), ('knn_on_lsa_hashed', 0.49097185342538502), ('knn_on_ch2_hashed', 0.4549920339883165), ('knn_on_lsa_vectorized', 0.43813064259160911), ('mnb_on_lsa_vectorized', 0), ('mnb_on_lsa_hashed', 0)]
Linear models (sgd, passive-agressive, svc, multinomial-nb) work superisingly well with text data, especially with a l2 norm penalty
l2 penalty works better than l1, so it is like that correlations of different terms are NOT very big - and the text classification task is better done by using all words
** Tf-idf is still one of the best for text terms normalization, sometimes LSA (TruncatedSVD) is also useful, hashing is useful for speed-up, but not as accurate as tfidf (specially the idf) part**