import os, json
from boto.s3.connection import S3Connection
import pandas as pd
from IPython.parallel import Client
cats =['advertising',
'analytics',
'automotive',
'biotech',
'cleantech',
'consulting',
'design',
'ecommerce',
'education',
'enterprise',
'fashion',
'finance',
'games_video',
'government',
'hardware',
'health',
'hospitality',
'legal',
'local',
'manufacturing',
'medical',
'messaging',
'mobile',
'music',
'nanotech',
'network_hosting',
'news',
'nonprofit',
'other',
'pets',
'photo_video',
'public_relations',
'real_estate',
'search',
'security',
'semiconductor',
'social',
'software',
'sports',
'transportation',
'travel',
'web']
path='C:\\Users\\gkand_000\\data'
c=Client()
d=c.direct_view()
with d.sync_imports():
import time, json, os
import pandas as pd
from boto.s3.connection import S3Connection
d.push(dict(path=path, cred=cred))
def build_output(companies_split):
#cred = pd.read_csv(''.join((path,'\\setup\\credentials.csv')))
s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0])
s3bucket = s3conn.get_bucket('kandlikards')
sections = ['name','overview','tag_list','description']
for company in companies_split:
try:
code = 'test' if not(company['category_code']) else company['category_code']
except KeyError:
code = 'test'
comppath=''.join((path,'\\cbproject\\',code,'\\',company['permalink'],'.json'))
if os.path.exists(comppath):
continue
try:
keyname='crunchbase/'+company['permalink']+'.json'
except KeyError:
continue
text = s3bucket.get_key(keyname).get_contents_as_string()
try:
json_doc = json.loads(text, strict = False)
output = '\n'.join(filter(None,[json_doc[key] for key in sections])).encode('utf-8')
except :
continue
with open(comppath, 'w') as outputfile:
outputfile.write(output)
time.sleep(0.10)
def make_folders():
for cat in cats:
if not os.path.exists(''.join((path,'\\',cat))):
os.mkdir(''.join((path,'\\',cat)))
cred = pd.read_csv(''.join((path,'\\0setup\\credentials.csv')))
s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0])
s3bucket = s3conn.get_bucket('kandlikards')
with open(path+'\\companies.json') as f:
companies_all=json.loads(f.read(), strict = False)
companies_split=[]
for i in range(8):
companies_split.append(companies_all[i::8])
make_folders()
results = d.map_async(build_output, companies_split)
importing time on engine(s) importing json on engine(s) importing os on engine(s) importing pandas on engine(s) importing S3Connection from boto.s3.connection on engine(s)
import time
while not results.ready():
print 'not just yet. %s' % (time.strftime("%d %b %Y %H:%M:%S", time.gmtime()))
time.sleep(180)
valid=results.get()
not just yet. 31 Mar 2014 18:42:33 not just yet. 31 Mar 2014 18:45:33 not just yet. 31 Mar 2014 18:48:33 not just yet. 31 Mar 2014 18:51:33 not just yet. 31 Mar 2014 18:54:33 not just yet. 31 Mar 2014 18:57:33 not just yet. 31 Mar 2014 19:00:33 not just yet. 31 Mar 2014 19:03:33 not just yet. 31 Mar 2014 19:06:33 not just yet. 31 Mar 2014 19:09:33 not just yet. 31 Mar 2014 19:12:33 not just yet. 31 Mar 2014 19:15:33 not just yet. 31 Mar 2014 19:18:33 not just yet. 31 Mar 2014 19:21:33 not just yet. 31 Mar 2014 19:24:33 not just yet. 31 Mar 2014 19:27:33 not just yet. 31 Mar 2014 19:30:33 not just yet. 31 Mar 2014 19:33:33 not just yet. 31 Mar 2014 19:36:33 not just yet. 31 Mar 2014 19:39:33 not just yet. 31 Mar 2014 19:42:33 not just yet. 31 Mar 2014 19:45:33 not just yet. 31 Mar 2014 19:48:33 not just yet. 31 Mar 2014 19:51:33 not just yet. 31 Mar 2014 19:54:33 not just yet. 31 Mar 2014 19:57:33 not just yet. 31 Mar 2014 20:00:33 not just yet. 31 Mar 2014 20:03:33 not just yet. 31 Mar 2014 20:06:33 not just yet. 31 Mar 2014 20:09:33 not just yet. 31 Mar 2014 20:12:33 not just yet. 31 Mar 2014 20:15:33 not just yet. 31 Mar 2014 20:18:33 not just yet. 31 Mar 2014 20:21:33 not just yet. 31 Mar 2014 20:24:33 not just yet. 31 Mar 2014 20:27:33 not just yet. 31 Mar 2014 20:30:33 not just yet. 31 Mar 2014 20:33:33 not just yet. 31 Mar 2014 20:36:33 not just yet. 31 Mar 2014 20:39:33 not just yet. 31 Mar 2014 20:42:33 not just yet. 31 Mar 2014 20:45:33 not just yet. 31 Mar 2014 20:48:33 not just yet. 31 Mar 2014 20:51:33 not just yet. 31 Mar 2014 20:54:33 not just yet. 31 Mar 2014 20:57:33 not just yet. 31 Mar 2014 21:00:33 not just yet. 31 Mar 2014 21:03:33 not just yet. 31 Mar 2014 21:06:33 not just yet. 31 Mar 2014 21:09:33 not just yet. 31 Mar 2014 21:12:33 not just yet. 31 Mar 2014 21:15:33 not just yet. 31 Mar 2014 21:18:33 not just yet. 31 Mar 2014 21:21:33 not just yet. 31 Mar 2014 21:24:33 not just yet. 31 Mar 2014 21:27:33
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-89-6d610f450753> in <module>() 3 while not results.ready(): 4 print 'not just yet. %s' % (time.strftime("%d %b %Y %H:%M:%S", time.gmtime())) ----> 5 time.sleep(180) 6 7 valid=results.get() KeyboardInterrupt:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
# Load the text data
categories = [
'ecommerce',
'education',
'software',
'biotech',
]
path='C:\\Users\\gkand_000\\data\\cbproject'
cb_train_small = load_files(''.join((path,'\\inittrain\\')),
categories=categories, encoding='utf-8')
cb_test_small = load_files(''.join((path, '\\inittest\\')),
categories=categories, encoding='utf-8')
'''
I wanted to play around with the parameters a little more, but ran out of time.
In particular, strip out any html tags. COnsidered BeautifulSoup.
'''
# Turn the text documents into vectors of word frequencies
vectorizer = TfidfVectorizer(min_df=2)
text_train = vectorizer.fit_transform(cb_train_small.data)
cat_train = cb_train_small.target
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
cb_train_x_vector = vectorizer.fit_transform(cb_train_small.data)
cb_train_y_vector = cat_train
'''Figuring out the principal components'''
'''Are the like records roughly grouped together?'''
from sklearn.decomposition import TruncatedSVD
cb_train_x_vector_pca = TruncatedSVD(n_components=2).fit_transform(cb_train_x_vector)
from itertools import cycle
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, c in zip(np.unique(cb_train_y_vector), cycle(colors)):
plt.scatter(cb_train_x_vector_pca[cb_train_y_vector == i, 0],
cb_train_x_vector_pca[cb_train_y_vector == i, 1],
c=c, label=cb_train_small.target_names[i], alpha=0.5)
_ = plt.legend(loc='best')
print cb_train_x_vector.shape
print cb_train_y_vector.shape
(22238, 73593) (22238L,)
print text_test.shape
print cat_test.shape
print len(cb_test_small.data)
print len(cb_test_small.target)
(11241, 47244) (11241L,) 11241 11241
from sklearn.naive_bayes import MultinomialNB
vectorizer = TfidfVectorizer(min_df=2)
text_train = vectorizer.fit_transform(cb_train_small.data)
cat_train = cb_train_small.target
print cat_train.shape[0] == text_train.shape[0]
# Fit a classifier on the training set
classifier = MultinomialNB().fit(text_train, cat_train)
print("Training score: {0:.1f}%".format(
classifier.score(text_train, cat_train) * 100))
# Evaluate the classifier on the testing set
text_test = vectorizer.transform(cb_test_small.data)
cat_test = cb_test_small.target
text_test
print("Testing score: {0:.1f}%".format(
classifier.score(text_test, cat_test) * 100))
True Training score: 81.4% Testing score: 75.0%
from sklearn.cross_validation import cross_val_score
from scipy.stats import sem
from sklearn.pipeline import Pipeline
pipe = Pipeline((
('vectorizer', TfidfVectorizer(max_df=0.8, use_idf=True, stop_words='english')),
('classifier', MultinomialNB(alpha=1.0)),
))
scores = cross_val_score(pipe, cb_train_small.data, cb_train_small.target, cv = 4)
print scores
print ""
print "\n Mean: " + str(scores.mean())
print "\n Standard Error: " + str(sem(scores))
[ 0.72464029 0.71852518 0.7222522 0.72333153] Mean: 0.722187301427 Standard Error: 0.00131471663303
from sklearn.grid_search import GridSearchCV
params = {
'vectorizer__max_df': [0.75, 0.8],
'vectorizer__use_idf': [True, False],
'vectorizer__ngram_range':[(1, 1), (1, 2)],
'classifier__alpha': [0.1, 1, 10, 100]
}
grid_search = GridSearchCV(pipe, params, verbose=2, refit=False)
_ = grid_search.fit(cb_train_small.data, cb_train_small.target)
print "Best Score: " + str(grid_search.best_score_)
print "Best Params: " + str(grid_search.best_params_)
Fitting 3 folds for each of 32 candidates, totalling 96 fits [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 - 2.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 - 2.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 - 2.0s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 - 7.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 - 7.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 - 7.1s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 - 6.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 - 2.0s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 - 7.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 - 7.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 - 7.1s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 - 6.8s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 - 6.7s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 - 7.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 - 7.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 - 7.1s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 - 6.7s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 - 1.9s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 2.1s [Parallel(n_jobs=1)]: Done 41 jobs | elapsed: 2.9min
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 - 8.4s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 - 7.4s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 - 7.2s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 - 6.8s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 - 6.8s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 - 7.4s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 - 7.7s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 - 7.1s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 - 6.7s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 - 7.4s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 - 2.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 - 2.2s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 - 2.0s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 - 2.0s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 - 2.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 - 7.7s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 - 7.6s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 - 8.1s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 - 6.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 - 6.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 - 6.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 - 2.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 - 2.0s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 - 2.2s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 - 2.4s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 - 8.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 - 8.4s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 - 7.3s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 - 6.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 - 6.8s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 - 6.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 - 2.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 - 2.0s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 - 2.0s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 - 1.9s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 - 1.9s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 - 7.2s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 - 7.1s [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 - 8.4s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 - 8.1s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 - 8.5s [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 [GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 - 7.4s Best Score: 0.837170608868 Best Params: {'vectorizer__use_idf': False, 'vectorizer__ngram_range': (1, 1), 'vectorizer__max_df': 0.75, 'classifier__alpha': 0.1}
[Parallel(n_jobs=1)]: Done 96 out of 96 | elapsed: 7.6min finished
Best Score: 0.837170608868
compare to 0.722187301427
Best Params: {'vectorizer__use_idf': False,
'vectorizer__ngram_range': (1, 1),
'vectorizer__max_df': 0.75,
'classifier__alpha': 0.1}
[Parallel(n_jobs=1)]: Done 96 out of 96 | elapsed: 7.6min finished
'''Performance'''
pipe = Pipeline((
('vectorizer', TfidfVectorizer(max_df=0.75, use_idf=False, stop_words='english')),
('classifier', MultinomialNB(alpha=0.1)),
))
_ = pipe.fit(cb_train_small.data, cb_train_small.target)
vectorizer_nm, vectorizer = pipe.steps[0]
classifier_nm, classifier = pipe.steps[1]
features = vectorizer.get_feature_names()
targets = cb_train_small.target_names
feature_weights = classifier.coef_
feature_weights.shape
(4L, 73294L)
def display_important_features(features, targets, weights, n_top=15):
for i, target_name in enumerate(targets):
print ""
print '==================================='
print("Class: " + target_name)
print("")
sorted_features_indices = weights[i].argsort()[::-1]
most_important = sorted_features_indices[:n_top]
print ("Most Important: \n")
print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j])
for j in most_important))
print("...")
print ("Least Important: \n")
least_important = sorted_features_indices[-n_top:]
print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j])
for j in least_important))
print("")
display_important_features(features, targets, feature_weights)
=================================== Class: biotech Most Important: company: -4.1491, based: -5.0161, development: -5.0201, products: -5.0750, technology: -5.4037, treatment: -5.4217, therapeutics: -5.5356, research: -5.5392, medical: -5.5583, drug: -5.5832, clinical: -5.6171, pharmaceuticals: -5.6357, founded: -5.6486, cancer: -5.6587, diseases: -5.6923 ... Least Important: oma: -12.2871, om: -12.2871, olympus: -12.2871, olympics: -12.2871, olympic: -12.2871, olympians: -12.2871, olympiads: -12.2871, olympiad: -12.2871, olymp: -12.2871, olx: -12.2871, oluåÿturur: -12.2871, olur: -12.2871, olup: -12.2871, oltremare: -12.2871, 00: -12.2871 =================================== Class: ecommerce Most Important: online: -4.2340, com: -4.9231, products: -5.1555, ecommerce: -5.1792, commerce: -5.2778, company: -5.2939, web: -5.3134, 8217: -5.3399, shopping: -5.3467, social: -5.5032, business: -5.5494, services: -5.5840, design: -5.6834, website: -5.7058, service: -5.7117 ... Least Important: micropreneur: -12.8304, microplates: -12.8304, microplate: -12.8304, microphones: -12.8304, microphone: -12.8304, microphilanthropy: -12.8304, microphage: -12.8304, micropattern: -12.8304, microparticles: -12.8304, microparticle: -12.8304, micropact: -12.8304, microorganisms: -12.8304, micronutrients: -12.8304, microns: -12.8304, first: -12.8304 =================================== Class: education Most Important: education: -4.6225, learning: -4.9933, online: -5.0966, students: -5.1166, training: -5.2445, company: -5.4757, school: -5.5423, mobile: -5.6069, apps: -5.7941, 8217: -5.8101, applications: -5.9127, educational: -5.9134, based: -5.9322, platform: -5.9356, business: -5.9858 ... Least Important: navigable: -12.0903, naurex: -12.0903, navideck: -12.0903, navidad: -12.0903, naves: -12.0903, naver: -12.0903, naveen: -12.0903, navayuga: -12.0903, navarra: -12.0903, naval: -12.0903, nautilus: -12.0903, nautical: -12.0903, nausea: -12.0903, naurexâ: -12.0903, first: -12.0903 =================================== Class: software Most Important: software: -3.9011, development: -4.5728, web: -4.6324, company: -4.7479, management: -4.7845, solutions: -4.8339, mobile: -4.9307, services: -5.0105, business: -5.0564, based: -5.1040, technology: -5.2855, data: -5.3205, online: -5.4612, 8217: -5.5069, design: -5.5137 ... Least Important: företag: -13.5012, företaget: -13.5012, företagsvärdering: -13.5012, führen: -13.5012, führungskräfte: -13.5012, verhuur: -13.5012, g12e: -13.5012, g2: -13.5012, g28: -13.5012, quarries: -13.5012, g31p: -13.5012, quarkâ: -13.5012, g8: -13.5012, wishcast: -13.5012, intelligenti: -13.5012
from sklearn.metrics import classification_report
predicted_vals = pipe.predict(cb_test_small.data)
print(classification_report(cb_test_small.target, predicted_vals,
target_names=cb_test_small.target_names))
precision recall f1-score support biotech 0.98 0.77 0.86 1829 ecommerce 0.93 0.71 0.81 3406 education 0.93 0.49 0.64 1003 software 0.73 0.97 0.83 5003 avg / total 0.85 0.82 0.81 11241
from sklearn.metrics import confusion_matrix
confusion_matrix(cb_test_small.target, predicted_vals)
array([[1404, 22, 3, 400], [ 15, 2431, 10, 950], [ 5, 54, 487, 457], [ 10, 98, 26, 4869]])