Extract useful text

In [88]:
import os, json
from boto.s3.connection import S3Connection
import pandas as pd
from IPython.parallel import Client

cats =['advertising',
      'analytics',
      'automotive',
      'biotech',
      'cleantech',
      'consulting',
      'design',
      'ecommerce',
      'education',
      'enterprise',
      'fashion',
      'finance',
      'games_video',
      'government',
      'hardware',
      'health',
      'hospitality',
      'legal',
      'local',
      'manufacturing',
      'medical',
      'messaging',
      'mobile',
      'music',
      'nanotech',
      'network_hosting',
      'news',
      'nonprofit',
      'other',
      'pets',
      'photo_video',
      'public_relations',
      'real_estate',
      'search',
      'security',
      'semiconductor',
      'social',
      'software',
      'sports',
      'transportation',
      'travel',
      'web']

path='C:\\Users\\gkand_000\\data'
c=Client()
d=c.direct_view()

with d.sync_imports():
    import time, json, os
    import pandas as pd
    from boto.s3.connection import S3Connection
    
d.push(dict(path=path, cred=cred))    

def build_output(companies_split):
    #cred = pd.read_csv(''.join((path,'\\setup\\credentials.csv')))
    s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0])
    s3bucket = s3conn.get_bucket('kandlikards')
    sections = ['name','overview','tag_list','description']
    for company in companies_split:
        try: 
            code = 'test' if not(company['category_code']) else company['category_code']
        except KeyError:
            code = 'test'
        comppath=''.join((path,'\\cbproject\\',code,'\\',company['permalink'],'.json'))
        if os.path.exists(comppath):
            continue
        try:
            keyname='crunchbase/'+company['permalink']+'.json'
        except KeyError:
            continue
        text = s3bucket.get_key(keyname).get_contents_as_string()
        
        try: 
            json_doc = json.loads(text, strict = False)
            output = '\n'.join(filter(None,[json_doc[key] for key in sections])).encode('utf-8')
        except :
            continue
        with open(comppath, 'w') as outputfile:
            outputfile.write(output)
        time.sleep(0.10)


def make_folders():
    for cat in cats:
        if not os.path.exists(''.join((path,'\\',cat))):
            os.mkdir(''.join((path,'\\',cat)))
            
cred = pd.read_csv(''.join((path,'\\0setup\\credentials.csv')))
s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0])
s3bucket = s3conn.get_bucket('kandlikards')

with open(path+'\\companies.json') as f:
    companies_all=json.loads(f.read(), strict = False)

companies_split=[]
for i in range(8):
    companies_split.append(companies_all[i::8])
    
make_folders()
results = d.map_async(build_output, companies_split)
        
importing time on engine(s)
importing json on engine(s)
importing os on engine(s)
importing pandas on engine(s)
importing S3Connection from boto.s3.connection on engine(s)
In [89]:
import time

while not results.ready():
    print 'not just yet. %s' % (time.strftime("%d %b %Y %H:%M:%S", time.gmtime()))
    time.sleep(180)
    
valid=results.get()
not just yet. 31 Mar 2014 18:42:33
not just yet. 31 Mar 2014 18:45:33
not just yet. 31 Mar 2014 18:48:33
not just yet. 31 Mar 2014 18:51:33
not just yet. 31 Mar 2014 18:54:33
not just yet. 31 Mar 2014 18:57:33
not just yet. 31 Mar 2014 19:00:33
not just yet. 31 Mar 2014 19:03:33
not just yet. 31 Mar 2014 19:06:33
not just yet. 31 Mar 2014 19:09:33
not just yet. 31 Mar 2014 19:12:33
not just yet. 31 Mar 2014 19:15:33
not just yet. 31 Mar 2014 19:18:33
not just yet. 31 Mar 2014 19:21:33
not just yet. 31 Mar 2014 19:24:33
not just yet. 31 Mar 2014 19:27:33
not just yet. 31 Mar 2014 19:30:33
not just yet. 31 Mar 2014 19:33:33
not just yet. 31 Mar 2014 19:36:33
not just yet. 31 Mar 2014 19:39:33
not just yet. 31 Mar 2014 19:42:33
not just yet. 31 Mar 2014 19:45:33
not just yet. 31 Mar 2014 19:48:33
not just yet. 31 Mar 2014 19:51:33
not just yet. 31 Mar 2014 19:54:33
not just yet. 31 Mar 2014 19:57:33
not just yet. 31 Mar 2014 20:00:33
not just yet. 31 Mar 2014 20:03:33
not just yet. 31 Mar 2014 20:06:33
not just yet. 31 Mar 2014 20:09:33
not just yet. 31 Mar 2014 20:12:33
not just yet. 31 Mar 2014 20:15:33
not just yet. 31 Mar 2014 20:18:33
not just yet. 31 Mar 2014 20:21:33
not just yet. 31 Mar 2014 20:24:33
not just yet. 31 Mar 2014 20:27:33
not just yet. 31 Mar 2014 20:30:33
not just yet. 31 Mar 2014 20:33:33
not just yet. 31 Mar 2014 20:36:33
not just yet. 31 Mar 2014 20:39:33
not just yet. 31 Mar 2014 20:42:33
not just yet. 31 Mar 2014 20:45:33
not just yet. 31 Mar 2014 20:48:33
not just yet. 31 Mar 2014 20:51:33
not just yet. 31 Mar 2014 20:54:33
not just yet. 31 Mar 2014 20:57:33
not just yet. 31 Mar 2014 21:00:33
not just yet. 31 Mar 2014 21:03:33
not just yet. 31 Mar 2014 21:06:33
not just yet. 31 Mar 2014 21:09:33
not just yet. 31 Mar 2014 21:12:33
not just yet. 31 Mar 2014 21:15:33
not just yet. 31 Mar 2014 21:18:33
not just yet. 31 Mar 2014 21:21:33
not just yet. 31 Mar 2014 21:24:33
not just yet. 31 Mar 2014 21:27:33
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-89-6d610f450753> in <module>()
      3 while not results.ready():
      4     print 'not just yet. %s' % (time.strftime("%d %b %Y %H:%M:%S", time.gmtime()))
----> 5     time.sleep(180)
      6 
      7 valid=results.get()

KeyboardInterrupt: 

After that is done, manually (randomly) assign docsto test/train folders

Scikit Learn!

In [117]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the text data
categories = [
    'ecommerce',
    'education',
    'software',
    'biotech',
]
path='C:\\Users\\gkand_000\\data\\cbproject'

cb_train_small = load_files(''.join((path,'\\inittrain\\')),
    categories=categories, encoding='utf-8')
cb_test_small = load_files(''.join((path, '\\inittest\\')),
    categories=categories, encoding='utf-8')


'''
I wanted to play around with the parameters a little more, but ran out of time.
In particular, strip out any html tags. COnsidered BeautifulSoup.
'''

# Turn the text documents into vectors of word frequencies
vectorizer = TfidfVectorizer(min_df=2)
text_train = vectorizer.fit_transform(cb_train_small.data)
cat_train = cb_train_small.target

Use the Tf-Idf Vectorizer to transform data.

Run SVD analysis to see if reducing the data to 2 dimensions is informative.

In [119]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1)

cb_train_x_vector = vectorizer.fit_transform(cb_train_small.data)
cb_train_y_vector = cat_train

'''Figuring out the principal components'''
'''Are the like records roughly grouped together?'''

from sklearn.decomposition import TruncatedSVD

cb_train_x_vector_pca = TruncatedSVD(n_components=2).fit_transform(cb_train_x_vector)

from itertools import cycle

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, c in zip(np.unique(cb_train_y_vector), cycle(colors)):
    plt.scatter(cb_train_x_vector_pca[cb_train_y_vector == i, 0],
               cb_train_x_vector_pca[cb_train_y_vector == i, 1],
               c=c, label=cb_train_small.target_names[i], alpha=0.5)
    
_ = plt.legend(loc='best')

Recheck Dimensions

In [120]:
print cb_train_x_vector.shape
print cb_train_y_vector.shape
(22238, 73593)
(22238L,)
In [121]:
print text_test.shape
print cat_test.shape

print len(cb_test_small.data)
print len(cb_test_small.target)
(11241, 47244)
(11241L,)
11241
11241
In [127]:
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(min_df=2)
text_train = vectorizer.fit_transform(cb_train_small.data)
cat_train = cb_train_small.target

print cat_train.shape[0] == text_train.shape[0]

# Fit a classifier on the training set
classifier = MultinomialNB().fit(text_train, cat_train)
print("Training score: {0:.1f}%".format(
    classifier.score(text_train, cat_train) * 100))

# Evaluate the classifier on the testing set
text_test = vectorizer.transform(cb_test_small.data)
cat_test = cb_test_small.target

text_test

print("Testing score: {0:.1f}%".format(
    classifier.score(text_test, cat_test) * 100))
True
Training score: 81.4%
Testing score: 75.0%

Cross Validate - Results are not great

In [187]:
from sklearn.cross_validation import cross_val_score
from scipy.stats import sem
from sklearn.pipeline import Pipeline

pipe = Pipeline((
    ('vectorizer', TfidfVectorizer(max_df=0.8, use_idf=True, stop_words='english')),
    ('classifier', MultinomialNB(alpha=1.0)),
))

scores = cross_val_score(pipe, cb_train_small.data, cb_train_small.target, cv = 4)

print scores
print ""
print "\n Mean: " + str(scores.mean())
print "\n Standard Error: " + str(sem(scores))
[ 0.72464029  0.71852518  0.7222522   0.72333153]


 Mean: 0.722187301427

 Standard Error: 0.00131471663303

GridSearch to the Rescue.

In [179]:
from sklearn.grid_search import GridSearchCV

params = {
    'vectorizer__max_df': [0.75, 0.8],
    'vectorizer__use_idf': [True, False],
    'vectorizer__ngram_range':[(1, 1), (1, 2)],
    'classifier__alpha': [0.1, 1, 10, 100]
}

grid_search = GridSearchCV(pipe, params, verbose=2, refit=False)
_ = grid_search.fit(cb_train_small.data, cb_train_small.target)


print "Best Score:  " + str(grid_search.best_score_)
print "Best Params: " + str(grid_search.best_params_)
Fitting 3 folds for each of 32 candidates, totalling 96 fits
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 -   2.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 -   2.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 -   2.0s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 -   7.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 -   7.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 -   7.1s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=0.1 -   6.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 -   2.0s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=0.1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 -   7.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 -   7.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 -   7.1s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 -   6.8s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=0.1 -   6.7s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 -   7.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 -   7.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 -   7.1s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=1 -   6.7s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 -   1.9s
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    2.1s
[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:  2.9min
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=1 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 -   8.4s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 -   7.4s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 -   7.2s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 -   6.8s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=1 -   6.8s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 -   7.4s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 -   7.7s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 -   7.1s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 -   6.7s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=10 -   7.4s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 -   2.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 -   2.2s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 -   2.0s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 -   2.0s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=10 -   2.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 -   7.7s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 -   7.6s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 -   8.1s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 -   6.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 -   6.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=10 -   6.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 -   2.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 -   2.0s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 -   2.2s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.75, classifier__alpha=100 -   2.4s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 -   8.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 -   8.4s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 -   7.3s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 -   6.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 -   6.8s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.75, classifier__alpha=100 -   6.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 -   2.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 -   2.0s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 -   2.0s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 -   1.9s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 1), vectorizer__max_df=0.8, classifier__alpha=100 -   1.9s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 -   7.2s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 -   7.1s
[GridSearchCV] vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=True, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 -   8.4s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 -   8.1s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 -   8.5s
[GridSearchCV] vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 
[GridSearchCV]  vectorizer__use_idf=False, vectorizer__ngram_range=(1, 2), vectorizer__max_df=0.8, classifier__alpha=100 -   7.4s
Best Score:  0.837170608868
Best Params: {'vectorizer__use_idf': False, 'vectorizer__ngram_range': (1, 1), 'vectorizer__max_df': 0.75, 'classifier__alpha': 0.1}
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:  7.6min finished
In [ ]:
Best Score:  0.837170608868
    
    compare to 0.722187301427
    
    
Best Params: {'vectorizer__use_idf': False,
              'vectorizer__ngram_range': (1, 1),
              'vectorizer__max_df': 0.75,
              'classifier__alpha': 0.1}

[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:  7.6min finished

Fit a new model

In [190]:
'''Performance'''
pipe = Pipeline((
    ('vectorizer', TfidfVectorizer(max_df=0.75, use_idf=False, stop_words='english')),
    ('classifier', MultinomialNB(alpha=0.1)),
))

_ = pipe.fit(cb_train_small.data, cb_train_small.target)

vectorizer_nm, vectorizer = pipe.steps[0]
classifier_nm, classifier = pipe.steps[1]

features = vectorizer.get_feature_names()
targets = cb_train_small.target_names

feature_weights = classifier.coef_
In [191]:
feature_weights.shape
Out[191]:
(4L, 73294L)

What are the important distinguishing words for each category?

In [192]:
def display_important_features(features, targets, weights, n_top=15):
    for i, target_name in enumerate(targets):
        print ""
        print '==================================='
        print("Class: " + target_name)
        print("")
        
        sorted_features_indices = weights[i].argsort()[::-1]
        
        most_important = sorted_features_indices[:n_top]
        print ("Most Important: \n")
        print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j])
                        for j in most_important))
        print("...")
        
        print ("Least Important: \n")
        least_important = sorted_features_indices[-n_top:]
        print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j])
                        for j in least_important))
        print("")
        
display_important_features(features, targets, feature_weights)
===================================
Class: biotech

Most Important: 

company: -4.1491, based: -5.0161, development: -5.0201, products: -5.0750, technology: -5.4037, treatment: -5.4217, therapeutics: -5.5356, research: -5.5392, medical: -5.5583, drug: -5.5832, clinical: -5.6171, pharmaceuticals: -5.6357, founded: -5.6486, cancer: -5.6587, diseases: -5.6923
...
Least Important: 

oma: -12.2871, om: -12.2871, olympus: -12.2871, olympics: -12.2871, olympic: -12.2871, olympians: -12.2871, olympiads: -12.2871, olympiad: -12.2871, olymp: -12.2871, olx: -12.2871, oluåÿturur: -12.2871, olur: -12.2871, olup: -12.2871, oltremare: -12.2871, 00: -12.2871


===================================
Class: ecommerce

Most Important: 

online: -4.2340, com: -4.9231, products: -5.1555, ecommerce: -5.1792, commerce: -5.2778, company: -5.2939, web: -5.3134, 8217: -5.3399, shopping: -5.3467, social: -5.5032, business: -5.5494, services: -5.5840, design: -5.6834, website: -5.7058, service: -5.7117
...
Least Important: 

micropreneur: -12.8304, microplates: -12.8304, microplate: -12.8304, microphones: -12.8304, microphone: -12.8304, microphilanthropy: -12.8304, microphage: -12.8304, micropattern: -12.8304, microparticles: -12.8304, microparticle: -12.8304, micropact: -12.8304, microorganisms: -12.8304, micronutrients: -12.8304, microns: -12.8304, first: -12.8304


===================================
Class: education

Most Important: 

education: -4.6225, learning: -4.9933, online: -5.0966, students: -5.1166, training: -5.2445, company: -5.4757, school: -5.5423, mobile: -5.6069, apps: -5.7941, 8217: -5.8101, applications: -5.9127, educational: -5.9134, based: -5.9322, platform: -5.9356, business: -5.9858
...
Least Important: 

navigable: -12.0903, naurex: -12.0903, navideck: -12.0903, navidad: -12.0903, naves: -12.0903, naver: -12.0903, naveen: -12.0903, navayuga: -12.0903, navarra: -12.0903, naval: -12.0903, nautilus: -12.0903, nautical: -12.0903, nausea: -12.0903, naurexâ: -12.0903, first: -12.0903


===================================
Class: software

Most Important: 

software: -3.9011, development: -4.5728, web: -4.6324, company: -4.7479, management: -4.7845, solutions: -4.8339, mobile: -4.9307, services: -5.0105, business: -5.0564, based: -5.1040, technology: -5.2855, data: -5.3205, online: -5.4612, 8217: -5.5069, design: -5.5137
...
Least Important: 

företag: -13.5012, företaget: -13.5012, företagsvärdering: -13.5012, führen: -13.5012, führungskräfte: -13.5012, verhuur: -13.5012, g12e: -13.5012, g2: -13.5012, g28: -13.5012, quarries: -13.5012, g31p: -13.5012, quarkâ: -13.5012, g8: -13.5012, wishcast: -13.5012, intelligenti: -13.5012

In [193]:
from sklearn.metrics import classification_report

predicted_vals = pipe.predict(cb_test_small.data)

Numbers!

In [194]:
print(classification_report(cb_test_small.target, predicted_vals,
                            target_names=cb_test_small.target_names))
             precision    recall  f1-score   support

    biotech       0.98      0.77      0.86      1829
  ecommerce       0.93      0.71      0.81      3406
  education       0.93      0.49      0.64      1003
   software       0.73      0.97      0.83      5003

avg / total       0.85      0.82      0.81     11241

In [185]:
from sklearn.metrics import confusion_matrix

confusion_matrix(cb_test_small.target, predicted_vals)
Out[185]:
array([[1404,   22,    3,  400],
       [  15, 2431,   10,  950],
       [   5,   54,  487,  457],
       [  10,   98,   26, 4869]])