import os, json
from boto.s3.connection import S3Connection
import pandas as pd
from IPython.parallel import Client

cats =['advertising',
      'analytics',
      'automotive',
      'biotech',
      'cleantech',
      'consulting',
      'design',
      'ecommerce',
      'education',
      'enterprise',
      'fashion',
      'finance',
      'games_video',
      'government',
      'hardware',
      'health',
      'hospitality',
      'legal',
      'local',
      'manufacturing',
      'medical',
      'messaging',
      'mobile',
      'music',
      'nanotech',
      'network_hosting',
      'news',
      'nonprofit',
      'other',
      'pets',
      'photo_video',
      'public_relations',
      'real_estate',
      'search',
      'security',
      'semiconductor',
      'social',
      'software',
      'sports',
      'transportation',
      'travel',
      'web']

path='C:\\Users\\gkand_000\\data'
c=Client()
d=c.direct_view()

with d.sync_imports():
    import time, json, os
    import pandas as pd
    from boto.s3.connection import S3Connection
    
d.push(dict(path=path, cred=cred))    

def build_output(companies_split):
    #cred = pd.read_csv(''.join((path,'\\setup\\credentials.csv')))
    s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0])
    s3bucket = s3conn.get_bucket('kandlikards')
    sections = ['name','overview','tag_list','description']
    for company in companies_split:
        try: 
            code = 'test' if not(company['category_code']) else company['category_code']
        except KeyError:
            code = 'test'
        comppath=''.join((path,'\\cbproject\\',code,'\\',company['permalink'],'.json'))
        if os.path.exists(comppath):
            continue
        try:
            keyname='crunchbase/'+company['permalink']+'.json'
        except KeyError:
            continue
        text = s3bucket.get_key(keyname).get_contents_as_string()
        
        try: 
            json_doc = json.loads(text, strict = False)
            output = '\n'.join(filter(None,[json_doc[key] for key in sections])).encode('utf-8')
        except :
            continue
        with open(comppath, 'w') as outputfile:
            outputfile.write(output)
        time.sleep(0.10)


def make_folders():
    for cat in cats:
        if not os.path.exists(''.join((path,'\\',cat))):
            os.mkdir(''.join((path,'\\',cat)))
            
cred = pd.read_csv(''.join((path,'\\0setup\\credentials.csv')))
s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0])
s3bucket = s3conn.get_bucket('kandlikards')

with open(path+'\\companies.json') as f:
    companies_all=json.loads(f.read(), strict = False)

companies_split=[]
for i in range(8):
    companies_split.append(companies_all[i::8])
    
make_folders()
results = d.map_async(build_output, companies_split)
        

import time

while not results.ready():
    print 'not just yet. %s' % (time.strftime("%d %b %Y %H:%M:%S", time.gmtime()))
    time.sleep(180)
    
valid=results.get()
After that is done, manually (randomly) assign docsto test/train folders
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the text data
categories = [
    'ecommerce',
    'education',
    'software',
    'biotech',
]
path='C:\\Users\\gkand_000\\data\\cbproject'

cb_train_small = load_files(''.join((path,'\\inittrain\\')),
    categories=categories, encoding='utf-8')
cb_test_small = load_files(''.join((path, '\\inittest\\')),
    categories=categories, encoding='utf-8')


'''
I wanted to play around with the parameters a little more, but ran out of time.
In particular, strip out any html tags. COnsidered BeautifulSoup.
'''

# Turn the text documents into vectors of word frequencies
vectorizer = TfidfVectorizer(min_df=2)
text_train = vectorizer.fit_transform(cb_train_small.data)
cat_train = cb_train_small.target

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1)

cb_train_x_vector = vectorizer.fit_transform(cb_train_small.data)
cb_train_y_vector = cat_train

'''Figuring out the principal components'''
'''Are the like records roughly grouped together?'''

from sklearn.decomposition import TruncatedSVD

cb_train_x_vector_pca = TruncatedSVD(n_components=2).fit_transform(cb_train_x_vector)

from itertools import cycle

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, c in zip(np.unique(cb_train_y_vector), cycle(colors)):
    plt.scatter(cb_train_x_vector_pca[cb_train_y_vector == i, 0],
               cb_train_x_vector_pca[cb_train_y_vector == i, 1],
               c=c, label=cb_train_small.target_names[i], alpha=0.5)
    
_ = plt.legend(loc='best')


print cb_train_x_vector.shape
print cb_train_y_vector.shape

print text_test.shape
print cat_test.shape

print len(cb_test_small.data)
print len(cb_test_small.target)

from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(min_df=2)
text_train = vectorizer.fit_transform(cb_train_small.data)
cat_train = cb_train_small.target

print cat_train.shape[0] == text_train.shape[0]

# Fit a classifier on the training set
classifier = MultinomialNB().fit(text_train, cat_train)
print("Training score: {0:.1f}%".format(
    classifier.score(text_train, cat_train) * 100))

# Evaluate the classifier on the testing set
text_test = vectorizer.transform(cb_test_small.data)
cat_test = cb_test_small.target

text_test

print("Testing score: {0:.1f}%".format(
    classifier.score(text_test, cat_test) * 100))


from sklearn.cross_validation import cross_val_score
from scipy.stats import sem
from sklearn.pipeline import Pipeline

pipe = Pipeline((
    ('vectorizer', TfidfVectorizer(max_df=0.8, use_idf=True, stop_words='english')),
    ('classifier', MultinomialNB(alpha=1.0)),
))

scores = cross_val_score(pipe, cb_train_small.data, cb_train_small.target, cv = 4)

print scores
print ""
print "\n Mean: " + str(scores.mean())
print "\n Standard Error: " + str(sem(scores))

from sklearn.grid_search import GridSearchCV

params = {
    'vectorizer__max_df': [0.75, 0.8],
    'vectorizer__use_idf': [True, False],
    'vectorizer__ngram_range':[(1, 1), (1, 2)],
    'classifier__alpha': [0.1, 1, 10, 100]
}

grid_search = GridSearchCV(pipe, params, verbose=2, refit=False)
_ = grid_search.fit(cb_train_small.data, cb_train_small.target)


print "Best Score:  " + str(grid_search.best_score_)
print "Best Params: " + str(grid_search.best_params_)


Best Score:  0.837170608868
    
    compare to 0.722187301427
    
    
Best Params: {'vectorizer__use_idf': False,
              'vectorizer__ngram_range': (1, 1),
              'vectorizer__max_df': 0.75,
              'classifier__alpha': 0.1}

[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:  7.6min finished


'''Performance'''
pipe = Pipeline((
    ('vectorizer', TfidfVectorizer(max_df=0.75, use_idf=False, stop_words='english')),
    ('classifier', MultinomialNB(alpha=0.1)),
))

_ = pipe.fit(cb_train_small.data, cb_train_small.target)

vectorizer_nm, vectorizer = pipe.steps[0]
classifier_nm, classifier = pipe.steps[1]

features = vectorizer.get_feature_names()
targets = cb_train_small.target_names

feature_weights = classifier.coef_

feature_weights.shape

def display_important_features(features, targets, weights, n_top=15):
    for i, target_name in enumerate(targets):
        print ""
        print '==================================='
        print("Class: " + target_name)
        print("")
        
        sorted_features_indices = weights[i].argsort()[::-1]
        
        most_important = sorted_features_indices[:n_top]
        print ("Most Important: \n")
        print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j])
                        for j in most_important))
        print("...")
        
        print ("Least Important: \n")
        least_important = sorted_features_indices[-n_top:]
        print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j])
                        for j in least_important))
        print("")
        
display_important_features(features, targets, feature_weights)

from sklearn.metrics import classification_report

predicted_vals = pipe.predict(cb_test_small.data)

print(classification_report(cb_test_small.target, predicted_vals,
                            target_names=cb_test_small.target_names))

from sklearn.metrics import confusion_matrix

confusion_matrix(cb_test_small.target, predicted_vals)