import os, json from boto.s3.connection import S3Connection import pandas as pd from IPython.parallel import Client cats =['advertising', 'analytics', 'automotive', 'biotech', 'cleantech', 'consulting', 'design', 'ecommerce', 'education', 'enterprise', 'fashion', 'finance', 'games_video', 'government', 'hardware', 'health', 'hospitality', 'legal', 'local', 'manufacturing', 'medical', 'messaging', 'mobile', 'music', 'nanotech', 'network_hosting', 'news', 'nonprofit', 'other', 'pets', 'photo_video', 'public_relations', 'real_estate', 'search', 'security', 'semiconductor', 'social', 'software', 'sports', 'transportation', 'travel', 'web'] path='C:\\Users\\gkand_000\\data' c=Client() d=c.direct_view() with d.sync_imports(): import time, json, os import pandas as pd from boto.s3.connection import S3Connection d.push(dict(path=path, cred=cred)) def build_output(companies_split): #cred = pd.read_csv(''.join((path,'\\setup\\credentials.csv'))) s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0]) s3bucket = s3conn.get_bucket('kandlikards') sections = ['name','overview','tag_list','description'] for company in companies_split: try: code = 'test' if not(company['category_code']) else company['category_code'] except KeyError: code = 'test' comppath=''.join((path,'\\cbproject\\',code,'\\',company['permalink'],'.json')) if os.path.exists(comppath): continue try: keyname='crunchbase/'+company['permalink']+'.json' except KeyError: continue text = s3bucket.get_key(keyname).get_contents_as_string() try: json_doc = json.loads(text, strict = False) output = '\n'.join(filter(None,[json_doc[key] for key in sections])).encode('utf-8') except : continue with open(comppath, 'w') as outputfile: outputfile.write(output) time.sleep(0.10) def make_folders(): for cat in cats: if not os.path.exists(''.join((path,'\\',cat))): os.mkdir(''.join((path,'\\',cat))) cred = pd.read_csv(''.join((path,'\\0setup\\credentials.csv'))) s3conn=S3Connection(cred['Access Key Id'][0], cred['Secret Access Key'][0]) s3bucket = s3conn.get_bucket('kandlikards') with open(path+'\\companies.json') as f: companies_all=json.loads(f.read(), strict = False) companies_split=[] for i in range(8): companies_split.append(companies_all[i::8]) make_folders() results = d.map_async(build_output, companies_split) import time while not results.ready(): print 'not just yet. %s' % (time.strftime("%d %b %Y %H:%M:%S", time.gmtime())) time.sleep(180) valid=results.get() After that is done, manually (randomly) assign docsto test/train folders from sklearn.datasets import load_files from sklearn.feature_extraction.text import TfidfVectorizer # Load the text data categories = [ 'ecommerce', 'education', 'software', 'biotech', ] path='C:\\Users\\gkand_000\\data\\cbproject' cb_train_small = load_files(''.join((path,'\\inittrain\\')), categories=categories, encoding='utf-8') cb_test_small = load_files(''.join((path, '\\inittest\\')), categories=categories, encoding='utf-8') ''' I wanted to play around with the parameters a little more, but ran out of time. In particular, strip out any html tags. COnsidered BeautifulSoup. ''' # Turn the text documents into vectors of word frequencies vectorizer = TfidfVectorizer(min_df=2) text_train = vectorizer.fit_transform(cb_train_small.data) cat_train = cb_train_small.target %matplotlib inline import matplotlib.pyplot as plt import numpy as np # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=1) cb_train_x_vector = vectorizer.fit_transform(cb_train_small.data) cb_train_y_vector = cat_train '''Figuring out the principal components''' '''Are the like records roughly grouped together?''' from sklearn.decomposition import TruncatedSVD cb_train_x_vector_pca = TruncatedSVD(n_components=2).fit_transform(cb_train_x_vector) from itertools import cycle colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] for i, c in zip(np.unique(cb_train_y_vector), cycle(colors)): plt.scatter(cb_train_x_vector_pca[cb_train_y_vector == i, 0], cb_train_x_vector_pca[cb_train_y_vector == i, 1], c=c, label=cb_train_small.target_names[i], alpha=0.5) _ = plt.legend(loc='best') print cb_train_x_vector.shape print cb_train_y_vector.shape print text_test.shape print cat_test.shape print len(cb_test_small.data) print len(cb_test_small.target) from sklearn.naive_bayes import MultinomialNB vectorizer = TfidfVectorizer(min_df=2) text_train = vectorizer.fit_transform(cb_train_small.data) cat_train = cb_train_small.target print cat_train.shape[0] == text_train.shape[0] # Fit a classifier on the training set classifier = MultinomialNB().fit(text_train, cat_train) print("Training score: {0:.1f}%".format( classifier.score(text_train, cat_train) * 100)) # Evaluate the classifier on the testing set text_test = vectorizer.transform(cb_test_small.data) cat_test = cb_test_small.target text_test print("Testing score: {0:.1f}%".format( classifier.score(text_test, cat_test) * 100)) from sklearn.cross_validation import cross_val_score from scipy.stats import sem from sklearn.pipeline import Pipeline pipe = Pipeline(( ('vectorizer', TfidfVectorizer(max_df=0.8, use_idf=True, stop_words='english')), ('classifier', MultinomialNB(alpha=1.0)), )) scores = cross_val_score(pipe, cb_train_small.data, cb_train_small.target, cv = 4) print scores print "" print "\n Mean: " + str(scores.mean()) print "\n Standard Error: " + str(sem(scores)) from sklearn.grid_search import GridSearchCV params = { 'vectorizer__max_df': [0.75, 0.8], 'vectorizer__use_idf': [True, False], 'vectorizer__ngram_range':[(1, 1), (1, 2)], 'classifier__alpha': [0.1, 1, 10, 100] } grid_search = GridSearchCV(pipe, params, verbose=2, refit=False) _ = grid_search.fit(cb_train_small.data, cb_train_small.target) print "Best Score: " + str(grid_search.best_score_) print "Best Params: " + str(grid_search.best_params_) Best Score: 0.837170608868 compare to 0.722187301427 Best Params: {'vectorizer__use_idf': False, 'vectorizer__ngram_range': (1, 1), 'vectorizer__max_df': 0.75, 'classifier__alpha': 0.1} [Parallel(n_jobs=1)]: Done 96 out of 96 | elapsed: 7.6min finished '''Performance''' pipe = Pipeline(( ('vectorizer', TfidfVectorizer(max_df=0.75, use_idf=False, stop_words='english')), ('classifier', MultinomialNB(alpha=0.1)), )) _ = pipe.fit(cb_train_small.data, cb_train_small.target) vectorizer_nm, vectorizer = pipe.steps[0] classifier_nm, classifier = pipe.steps[1] features = vectorizer.get_feature_names() targets = cb_train_small.target_names feature_weights = classifier.coef_ feature_weights.shape def display_important_features(features, targets, weights, n_top=15): for i, target_name in enumerate(targets): print "" print '===================================' print("Class: " + target_name) print("") sorted_features_indices = weights[i].argsort()[::-1] most_important = sorted_features_indices[:n_top] print ("Most Important: \n") print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j]) for j in most_important)) print("...") print ("Least Important: \n") least_important = sorted_features_indices[-n_top:] print(", ".join("{0}: {1:.4f}".format(features[j].encode('utf-8', 'replace'), weights[i, j]) for j in least_important)) print("") display_important_features(features, targets, feature_weights) from sklearn.metrics import classification_report predicted_vals = pipe.predict(cb_test_small.data) print(classification_report(cb_test_small.target, predicted_vals, target_names=cb_test_small.target_names)) from sklearn.metrics import confusion_matrix confusion_matrix(cb_test_small.target, predicted_vals)