# unzip 'em !unzip /Users/max/Downloads/max.mautner@gmail.com-20131218T185235Z-Mail.zip -d ./data/ !ls -l ./data/max.mautner@gmail.com-20131218T185235Z-Mail/Mail/ from glob import glob mailboxes = glob('./data/max.mautner@gmail.com-20131218T185235Z-Mail/Mail/*') mailboxes !head ./data/max.mautner@gmail.com-20131218T185235Z-Mail/Mail/Inbox.mbox interesting_mboxes = ['./data/max.mautner@gmail.com-20131218T185235Z-Mail/Mail/Inbox.mbox'] import mailbox import email from nltk import clean_html import sys %time len(mailbox.mbox(interesting_mboxes[0]).items()) corpus = [] labels = [] def create_corpus(): for fname in interesting_mboxes: print fname sys.stdout.flush() # make sure to flush to output category = fname.split('/')[-1].split('.')[0].lower() mbox = mailbox.mbox(fname) for msg_id, email_obj in mbox.items(): if 'Sent' not in email_obj['X-Gmail-Labels']: category = 1 if 'Important' in email_obj['X-Gmail-Labels'].split(',') else 0 else: continue body = '' for part in email_obj.walk(): if part.get_content_type() == 'text/html': body = clean_html(part.get_payload()) break elif part.get_content_type() == 'text/plain': body = part.get_payload() else: continue body += ' ' + ' '.join(email_obj.keys()) corpus.append(body) labels.append(category) %time create_corpus() print labels[0] print corpus[0] len(corpus), len(labels) import pandas as pd d = pd.DataFrame(labels, columns=['labels']) print d.labels.value_counts()/float(d.shape[0]) import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer vectorizer = CountVectorizer(tokenizer=nltk.word_tokenize, stop_words='english', max_features=6000, ngram_range=(1,1)) #vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) # bigrams #vectorizer = TfidfTransformer() # tf-idf vectorizer CountVectorizer? %time vectors = vectorizer.fit_transform(corpus) import numpy as np from sklearn.cross_validation import ShuffleSplit from sklearn.naive_bayes import MultinomialNB from sklearn.svm import LinearSVC from collections import defaultdict X = vectors y = np.array(labels) label_train_scores = defaultdict(list) label_test_scores = defaultdict(list) train_scores = [] test_scores = [] from sklearn import metrics cv = ShuffleSplit(len(corpus), n_iter=10, test_size=0.1, random_state=0) for cv_index, (train, test) in enumerate(cv): print cv_index sys.stdout.flush() gnb = MultinomialNB().fit(X[train], y[train]) for label in d.labels.unique(): train_special = [a for a in d.index[d.labels == label] if a in train] test_special = [a for a in d.index[d.labels == label] if a in test] label_train_scores[label].append(gnb.score(X[train_special], y[train_special])) label_test_scores[label].append(gnb.score(X[test_special], y[test_special])) train_scores.append(gnb.score(X[train], y[train])) test_scores.append(gnb.score(X[test], y[test])) from pprint import pprint for l in d.labels.unique(): print l print "Training:\t %.1f%%" % (np.multiply(np.average(label_train_scores[l]), 100)) print "Test:\t\t*%.1f%%*" % (np.multiply(np.average(label_test_scores[l]), 100)) from nltk import word_tokenize from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords LEMMATIZER = WordNetLemmatizer() STOP_SET = set(stopwords.words('english')) words = 'run runs running ran' for word in words.split(' '): print LEMMATIZER.lemmatize(word.lower()) # train on entire dataset: gnb = MultinomialNB().fit(X, y) vectorizer.transform? from sklearn.externals import joblib joblib.dump(gnb, 'email_importance.pkl', compress=9) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) !du -hc *.pkl model_clone = joblib.load('email_importance.pkl') vectorizer_clone = joblib.load('vectorizer.pkl') type(model_clone), type(vectorizer_clone) model_clone.predict(X[0]), y[0] !scp -i /Users/max/.ssh/keys/ivendorz.pem email_importance.pkl ubuntu@ec2-54-202-114-193.us-west-2.compute.amazonaws.com:~/ !scp -i /Users/max/.ssh/keys/ivendorz.pem vectorizer.pkl ubuntu@ec2-54-202-114-193.us-west-2.compute.amazonaws.com:~/