import mailbox
import email
import json

MBOX = 'resources/ch06-mailboxes/data/northpole.mbox'

# A routine that makes a ton of simplifying assumptions
# about converting an mbox message into a Python object
# given the nature of the northpole.mbox file in order
# to demonstrate the basic parsing of an mbox with mail
# utilities

def objectify_message(msg):
    
    # Map in fields from the message
    o_msg = dict([ (k, v) for (k,v) in msg.items() ])
    
    # Assume one part to the message and get its content
    # and its content type
    
    part = [p for p in msg.walk()][0]
    o_msg['contentType'] = part.get_content_type()
    o_msg['content'] = part.get_payload()
    
    return o_msg

# Create an mbox that can be iterated over and transform each of its
# messages to a convenient JSON representation

mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)

messages = []

while 1:
    msg = mbox.next()
    
    if msg is None: break
        
    messages.append(objectify_message(msg))
    
print json.dumps(messages, indent=1)

import sys
import urllib2
import time
import os
import envoy # pip install envoy

URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz"
DOWNLOAD_DIR = "resources/ch06-mailboxes/data"

# Downloads a file and displays a download status every 5 seconds

def download(url, download_dir):    
    file_name = url.split('/')[-1]
    u = urllib2.urlopen(url)
    f = open(os.path.join(download_dir, file_name), 'wb')
    meta = u.info()
    file_size = int(meta.getheaders("Content-Length")[0])
    print "Downloading: %s Bytes: %s" % (file_name, file_size)

    file_size_dl = 0
    block_sz = 8192
    last_update = time.time()
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        download_status = r"%10d MB  [%3.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size)
        download_status = download_status + chr(8)*(len(download_status)+1)
        if time.time() - last_update > 5:
            print download_status,
            sys.stdout.flush()
            last_update = time.time()
    f.close()
    return f.name

# Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz"

def tar_xzf(f):
    # Call out to the shell for a faster decompression.
    # This will still take a while because Vagrant synchronizes
    # thousands of files that are extracted to the host machine
    r = envoy.run("tar xzf %s -C %s" % (f, DOWNLOAD_DIR))
    print r.std_out
    print r.std_err

f = download(URL, DOWNLOAD_DIR)
print "Download complete: %s" % (f,)
tar_xzf(f)
print "Decompression complete"
print "Data is ready"

import re
import email
from time import asctime
import os
import sys
from dateutil.parser import parse # pip install python_dateutil

# XXX: Download the Enron corpus to resources/ch06-mailboxes/data
# and unarchive it there.

MAILDIR = 'resources/ch06-mailboxes/data/enron_mail_20110402/' + \
          'enron_data/maildir' 

# Where to write the converted mbox
MBOX = 'resources/ch06-mailboxes/data/enron.mbox'

# Create a file handle that we'll be writing into...
mbox = open(MBOX, 'w')

# Walk the directories and process any folder named 'inbox'

for (root, dirs, file_names) in os.walk(MAILDIR):

    if root.split(os.sep)[-1].lower() != 'inbox':
        continue

    # Process each message in 'inbox'

    for file_name in file_names:
        file_path = os.path.join(root, file_name)
        message_text = open(file_path).read()

        # Compute fields for the From_ line in a traditional mbox message

        _from = re.search(r"From: ([^\r]+)", message_text).groups()[0]
        _date = re.search(r"Date: ([^\r]+)", message_text).groups()[0]

        # Convert _date to the asctime representation for the From_ line

        _date = asctime(parse(_date).timetuple())

        msg = email.message_from_string(message_text)
        msg.set_unixfrom('From %s %s' % (_from, _date))

        mbox.write(msg.as_string(unixfrom=True) + "\n\n")
    
mbox.close()

import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse

MBOX = 'resources/ch06-mailboxes/data/enron.mbox'
OUT_FILE = MBOX + '.json'
 
def cleanContent(msg):

    # Decode message from "quoted printable" format, but first
    # re-encode, since decodestring will try to do a decode of its own
    msg = quopri.decodestring(msg.encode('utf-8'))

    # Strip out HTML tags, if any are present.
    # Bail on unknown encodings if errors happen in BeautifulSoup.
    try:
        soup = BeautifulSoup(msg)
    except:
        return ''
    return ''.join(soup.findAll(text=True))

# There's a lot of data to process, and the Pythonic way to do it is with a 
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object 
# serialization.

class Encoder(json.JSONEncoder):
    def default(self, o): return  list(o)

# The generator itself...
def gen_json_msgs(mb):
    while 1:
        msg = mb.next()
        if msg is None:
            break

        yield jsonifyMessage(msg)

def jsonifyMessage(msg):
    json_msg = {'parts': []}
    for (k, v) in msg.items():
        json_msg[k] = v.decode('utf-8', 'ignore')

    # The To, Cc, and Bcc fields, if present, could have multiple items.
    # Note that not all of these fields are necessarily defined.

    for k in ['To', 'Cc', 'Bcc']:
        if not json_msg.get(k):
            continue
        json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
                                 .replace(' ', '').decode('utf-8', 'ignore').split(',')

    for part in msg.walk():
        json_part = {}

        if part.get_content_maintype() != 'text':
            print >> sys.stderr, "Skipping MIME content in JSONification ({0})".format(part.get_content_maintype())
            continue

        json_part['contentType'] = part.get_content_type()
        content = part.get_payload(decode=False).decode('utf-8', 'ignore')
        json_part['content'] = cleanContent(content)
        json_msg['parts'].append(json_part)

    # Finally, convert date from asctime to milliseconds since epoch using the
    # $date descriptor so it imports "natively" as an ISODate object in MongoDB
    then = parse(json_msg['Date'])
    millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
    json_msg['Date'] = {'$date' : millis}

    return json_msg

mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)

# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport

f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
    if msg != None:
        f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()

print "All done"

import envoy

# This data is checked-in to the repository and is a compressed 
# version of the output from Example 3

F = 'resources/ch06-mailboxes/data/enron.mbox.json.bz2'

r = envoy.run("bunzip2 %s" % (F,))
print r.std_out
print r.std_err

import envoy # pip install envoy

r = envoy.run('mongoimport')
print r.std_out
print r.std_err

import os
import sys
import envoy

data_file = os.path.join(os.getcwd(), 'resources/ch06-mailboxes/data/enron.mbox.json')

# Run a command just as you would in a terminal on the virtual machine to 
# import the data file into MongoDB.
r = envoy.run('mongoimport --db enron --collection mbox ' + \
              '--file %s' % data_file)

# Print its standard output
print r.std_out
print sys.stderr.write(r.std_err)

# We can even simulate a MongoDB shell using envoy to execute commands.
# For example, let's get some stats out of MongoDB just as though we were working 
# in a shell by passing it the command and wrapping it in a printjson function to 
# display it for us.

def mongo(db, cmd):
    r = envoy.run("mongo %s --eval 'printjson(%s)'" % (db, cmd,))
    print r.std_out
    if r.std_err: print r.std_err
    
mongo('enron', 'db.mbox.stats()')

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo

# Connects to the MongoDB server running on 
# localhost:27017 by default

client = pymongo.MongoClient()

# Get a reference to the enron database

db = client.enron

# Reference the mbox collection in the Enron database

mbox = db.mbox

# The number of messages in the collection

print "Number of messages in mbox:"
print mbox.count()
print

# Pick a message to look at...

msg = mbox.find_one()

# Display the message as pretty-printed JSON. The use of
# the custom serializer supplied by PyMongo is necessary in order
# to handle the date field that is provided as a datetime.datetime 
# tuple.

print "A message:"
print json.dumps(msg, indent=1, default=json_util.default)

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
from datetime import datetime as dt

client = pymongo.MongoClient()

db = client.enron

mbox = db.mbox

# Create a small date range here of one day

start_date = dt(2001, 4, 1) # Year, Month, Day
end_date = dt(2001, 4, 2) # Year, Month, Day

# Query the database with the highly versatile "find" command,
# just like in the MongoDB shell.

msgs = [ msg 
         for msg in mbox.find({"Date" : 
                                  {
                                   "$lt" : end_date, 
                                   "$gt" : start_date
                                  }
                              }).sort("date")]

# Create a convenience function to make pretty-printing JSON a little
# less cumbersome

def pp(o, indent=1):
    print json.dumps(msgs, indent=indent, default=json_util.default)

print "Messages from a query by date range:"
pp(msgs)

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo

client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox

senders = [ i for i in mbox.distinct("From") ]

receivers = [ i for i in mbox.distinct("To") ]

cc_receivers = [ i for i in mbox.distinct("Cc") ]

bcc_receivers = [ i for i in mbox.distinct("Bcc") ]

print "Num Senders:", len(senders)
print "Num Receivers:", len(receivers)
print "Num CC Receivers:", len(cc_receivers)
print "Num BCC Receivers:", len(bcc_receivers)

senders = set(senders)
receivers = set(receivers)
cc_receivers = set(cc_receivers)
bcc_receivers = set(bcc_receivers)

# Find the number of senders who were also direct receivers

senders_intersect_receivers = senders.intersection(receivers)

# Find the senders that didn't receive any messages

senders_diff_receivers = senders.difference(receivers)

# Find the receivers that didn't send any messages

receivers_diff_senders = receivers.difference(senders)

# Find the senders who were any kind of receiver by
# first computing the union of all types of receivers

all_receivers = receivers.union(cc_receivers, bcc_receivers)
senders_all_receivers = senders.intersection(all_receivers)

print "Num senders in common with receivers:", len(senders_intersect_receivers)
print "Num senders who didn't receive:", len(senders_diff_receivers)
print "Num receivers who didn't send:", len(receivers_diff_senders)
print "Num senders in common with *all* receivers:", len(senders_all_receivers)

# In a Mongo shell, you could try this query for the same effect:
# db.mbox.find({"To" : {"$regex" : /.*enron.com.*/i} }, 
#              {"To" : 1, "_id" : 0})

senders = [ i 
            for i in mbox.distinct("From") 
                if i.lower().find("@enron.com") > -1 ]

receivers = [ i 
              for i in mbox.distinct("To") 
                  if i.lower().find("@enron.com") > -1 ]

cc_receivers = [ i 
                 for i in mbox.distinct("Cc") 
                     if i.lower().find("@enron.com") > -1 ]

bcc_receivers = [ i 
                  for i in mbox.distinct("Bcc") 
                      if i.lower().find("@enron.com") > -1 ]

print "Num Senders:", len(senders)
print "Num Receivers:", len(receivers)
print "Num CC Receivers:", len(cc_receivers)
print "Num BCC Receivers:", len(bcc_receivers)

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo

client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox

aliases = ["kenneth.lay@enron.com", "ken_lay@enron.com", "ken.lay@enron.com", 
           "kenneth_lay@enron.net", "klay@enron.com"] # More possibilities?

to_msgs = [ msg 
            for msg in mbox.find({"To" : { "$in" : aliases } })]

from_msgs = [ msg 
         for msg in mbox.find({"From" : { "$in" : aliases } })]

print "Number of message sent to:", len(to_msgs)
print "Number of messages sent from:", len(from_msgs)

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
import re

# The basis of our query
FROM = "kenneth.lay@enron.com"

client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox

# Get the recipient lists for each message

recipients_per_message = db.mbox.aggregate([
  {"$match" : {"From" : re.compile(r".*{0}.*".format(FROM), re.IGNORECASE)}}, 
  {"$project" : {"From" : 1, "To" : 1} }, 
  {"$group" : {"_id" : "$From", "recipients" : {"$addToSet" : "$To" } } }                    
])['result'][0]['recipients']

# Collapse the lists of recipients into a single list

all_recipients = [recipient
                  for message in recipients_per_message
                      for recipient in message]

# Calculate the number of recipients per sent message and sort

recipients_per_message_totals = \
  sorted([len(recipients) 
   for recipients in recipients_per_message])

# Demonstrate how to use $unwind followed by $group to collapse
# the recipient lists into a single list (with no duplicates
# per the $addToSet operator)

unique_recipients = db.mbox.aggregate([
  {"$match" : {"From" : re.compile(r".*{0}.*".format(FROM), re.IGNORECASE)}}, 
  {"$project" : {"From" : 1, "To" : 1} }, 
  {"$unwind" : "$To"}, 
  {"$group" : {"_id" : "From", "recipients" : {"$addToSet" : "$To"}} }
])['result'][0]['recipients']

print all_recipients
print "Num total recipients on all messages:", len(all_recipients)
print "Num recipients for each message:", recipients_per_message_totals
print "Num unique recipients", len(unique_recipients)

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo

client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox

# Create an index if it doesn't already exist
mbox.ensure_index([("$**", "text")], name="TextIndex")

# Get the collection stats (collstats) on a collection
# named "mbox"
print json.dumps(db.command("collstats", "mbox"), indent=1)

# Use the db.command method to issue a "text" command
# on collection "mbox" with parameters, remembering that
# we need to use json_util to handle serialization of our JSON
print json.dumps(db.command("text", "mbox",  
                            search="raptor", 
                            limit=1), 
                 indent=1, default=json_util.default)

import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo

client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox

results = mbox.aggregate([
{
  # Create a subdocument called DateBucket with each date component projected
  # so that these fields can be grouped on in the next stage of the pipeline
  "$project" :
  {
     "_id" : 0,
     "DateBucket" : 
     {
       "year" : {"$year" : "$Date"}, 
       "month" : {"$month" : "$Date"}, 
       "day" : {"$dayOfMonth" : "$Date"},
       "hour" : {"$hour" : "$Date"},
     }
  }
},
{
  "$group" : 
  {
    # Group by year and date by using these fields for the key.
    "_id" : {"year" : "$DateBucket.year", "month" : "$DateBucket.month"},
    
    # Increment the sum for each group by 1 for every document that's in it
    "num_msgs" : {"$sum" : 1}
  }
},
{
  "$sort" : {"_id.year" : 1, "_id.month" : 1} 
}
])

print results

from prettytable import PrettyTable


pt = PrettyTable(field_names=['Year', 'Month', 'Num Msgs'])
pt.align['Num Msgs'], pt.align['Month'] = 'r', 'r'
[ pt.add_row([ result['_id']['year'], result['_id']['month'], result['num_msgs'] ]) 
  for result in results['result'] ]

print pt

import sys
import oauth2 as oauth
import oauth2.clients.imap as imaplib

# See http://code.google.com/p/google-mail-xoauth-tools/wiki/
# XoauthDotPyRunThrough for details on obtaining and
# running xoauth.py to get the credentials

OAUTH_TOKEN = ''  # XXX: Obtained with xoauth.py
OAUTH_TOKEN_SECRET = ''  # XXX: Obtained with xoauth.py
GMAIL_ACCOUNT = ''  # XXX: Your Gmail address - example@gmail.com

url = 'https://mail.google.com/mail/b/%s/imap/' % (GMAIL_ACCOUNT, )

# Standard values for Gmail's Xoauth
consumer = oauth.Consumer('anonymous', 'anonymous')  
token = oauth.Token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

conn = imaplib.IMAP4_SSL('imap.googlemail.com')
conn.debug = 4  # Set to the desired debug level
conn.authenticate(url, consumer, token)

conn.select('INBOX')

# Access your INBOX data

import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse

# What you'd like to search for in the subject of your mail.
# See Section 6.4.4 of http://www.faqs.org/rfcs/rfc3501.html
# for more SEARCH options.

Q = "Alaska" # XXX

# Recycle some routines from Example 6-3 so that you arrive at the
# very same data structure you've been using throughout this chapter

def cleanContent(msg):

    # Decode message from "quoted printable" format
    msg = quopri.decodestring(msg)
        
    # Strip out HTML tags, if any are present.
    # Bail on unknown encodings if errors happen in BeautifulSoup.
    try:
        soup = BeautifulSoup(msg)
    except:
        return ''
    return ''.join(soup.findAll(text=True))

def jsonifyMessage(msg):
    json_msg = {'parts': []}
    for (k, v) in msg.items():
        json_msg[k] = v.decode('utf-8', 'ignore')

    # The To, Cc, and Bcc fields, if present, could have multiple items.
    # Note that not all of these fields are necessarily defined.

    for k in ['To', 'Cc', 'Bcc']:
        if not json_msg.get(k):
            continue
        json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '')\
                                 .replace('\r', '').replace(' ', '')\
                                 .decode('utf-8', 'ignore').split(',')

    for part in msg.walk():
        json_part = {}
        if part.get_content_maintype() == 'multipart':
            continue
            
        json_part['contentType'] = part.get_content_type()
        content = part.get_payload(decode=False).decode('utf-8', 'ignore')
        json_part['content'] = cleanContent(content)
           
        json_msg['parts'].append(json_part)
        
    # Finally, convert date from asctime to milliseconds since epoch using the
    # $date descriptor so it imports "natively" as an ISODate object in MongoDB.
    then = parse(json_msg['Date'])
    millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
    json_msg['Date'] = {'$date' : millis}

    return json_msg

# Consume a query from the user. This example illustrates searching by subject.

(status, data) = conn.search(None, '(SUBJECT "%s")' % (Q, ))
ids = data[0].split()

messages = []
for i in ids:
    try:
        (status, data) = conn.fetch(i, '(RFC822)')
        messages.append(email.message_from_string(data[0][1]))
    except Exception, e:
        print e
        print 'Print error fetching message %s. Skipping it.' % (i, )

print len(messages)
jsonified_messages = [jsonifyMessage(m) for m in messages]

# Separate out the text content from each message so that it can be analyzed.

content = [p['content'] for m in jsonified_messages for p in m['parts']]

# Content can still be quite messy and contain line breaks and other quirks.

filename = os.path.join('resources/ch06-mailboxes/data', 
                        GMAIL_ACCOUNT.split("@")[0] + '.gmail.json')
f = open(filename, 'w')
f.write(json.dumps(jsonified_messages))
f.close()

print >> sys.stderr, "Data written out to", f.name