'''
-------------------------------------------------------------------------------
Filename   : ch4.ipynb
Date       :  2012-09-17
Author     : C. Vogel
Purpose    : Replicate the naive Bayes e-mail classifier in Chapter 3 of 
           : _Machine Learning for Hackers_.
Input Data : e-mail files, split into spam and ham (non-spam) folders are available 
           : at the book's github repository at https://github.com/johnmyleswhite/
           : ML_for_Hackers.git. 
Libraries  : dateutil, Matplotlib, NLTK, textmining, Numpy 1.6.1, pandas 0.9.0
           : statsmodels
-------------------------------------------------------------------------------

This notebook is a Python port of the R code in Chapter 4 of _Machine Learning
for Hackers_ by D. Conway and J.M. White.

E-mail files, split into folders classified as spam or ham (non-spam), should be located 
in a /data/ subfolder of the working directory. See the paths defined just after the import
statements below to see what directory structure this script requires. Copying complete
data folder from the book's github repository should be sufficient.

For a detailed description of the analysis and the process of porting it
to Python, see: slendrmeans.wordpress.com/will-it-python.
'''
%pylab inline

import os
import re
import math
import random
import numpy as np
import datetime as dt
import dateutil.parser as dtp
import matplotlib.pyplot as plt
import textmining as txtm
import string
from pandas import *
import sys
from statsmodels.nonparametric import KDE
from tdm_df import tdm_df  # Term-document matrix builder from CH3

# Define paths to the data. We'll be prioritizing e-mails in the "Easy Ham" folder
data_path = os.path.abspath(os.path.join('..','ch3', 'data'))
easyham_path = os.path.abspath(os.path.join(data_path, 'easy_ham'))   

def parse_email_list(file_list):
    '''
    Create a generator that iterates through the e-mails in
    a folder, and parses them as the generator is called.
    '''
    for f in file_list:
        yield parse_email(os.path.join(easyham_path, f))

def parse_email(path):
    '''
    Get important elements of an e-mail file (date, sender's
    e-mail address, subject, and message contents.
    '''
    filename        = os.path.split(path)[-1]
    header, message = get_header_and_message(path)
    date            = get_date(header)
    sender          = get_sender(header)
    subj            = get_subject(header)
    return date, sender, subj, message, filename

def get_header_and_message(path):
    '''
    Split e-mail's header and its message into two
    separate strings.
    '''
    with open(path, 'rU') as con:
        email = con.readlines()
        first_blank_index = email.index('\n')
        header = email[:first_blank_index]
        message = ''.join(email[(first_blank_index + 1): ])
    return header, message

# Regex pattern that finds the date line
# (it's a line that starts with 'Date')
date_pattern = re.compile('^Date')

def get_date(header):
    '''
    Find the date line of the e-mail's header, and parse
    the date into a datetime object.
    '''
    # Get the first line that matches the date-line
    # regex pattern
    dateline = [l for l in header if 
                re.search(date_pattern, l) != None][0]
    
    # Grab the text after 'Date: ' (6 chars)
    dateline = dateline[6:].strip()
    
    # Return the date as parsed by dateutil.parser.parse()
    return dtp.parse(dateline)

# Characters that may separate words in the From line
splitfrom_pattern = re.compile('[\"\:<> ]')

def get_sender(header):
    '''
    Find the 'From:' line in the e-mail's header data, and 
    extract the sender's e-mail address from it.
    '''
    # Find line in header that contains 'From: '
    sender = filter(lambda s: s.find('From: ') != -1, header)[0]
    
    # Get 'words' in From line
    sender = re.split(splitfrom_pattern, sender)
    sender = filter(lambda s: s != ' ' and s != '', sender)
    
    # Find the first word that is an e-mail address (contains @)
    sender = filter(lambda s: '@' in s, sender)[0]
    sender = re.sub('\\[a-z]', '', sender)
    
    return sender.lower().rstrip()

def get_subject(header):
    '''
    Find the subject line of the e-mail's header, and extract
    the subject text.
    '''
    subject = filter(lambda s: s.find('Subject: ') != -1, header)
    if len(subject) > 0:
        subject_start = subject[0].find('Subject: ') + 9
        subject = subject[0][subject_start:]
        return subject.lower()
    else:
        return ''

def make_email_df(email_dir):
    '''
    Parse each e-mail in a directory and return a dataframe
     of each e-mail's date, sender, subject, and message
    '''
    
    email_dict = {'date'     : [],
                  'sender'   : [], 
                  'subject'  : [], 
                  'message'  : [],
                  'filename' : []}

    # Get a list of e-mails to parse
    file_list = os.listdir(email_dir)
    file_list = [f for f in file_list if  f != 'cmds'] 
    
    # A generator that that returns the parsed components
    # of each subsequent e-mail as called
    parsed_emails = parse_email_list(file_list)
    
    # Fill up the dictionary with the generator
    for pe in parsed_emails:
        date, sender, subject, message, filename = pe
        email_dict['date'].append(date)
        email_dict['sender'].append(sender)
        email_dict['subject'].append(subject)
        email_dict['message'].append(message)
        email_dict['filename'].append(filename)
    
    # Make the dictionary a data frame.
    email_df = DataFrame(email_dict, 
                         columns = ['date', 'sender', 'subject', 
                                    'message', 'filename'])  
    return email_df

# Make a dataframe of information in the
# 'Easy Ham' emails
email_df = make_email_df(easyham_path)

def train_test_split(df, train_fraction = .5, shuffle = True, 
                     preserve_index = True, seed = None):
    '''
    Split a dataframe into training and test sets by randomly assigning
    its observations to one set or the other.

    Parameters
    ----------
    df: a DataFrame
    train_fraction: the fraction of `df` to be assigned to the 
        training set (1-train_fraction will go to the test set).
    preserve_index: If True, the split dataframes keep their index
        values from `df`. If False, each set gets a new integer index
        of arange(# rows).
    seed: the random number generator seed for row shuffling.
    '''
    if seed: 
        random.seed(seed)
        
    nrows = df.shape[0]
    split_point = int(train_fraction * nrows)
    
    rows = range(nrows)
    if shuffle: 
        random.shuffle(rows)
    
    train_rows = rows[:split_point]
    test_rows  = rows[split_point:]
    
    train_index = df.index[train_rows]
    test_index  = df.index[test_rows]
    
    train_df = df.ix[train_index, :]
    test_df  = df.ix[test_index, :]
    
    if not preserve_index:
        train_df.index = arange(train_df.shape[0])
        test_df.index  = arange(test_df.shape[0])
    
    return train_df, test_df

email_df['sort_date'] = email_df['date'].map(lambda d: 
    dt.datetime(d.year, d.month, d.day, d.hour, d.minute, d.second))
email_df = email_df.sort('sort_date')
train_df, test_df = train_test_split(email_df, shuffle = False,
                                     preserve_index = False, 
                                     seed = 224)

# Each sender address get's a weight equal to the log of 
# 1 + the number of e-mails from that address.
def get_sender_weights(email_df):
    freq = email_df['sender'].value_counts()
    freq.sort()
    sender_weights = DataFrame({'freq'   : freq,
                                'weight' : np.log(1.0 + freq)})
    sender_weights = sender_weights.sort('weight', ascending = False)
    return sender_weights

sender_weights = get_sender_weights(train_df)
sender_weights_test = get_sender_weights(test_df)

nshow = 30
top_emails = sender_weights[:nshow].index

plt.figure(figsize = (6, 14))
plt.barh(np.arange(nshow), sender_weights['freq'][top_emails], 
         align = 'center', 
         label = 'Training')
plt.barh(np.arange(nshow), sender_weights_test['freq'][top_emails], 
         align = 'center', 
         left  = sender_weights['freq'][top_emails].fillna(0),
         fc    = 'orange', 
         alpha = .8,
         label = 'Test')
plt.ylim((0 - .5, nshow - .5))
plt.title('Frequency of top %i sender addresses' % nshow)
plt.yticks(np.arange(nshow), top_emails)
plt.legend(frameon = False)

reply_pattern   = '(re:|re\[\d\]:)'
fwd_pattern = '(fw:|fw[\d]:)'

def thread_flag(s):
    '''
    Returns True if string s matches the thread patterns.
    If s is a pandas Series, returns a Series of booleans.
    '''
    if isinstance(s, basestring):
        return re.search(reply_pattern, s) is not None
    else:
        return s.str.contains(reply_pattern, re.I)

def clean_subject(s):
    '''
    Removes all the reply and forward labeling from a 
    string (an e-mail subject) s. 
    If s is a pandas Series, returns a Series of cleaned
    strings.
    This will help find the initial message in the thread
    (which won't have any of the reply/forward labeling.
    '''
    if isinstance(s, basestring):
        s_clean = re.sub(reply_pattern, '', s, re.I)
        s_clean = re.sub(fwd_pattern, '', s_clean, re.I)
        s_clean = s_clean.strip()
    else:
        s_clean = s.str.replace(reply_pattern, '', re.I)
        s_clean = s_clean.str.replace(fwd_pattern, '', re.I)
        s_clean = s_clean.str.strip()
    
    return s_clean

def get_thread_df(email_df):
    '''
    Identify threads in an e-mail DataFrame, and extract
    them into a new DataFrame.
    '''
    
    # Find threads by e-mails with reply patterns in their subjects.
    # Then get a set of thread subjects.
    is_thread = thread_flag(email_df['subject'])
    thread_subj = email_df['subject'][is_thread]
    
    # Clean up the subjects by removing reply and forward labels
    thread_subj = clean_subject(thread_subj) 
    thread_subj = thread_subj.unique()
    
    # Search for these thread subjects in the original
    # e-mail DataFrame (so we pick up the original e-mail in the
    # thread (which won't have a reply pattern in its subject)
    
    # Prepare the DataFrame for searching by cleaning up the 
    # subjects.
    search_df = email_df[['date', 'sender', 'subject']]
    search_df['subject'] = clean_subject(search_df['subject'])
    
    # Find subject matches
    thread_matches = [subj in thread_subj for subj in search_df['subject']]
    match_df = search_df.ix[thread_matches, :]
    
    return match_df 

# Get a DataFrame of threads in the training e-mails;
# compute sender weights within this subset of threads.
# To do the latter, we can use the same function we
# used on the whole DataFrame above.
thread_df = get_thread_df(train_df)
thread_sender_weights = get_sender_weights(thread_df)

def get_thread_activity(thread_df):
    '''
    Compute 'activity' statistics on threads in a DataFrame:
    frequency: Number of e-mails observed in the thread
    span: Time before the first and last e-mail observed (seconds)
    weight: Number e-mails-per-second in the thread (log-scale)
    '''
    clean_times = thread_df['date'].map(lambda t: t.tzinfo is not None)
    thread_df_clean = thread_df.ix[clean_times, :]
    
    freq_by_thread = thread_df['subject'].value_counts()
    
    # NB: I'm not sure if total_seconds() is daylight-savings aware
    # so this may be mildly buggy.
    seconds_span = lambda x: (
        (np.max(x) - np.min(x)).total_seconds())
    
    span_by_thread = thread_df_clean.groupby('subject')
    span_by_thread = span_by_thread['date'].aggregate(seconds_span)
    
    activity = DataFrame({'freq'   : freq_by_thread,
                          'span'   : span_by_thread,
                          'weight' : 10 + np.log10(freq_by_thread/span_by_thread)})
    
    # Restricting to threads with more than 2 e-mails may be unecessary, but we 
    # could get 1-e-mail threads for a couple of reasons: misclassification (it's 
    # not really a thread), and left-censored threads -- where we found the last 
    # reply in the thread, but the earlier messages were recieved before the data 
    # was collected.
    activity = activity[activity['freq'] >= 2]
    activity = activity[notnull(activity['weight'])]
    return activity

thread_activity_df = get_thread_activity(thread_df)

threads_to_check = ['please help a newbie compile mplayer :-)',
                    'prob. w/ install/uninstall',
                    'http://apt.nixia.no/']

print thread_activity_df.ix[threads_to_check, :]

# Stopwords from R. NLTK has a set of stopwords we could use as 
# well, but this minimizes discrepancies with the book.
rsw = read_csv('../ch3/r_stopwords.csv')['x'].values.tolist()

def get_thread_subject_term_weights(thread_activity_df):
    '''
    Creates a term->weight map based on a DataFrame containing
    thread subjects and their activity weights
    '''
    thread_subjects = thread_activity_df.index
    thread_tdm = tdm_df(thread_subjects, remove_punctuation = False,
                                         remove_digits = False,
                                         stopwords = rsw)
    
    def calc_term_weight(term): 
        threads_with_term = np.where(thread_tdm[term] > 0.0)
        weight_vec = thread_activity_df['weight'].ix[threads_with_term] 
        return weight_vec.mean()
   
    term_weights = Series({t: calc_term_weight(t) for t in thread_tdm})
    return term_weights

thread_subject_terms_weights = \
    get_thread_subject_term_weights(thread_activity_df)

# Example term weights
print thread_subject_terms_weights.head(10)

def get_message_term_weights(email_df):
    '''
    Creates a term->weight map for terms in the messages of the training
    e-mail DataFrame
    '''
    messages = email_df['message']
    term_freq = tdm_df(messages, stopwords = rsw).sum()
    
    term_weight_df = DataFrame({'freq'   : term_freq,
                                'weight' : np.log10(term_freq)})
    
    return term_weight_df   

message_terms_weights = get_message_term_weights(train_df)

def get_weight_from_terms(term_list, weight_df, subject = False):
    '''
    Given a term-list from an e-mail's message, and a term->weights
    map contained in a DataFrame, calculate the e-mail's message or
    subject term-weight. (default is message)
    '''
    if isinstance(term_list, basestring):
        term_list = [term_list]
    
    if subject:
        weights = weight_df
    else:
        weights = weight_df['weight']
    
    term_list_weight = weights[term_list].mean()
    
    if np.isnan(term_list_weight):
        term_list_weight = 1.0
    
    return term_list_weight

def rank_email(email_df, row):
    '''
    Ranks an e-mail (as contained in the row of a DataFrame) by
    computing and combining its five weights.
    '''
    email   = email_df.ix[row, :]
    date    = email['date']
    sender  = email['sender']
    subject = email['subject']
    message = email['message']
    
    # 1. Get sender weights (all messages)
    sender_weight = (sender_weights['weight']
                     .get(sender) or 1.0)
    
    # 2. Get sender weights (threads)
    thread_sender_weight = (thread_sender_weights['weight']
                           .get(sender) or 1.0)
    
    # 3. Get thread activity weight
    is_thread = thread_flag(subject)
    subject = clean_subject(subject)
    if is_thread:
        activity_weight = (thread_activity_df['weight']
                           .get(subject) or 1.0)
    else:
        activity_weight = 1.0
    
    # 4. Get subject line weight via thread-subject term weights
    subj_terms = tdm_df(subject, remove_punctuation = False,
                                 remove_digits = False,
                                 stopwords = rsw).columns
    subj_term_weight = get_weight_from_terms(subj_terms, 
                            thread_subject_terms_weights,
                            subject = True)
    
    # 5. Message term weight
    message_terms = tdm_df(message, stopwords = rsw).columns
    message_term_weight = get_weight_from_terms(message_terms,
                               message_terms_weights)
    weights = [sender_weight, 
               thread_sender_weight,
               activity_weight,
               subj_term_weight,
               message_term_weight]
    
    # The e-mail's final rank is just the product of the weights.
    rank = np.array(weights).prod()
   
    return rank

def make_rank_df(email_df):
    '''
    Rank each e-mail in a DataFrame.
    '''
    n_emails = email_df.shape[0]
    
    sender_weight_results = []
    thread_sender_weight_results = []
    activity_weight_results = []
    subj_term_weight_results = []
    message_term_weight_results = []
    
    rank_results = []
    rank_df = email_df.copy()
    
    for e in xrange(n_emails):
        weights_rank = rank_email(email_df, e)
        rank_results.append(weights_rank)
     
    rank_df['rank'] = rank_results
    
    return rank_df

train_ranks = make_rank_df(train_df)

threshold = train_ranks['rank'].median()

test_ranks = make_rank_df(test_df)

train_kde = KDE(train_ranks['rank'])
train_kde.fit()
test_kde = KDE(test_ranks['rank'])
test_kde.fit()

plt.figure(figsize(8, 6))
plt.fill(train_kde.support, train_kde.density, color = 'steelblue', alpha = .7, 
         label = 'Train')
plt.fill(test_kde.support, test_kde.density, color = 'red', alpha = .7, 
         label = 'Test')
plt.xlim(0, 400)
plt.ylim(0, np.max(test_kde.density))
plt.axvline(threshold, linestyle = '--', label = 'Priority threshold')
plt.xlabel('Rank')
plt.ylabel('Density')
plt.title('Distribution of ranks for training and test e-mails')
plt.legend(frameon = False)

(test_ranks['rank'] > threshold).sum()

test_ranks[test_ranks['rank'] > threshold]['sender'].unique()

maxrank_email = train_ranks.ix[train_ranks['rank'].idxmax(), :]
print 'Maximum rank in training set: ', train_ranks['rank'].max()

thread_activity_df.ix[clean_subject(maxrank_email['subject']), :]

thread_df[thread_df['subject'].str.contains('spam_level_char')]['date']

correct_weight = 10 + np.log10(2 / 4.)
incorrect_weight = 10 + np.log10(2 / (2* 3600 + 59 * 60 + 56.))
print 'Activity weight correct time-zone: ', correct_weight
print 'Activity weight incorrect time-zone: ', incorrect_weight
print 'Rank difference: ' , '{:2.1%}'.format(1 - incorrect_weight / correct_weight)