''' ------------------------------------------------------------------------------- Filename : ch4.ipynb Date : 2012-09-17 Author : C. Vogel Purpose : Replicate the naive Bayes e-mail classifier in Chapter 3 of : _Machine Learning for Hackers_. Input Data : e-mail files, split into spam and ham (non-spam) folders are available : at the book's github repository at https://github.com/johnmyleswhite/ : ML_for_Hackers.git. Libraries : dateutil, Matplotlib, NLTK, textmining, Numpy 1.6.1, pandas 0.9.0 : statsmodels ------------------------------------------------------------------------------- This notebook is a Python port of the R code in Chapter 4 of _Machine Learning for Hackers_ by D. Conway and J.M. White. E-mail files, split into folders classified as spam or ham (non-spam), should be located in a /data/ subfolder of the working directory. See the paths defined just after the import statements below to see what directory structure this script requires. Copying complete data folder from the book's github repository should be sufficient. For a detailed description of the analysis and the process of porting it to Python, see: slendrmeans.wordpress.com/will-it-python. ''' %pylab inline import os import re import math import random import numpy as np import datetime as dt import dateutil.parser as dtp import matplotlib.pyplot as plt import textmining as txtm import string from pandas import * import sys from statsmodels.nonparametric import KDE from tdm_df import tdm_df # Term-document matrix builder from CH3 # Define paths to the data. We'll be prioritizing e-mails in the "Easy Ham" folder data_path = os.path.abspath(os.path.join('..','ch3', 'data')) easyham_path = os.path.abspath(os.path.join(data_path, 'easy_ham')) def parse_email_list(file_list): ''' Create a generator that iterates through the e-mails in a folder, and parses them as the generator is called. ''' for f in file_list: yield parse_email(os.path.join(easyham_path, f)) def parse_email(path): ''' Get important elements of an e-mail file (date, sender's e-mail address, subject, and message contents. ''' filename = os.path.split(path)[-1] header, message = get_header_and_message(path) date = get_date(header) sender = get_sender(header) subj = get_subject(header) return date, sender, subj, message, filename def get_header_and_message(path): ''' Split e-mail's header and its message into two separate strings. ''' with open(path, 'rU') as con: email = con.readlines() first_blank_index = email.index('\n') header = email[:first_blank_index] message = ''.join(email[(first_blank_index + 1): ]) return header, message # Regex pattern that finds the date line # (it's a line that starts with 'Date') date_pattern = re.compile('^Date') def get_date(header): ''' Find the date line of the e-mail's header, and parse the date into a datetime object. ''' # Get the first line that matches the date-line # regex pattern dateline = [l for l in header if re.search(date_pattern, l) != None][0] # Grab the text after 'Date: ' (6 chars) dateline = dateline[6:].strip() # Return the date as parsed by dateutil.parser.parse() return dtp.parse(dateline) # Characters that may separate words in the From line splitfrom_pattern = re.compile('[\"\:<> ]') def get_sender(header): ''' Find the 'From:' line in the e-mail's header data, and extract the sender's e-mail address from it. ''' # Find line in header that contains 'From: ' sender = filter(lambda s: s.find('From: ') != -1, header)[0] # Get 'words' in From line sender = re.split(splitfrom_pattern, sender) sender = filter(lambda s: s != ' ' and s != '', sender) # Find the first word that is an e-mail address (contains @) sender = filter(lambda s: '@' in s, sender)[0] sender = re.sub('\\[a-z]', '', sender) return sender.lower().rstrip() def get_subject(header): ''' Find the subject line of the e-mail's header, and extract the subject text. ''' subject = filter(lambda s: s.find('Subject: ') != -1, header) if len(subject) > 0: subject_start = subject[0].find('Subject: ') + 9 subject = subject[0][subject_start:] return subject.lower() else: return '' def make_email_df(email_dir): ''' Parse each e-mail in a directory and return a dataframe of each e-mail's date, sender, subject, and message ''' email_dict = {'date' : [], 'sender' : [], 'subject' : [], 'message' : [], 'filename' : []} # Get a list of e-mails to parse file_list = os.listdir(email_dir) file_list = [f for f in file_list if f != 'cmds'] # A generator that that returns the parsed components # of each subsequent e-mail as called parsed_emails = parse_email_list(file_list) # Fill up the dictionary with the generator for pe in parsed_emails: date, sender, subject, message, filename = pe email_dict['date'].append(date) email_dict['sender'].append(sender) email_dict['subject'].append(subject) email_dict['message'].append(message) email_dict['filename'].append(filename) # Make the dictionary a data frame. email_df = DataFrame(email_dict, columns = ['date', 'sender', 'subject', 'message', 'filename']) return email_df # Make a dataframe of information in the # 'Easy Ham' emails email_df = make_email_df(easyham_path) def train_test_split(df, train_fraction = .5, shuffle = True, preserve_index = True, seed = None): ''' Split a dataframe into training and test sets by randomly assigning its observations to one set or the other. Parameters ---------- df: a DataFrame train_fraction: the fraction of `df` to be assigned to the training set (1-train_fraction will go to the test set). preserve_index: If True, the split dataframes keep their index values from `df`. If False, each set gets a new integer index of arange(# rows). seed: the random number generator seed for row shuffling. ''' if seed: random.seed(seed) nrows = df.shape[0] split_point = int(train_fraction * nrows) rows = range(nrows) if shuffle: random.shuffle(rows) train_rows = rows[:split_point] test_rows = rows[split_point:] train_index = df.index[train_rows] test_index = df.index[test_rows] train_df = df.ix[train_index, :] test_df = df.ix[test_index, :] if not preserve_index: train_df.index = arange(train_df.shape[0]) test_df.index = arange(test_df.shape[0]) return train_df, test_df email_df['sort_date'] = email_df['date'].map(lambda d: dt.datetime(d.year, d.month, d.day, d.hour, d.minute, d.second)) email_df = email_df.sort('sort_date') train_df, test_df = train_test_split(email_df, shuffle = False, preserve_index = False, seed = 224) # Each sender address get's a weight equal to the log of # 1 + the number of e-mails from that address. def get_sender_weights(email_df): freq = email_df['sender'].value_counts() freq.sort() sender_weights = DataFrame({'freq' : freq, 'weight' : np.log(1.0 + freq)}) sender_weights = sender_weights.sort('weight', ascending = False) return sender_weights sender_weights = get_sender_weights(train_df) sender_weights_test = get_sender_weights(test_df) nshow = 30 top_emails = sender_weights[:nshow].index plt.figure(figsize = (6, 14)) plt.barh(np.arange(nshow), sender_weights['freq'][top_emails], align = 'center', label = 'Training') plt.barh(np.arange(nshow), sender_weights_test['freq'][top_emails], align = 'center', left = sender_weights['freq'][top_emails].fillna(0), fc = 'orange', alpha = .8, label = 'Test') plt.ylim((0 - .5, nshow - .5)) plt.title('Frequency of top %i sender addresses' % nshow) plt.yticks(np.arange(nshow), top_emails) plt.legend(frameon = False) reply_pattern = '(re:|re\[\d\]:)' fwd_pattern = '(fw:|fw[\d]:)' def thread_flag(s): ''' Returns True if string s matches the thread patterns. If s is a pandas Series, returns a Series of booleans. ''' if isinstance(s, basestring): return re.search(reply_pattern, s) is not None else: return s.str.contains(reply_pattern, re.I) def clean_subject(s): ''' Removes all the reply and forward labeling from a string (an e-mail subject) s. If s is a pandas Series, returns a Series of cleaned strings. This will help find the initial message in the thread (which won't have any of the reply/forward labeling. ''' if isinstance(s, basestring): s_clean = re.sub(reply_pattern, '', s, re.I) s_clean = re.sub(fwd_pattern, '', s_clean, re.I) s_clean = s_clean.strip() else: s_clean = s.str.replace(reply_pattern, '', re.I) s_clean = s_clean.str.replace(fwd_pattern, '', re.I) s_clean = s_clean.str.strip() return s_clean def get_thread_df(email_df): ''' Identify threads in an e-mail DataFrame, and extract them into a new DataFrame. ''' # Find threads by e-mails with reply patterns in their subjects. # Then get a set of thread subjects. is_thread = thread_flag(email_df['subject']) thread_subj = email_df['subject'][is_thread] # Clean up the subjects by removing reply and forward labels thread_subj = clean_subject(thread_subj) thread_subj = thread_subj.unique() # Search for these thread subjects in the original # e-mail DataFrame (so we pick up the original e-mail in the # thread (which won't have a reply pattern in its subject) # Prepare the DataFrame for searching by cleaning up the # subjects. search_df = email_df[['date', 'sender', 'subject']] search_df['subject'] = clean_subject(search_df['subject']) # Find subject matches thread_matches = [subj in thread_subj for subj in search_df['subject']] match_df = search_df.ix[thread_matches, :] return match_df # Get a DataFrame of threads in the training e-mails; # compute sender weights within this subset of threads. # To do the latter, we can use the same function we # used on the whole DataFrame above. thread_df = get_thread_df(train_df) thread_sender_weights = get_sender_weights(thread_df) def get_thread_activity(thread_df): ''' Compute 'activity' statistics on threads in a DataFrame: frequency: Number of e-mails observed in the thread span: Time before the first and last e-mail observed (seconds) weight: Number e-mails-per-second in the thread (log-scale) ''' clean_times = thread_df['date'].map(lambda t: t.tzinfo is not None) thread_df_clean = thread_df.ix[clean_times, :] freq_by_thread = thread_df['subject'].value_counts() # NB: I'm not sure if total_seconds() is daylight-savings aware # so this may be mildly buggy. seconds_span = lambda x: ( (np.max(x) - np.min(x)).total_seconds()) span_by_thread = thread_df_clean.groupby('subject') span_by_thread = span_by_thread['date'].aggregate(seconds_span) activity = DataFrame({'freq' : freq_by_thread, 'span' : span_by_thread, 'weight' : 10 + np.log10(freq_by_thread/span_by_thread)}) # Restricting to threads with more than 2 e-mails may be unecessary, but we # could get 1-e-mail threads for a couple of reasons: misclassification (it's # not really a thread), and left-censored threads -- where we found the last # reply in the thread, but the earlier messages were recieved before the data # was collected. activity = activity[activity['freq'] >= 2] activity = activity[notnull(activity['weight'])] return activity thread_activity_df = get_thread_activity(thread_df) threads_to_check = ['please help a newbie compile mplayer :-)', 'prob. w/ install/uninstall', 'http://apt.nixia.no/'] print thread_activity_df.ix[threads_to_check, :] # Stopwords from R. NLTK has a set of stopwords we could use as # well, but this minimizes discrepancies with the book. rsw = read_csv('../ch3/r_stopwords.csv')['x'].values.tolist() def get_thread_subject_term_weights(thread_activity_df): ''' Creates a term->weight map based on a DataFrame containing thread subjects and their activity weights ''' thread_subjects = thread_activity_df.index thread_tdm = tdm_df(thread_subjects, remove_punctuation = False, remove_digits = False, stopwords = rsw) def calc_term_weight(term): threads_with_term = np.where(thread_tdm[term] > 0.0) weight_vec = thread_activity_df['weight'].ix[threads_with_term] return weight_vec.mean() term_weights = Series({t: calc_term_weight(t) for t in thread_tdm}) return term_weights thread_subject_terms_weights = \ get_thread_subject_term_weights(thread_activity_df) # Example term weights print thread_subject_terms_weights.head(10) def get_message_term_weights(email_df): ''' Creates a term->weight map for terms in the messages of the training e-mail DataFrame ''' messages = email_df['message'] term_freq = tdm_df(messages, stopwords = rsw).sum() term_weight_df = DataFrame({'freq' : term_freq, 'weight' : np.log10(term_freq)}) return term_weight_df message_terms_weights = get_message_term_weights(train_df) def get_weight_from_terms(term_list, weight_df, subject = False): ''' Given a term-list from an e-mail's message, and a term->weights map contained in a DataFrame, calculate the e-mail's message or subject term-weight. (default is message) ''' if isinstance(term_list, basestring): term_list = [term_list] if subject: weights = weight_df else: weights = weight_df['weight'] term_list_weight = weights[term_list].mean() if np.isnan(term_list_weight): term_list_weight = 1.0 return term_list_weight def rank_email(email_df, row): ''' Ranks an e-mail (as contained in the row of a DataFrame) by computing and combining its five weights. ''' email = email_df.ix[row, :] date = email['date'] sender = email['sender'] subject = email['subject'] message = email['message'] # 1. Get sender weights (all messages) sender_weight = (sender_weights['weight'] .get(sender) or 1.0) # 2. Get sender weights (threads) thread_sender_weight = (thread_sender_weights['weight'] .get(sender) or 1.0) # 3. Get thread activity weight is_thread = thread_flag(subject) subject = clean_subject(subject) if is_thread: activity_weight = (thread_activity_df['weight'] .get(subject) or 1.0) else: activity_weight = 1.0 # 4. Get subject line weight via thread-subject term weights subj_terms = tdm_df(subject, remove_punctuation = False, remove_digits = False, stopwords = rsw).columns subj_term_weight = get_weight_from_terms(subj_terms, thread_subject_terms_weights, subject = True) # 5. Message term weight message_terms = tdm_df(message, stopwords = rsw).columns message_term_weight = get_weight_from_terms(message_terms, message_terms_weights) weights = [sender_weight, thread_sender_weight, activity_weight, subj_term_weight, message_term_weight] # The e-mail's final rank is just the product of the weights. rank = np.array(weights).prod() return rank def make_rank_df(email_df): ''' Rank each e-mail in a DataFrame. ''' n_emails = email_df.shape[0] sender_weight_results = [] thread_sender_weight_results = [] activity_weight_results = [] subj_term_weight_results = [] message_term_weight_results = [] rank_results = [] rank_df = email_df.copy() for e in xrange(n_emails): weights_rank = rank_email(email_df, e) rank_results.append(weights_rank) rank_df['rank'] = rank_results return rank_df train_ranks = make_rank_df(train_df) threshold = train_ranks['rank'].median() test_ranks = make_rank_df(test_df) train_kde = KDE(train_ranks['rank']) train_kde.fit() test_kde = KDE(test_ranks['rank']) test_kde.fit() plt.figure(figsize(8, 6)) plt.fill(train_kde.support, train_kde.density, color = 'steelblue', alpha = .7, label = 'Train') plt.fill(test_kde.support, test_kde.density, color = 'red', alpha = .7, label = 'Test') plt.xlim(0, 400) plt.ylim(0, np.max(test_kde.density)) plt.axvline(threshold, linestyle = '--', label = 'Priority threshold') plt.xlabel('Rank') plt.ylabel('Density') plt.title('Distribution of ranks for training and test e-mails') plt.legend(frameon = False) (test_ranks['rank'] > threshold).sum() test_ranks[test_ranks['rank'] > threshold]['sender'].unique() maxrank_email = train_ranks.ix[train_ranks['rank'].idxmax(), :] print 'Maximum rank in training set: ', train_ranks['rank'].max() thread_activity_df.ix[clean_subject(maxrank_email['subject']), :] thread_df[thread_df['subject'].str.contains('spam_level_char')]['date'] correct_weight = 10 + np.log10(2 / 4.) incorrect_weight = 10 + np.log10(2 / (2* 3600 + 59 * 60 + 56.)) print 'Activity weight correct time-zone: ', correct_weight print 'Activity weight incorrect time-zone: ', incorrect_weight print 'Rank difference: ' , '{:2.1%}'.format(1 - incorrect_weight / correct_weight)