from pandas import * import numpy as np import os import re from nltk import NaiveBayesClassifier import nltk.classify from nltk.tokenize import wordpunct_tokenize from nltk.corpus import stopwords from collections import defaultdict data_path = os.path.abspath(os.path.join('.', 'data')) spam_path = os.path.join(data_path, 'spam') spam2_path = os.path.join(data_path, 'spam_2') easyham_path = os.path.join(data_path, 'easy_ham') easyham2_path = os.path.join(data_path, 'easy_ham_2') hardham_path = os.path.join(data_path, 'hard_ham') hardham2_path = os.path.join(data_path, 'hard_ham_2') def get_msgdir(path): ''' Read all messages from files in a directory into a list where each item is the text of a message. Simply gets a list of e-mail files in a directory, and iterates get_msg() over them. Returns a list of strings. ''' filelist = os.listdir(path) filelist = filter(lambda x: x != 'cmds', filelist) all_msgs =[get_msg(os.path.join(path, f)) for f in filelist] return all_msgs def get_msg(path): ''' Read in the 'message' portion of an e-mail, given its file path. The 'message' text begins after the first blank line; above is header information. Returns a string. ''' with open(path, 'rU') as con: msg = con.readlines() first_blank_index = msg.index('\n') msg = msg[(first_blank_index + 1): ] return ''.join(msg) train_spam_messages = get_msgdir(spam_path) train_easyham_messages = get_msgdir(easyham_path) # Only keep the first 500 to balance w/ number of spam messages. train_easyham_messages = train_easyham_messages[:500] train_hardham_messages = get_msgdir(hardham_path) test_spam_messages = get_msgdir(spam2_path) test_easyham_messages = get_msgdir(easyham2_path) test_hardham_messages = get_msgdir(hardham2_path) def get_msg_words(msg, stopwords = [], strip_html = False): ''' Returns the set of unique words contained in an e-mail message. Excludes any that are in an optionally-provided list. NLTK's 'wordpunct' tokenizer is used, and this will break contractions. For example, don't -> (don, ', t). Therefore, it's advisable to supply a stopwords list that includes contraction parts, like 'don' and 't'. ''' # Strip out weird '3D' artefacts. msg = re.sub('3D', '', msg) # Strip out html tags and attributes and html character codes, # like   and <. if strip_html: msg = re.sub('<(.|\n)*?>', ' ', msg) msg = re.sub('&\w+;', ' ', msg) # wordpunct_tokenize doesn't split on underscores. We don't # want to strip them, since the token first_name may be informative # moreso than 'first' and 'name' apart. But there are tokens with long # underscore strings (e.g. 'name_________'). We'll just replace the # multiple underscores with a single one, since 'name_____' is probably # not distinct from 'name___' or 'name_' in identifying spam. msg = re.sub('_+', '_', msg) # Note, remove '=' symbols before tokenizing, since these are # sometimes occur within words to indicate, e.g., line-wrapping. msg_words = set(wordpunct_tokenize(msg.replace('=\n', '').lower())) # Get rid of stopwords msg_words = msg_words.difference(stopwords) # Get rid of punctuation tokens, numbers, and single letters. msg_words = [w for w in msg_words if re.search('[a-zA-Z]', w) and len(w) > 1] return msg_words sw = stopwords.words('english') sw.extend(['ll', 've']) def features_from_messages(messages, label, feature_extractor, **kwargs): ''' Make a (features, label) tuple for each message in a list of a certain, label of e-mails ('spam', 'ham') and return a list of these tuples. Note every e-mail in 'messages' should have the same label. ''' features_labels = [] for msg in messages: features = feature_extractor(msg, **kwargs) features_labels.append((features, label)) return features_labels def word_indicator(msg, **kwargs): ''' Create a dictionary of entries {word: True} for every unique word in a message. Note **kwargs are options to the word-set creator, get_msg_words(). ''' features = defaultdict(list) msg_words = get_msg_words(msg, **kwargs) for w in msg_words: features[w] = True return features def make_train_test_sets(feature_extractor, **kwargs): ''' Make (feature, label) lists for each of the training and testing lists. ''' train_spam = features_from_messages(train_spam_messages, 'spam', feature_extractor, **kwargs) train_ham = features_from_messages(train_easyham_messages, 'ham', feature_extractor, **kwargs) train_set = train_spam + train_ham test_spam = features_from_messages(test_spam_messages, 'spam', feature_extractor, **kwargs) test_ham = features_from_messages(test_easyham_messages, 'ham', feature_extractor, **kwargs) test_hardham = features_from_messages(test_hardham_messages, 'ham', feature_extractor, **kwargs) return train_set, test_spam, test_ham, test_hardham def check_classifier(feature_extractor, **kwargs): ''' Train the classifier on the training spam and ham, then check its accuracy on the test data, and show the classifier's most informative features. ''' # Make training and testing sets of (features, label) data train_set, test_spam, test_ham, test_hardham = \ make_train_test_sets(feature_extractor, **kwargs) # Train the classifier on the training set classifier = NaiveBayesClassifier.train(train_set) # How accurate is the classifier on the test sets? print ('Test Spam accuracy: {0:.2f}%' .format(100 * nltk.classify.accuracy(classifier, test_spam))) print ('Test Ham accuracy: {0:.2f}%' .format(100 * nltk.classify.accuracy(classifier, test_ham))) print ('Test Hard Ham accuracy: {0:.2f}%' .format(100 * nltk.classify.accuracy(classifier, test_hardham))) # Show the top 20 informative features print classifier.show_most_informative_features(20) check_classifier(word_indicator, stopwords = sw) check_classifier(word_indicator, stopwords = sw, strip_html = True)