labelsfile = 'data/csv/fiftyshades_labeled.txt' def get_documents_csv(filename): """ Read in the labeled chunks and classifications. Assume label is cell 1, doc text is cell 2, classification is cell 3. """ labels = [] documents = [] classif = [] for line in open(filename): fields = line.split("\t") if fields[0].strip != 'label': # header row documents.append(fields[1].strip()) labels.append(fields[0].strip()) if fields[2]: classif.append(fields[2].strip("\n")) print "Got", len(documents), "chunks" return (documents, labels, classif) docs, labels, classes = get_documents_csv(labelsfile) # Hmm, in my classes I did categorize "maybes." These can either be used as "no" or as a third class. classes[25:40] labels[25:40] # these are the original file chunks for reference # this text can't be used as is in the classifier -- and notice this was a "maybe": docs[25] class Document: def __init__(self): Document.words = [] Document.original = "" Document.clean = "" Document.label = "" Document.classif = "" def clean_doc(doc): from nltk import corpus import re stopwords = corpus.stopwords.words('english') new = Document() new.original = doc sentence = doc sentence = sentence.lower() # note that I'm looking for non-numeric alphabetic items; this makes a difference from sklearn words = re.findall(r'\w+', sentence, flags = re.UNICODE | re.LOCALE) new.clean = " ".join(words) words = [word for word in words if word not in stopwords] new.words = words return new # example ouput: clean_doc(docs[25]).clean # clean them all... clean_docs = [clean_doc(x) for x in docs] # Fix up with more info on each object: def add_ids_classes(doc_objs, labels, classes): # Go thru the objects we just made and add the corresponding class and label for i,x in enumerate(doc_objs): x.label = labels[i] x.id = i x.classif = classes[i].strip("\r") # may be necessary to strip, was for me return doc_objs clean_docs = add_ids_classes(clean_docs, labels, classes) clean_docs[0] clean_docs[0].classif # We will consider the "maybe" as no, for now: neg_docs = [doc for doc in clean_docs if doc.classif == 'no' or doc.classif == 'maybe'] pos_docs = [doc for doc in clean_docs if doc.classif == 'yes'] print len(neg_docs), len(pos_docs) # Bag of Words - just a True for each word's presence in a document. Later we'll use TF-IDF weights. def word_feats(words): return dict([(word, True) for word in words]) neg_words = [(word_feats(doc.words),'neg') for doc in neg_docs] pos_words = [(word_feats(doc.words),'pos') for doc in pos_docs] # These are lists of dictionaries, one for each text. Here's the first "no" text: neg_words[0] # Let's make a cut point at 3/4 of each list, so we can do separate test and training runs. negcutoff = len(neg_words)*3/4 poscutoff = len(pos_words)*3/4 # Now split up the lists into training and testing. import random random.shuffle(neg_words) random.shuffle(pos_words) train_fic = neg_words[:negcutoff] + pos_words[:poscutoff] test_fic = neg_words[negcutoff:] + pos_words[poscutoff:] print 'train on %d docs, test on %d docs' % (len(train_fic), len(test_fic)) import nltk.classify.util from nltk.classify import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(train_fic) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_fic) classifier.show_most_informative_features(15) from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import MultinomialNB from nltk.classify.util import accuracy sk_classifier = SklearnClassifier(MultinomialNB()) sk_classifier.train(train_fic) accuracy(sk_classifier, test_fic)