labelsfile = 'data/csv/fiftyshades_labeled.txt'

def get_documents_csv(filename):
    """ Read in the labeled chunks and classifications.
    Assume label is cell 1, doc text is cell 2, classification is cell 3.
    """
    
    labels = []
    documents = []
    classif = []
    for line in open(filename):
        fields = line.split("\t")
        if fields[0].strip != 'label':    # header row
            documents.append(fields[1].strip())
            labels.append(fields[0].strip())
            if fields[2]:
                classif.append(fields[2].strip("\n"))
    print "Got", len(documents), "chunks"
    return (documents, labels, classif)

docs, labels, classes = get_documents_csv(labelsfile)

# Hmm, in my classes I did categorize "maybes."  These can either be used as "no" or as a third class.
classes[25:40]

labels[25:40]  # these are the original file chunks for reference

# this text can't be used as is in the classifier -- and notice this was a "maybe":

docs[25]

class Document:
    def __init__(self):
        Document.words = []
        Document.original = ""
        Document.clean = ""
        Document.label = ""
        Document.classif = ""

def clean_doc(doc):
    from nltk import corpus
    import re
    stopwords = corpus.stopwords.words('english')
    new = Document()
    new.original = doc
    sentence = doc
    sentence = sentence.lower()
    # note that I'm looking for non-numeric alphabetic items; this makes a difference from sklearn
    words = re.findall(r'\w+', sentence, flags = re.UNICODE | re.LOCALE)
    new.clean = " ".join(words)
    words = [word for word in words if word not in stopwords]
    new.words = words
    return new

# example ouput:

clean_doc(docs[25]).clean

# clean them all...

clean_docs = [clean_doc(x) for x in docs]

# Fix up with more info on each object:

def add_ids_classes(doc_objs, labels, classes):
    # Go thru the objects we just made and add the corresponding class and label
    for i,x in enumerate(doc_objs):
        x.label = labels[i]
        x.id = i
        x.classif = classes[i].strip("\r")  # may be necessary to strip, was for me
    return doc_objs

clean_docs = add_ids_classes(clean_docs, labels, classes)

clean_docs[0]
clean_docs[0].classif

# We will consider the "maybe" as no, for now:

neg_docs = [doc for doc in clean_docs if doc.classif == 'no' or doc.classif == 'maybe']
pos_docs = [doc for doc in clean_docs if doc.classif == 'yes']

print len(neg_docs), len(pos_docs)

# Bag of Words - just a True for each word's presence in a document.  Later we'll use TF-IDF weights.

def word_feats(words):
        return dict([(word, True) for word in words])

neg_words = [(word_feats(doc.words),'neg') for doc in neg_docs]
pos_words = [(word_feats(doc.words),'pos') for doc in pos_docs]

# These are lists of dictionaries, one for each text.  Here's the first "no" text:
neg_words[0]

# Let's make a cut point at 3/4 of each list, so we can do separate test and training runs.

negcutoff = len(neg_words)*3/4
poscutoff = len(pos_words)*3/4

# Now split up the lists into training and testing.

import random

random.shuffle(neg_words)
random.shuffle(pos_words)

train_fic = neg_words[:negcutoff] + pos_words[:poscutoff]
test_fic = neg_words[negcutoff:] + pos_words[poscutoff:]

print 'train on %d docs, test on %d docs' % (len(train_fic), len(test_fic))

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_fic)


print 'accuracy:', nltk.classify.util.accuracy(classifier, test_fic)
classifier.show_most_informative_features(15)

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.util import accuracy

sk_classifier = SklearnClassifier(MultinomialNB())
sk_classifier.train(train_fic)
accuracy(sk_classifier, test_fic)