import nltk
from nltk.corpus import names
from pylab import *
import random as pyrandom
posts = nltk.corpus.nps_chat.xml_posts()[:10000]
def features(post):
f = {}
for w in nltk.word_tokenize(post): f[w.lower()] = True
return f
posts[333].text
'wouldnt let her date'
posts[333].get('class')
'Emotion'
print set([p.get('class') for p in posts])
set(['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'])
featuresets = [(features(p.text),p.get('class')) for p in posts]
training_set = featuresets[1000:]
test_set = featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print nltk.classify.accuracy(classifier,test_set)
0.66