import nltk
from nltk.corpus import names
from pylab import *
import random as pyrandom

sents = nltk.corpus.treebank_raw.sents()
sents = [s for s in sents if len(s)>3]
sents = [s for s in sents if "START" not in s]

tokens = []
boundaries = []
for s in sents:
    tokens += s
    boundaries.append(len(tokens)-1)

print tokens[:200]

def features(s,i):
    return dict(current=tokens[i],
                prev=tokens[i-1],
                next=tokens[i+1],
                upper=tokens[i+1][0].isupper(),
                plen=len(tokens[i-1]),
                nlen=len(tokens[i+1]))
    

data = []
for i in range(1,len(tokens)-1):
    if tokens[i] not in [".","?","!"]: continue
    c = (i in boundaries)
    f = features(tokens,i)
    data.append((f,c))
pyrandom.shuffle(data)
n = len(data)
print n
training_set = data[n//10:]
test_set = data[:n//10]

classifier = nltk.NaiveBayesClassifier.train(training_set)
nltk.classify.accuracy(classifier,test_set)

classifier.classify(features("The quick . brown".split(),2))

def segment_sentences(words):
    sentences = [[words[0]]]
    for i in range(1,len(words)):
        sentences[-1].append(words[i])
        c = words[i] in [".","?","!"] and classifier.classify(features(words,i))
        if c: sentences.append([])
    if sentences[-1]==[]: sentences = sentences[:-1]
    return sentences
    

segment_sentences("""Smith ran . J . Smith really ran . """.split())