import nltk from nltk.corpus import names from pylab import * import random as pyrandom sents = nltk.corpus.treebank_raw.sents() sents = [s for s in sents if len(s)>3] sents = [s for s in sents if "START" not in s] tokens = [] boundaries = [] for s in sents: tokens += s boundaries.append(len(tokens)-1) print tokens[:200] def features(s,i): return dict(current=tokens[i], prev=tokens[i-1], next=tokens[i+1], upper=tokens[i+1][0].isupper(), plen=len(tokens[i-1]), nlen=len(tokens[i+1])) data = [] for i in range(1,len(tokens)-1): if tokens[i] not in [".","?","!"]: continue c = (i in boundaries) f = features(tokens,i) data.append((f,c)) pyrandom.shuffle(data) n = len(data) print n training_set = data[n//10:] test_set = data[:n//10] classifier = nltk.NaiveBayesClassifier.train(training_set) nltk.classify.accuracy(classifier,test_set) classifier.classify(features("The quick . brown".split(),2)) def segment_sentences(words): sentences = [[words[0]]] for i in range(1,len(words)): sentences[-1].append(words[i]) c = words[i] in [".","?","!"] and classifier.classify(features(words,i)) if c: sentences.append([]) if sentences[-1]==[]: sentences = sentences[:-1] return sentences segment_sentences("""Smith ran . J . Smith really ran . """.split())