import nltk
from nltk.corpus import names
from pylab import *
import random as pyrandom
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories='news')
def features(s,i,y):
f = dict(ltag=y[i-1] if i>0 else "^", # previous tag
lword=s[i-1] if i>0 else "^", # previous word
s1 = s[i][-1:], # current word features
s2 = s[i][-2:],
s3 = s[i][-3:])
return f
data = []
for sy in brown.tagged_sents(categories='news'):
s,y = zip(*sy)
for i in range(len(s)):
data.append((features(s,i,y),y[i]))
n = len(data)
training_set = data[n//10:]
test_set = data[:n//10]
classifier = nltk.NaiveBayesClassifier.train(training_set)
nltk.classify.accuracy(classifier,test_set)
0.8176031824962705
class MyTagger:
def __init__(self,classifier):
self.classifier = classifier
def tag(self,s):
y = []
for i in range(len(s)):
f = features(s,i,y)
y.append(classifier.classify(features(s,i,y)))
return zip(s,y)
tagger = MyTagger(classifier)
tagger.tag("The quick brown fox jumped over the lazy dogs.".split())
[('The', 'AT'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NPS-TL'), ('jumped', 'VBD'), ('over', 'RP'), ('the', 'AT'), ('lazy', 'JJ'), ('dogs.', 'NP')]