from nltk.corpus import cess_esp sents = cess_esp.tagged_sents() training = [] test = [] for i in range(len(sents)): if i % 10: training.append(sents[i]) else: test.append(sents[i]) from nltk import UnigramTagger, BigramTagger, TrigramTagger from nltk.tag.hmm import HiddenMarkovModelTagger unigram_tagger = UnigramTagger(training) bigram_tagger = BigramTagger(training, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word trigram_tagger = TrigramTagger(training, backoff=unigram_tagger) hmm_tagger = HiddenMarkovModelTagger.train(training) print 'UnigramTagger: %.1f %%' % (unigram_tagger.evaluate(test) * 100) print 'BigramTagger: %.1f %%' % (bigram_tagger.evaluate(test) * 100) print 'TrigramTagger: %.1f %%' % (trigram_tagger.evaluate(test) * 100) print 'HMM: %.1f %%' % (hmm_tagger.evaluate(test) * 100) import pickle # Dump trained tagger with open('unigram_spanish.pickle', 'w') as fd: pickle.dump(unigram_tagger, fd) # Load tagger with open('unigram_spanish.pickle', 'r') as fd: tagger = pickle.load(fd)