from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/new/words-6mill/colls/', r'.*\.txt', cat_pattern=r'(\w+)\.txt')
reader.categories('gpo.txt')
['gpo']
gpowords = reader.words('gpo.txt')
len(gpowords)
36646650
len(set([w.lower() for w in reader.words('gpo.txt')]))
438479
import nlt
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) <ipython-input-7-40bae14ecbf8> in <module>() ----> 1 import nlt ImportError: No module named 'nlt'
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in ['gpo', 'artstor']
for word in reader.words(categories=genre))
cfd.most_common()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-10-776394e71d50> in <module>() ----> 1 cfd.most_common() AttributeError: 'ConditionalFreqDist' object has no attribute 'most_common'
fd = nltk.FreqDist((word) for word in reader.words(categories='gpo'))
#fd.most_common()
fd[1]
0
fd.plot()
--------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call last) <ipython-input-18-e119b3c14ebe> in <module>() ----> 1 fd.plot() /home/charper/anaconda/lib/python2.7/site-packages/nltk/probability.pyc in plot(self, *args, **kwargs) 313 del kwargs["title"] 314 pylab.plot(freqs, **kwargs) --> 315 pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90) 316 pylab.xlabel("Samples") 317 pylab.ylabel(ylabel) UnicodeDecodeError: 'ascii' codec can't decode byte 0xb0 in position 0: ordinal not in range(128)
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in ['gpo', 'artstor']
for word in reader.words(categories=genre))