In [2]:

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/new/words-6mill/colls/', r'.*\.txt', cat_pattern=r'(\w+)\.txt')

In [3]:

reader.categories('gpo.txt')

Out[3]:

['gpo']

In [4]:

gpowords = reader.words('gpo.txt')

In [5]:

len(gpowords)

Out[5]:

36646650

In [6]:

len(set([w.lower() for w in reader.words('gpo.txt')]))

Out[6]:

In [7]:

import nlt

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-7-40bae14ecbf8> in <module>()
----> 1 import nlt

ImportError: No module named 'nlt'

In [9]:

cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in ['gpo', 'artstor']
for word in reader.words(categories=genre))

In [10]:

cfd.most_common()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-10-776394e71d50> in <module>()
----> 1 cfd.most_common()

AttributeError: 'ConditionalFreqDist' object has no attribute 'most_common'

In [16]:

fd = nltk.FreqDist((word) for word in reader.words(categories='gpo'))
#fd.most_common()
fd[1]

Out[16]:

In [18]:

fd.plot()

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-18-e119b3c14ebe> in <module>()
----> 1 fd.plot()

/home/charper/anaconda/lib/python2.7/site-packages/nltk/probability.pyc in plot(self, *args, **kwargs)
    313             del kwargs["title"]
    314         pylab.plot(freqs, **kwargs)
--> 315         pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
    316         pylab.xlabel("Samples")
    317         pylab.ylabel(ylabel)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xb0 in position 0: ordinal not in range(128)

In [ ]:

cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in ['gpo', 'artstor']
for word in reader.words(categories=genre))