import numpy
import re
from random import choice
from numpy import bincount
import matplotlib.pyplot as plt
from pytree import read_trees
%matplotlib inline

LETTERS = '  abcdefghijklmnopqrstuvwxyz'

def monkeylanguage(n=10000):
    chars = []
    for i in xrange(n):
        chars.append(choice(LETTERS))
    return ''.join(chars).strip().split()
Wie sieht "Monkey language" aus?
monkeylanguage(200)

worte = monkeylanguage()
laengen = sorted([len(w) for w in worte])
counts = bincount(laengen)
x_numbers = numpy.linspace(0,len(counts), len(counts))
plt.plot(x_numbers, counts)
plt.plot(x_numbers, 50*(numpy.exp(-x_numbers/(26/2.))))
plt.title('Monkey Language: Word length distribution');

def corpus_words(fname):
    words = []
    for t in read_trees(fname):
        for n in t.terminals:
            words.append(n.word)
    return words

word_re = re.compile('^[a-zA-Z]+$')
worte = [x.lower() for x in corpus_words('/home/yannick/corpora/tuebadz-8.0-mit-NE.export4')
         if word_re.match(x)]
worte[:10]

laengen = sorted([len(w) for w in worte])
counts = numpy.array(bincount(laengen))
x_numbers = numpy.linspace(0,len(counts), len(counts))
avg_len = (counts*x_numbers).sum()/counts.sum()
plt.plot(x_numbers, counts)
plt.plot(x_numbers, 300000*(numpy.exp(-x_numbers/(avg_len))))
plt.title('German Language: Word length distribution');