import numpy import re from random import choice from numpy import bincount import matplotlib.pyplot as plt from pytree import read_trees %matplotlib inline LETTERS = ' abcdefghijklmnopqrstuvwxyz' def monkeylanguage(n=10000): chars = [] for i in xrange(n): chars.append(choice(LETTERS)) return ''.join(chars).strip().split() Wie sieht "Monkey language" aus? monkeylanguage(200) worte = monkeylanguage() laengen = sorted([len(w) for w in worte]) counts = bincount(laengen) x_numbers = numpy.linspace(0,len(counts), len(counts)) plt.plot(x_numbers, counts) plt.plot(x_numbers, 50*(numpy.exp(-x_numbers/(26/2.)))) plt.title('Monkey Language: Word length distribution'); def corpus_words(fname): words = [] for t in read_trees(fname): for n in t.terminals: words.append(n.word) return words word_re = re.compile('^[a-zA-Z]+$') worte = [x.lower() for x in corpus_words('/home/yannick/corpora/tuebadz-8.0-mit-NE.export4') if word_re.match(x)] worte[:10] laengen = sorted([len(w) for w in worte]) counts = numpy.array(bincount(laengen)) x_numbers = numpy.linspace(0,len(counts), len(counts)) avg_len = (counts*x_numbers).sum()/counts.sum() plt.plot(x_numbers, counts) plt.plot(x_numbers, 300000*(numpy.exp(-x_numbers/(avg_len)))) plt.title('German Language: Word length distribution');