In [1]:
import numpy
from collections import Counter
from matplotlib import pyplot
import re
In [2]:
posts = []

row_pattern = re.compile("AnswerCount=\"(\d+)\"")

with open("stackoverflow_1000.xml", encoding="utf-8") as file_reader:
    for line in file_reader:
        row_match = row_pattern.search(line)
        if row_match:
            posts.append( { "answers": int(row_match.group(1)) } )
            #print(row_match.group(0))

len(posts)
Out[2]:
487
In [3]:
answer_counts = numpy.array([ post["answers"] for post in posts ])
print(answer_counts[:5])
[0 5 1 1 1]
In [4]:
answer_counts.mean()
Out[4]:
0.8870636550308009
In [5]:
answer_counts.var()
Out[5]:
0.8681488727447516
In [6]:
Counter(answer_counts)
Out[6]:
Counter({0: 187, 5: 3, 1: 205, 2: 70, 3: 18, 6: 1, 4: 3})
In [7]:
pyplot.hist(answer_counts, bins=range(0, 7))
pyplot.show()
In [8]:
pyplot.figure(figsize=(10,6))

pyplot.subplot(3, 2, 1)
pyplot.hist(numpy.random.poisson(0.88, size=487), bins=range(0,7))
pyplot.subplot(3, 2, 2)
pyplot.hist(numpy.random.poisson(0.88, size=487), bins=range(0,7))
pyplot.subplot(3, 2, 3)
pyplot.hist(numpy.random.poisson(0.88, size=487), bins=range(0,7))
pyplot.subplot(3, 2, 4)
pyplot.hist(answer_counts, bins=range(0,7))
pyplot.subplot(3, 2, 5)
pyplot.hist(numpy.random.poisson(0.88, size=487), bins=range(0,7))
pyplot.subplot(3, 2, 6)
pyplot.hist(numpy.random.poisson(0.88, size=487), bins=range(0,7))
pyplot.show()