In [2]:

```
import math
import random
num_marbles = 4000
num_jars = 100
jars = []
## empty out all the jars
for j in xrange(num_jars):
jars.append(0)
## fill the jars randomly
for i in xrange(num_marbles):
jar = random.randint(0,num_jars-1) ## make sure in the range [0..num_jars-1]
jars[jar] += 1
## count how many jars have 42 marbles
target_val = 42
target_count = 0.
for j in xrange(num_jars):
if (jars[j] == target_val):
print "Jar %d has %d marbles" % (j, target_val)
target_count += 1
sample_mean = (sum(jars)+0.) / num_jars
print "There were %d jars with %d marbles for a probability of %.05f" % (target_count, target_val, target_count / num_jars)
print "The mean number of marbles per jar was %.02f" % (sample_mean)
```

In [3]:

```
## Plot the number of marbles in each jar
import matplotlib.pyplot as plt
plt.figure(figsize=(15,4), dpi=100)
plt.xlabel("Jar Index")
plt.ylabel("Marbles in Jar")
plt.bar(range(num_jars), jars)
plt.show()
```

In [4]:

```
## Plot the distribution on the number of marbles in each jar
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Number of Jars with that many marbles")
plt.hist(jars, bins=range(max(jars)+3))
plt.show()
```

In [5]:

```
## Plot the density function, notice bars sum to exactly 1 but otherwise just scales display
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.hist(jars, bins=range(max(jars)+3), normed=True)
plt.show()
```

In [6]:

```
## Compute mean & stdev of distribution
mean = (sum(jars) + 0.) / num_jars
sumdiff = 0.0
for count in jars:
diff = (count - mean) ** 2
sumdiff += diff
sumdiff /= num_jars
stdev = math.sqrt(sumdiff)
print "The mean values was %0.2f +/- %0.2f" % (mean, stdev)
```

In [7]:

```
## Overlap the normal distribution with this mean and stdev
gaus_prob_dist = []
pi = 3.1415926
for k in xrange(int(max(jars)*1.2)+3):
var = float(stdev)**2
num = math.exp(-(float(k)-float(mean))**2/(2*var))
denom = (2*pi*var)**.5
gaus_prob = num/denom
gaus_prob_dist.append(gaus_prob)
## overlay the analytic distributions over the data
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.hist(jars, bins=range(int(max(jars)*1.2+3)), normed=True, label="Observed")
plt.plot(gaus_prob_dist, color='green', linewidth=4, linestyle='dashed', label="Normal Fit")
plt.legend(loc="upper left")
plt.show()
```

In [8]:

```
## Mean value is intuitively num_marbles / num_jars
## but what is the expected stdev?
expected_mean = num_marbles / num_jars
print "The range in values was %0.2f +/- %0.2f" % (mean, stdev)
print "The expected frequency is %.02f +/- ???" % (expected_mean)
```

In [9]:

```
## Lets simulate many experiments with different mean values to see what happens to the stdev
import sys
num_trials = 100
max_sim_mean = 1000
sim_target_means = []
sim_means = []
sim_stdevs = []
for t in xrange(num_trials):
target_mean = random.randint(0,max_sim_mean)
num_marbles = num_jars * target_mean
sim_jars = []
## empty out all the jars
for j in xrange(num_jars):
sim_jars.append(0)
## fill the jars randomly
for i in xrange(num_marbles):
jar = random.randint(1,num_jars) - 1 ## make sure in the range 0..num_jars-1
sim_jars[jar] += 1
## Compute mean & stdev of distribution
sim_mean = (sum(sim_jars) + 0.) / num_jars
sumdiff = 0.0
for count in sim_jars:
diff = (count - sim_mean) ** 2
sumdiff += diff
sumdiff /= num_jars
sim_stdev = math.sqrt(sumdiff)
sim_target_means.append(target_mean)
sim_means.append(sim_mean)
sim_stdevs.append(sim_stdev)
if (t % 10 == 0):
print "Completed %d trials" % (t)
```

In [10]:

```
## first make sure the target and simulated means agree
plt.figure()
plt.xlabel("Target means for simulations")
plt.ylabel("Simulated means")
plt.scatter(sim_target_means, sim_means)
plt.show()
```

In [11]:

```
## Now plot the relationship between the mean and stdev of the different simulations
plt.figure()
plt.xlabel("Simulated means")
plt.ylabel("Simulated standard deviations")
plt.scatter(sim_means, sim_stdevs)
plt.show()
```

In [12]:

```
## Now plot the relationship between the mean and stdev of the different simulations
plt.figure()
plt.xlabel("Simulated means")
plt.ylabel("Simulated standard deviations")
plt.scatter(sim_means, sim_stdevs, label="observed")
sx = []
sy = []
for i in xrange(max_sim_mean):
sx.append(i)
sy.append(math.sqrt(i))
plt.plot(sx, sy, color='red', linewidth=4, label="sqrt")
plt.legend(loc="upper left")
plt.show()
```

**Eureka! The data look normal and the stdev is consistently the sqrt of the mean!**

This is the hallmark of the Poisson distribution

$$ prob(\text{jar has k marbles given the mean amount}) = \frac{(mean^{k})}{k!} e^{-mean} $$

In [13]:

```
prob_42 = math.exp(-sample_mean) * (sample_mean ** 42) / math.factorial(42)
print "The probability of having 42 marbles when the mean is %.02f is %.05f" % (sample_mean, prob_42)
print "There were %d jars with %d marbles for a probability of %.05f" % (target_count, target_val, target_count / num_jars)
```

In [14]:

```
## Compute the distribution analytically
pois_prob_dist = []
## This will crash for large sample means!
for k in xrange(int(max(jars)*1.2)+3):
prob_pois = math.exp(-sample_mean) * (sample_mean ** k) / math.factorial(k)
pois_prob_dist.append(prob_pois)
## Plot the poisson dist.
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.plot(pois_prob_dist, color='purple', linewidth=4, linestyle='solid')
plt.show()
```

In [15]:

```
## overlay the analytic distributions over the data
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.hist(jars, bins=range(int(max(jars)*1.2)+3), normed=True, label="observed")
plt.plot(pois_prob_dist, color='purple', linewidth=4, linestyle='solid', label="Poisson Fit")
plt.plot(gaus_prob_dist, color='green', linewidth=4, linestyle='dashed', label="Normal Fit")
plt.legend(loc="upper left")
plt.show()
```