import math
import random

num_marbles = 4000
num_jars    = 100

jars = []

## empty out all the jars

for j in xrange(num_jars):
    jars.append(0)
    
## fill the jars randomly
for i in xrange(num_marbles):
    jar = random.randint(0,num_jars-1) ## make sure in the range [0..num_jars-1]
    jars[jar] += 1
    
## count how many jars have 42 marbles
target_val = 42
target_count = 0.

for j in xrange(num_jars):
    if (jars[j] == target_val):
        print "Jar %d has %d marbles" % (j, target_val)
        target_count += 1

sample_mean = (sum(jars)+0.) / num_jars
print "There were %d jars with %d marbles for a probability of %.05f" % (target_count, target_val, target_count / num_jars)
print "The mean number of marbles per jar was %.02f" % (sample_mean)

## Plot the number of marbles in each jar
import matplotlib.pyplot as plt
plt.figure(figsize=(15,4), dpi=100)
plt.xlabel("Jar Index")
plt.ylabel("Marbles in Jar")
plt.bar(range(num_jars), jars)
plt.show()


## Plot the distribution on the number of marbles in each jar
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Number of Jars with that many marbles")
plt.hist(jars, bins=range(max(jars)+3))
plt.show()


## Plot the density function, notice bars sum to exactly 1 but otherwise just scales display
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.hist(jars, bins=range(max(jars)+3), normed=True)
plt.show()


## Compute mean & stdev of distribution
mean = (sum(jars) + 0.) / num_jars
sumdiff = 0.0
for count in jars:
    diff = (count - mean) ** 2
    sumdiff += diff

sumdiff /= num_jars
stdev = math.sqrt(sumdiff)

print "The mean values was %0.2f +/- %0.2f" % (mean, stdev)


## Overlap the normal distribution with this mean and stdev
gaus_prob_dist = []
pi = 3.1415926

for k in xrange(int(max(jars)*1.2)+3):
    var = float(stdev)**2
    num = math.exp(-(float(k)-float(mean))**2/(2*var))
    denom = (2*pi*var)**.5
    gaus_prob = num/denom
    gaus_prob_dist.append(gaus_prob)


## overlay the analytic distributions over the data
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.hist(jars, bins=range(int(max(jars)*1.2+3)), normed=True, label="Observed")
plt.plot(gaus_prob_dist, color='green', linewidth=4, linestyle='dashed', label="Normal Fit")
plt.legend(loc="upper left")
plt.show()


## Mean value is intuitively num_marbles / num_jars
## but what is the expected stdev?

expected_mean = num_marbles / num_jars
print "The range in values was %0.2f +/- %0.2f" % (mean, stdev)
print "The expected frequency is %.02f +/- ???" % (expected_mean)

## Lets simulate many experiments with different mean values to see what happens to the stdev
import sys
num_trials = 100
max_sim_mean = 1000

sim_target_means = []
sim_means  = []
sim_stdevs = []

for t in xrange(num_trials):
    target_mean = random.randint(0,max_sim_mean)
    num_marbles = num_jars * target_mean
    
    sim_jars = []

    ## empty out all the jars

    for j in xrange(num_jars):
        sim_jars.append(0)
    
    ## fill the jars randomly
    for i in xrange(num_marbles):
        jar = random.randint(1,num_jars) - 1 ## make sure in the range 0..num_jars-1
        sim_jars[jar] += 1
        
    ## Compute mean & stdev of distribution
    sim_mean = (sum(sim_jars) + 0.) / num_jars
    sumdiff = 0.0
    for count in sim_jars:
        diff = (count - sim_mean) ** 2
        sumdiff += diff

    sumdiff /= num_jars
    sim_stdev = math.sqrt(sumdiff)

    sim_target_means.append(target_mean)
    sim_means.append(sim_mean)
    sim_stdevs.append(sim_stdev)
    
    if (t % 10 == 0):
        print "Completed %d trials" % (t)
    

## first make sure the target and simulated means agree
plt.figure()
plt.xlabel("Target means for simulations")
plt.ylabel("Simulated means")
plt.scatter(sim_target_means, sim_means)
plt.show()


## Now plot the relationship between the mean and stdev of the different simulations
plt.figure()
plt.xlabel("Simulated means")
plt.ylabel("Simulated standard deviations")
plt.scatter(sim_means, sim_stdevs)
plt.show()


## Now plot the relationship between the mean and stdev of the different simulations
plt.figure()
plt.xlabel("Simulated means")
plt.ylabel("Simulated standard deviations")
plt.scatter(sim_means, sim_stdevs, label="observed")
sx = []
sy = []
for i in xrange(max_sim_mean):
    sx.append(i)
    sy.append(math.sqrt(i))
plt.plot(sx, sy, color='red', linewidth=4, label="sqrt")
plt.legend(loc="upper left")
plt.show()


prob_42 = math.exp(-sample_mean) * (sample_mean ** 42) / math.factorial(42) 
print "The probability of having 42 marbles when the mean is %.02f is %.05f" % (sample_mean, prob_42)
print "There were %d jars with %d marbles for a probability of %.05f" % (target_count, target_val, target_count / num_jars)


## Compute the distribution analytically
pois_prob_dist = []

## This will crash for large sample means!
for k in xrange(int(max(jars)*1.2)+3):
    prob_pois = math.exp(-sample_mean) * (sample_mean ** k) / math.factorial(k) 
    pois_prob_dist.append(prob_pois)
    
## Plot the poisson dist.
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.plot(pois_prob_dist, color='purple', linewidth=4, linestyle='solid')
plt.show()


## overlay the analytic distributions over the data
plt.figure()
plt.xlabel("Number of Marbles")
plt.ylabel("Probability of Jars with that many marbles")
plt.hist(jars, bins=range(int(max(jars)*1.2)+3), normed=True, label="observed")
plt.plot(pois_prob_dist, color='purple', linewidth=4, linestyle='solid', label="Poisson Fit")
plt.plot(gaus_prob_dist, color='green', linewidth=4, linestyle='dashed', label="Normal Fit")
plt.legend(loc="upper left")
plt.show()