import math import random num_marbles = 4000 num_jars = 100 jars = [] ## empty out all the jars for j in xrange(num_jars): jars.append(0) ## fill the jars randomly for i in xrange(num_marbles): jar = random.randint(0,num_jars-1) ## make sure in the range [0..num_jars-1] jars[jar] += 1 ## count how many jars have 42 marbles target_val = 42 target_count = 0. for j in xrange(num_jars): if (jars[j] == target_val): print "Jar %d has %d marbles" % (j, target_val) target_count += 1 sample_mean = (sum(jars)+0.) / num_jars print "There were %d jars with %d marbles for a probability of %.05f" % (target_count, target_val, target_count / num_jars) print "The mean number of marbles per jar was %.02f" % (sample_mean) ## Plot the number of marbles in each jar import matplotlib.pyplot as plt plt.figure(figsize=(15,4), dpi=100) plt.xlabel("Jar Index") plt.ylabel("Marbles in Jar") plt.bar(range(num_jars), jars) plt.show() ## Plot the distribution on the number of marbles in each jar plt.figure() plt.xlabel("Number of Marbles") plt.ylabel("Number of Jars with that many marbles") plt.hist(jars, bins=range(max(jars)+3)) plt.show() ## Plot the density function, notice bars sum to exactly 1 but otherwise just scales display plt.figure() plt.xlabel("Number of Marbles") plt.ylabel("Probability of Jars with that many marbles") plt.hist(jars, bins=range(max(jars)+3), normed=True) plt.show() ## Compute mean & stdev of distribution mean = (sum(jars) + 0.) / num_jars sumdiff = 0.0 for count in jars: diff = (count - mean) ** 2 sumdiff += diff sumdiff /= num_jars stdev = math.sqrt(sumdiff) print "The mean values was %0.2f +/- %0.2f" % (mean, stdev) ## Overlap the normal distribution with this mean and stdev gaus_prob_dist = [] pi = 3.1415926 for k in xrange(int(max(jars)*1.2)+3): var = float(stdev)**2 num = math.exp(-(float(k)-float(mean))**2/(2*var)) denom = (2*pi*var)**.5 gaus_prob = num/denom gaus_prob_dist.append(gaus_prob) ## overlay the analytic distributions over the data plt.figure() plt.xlabel("Number of Marbles") plt.ylabel("Probability of Jars with that many marbles") plt.hist(jars, bins=range(int(max(jars)*1.2+3)), normed=True, label="Observed") plt.plot(gaus_prob_dist, color='green', linewidth=4, linestyle='dashed', label="Normal Fit") plt.legend(loc="upper left") plt.show() ## Mean value is intuitively num_marbles / num_jars ## but what is the expected stdev? expected_mean = num_marbles / num_jars print "The range in values was %0.2f +/- %0.2f" % (mean, stdev) print "The expected frequency is %.02f +/- ???" % (expected_mean) ## Lets simulate many experiments with different mean values to see what happens to the stdev import sys num_trials = 100 max_sim_mean = 1000 sim_target_means = [] sim_means = [] sim_stdevs = [] for t in xrange(num_trials): target_mean = random.randint(0,max_sim_mean) num_marbles = num_jars * target_mean sim_jars = [] ## empty out all the jars for j in xrange(num_jars): sim_jars.append(0) ## fill the jars randomly for i in xrange(num_marbles): jar = random.randint(1,num_jars) - 1 ## make sure in the range 0..num_jars-1 sim_jars[jar] += 1 ## Compute mean & stdev of distribution sim_mean = (sum(sim_jars) + 0.) / num_jars sumdiff = 0.0 for count in sim_jars: diff = (count - sim_mean) ** 2 sumdiff += diff sumdiff /= num_jars sim_stdev = math.sqrt(sumdiff) sim_target_means.append(target_mean) sim_means.append(sim_mean) sim_stdevs.append(sim_stdev) if (t % 10 == 0): print "Completed %d trials" % (t) ## first make sure the target and simulated means agree plt.figure() plt.xlabel("Target means for simulations") plt.ylabel("Simulated means") plt.scatter(sim_target_means, sim_means) plt.show() ## Now plot the relationship between the mean and stdev of the different simulations plt.figure() plt.xlabel("Simulated means") plt.ylabel("Simulated standard deviations") plt.scatter(sim_means, sim_stdevs) plt.show() ## Now plot the relationship between the mean and stdev of the different simulations plt.figure() plt.xlabel("Simulated means") plt.ylabel("Simulated standard deviations") plt.scatter(sim_means, sim_stdevs, label="observed") sx = [] sy = [] for i in xrange(max_sim_mean): sx.append(i) sy.append(math.sqrt(i)) plt.plot(sx, sy, color='red', linewidth=4, label="sqrt") plt.legend(loc="upper left") plt.show() prob_42 = math.exp(-sample_mean) * (sample_mean ** 42) / math.factorial(42) print "The probability of having 42 marbles when the mean is %.02f is %.05f" % (sample_mean, prob_42) print "There were %d jars with %d marbles for a probability of %.05f" % (target_count, target_val, target_count / num_jars) ## Compute the distribution analytically pois_prob_dist = [] ## This will crash for large sample means! for k in xrange(int(max(jars)*1.2)+3): prob_pois = math.exp(-sample_mean) * (sample_mean ** k) / math.factorial(k) pois_prob_dist.append(prob_pois) ## Plot the poisson dist. plt.figure() plt.xlabel("Number of Marbles") plt.ylabel("Probability of Jars with that many marbles") plt.plot(pois_prob_dist, color='purple', linewidth=4, linestyle='solid') plt.show() ## overlay the analytic distributions over the data plt.figure() plt.xlabel("Number of Marbles") plt.ylabel("Probability of Jars with that many marbles") plt.hist(jars, bins=range(int(max(jars)*1.2)+3), normed=True, label="observed") plt.plot(pois_prob_dist, color='purple', linewidth=4, linestyle='solid', label="Poisson Fit") plt.plot(gaus_prob_dist, color='green', linewidth=4, linestyle='dashed', label="Normal Fit") plt.legend(loc="upper left") plt.show()