%run github_events.py print "Some User ids from the latest events (push, star, fork etc.) on Github." print ids[:10] print print "Number of unique ids found: ", ids.shape[0] print "Largest user id: ", ids.max() figsize(12.5, 3) plt.hist(ids, bins=45, alpha=0.9) plt.title("Histogram of %d Github User ids" % ids.shape[0]); plt.xlabel("User id") plt.ylabel("Frequency"); FACTOR = 1000000. import pymc as pm upper_bound = pm.Uniform("n_sign_ups", ids.max() / FACTOR, (ids.max()) / FACTOR + 1) obs = pm.Uniform("obs", 0, upper_bound, value=ids / FACTOR, observed=True) # code to be examplained in Chp. 3. mcmc = pm.MCMC([upper_bound, obs]) mcmc.sample(100000, 45000) from scipy.stats.mstats import mquantiles samples = mcmc.trace("n_sign_ups")[:] hist(samples, bins=100, label="Uniform prior", density=True, alpha=0.8, histtype="stepfilled", color="#7A68A6"); quantiles_mean = np.append(mquantiles(samples, [0.05, 0.5, 0.95]), samples.mean()) print "Quantiles: ", quantiles_mean[:3] print "Mean: ", quantiles_mean[-1] plt.vlines(quantiles_mean, 0, 33, linewidth=2, linestyles=["--", "--", "--", "-"], ) plt.title("Posterior distribution of total number of Github users") plt.xlabel("number of users (in millions)") plt.legend() plt.xlim(ids.max() / FACTOR - 0.01, ids.max() / FACTOR + 0.12);