%run github_events.py print "Some User ids from the latest events (push, star, fork etc.) on Github." print ids[:10] print print "Number of unique ids found: ", ids.shape[0] print "Largest user id: ", ids.max() figsize(12.5,3) plt.hist( ids, bins = 45, alpha = 0.9) plt.title("Histogram of %d Github User ids"%ids.shape[0] ); plt.xlabel("User id") plt.ylabel("Frequency"); FACTOR = 1000000. import pymc as pm upper_bound = pm.Uniform( "n_sign_ups", ids.max()/FACTOR, (ids.max())/FACTOR + 1) obs = pm.Uniform("obs", 0, upper_bound, value = ids/FACTOR, observed = True ) #code to be examplained in Chp. 3. mcmc = pm.MCMC([upper_bound, obs] ) mcmc.sample( 100000, 45000) from scipy.stats.mstats import mquantiles samples = mcmc.trace("n_sign_ups")[:] hist(samples, bins = 100, label = "Uniform prior", normed=True, alpha = 0.8, histtype="stepfilled", color = "#7A68A6" ); quantiles_mean = np.append( mquantiles( samples, [0.05, 0.5, 0.95]), samples.mean() ) print "Quantiles: ", quantiles_mean[:3] print "Mean: ", quantiles_mean[-1] plt.vlines( quantiles_mean, 0, 33, linewidth=2, linestyles = ["--", "--", "--", "-"], ) plt.title("Posterior distribution of total number of Github users" ) plt.xlabel("number of users (in millions)") plt.legend() plt.xlim( ids.max()/FACTOR - 0.01, ids.max()/FACTOR + 0.12 );