import numpy as np import scipy.stats as spstats %pylab inline total_population_size = 100000 sample_size = 0.05 known_metric = .3 # 30% of users click the widget. def get_samples(sample_ratio=sample_size): total_population = np.random.uniform(0, 1, total_population_size) < known_metric sampled_population = np.random.choice(total_population, int(total_population_size * sample_ratio), replace=False) return total_population, sampled_population total_population, sampled_population = get_samples() print "Actual population metric is {0}".format(total_population.sum() / float(total_population_size)) print "Sampled population metric is {0}".format(sum(sampled_population) / float(len(sampled_population))) np.array(spstats.binom.interval(.95, total_population_size, sum(sampled_population) / float(len(sampled_population)))) / total_population_size in_ci = [] ci_size = [] estimate = [] for i in range(1000): total_population, sampled_population = get_samples() # in reality, you only have sampled_population. # compute your metric. sample_mean = sum(sampled_population) / float(len(sampled_population)) estimate.append(sample_mean) # compute a confidence interval. ci = np.array(spstats.binom.interval(.95, len(sampled_population), sample_mean )) # Scale up. In our case, we are multiplying times inverse of sample ratio. ci_full_population = ci / sample_size # Record whether our CI includes the known population mean. in_ci.append( ci_full_population[0] <= sum(total_population) <= ci_full_population[1]) ci_size.append(ci_full_population) print "Your 95% CI contains the population value {0}% of the time".format(sum(in_ci)/ float(len(in_ci)) * 100) print "This should be near 95%." ci_width = ci_size[0] print "Here's a sample" print "Sample estimated num users who did action is {0}".format(estimate[0] * total_population_size) print "average Confidence Interval is {0}".format(ci_width) print "So the width of your estimate is {0}% of your metric value.".format((ci_width[1] - ci_width[0]) / estimate[0] /total_population_size* 100) in_ci = [] ci_size = [] estimate = [] hyperparameter_mean_sample_size = sample_size # same as above hyperparameter_sample_size_width = 0.005 # So we could be between 0.04 and 0.06 in our sample size. for i in range(100): local_sample_size = np.random.uniform(hyperparameter_mean_sample_size - hyperparameter_sample_size_width, hyperparameter_mean_sample_size + hyperparameter_sample_size_width) total_population, sampled_population = get_samples(local_sample_size) # in reality, you only have sampled_population. # compute your metric. sample_mean = sum(sampled_population) / float(len(sampled_population)) estimate.append(sample_mean) # compute a confidence interval. ci = np.array(spstats.binom.interval(.95, len(sampled_population), sample_mean )) # Scale up. In our case, we are multiplying times inverse of sample ratio. ci_full_population = ci / sample_size # Record whether our CI includes the known population mean. in_ci.append( ci_full_population[0] <= sum(total_population) <= ci_full_population[1]) ci_size.append(ci_full_population) print "Your 95% CI contains the population value {0}% of the time".format(sum(in_ci)/ float(len(in_ci)) * 100) print "This should be near 95%." ci_width = ci_size[0] print "Here's a sample" print "Sample estimated num users who did action is {0}".format(estimate[0] * total_population_size) print "average Confidence Interval is {0}".format(ci_width) print "So the width of your estimate is {0}% of your metric value.".format((ci_width[1] - ci_width[0]) / estimate[0] /total_population_size* 100) def generate_bootstrap_sample(): total_population = np.random.uniform(0, 1, total_population_size) < known_metric total_population_estimates = [] for i in xrange(400): local_sample_size = np.random.uniform(hyperparameter_mean_sample_size - hyperparameter_sample_size_width, hyperparameter_mean_sample_size + hyperparameter_sample_size_width) sampled_population = np.random.choice(total_population, int(total_population_size * local_sample_size), replace=False) sample_pop_estimate = sum(sampled_population) total_population_estimate = sample_pop_estimate / local_sample_size total_population_estimates.append(total_population_estimate) return total_population_estimates, sum(total_population) total_population_estimates, population_sum = generate_bootstrap_sample() population_sums = [] estimate_sums = [] lower_cis = [] upper_cis = [] in_ci = [] for i in xrange(50): total_population_estimates, population_sum = generate_bootstrap_sample() mean_estimate = np.mean(total_population_estimates) total_population_estimates.sort() lowerci = total_population_estimates[int(len(total_population_estimates) * 0.025)] upperci = total_population_estimates[int(len(total_population_estimates) * 0.975)] population_sums.append(population_sum) estimate_sums.append(mean_estimate) lower_cis.append(lowerci) upper_cis.append(upperci) in_ci.append(lowerci <= population_sum <= upperci) print population_sums lower_cis upper_cis estimate_sums in_ci int(len(total_population_estimates) * 0.975)