%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib.parallel import Parallel, delayed
def bootstrap(data, agg=np.mean, n_bootstraps=10000, seed=None, ci_width=0.95):
data = np.asarray(data)
rng = np.random.RandomState(seed)
n_samples = len(data)
all_aggs = []
for i in range(n_bootstraps):
idx = rng.random_integers(low=0, high=n_samples - 1, size=n_samples)
all_aggs.append(agg(data[idx]))
boostrapped = np.asarray(all_aggs)
boostrapped.sort()
ci_min = (1 - ci_width) / 2.
ci_max = 1. - ci_min
ci = boostrapped[int(ci_min * n_bootstraps)], boostrapped[int(ci_max * n_bootstraps)]
return boostrapped, ci
bins = np.linspace(0, 1, 30)
data = np.random.uniform(0, 1, size=10)
_ = plt.hist(data, bins=bins, alpha=0.5)
data.mean()
0.47800950435078776
bootstrapped, ci = bootstrap(data, ci_width=0.99)
_ = plt.hist(bootstrapped, bins=bins, alpha=0.5)
ci
(0.19418497980251903, 0.77660497016829755)
def unbiased_std(x):
return np.std(x, ddof=1)
bootstrapped, ci = bootstrap(data, agg=unbiased_std, ci_width=0.99)
_ = plt.hist(bootstrapped, bins=bins, alpha=0.5)
plt.title("Bootstrap distribution for the std dev of a uniform variable")
ci
(0.19711553043302424, 0.46172529784033078)
%%time
target_ci_width = 0.95
def compute_ci_correctness(n_samples, true_value, agg=np.mean, target_ci_width=0.95,
n_runs = 100):
correctness = []
for i in range(n_runs):
data = np.random.uniform(0, 1, size=n_samples)
_, ci = bootstrap(data, agg=agg, ci_width=target_ci_width,
n_bootstraps=5000)
correctness.append(float(ci[0] <= true_value <= ci[1]))
# Look Ma': boostrap CI on the boostrap CI correctness
_, ci_outcomes = bootstrap(correctness, ci_width=0.99,
n_bootstraps=10000)
return {
'n_samples': n_samples,
'correctness': np.mean(correctness),
'ci_0025_correct': ci_outcomes[0],
'ci_0975_correct': ci_outcomes[1],
}
all_outcomes = Parallel(n_jobs=-1)(delayed(compute_ci_correctness)(
n_samples, 0.5, agg=np.mean, target_ci_width=target_ci_width)
for n_samples in [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
all_outcomes = pd.DataFrame(all_outcomes)
CPU times: user 44 ms, sys: 20 ms, total: 64 ms Wall time: 11.8 s
def plot_outcomes(all_outcomes, target_ci_width):
plt.fill_between(all_outcomes.n_samples.values,
all_outcomes.ci_0025_correct.values,
all_outcomes.ci_0975_correct.values,
alpha=0.2, color='b')
plt.plot(all_outcomes.n_samples, all_outcomes.correctness, 'o-',
alpha=0.8, color='b')
plt.hlines(target_ci_width,
all_outcomes.n_samples.min(), all_outcomes.n_samples.max(),
linestyles='dotted')
plt.ylim(0.5, 1)
plt.title("Correctness of the bootstrap CI interval")
plt.xlabel("Number of source samples")
plt.ylabel("Rate of true value belonging to the CI interval")
plot_outcomes(all_outcomes, target_ci_width)
all_outcomes
ci_0025_correct | ci_0975_correct | correctness | n_samples | |
---|---|---|---|---|
0 | 0.51 | 0.76 | 0.64 | 3 |
1 | 0.67 | 0.88 | 0.78 | 5 |
2 | 0.79 | 0.95 | 0.88 | 10 |
3 | 0.92 | 1.00 | 0.97 | 20 |
4 | 0.87 | 0.99 | 0.94 | 30 |
5 | 0.92 | 1.00 | 0.97 | 40 |
6 | 0.96 | 1.00 | 0.99 | 50 |
7 | 0.92 | 1.00 | 0.97 | 60 |
8 | 0.90 | 1.00 | 0.96 | 70 |
9 | 0.92 | 1.00 | 0.97 | 80 |
10 | 0.92 | 1.00 | 0.97 | 90 |
11 | 0.90 | 1.00 | 0.96 | 100 |
all_outcomes = Parallel(n_jobs=-1)(delayed(compute_ci_correctness)(
n_samples, 1 / np.sqrt(12), target_ci_width=target_ci_width, agg=unbiased_std)
for n_samples in [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
all_outcomes = pd.DataFrame(all_outcomes)
plot_outcomes(all_outcomes, target_ci_width)
all_outcomes
ci_0025_correct | ci_0975_correct | correctness | n_samples | |
---|---|---|---|---|
0 | 0.24 | 0.49 | 0.36 | 3 |
1 | 0.50 | 0.75 | 0.63 | 5 |
2 | 0.80 | 0.96 | 0.89 | 10 |
3 | 0.82 | 0.97 | 0.90 | 20 |
4 | 0.84 | 0.98 | 0.92 | 30 |
5 | 0.87 | 0.99 | 0.94 | 40 |
6 | 0.87 | 0.99 | 0.94 | 50 |
7 | 0.83 | 0.97 | 0.91 | 60 |
8 | 0.89 | 1.00 | 0.95 | 70 |
9 | 0.92 | 1.00 | 0.97 | 80 |
10 | 0.85 | 0.99 | 0.93 | 90 |
11 | 0.92 | 1.00 | 0.97 | 100 |