Approximate Bayesian Computation

Florent Leclercq
Imperial Centre for Inference and Cosmology, Imperial College London
[email protected]

In [58]:
import numpy as np
from scipy.stats import norm, invgamma, gaussian_kde
from matplotlib import pyplot as plt
from cycler import cycler
np.random.seed(158724)
%matplotlib inline
plt.rc('lines', linewidth=2)
colors=[plt.cm.Set2(i) for i in np.linspace(0, 1, 8)]
plt.rc('axes', prop_cycle=cycler('color', colors))

Generate data

Data are samples from a Gaussian of known mean (zero) and unknown variance $\sigma^2$.

In [59]:
Nsamp=100
In [60]:
groundtruth=2.
likelihood=norm(loc=0.,scale=np.sqrt(groundtruth))
In [61]:
(dmin,dmax)=(-5,5)
data=likelihood.rvs(size=Nsamp)
lh_data=likelihood.pdf(data)
x_arr=np.arange(dmin,dmax,(dmax-dmin)/100.)
f_arr=likelihood.pdf(x_arr)
In [62]:
plt.xlim(dmin,dmax)
plt.plot(x_arr,f_arr,color=colors[3])
markerline, stemlines, baseline = plt.stem(data,lh_data)
plt.setp(markerline, color='black', markersize=1., markeredgewidth = 0.)
plt.setp(stemlines, color='black')
plt.setp(baseline, linestyle="None")
plt.title("Likelihood & Data")
plt.show()

Analytic solution

The Inverse Gamma distribution is conjugate prior for the unknown mean of a Gaussian process. If the prior is $p(\sigma^2)=\mathrm{Inv\Gamma}(\alpha,\beta)$, then the posterior is $p(\sigma^2|d)=\mathrm{Inv\Gamma}(\alpha',\beta')$ with \begin{equation} \alpha'=\alpha + \frac{N_\mathrm{samp}}{2} \quad \mathrm{and} \quad \beta'=\beta+\frac{1}{2} \sum_k d_k^2 . \end{equation}

In [63]:
alpha=60
beta=130
prior=invgamma(alpha,loc=0,scale=beta)
alphaprime=alpha+Nsamp/2
betaprime=beta+1/2.*np.sum(data**2)
posterior=invgamma(alphaprime,loc=0,scale=betaprime)
In [64]:
a=1
b=4
x=np.arange(a,b,0.01)
plt.xlim(a,b)
plt.ylim(0,1.2)
plt.plot(x,prior.pdf(x)/prior.pdf(x).max(),label="prior")
plt.plot(x,posterior.pdf(x)/posterior.pdf(x).max(),label="posterior")
plt.plot([groundtruth,groundtruth],[0,1.2],linestyle='--',color='black')
plt.legend()
plt.show()

Approximate Bayesian Computation (Likelihood-free rejection sampling)

Given a parameter $\sigma^2$ drawn from the prior, the simulator generates $N_\mathrm{samp}$ samples from a Gaussian with mean $0$ and variance $\sigma^2$. The distance between a simulation $s$ and real data $d$ is \begin{equation} D(s,d)=\sqrt{\left[S(s)-S(k)\right]^2}, \end{equation} where $S$ is a summary statistic of $s$ or $d$.

In [65]:
def simulator(var):
    return norm(loc=0.,scale=np.sqrt(var)).rvs(size=Nsamp)
def distance(sim_ss,data_ss):
    return np.sqrt(np.sum((sim_ss-data_ss)**2))
In [66]:
Ntries=10000

Sufficient summary statistics

We first try a sufficient summary statistic: the variance in this case

In [67]:
# sufficient summary statistics
def summary_stat(data):
    return np.var(data)
data_ss=summary_stat(data)
In [68]:
data_ss
Out[68]:
1.8657403043808352
In [69]:
for epsilon in [0.5,0.4,0.3,0.2,0.1]:
    # likelihood-free rejection sampler
    samples=[]
    for this_try in range(Ntries):
        this_var = prior.rvs(size=1)
        this_sim = simulator(this_var)
        this_ss = summary_stat(this_sim)
        if(distance(this_ss,data_ss)<epsilon):
            samples.append(this_var)
    samples=np.array(samples).T[0]
    fraction_accepted=float(len(samples))/Ntries
    
    # kernel density estimation of the approximate posterior
    kernel=gaussian_kde(samples)
    
    # produce a plot
    plt.xlim(a,b)
    plt.ylim(0,1.2)
    plt.plot([groundtruth,groundtruth],[0,1.2],linestyle='--',color='black')
    plt.plot(x,prior.pdf(x)/prior.pdf(x).max(),label="prior")
    plt.plot(x,posterior.pdf(x)/posterior.pdf(x).max(),label="true posterior")
    plt.plot(x,kernel.evaluate(x)/kernel.evaluate(x).max(),label="approximate posterior")
    plt.title("$\\varepsilon="+str(epsilon)+"$, acceptance ratio="+str(fraction_accepted))
    plt.legend()
    plt.show()

Insufficient summary statistics

We now try with an "insufficient" summary statistic: we throw away most of the data/simuation before estimating the variance

In [72]:
# insufficient summary statistics: throw away most of the information
def summary_stat(data):
    return np.var(data[0:int(Nsamp/3.)])
data_ss=summary_stat(data)
In [73]:
for epsilon in [0.5,0.4,0.3,0.2,0.1]:
    # likelihood-free rejection sampler
    samples=[]
    for this_try in range(Ntries):
        this_var = prior.rvs(size=1)
        this_sim = simulator(this_var)
        this_ss = summary_stat(this_sim)
        if(distance(this_ss,data_ss)<epsilon):
            samples.append(this_var)
    samples=np.array(samples).T[0]
    fraction_accepted=float(len(samples))/Ntries
    
    # kernel density estimation of the approximate posterior
    kernel=gaussian_kde(samples)
    
    # produce a plot
    plt.xlim(a,b)
    plt.ylim(0,1.2)
    plt.plot([groundtruth,groundtruth],[0,1.2],linestyle='--',color='black')
    plt.plot(x,prior.pdf(x)/prior.pdf(x).max(),label="prior")
    plt.plot(x,posterior.pdf(x)/posterior.pdf(x).max(),label="true posterior")
    plt.plot(x,kernel.evaluate(x)/kernel.evaluate(x).max(),label="approximate posterior")
    plt.title("$\\varepsilon="+str(epsilon)+"$, acceptance ratio="+str(fraction_accepted))
    plt.legend()
    plt.show()