#!/usr/bin/env python
# coding: utf-8
# # Approximate Bayesian Computation: Likelihood-free rejection sampling
# Florent Leclercq
# Imperial Centre for Inference and Cosmology, Imperial College London
# florent.leclercq@polytechnique.org
# In[1]:
import numpy as np
from scipy.stats import norm, invgamma, gaussian_kde
from matplotlib import pyplot as plt
from cycler import cycler
np.random.seed(123457)
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams.update({'lines.linewidth': 2})
colors=[plt.cm.Set2(i) for i in np.linspace(0, 1, 8)]
plt.rcParams.update({'axes.prop_cycle': cycler('color', colors)})
# ## Generate data
# In this notebook we try to infer the unknown variance $\sigma^2$ of a zero-mean Gaussian distribution.
#
# The data are $N_\mathrm{samp}$ samples from that distribution.
# In[2]:
Nsamp=100
# In[3]:
groundtruth=2.
likelihood=norm(loc=0.,scale=np.sqrt(groundtruth))
# In[4]:
(dmin,dmax)=(-5,5)
data=likelihood.rvs(size=Nsamp)
lh_data=likelihood.pdf(data)
x_arr=np.arange(dmin,dmax,(dmax-dmin)/100.)
f_arr=likelihood.pdf(x_arr)
# In[5]:
plt.figure(figsize=(10,6))
plt.xlim([dmin,dmax])
plt.plot(x_arr,f_arr,color=colors[3],label="Likelihood")
markerline, stemlines, baseline = plt.stem(data,lh_data,linefmt='-k',markerfmt='k.',label="Data")
baseline.set_visible(False)
plt.title("Likelihood & Data")
plt.legend()
plt.ylim(bottom=0.)
plt.show()
# ## Analytic solution
# The Inverse Gamma distribution is conjugate prior for the unknown mean of a Gaussian process.
# If the prior is $p(\sigma^2)=\mathrm{Inv\Gamma}(\alpha,\beta)$, then the posterior is
# $p(\sigma^2|d)=\mathrm{Inv\Gamma}(\alpha',\beta')$ with
# \begin{equation}
# \alpha'=\alpha + \frac{N_\mathrm{samp}}{2} \quad \mathrm{and} \quad \beta'=\beta+\frac{1}{2} \sum_k d_k^2 .
# \end{equation}
# In[6]:
alpha=60
beta=130
prior=invgamma(alpha,loc=0,scale=beta)
alphaprime=alpha+Nsamp/2
betaprime=beta+1/2.*np.sum(data**2)
posterior=invgamma(alphaprime,loc=0,scale=betaprime)
# In[7]:
a=1
b=4
x=np.arange(a,b,0.01)
plt.figure(figsize=(10,6))
plt.xlim([a,b])
plt.xlabel("$\sigma^2$")
plt.ylim([0,1.2])
plt.plot([groundtruth,groundtruth],[0,1.2],linestyle='--',color='black',label="groundtruth")
plt.plot(x,prior.pdf(x)/prior.pdf(x).max(),label="prior")
plt.plot(x,posterior.pdf(x)/posterior.pdf(x).max(),label="posterior")
plt.legend()
plt.show()
# ## Likelihood-free rejection sampling
# Given a parameter $\sigma^2$ drawn from the prior, the simulator generates $N_\mathrm{samp}$ samples
# from a Gaussian with mean $0$ and variance $\sigma^2$.
# The distance between a simulation $s$ and real data $d$ is
# \begin{equation}
# D(s,d)=\sqrt{\left[S(s)-S(k)\right]^2},
# \end{equation}
# where $S$ is a summary statistic of $s$ or $d$.
# In[8]:
def simulator(var):
return norm(loc=0.,scale=np.sqrt(var)).rvs(size=Nsamp)
def distance(sim_ss,data_ss):
return np.sqrt(np.sum((sim_ss-data_ss)**2))
# In[9]:
Ntries=10000
# ### Sufficient summary statistics
# We first try a sufficient summary statistic: the estimated variance in this case
# In[10]:
# sufficient summary statistics
def sufficient_summary_stat(data):
return np.var(data)
data_ss=sufficient_summary_stat(data)
# In[11]:
data_ss
# In[12]:
for epsilon in [0.5,0.4,0.3,0.2,0.1]:
# likelihood-free rejection sampler
# note that we never call the likelihood function!
samples=[]
for this_try in range(Ntries):
this_var = prior.rvs(size=1)
this_sim = simulator(this_var)
this_ss = sufficient_summary_stat(this_sim)
if(distance(this_ss,data_ss)