#!/usr/bin/env python # coding: utf-8 # # MCMC: Metropolis-Hastings algorithm and diagnostics # Florent Leclercq,
# Imperial Centre for Inference and Cosmology, Imperial College London,
# florent.leclercq@polytechnique.org # In[1]: import numpy as np from scipy.stats import norm, binom, beta from matplotlib import pyplot as plt from cycler import cycler np.random.seed(123456) get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams.update({'lines.linewidth': 2}) # In this notebook we consider a Bernoulli experiment (a series of $N_{\mathrm{trials}}$ independent experiments, each with its own boolean-valued outcome). The target parameter is the success rate $\theta$. The likelihood for this problem is a [binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution) and the [beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) provides a family of conjuguate priors for Bayesian inference. # ## Generate data # In[2]: groundtruth = 0.38 Ntrials = 100 Nsuccesses = binom(Ntrials, groundtruth).rvs() Nsuccesses # ## Analytic solution # If the likelihood is a binomial distribution with $N_{\mathrm{successes}}$ for $N_{\mathrm{trials}}$, and if the prior is a beta distribution with parameters ($\alpha$, $\beta$), then the posterior is a beta distribution with parameters # \begin{equation} # \alpha' = \alpha+N_{\mathrm{successes}}, \quad \beta' = \beta + N_{\mathrm{trials}} - N_{\mathrm{successes}} # \end{equation} # (see for example https://en.wikipedia.org/wiki/Conjugate_prior#When_likelihood_function_is_a_discrete_distribution). # In[3]: def target_pdf(theta,Ntrials,Nsuccesses,lh,prior): if theta < 0 or theta > 1: return 0 else: return lh(Ntrials,theta).pmf(Nsuccesses)*prior.pdf(theta) # In[4]: a = 10 b = 10 lh = binom prior = beta(a,b) posterior = beta(a+Nsuccesses, b+Ntrials-Nsuccesses) # In[5]: thetas = np.linspace(0, 1, 200) plt.figure(figsize=(10,6)) plt.xlim([0,1]) plt.ylim([0,1.05]) plt.xlabel("$\\theta$") plt.plot(thetas, prior.pdf(thetas)/prior.pdf(thetas).max(), label="prior") plt.plot(thetas, lh(Ntrials,thetas).pmf(Nsuccesses)/lh(Ntrials,thetas).pmf(Nsuccesses).max(), label="likelihood") plt.plot(thetas, posterior.pdf(thetas)/posterior.pdf(thetas).max(), label="posterior") plt.plot([groundtruth,groundtruth],[0,1.05],color='black',linestyle='--',label="groundtruth") plt.title("Analytic solution") plt.legend(loc='best') plt.show() # ## Metropolis-Hastings sampler # In[6]: def proposal_pdf(sigma): return norm(0,sigma) # In[7]: def MH_sampler(Ntries,theta_start,Ntrials,Nsuccesses,lh,prior,proposal_sigma): Naccepted=0 samples=np.zeros(Ntries+1) samples[0]=theta_start theta=theta_start for i in range(Ntries): theta_p = theta + proposal_pdf(proposal_sigma).rvs() # the Gaussian proposal pdf satisfies the detailed balance equation, so the # acceptance ratio simplifies to the Metropolis ratio a = min(1, target_pdf(theta_p,Ntrials,Nsuccesses,lh,prior)/target_pdf(theta,Ntrials,Nsuccesses,lh,prior)) u = np.random.uniform() if u < a: Naccepted+=1 theta=theta_p samples[i+1] = theta return Naccepted, samples # In[8]: Ntries1=10000 Nburnin=1000 proposal_sigma=0.1 theta_start=0.6 Naccepted, samples = MH_sampler(Ntries1,theta_start,Ntrials,Nsuccesses,lh,prior,proposal_sigma) fraction_accepted=float(Naccepted)/Ntries1 # In[9]: fig, ax = plt.subplots(figsize=(10,6)) ax.set_xlim(0,Ntries1) ax.set_xlabel("sample number") ax.set_title("Trace plot") ax.set_ylabel("$\\theta$") ax.plot(np.arange(Ntries1+1),samples,marker='.',color='C4') ymin, ymax = ax.get_ylim() ax.set_ylim([ymin,ymax]) ax.plot([Nburnin,Nburnin],[ymin,ymax],color='black',linestyle=':') plt.show() # In[10]: fraction_accepted # In[11]: burnedin_samples=samples[Nburnin:] plt.figure(figsize=(10,6)) plt.xlim([0,1]) plt.xlabel("$\\theta$") plt.plot(thetas, prior.pdf(thetas), label="prior") plt.hist(burnedin_samples, 40, histtype='step', density=True, color='C2', linewidth=2, label="MH posterior") plt.plot(thetas, posterior.pdf(thetas), color='C2', linestyle='--', label="analytic posterior") plt.title("Metropolis-Hastings sampling, acceptance ratio={:.3f}".format(fraction_accepted)) plt.legend(loc='best') plt.show() # ## Markov Chain diagnostics # ### 1- Step size # In[12]: Ntries2=1000 theta_start=0.6 # In[13]: # Suitable step size proposal_sigma_1=0.1 Naccepted_1, samples_1 = MH_sampler(Ntries2,theta_start,Ntrials,Nsuccesses,lh,prior,proposal_sigma_1) fraction_accepted_1=float(Naccepted_1)/Ntries2 # In[14]: # Step size too large proposal_sigma_2=4 Naccepted_2, samples_2 = MH_sampler(Ntries2,theta_start,Ntrials,Nsuccesses,lh,prior,proposal_sigma_2) fraction_accepted_2=float(Naccepted_2)/Ntries2 # In[15]: # Step size too small proposal_sigma_3=0.003 Naccepted_3, samples_3 = MH_sampler(Ntries2,theta_start,Ntrials,Nsuccesses,lh,prior,proposal_sigma_3) fraction_accepted_3=float(Naccepted_3)/Ntries2 # In[16]: f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(15,5)) ax1.set_xlim(0,Ntries2) ax1.set_xlabel("sample number") ax1.set_ylabel("$\\theta$") ax1.plot(np.arange(Ntries2+1),samples_1,marker='.',color='C4') ax1.set_title("sigma={}, acceptance ratio={:.3f}".format(proposal_sigma_1,fraction_accepted_1)) ax2.set_xlim(0,Ntries2) ax2.set_xlabel("sample number") ax2.plot(np.arange(Ntries2+1),samples_2,marker='.',color='C4') ax2.set_title("sigma={}, acceptance ratio={:.3f}".format(proposal_sigma_2,fraction_accepted_2)) ax3.set_xlim(0,Ntries2) ax3.set_xlabel("sample number") ax3.plot(np.arange(Ntries2+1),samples_3,marker='.',color='C4') ax3.set_title("sigma={}, acceptance ratio={:.3f}".format(proposal_sigma_3,fraction_accepted_3)) ymin, ymax = ax1.get_ylim() ax1.text(200,ymax-(ymax-ymin)/10., "adequate step size",fontsize=14) ax2.text(200,ymax-(ymax-ymin)/10., "step size too large",fontsize=14) ax3.text(200,ymax-(ymax-ymin)/10., "step size too small",fontsize=14) f.subplots_adjust(wspace=0.1) plt.show() # ### 2- Multiple chains, different starting point # In[17]: Ntries3=100 proposal_sigma=0.05 Nchains=5 # In[18]: # Run Nchains different chains starting at different positions in parameter space chains = [MH_sampler(Ntries3,theta_start,Ntrials,Nsuccesses,lh,prior,proposal_sigma) for theta_start in np.linspace(0.1,0.9,Nchains)] # In[19]: fig, ax = plt.subplots(figsize=(10,6)) ax.set_xlim(0,Ntries3) ax.set_xlabel("sample number") ax.set_ylabel("$\\theta$") ax.set_prop_cycle(cycler('color', [plt.cm.Set2(i) for i in np.linspace(0, 1., 8)])) for samples in chains: ax.plot(np.arange(Ntries3+1),samples[1],marker='.') plt.title("Trace plot for different chains") plt.show() # ### 3- Gelman-Rubin test # Gelman *et al.*, "*Bayesian Data Analysis*" (third edition), p. 284-285 # # **Parameters**: # * $m$: number of chains # * $n$: length of chains # # **Definitions**: # * "between" chains variance: # \begin{equation} # B \equiv \frac{n}{m-1} \sum_{j=1}^m \left( \bar{\psi}_{. j} - \bar{\psi}_{..} \right)^2 \quad \mathrm{where} \quad \bar{\psi}_{. j} = \frac{1}{n} \sum_{i=1}^n \psi_{ij} \quad \mathrm{and} \quad \bar{\psi}_{..} = \frac{1}{m} \sum_{j=1}^m \bar{\psi}_{.j} # \end{equation} # * "within" chains variance: # \begin{equation} # W \equiv \frac{1}{m} \sum_{j=1}^m s_j^2 \quad \mathrm{where} \quad s_j^2 = \frac{1}{n-1} \sum_{i=1}^n \left( \psi_{ij} - \bar{\psi}_{.j} \right)^2 # \end{equation} # # **Estimators**: # Estimators of the marginal posterior variance of the estimand: # * $\widehat{\mathrm{var}}^- \equiv W$: underestimates the variance # * $\widehat{\mathrm{var}}^+ \equiv \frac{n}{n-1}W + \frac{1}{n} B$: overestimates the variance # # **Test**: # * Potential scale reduction factor: $\widehat{R} \equiv \sqrt{\frac{\widehat{\mathrm{var}}^+}{\widehat{\mathrm{var}}^-}}$ # * Test: $\widehat{R} \rightarrow 1$ as $n \rightarrow \infty$ # In[20]: psi_j = np.array([np.mean(chains[j][1]) for j in range(Nchains)]) psi = np.mean(psi_j) B = float(Nchains)/(Ntries3-1)*np.sum((psi_j-psi*np.ones(Nchains))**2) B # In[21]: s_j = np.array([1./(Ntries3-1)*np.sum((chains[j][1]-psi_j[j])**2) for j in range(Nchains)]) W = np.mean(s_j) W # In[22]: var_minus=W var_plus=float(Ntries3)/(Ntries3-1)*W+1./Ntries3*B var_minus, var_plus # In[23]: R=np.sqrt(var_plus/var_minus) R