Kruschke's Doing Bayesian Data Analysis in PyMC3

The scripts from https://github.com/aloctavodia/Doing_bayesian_data_analysis collected into one IPython notebook, with minor edits:

  • all imports collected into the first cell
  • import scipy.special.binom so it does not overwrite scipy.stats.binom
  • move csv files into cells, and read them using StringIO

Only some files import division from __future__ but this does not appear to cause any bugs.

In [3]:
from __future__ import division
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm

import sys
from StringIO import StringIO

from scipy.stats import norm, multivariate_normal, beta, binom, stats
from scipy.stats import t as stats_t
from scipy.special import beta as beta_func
from scipy.special import binom as special_binom
from scipy.special import betaln as special_betaln
from scipy.optimize import fmin
from scipy.interpolate import spline
from mpl_toolkits.mplot3d.axes3d import Axes3D

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [21]:
"""
02_SimpleGraph.py
A simple graph drawn by Python :-)
"""
x = np.linspace(-2, 2, 40)
y = x**2

plt.figure(figsize=(16,6))
plt.plot(x, y)
plt.show()
In [22]:
"""
03_IntegralOfDensity.py
Graph of normal probability density function, with comb of intervals.
"""
meanval = 0.0              # Specify mean of distribution.
sdval = 0.2                # Specify standard deviation of distribution.
xlow = meanval - 3 * sdval  # Specify low end of x-axis.
xhigh = meanval + 3 * sdval  # Specify high end of x-axis.
dx = 0.02                  # Specify interval width on x-axis
# Specify comb points along the x axis:
x = np.arange(xlow, xhigh, dx)
# Compute y values, i.e., probability density at each value of x:
y = (1/(sdval*np.sqrt(2*np.pi))) * np.exp(-.5 * ((x - meanval)/sdval)**2)
# Plot the function. "plot" draws the bell curve. "stem" draws the intervals.
plt.figure(figsize=(16,6)) # Brian added
plt.plot(x, y)
plt.stem(x, y, markerfmt=' ')

plt.xlabel('$x$')
plt.ylabel('$p(x)$')
plt.title('Normal Probability Density')
# Approximate the integral as the sum of width * height for each interval.
area = np.sum(dx*y)
# Display info in the graph.
_ = plt.text(-.6, 1.7, '$\mu$ = %s' % meanval)
_ = plt.text(-.6, 1.5, '$\sigma$ = %s' % sdval)
_ = plt.text(.2, 1.7, '$\Delta x$ = %s' % dx)
_ = plt.text(.2, 1.5, '$\sum_{x}$ $\Delta x$ $p(x)$ = %5.3f' % area)
In [5]:
"""
03_RunningProportion.py
Goal: Toss a coin N times and compute the running proportion of heads.
"""

# Specify the total number of flips, denoted N.
N = 500
# Generate a random sample of N flips for a fair coin (heads=1, tails=0);
np.random.seed(47405)
flip_sequence = np.random.choice(a=(0, 1), p=(.5, .5), size=N, replace=True)
# Compute the running proportion of heads:
r = np.cumsum(flip_sequence)
n = np.linspace(1, N, N)  # n is a vector.
run_prop = r/n  # component by component division.

# Graph the running proportion:
plt.figure(figsize=(16,6)) # Brian added
plt.plot(n, run_prop, '-o', )
plt.xscale('log')  # an alternative to plot() and xscale() is semilogx()
plt.xlim(1, N)
plt.ylim(0, 1)
plt.xlabel('Flip Number')
plt.ylabel('Proportion Heads')
plt.title('Running Proportion of Heads')
# Plot a dotted horizontal line at y=.5, just as a reference line:
plt.axhline(y=.5, ls='dashed')

# Display the beginning of the flip sequence.
flipletters = ''.join(["T","H"][flip] for flip in flip_sequence[:10])

_ = plt.text(10, 0.8, 'Flip Sequence = %s...' % flipletters)
# Display the relative frequency at the end of the sequence.
_ = plt.text(25, 0.2, 'End Proportion = %s' % run_prop[-1])
In [24]:
"""
04_BayesUpdate.py
Bayesian updating of beliefs about the bias of a coin. The prior and posterior
distributions indicate probability masses at discrete candidate values of theta.
"""

# theta is the vector of candidate values for the parameter theta.
# n_theta_vals is the number of candidate theta values.
# To produce the examples in the book, set n_theta_vals to either 3 or 63.
n_theta_vals = 3.
# Now make the vector of theta values:
theta = np.linspace(1/(n_theta_vals +1), n_theta_vals /(n_theta_vals +1), n_theta_vals )

# p_theta is the vector of prior probabilities on the theta values.
p_theta = np.minimum(theta, 1-theta)  # Makes a triangular belief distribution.
p_theta = p_theta / np.sum(p_theta)     # Makes sure that beliefs sum to 1.

# Specify the data. To produce the examples in the book, use either
# data = np.repeat([1,0], [3, 9]) or data = np.repeat([1,0], [1, 11])
data = np.repeat([1, 0], [3, 9])
n_heads = np.sum(data)
n_tails = len(data) - n_heads

# Compute the likelihood of the data for each value of theta:
p_data_given_theta = theta**n_heads * (1-theta)**n_tails

# Compute the posterior:
p_data = np.sum(p_data_given_theta * p_theta)
p_theta_given_data = p_data_given_theta * p_theta / p_data   # This is Bayes' rule!

# Plot the results.
plt.figure(figsize=(12, 11))
plt.subplots_adjust(hspace=0.7)

# Plot the prior:
plt.subplot(3, 1, 1)
plt.stem(theta, p_theta, markerfmt=' ')
plt.xlim(0, 1)
plt.xlabel('$\\theta$')
plt.ylabel('$P(\\theta)$')
plt.title('Prior')
# Plot the likelihood:
plt.subplot(3, 1, 2)
plt.stem(theta, p_data_given_theta, markerfmt=' ')
plt.xlim(0, 1)
plt.xlabel('$\\theta$')
plt.ylabel('$P(D|\\theta)$')
plt.title('Likelihood')
plt.text(0.6, np.max(p_data_given_theta)/2, 'D = %sH,%sT' % (n_heads, n_tails))
# Plot the posterior:
plt.subplot(3, 1, 3)
plt.stem(theta, p_theta_given_data, markerfmt=' ')
plt.xlim(0, 1)
plt.xlabel('$\\theta$')
plt.ylabel('$P(\\theta|D)$')
plt.title('Posterior')
_ = plt.text(0.6, np.max(p_theta_given_data)/2, 'P(D) = %g' % p_data)
In [25]:
"""
HIDofICDF.py
This program finds the HDI of a probability density function that is specified 
mathematically in Python.
"""

def HDIofICDF(dist_name, credMass=0.95, **args):
    # freeze distribution with given arguments
    distri = dist_name(**args)
    # initial guess for HDIlowTailPr
    incredMass =  1.0 - credMass

    def intervalWidth(lowTailPr):
        return distri.ppf(credMass + lowTailPr) - distri.ppf(lowTailPr)

    # find lowTailPr that minimizes intervalWidth
    HDIlowTailPr = fmin(intervalWidth, incredMass, ftol=1e-8, disp=False)[0]
    # return interval as array([low, high])
    return distri.ppf([HDIlowTailPr, credMass + HDIlowTailPr])
In [26]:
"""
05_BernBeta.py
Inferring a binomial proportion via exact mathematical analysis.
"""

def bern_beta(prior_shape, data_vec, cred_mass=0.95):
    """Bayesian updating for Bernoulli likelihood and beta prior.
     Input arguments:
       prior_shape
         vector of parameter values for the prior beta distribution.
       data_vec
         vector of 1's and 0's.
       cred_mass
         the probability mass of the HDI.
     Output:
       post_shape
         vector of parameter values for the posterior beta distribution.
     Graphics:
       Creates a three-panel graph of prior, likelihood, and posterior
       with highest posterior density interval.
     Example of use:
     post_shape = bern_beta(prior_shape=[1,1] , data_vec=[1,0,0,1,1])"""

    # Check for errors in input arguments:
    if len(prior_shape) != 2:
        sys.exit('prior_shape must have two components.')
    if any([i < 0 for i in prior_shape]):
        sys.exit('prior_shape components must be positive.')
    if any([i != 0 and i != 1 for i in data_vec]):
        sys.exit('data_vec must be a vector of 1s and 0s.')
    if cred_mass <= 0 or cred_mass >= 1.0:
        sys.exit('cred_mass must be between 0 and 1.')

    # Rename the prior shape parameters, for convenience:
    a = prior_shape[0]
    b = prior_shape[1]
    # Create summary values of the data:
    z = sum(data_vec[data_vec == 1])  # number of 1's in data_vec
    N = len(data_vec)   # number of flips in data_vec
    # Compute the posterior shape parameters:
    post_shape = [a+z, b+N-z]
    # Compute the evidence, p(D):
    p_data = beta_func(z+a, N-z+b)/beta_func(a, b)
    # Construct grid of theta values, used for graphing.
    bin_width = 0.005  # Arbitrary small value for comb on theta.
    theta = np.arange(bin_width/2, 1-(bin_width/2)+bin_width, bin_width)
    # Compute the prior at each value of theta.
    p_theta = beta.pdf(theta, a, b)
    # Compute the likelihood of the data at each value of theta.
    p_data_given_theta = theta**z * (1-theta)**(N-z)
    # Compute the posterior at each value of theta.
    post_a = a + z
    post_b = b+N-z
    p_theta_given_data = beta.pdf(theta, a+z, b+N-z)
    # Determine the limits of the highest density interval
    intervals = HDIofICDF(beta, cred_mass, a=post_shape[0], b=post_shape[1])

    # Plot the results.
    plt.figure(figsize=(12, 12))
    plt.subplots_adjust(hspace=0.7)

    # Plot the prior.
    locx = 0.05
    plt.subplot(3, 1, 1)
    plt.plot(theta, p_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta)$')
    plt.title('Prior')
    plt.text(locx, np.max(p_theta)/2, r'beta($\theta$;%s,%s)' % (a, b))
    # Plot the likelihood:
    plt.subplot(3, 1, 2)
    plt.plot(theta, p_data_given_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_data_given_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(D|\theta)$')
    plt.title('Likelihood')
    plt.text(locx, np.max(p_data_given_theta)/2, 'Data: z=%s, N=%s' % (z, N))
    # Plot the posterior:
    plt.subplot(3, 1, 3)
    plt.plot(theta, p_theta_given_data)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta_given_data)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta|D)$')
    plt.title('Posterior')
    locy = np.linspace(0, np.max(p_theta_given_data), 5)
    plt.text(locx, locy[1], r'beta($\theta$;%s,%s)' % (post_a, post_b))
    plt.text(locx, locy[2], 'P(D) = %g' % p_data)
    # Plot the HDI
    plt.text(locx, locy[3],
             'Intervals = %.3f - %.3f' % (intervals[0], intervals[1]))
    plt.fill_between(theta, 0, p_theta_given_data,
                    where=np.logical_and(theta > intervals[0],
                    theta < intervals[1]),
                        color='blue', alpha=0.3)
    return intervals

data_vec = np.repeat([1, 0], [11, 3])  # 11 heads, 3 tail
intervals = bern_beta(prior_shape=[100, 100], data_vec=data_vec)
plt.show()
In [27]:
"""
05_BetaPosteriorPredictions.py
Posterior predictive check. Examine the veracity of the winning model by
simulating data sampled from the winning model and see if the simulated data
'look like' the actual data.
"""
# Specify known values of prior and actual data.
prior_a = 100
prior_b = 1
actual_data_Z = 8
actual_data_N = 12
# Compute posterior parameter values.
post_a = prior_a + actual_data_Z
post_b = prior_b + actual_data_N - actual_data_Z
# Number of flips in a simulated sample should match the actual sample size:
sim_sample_size = actual_data_N
# Designate an arbitrarily large number of simulated samples.
n_sim_samples = 1000
# Set aside a vector in which to store the simulation results.
sim_sample_Z_record = np.zeros(n_sim_samples)
# Now generate samples from the posterior.
for sample_idx in range(0, n_sim_samples):
    # Generate a theta value for the new sample from the posterior.
    sample_theta = beta.rvs(post_a, post_b)
    # Generate a sample, using sample_theta.
    sample_data = np.random.choice([0, 1], p=[1-sample_theta, sample_theta],
                                  size=sim_sample_size, replace=True)
    sim_sample_Z_record[sample_idx] = sum(sample_data)


## Make a histogram of the number of heads in the samples.
plt.figure(figsize=(16,4))
plt.title("a histogram of the number of heads in the samples")
plt.hist(sim_sample_Z_record)
plt.show()
In [28]:
"""HPD.py
This code was taken form the PyMC library https://github.com/pymc-devs/pymc
"""

def calc_min_interval(x, alpha):
    """Internal method to determine the minimum interval of a given width
    Assumes that x is sorted numpy array.
    """

    n = len(x)
    cred_mass = 1.0-alpha

    interval_idx_inc = int(np.floor(cred_mass*n))
    n_intervals = n - interval_idx_inc
    interval_width = x[interval_idx_inc:] - x[:n_intervals]

    if len(interval_width) == 0:
        raise ValueError('Too few elements for interval calculation')

    min_idx = np.argmin(interval_width)
    hdi_min = x[min_idx]
    hdi_max = x[min_idx+interval_idx_inc]
    return hdi_min, hdi_max


def hpd(x, alpha=0.05):
    """Calculate highest posterior density (HPD) of array for given alpha. 
    The HPD is the minimum width Bayesian credible interval (BCI).
    :Arguments:
        x : Numpy array
        An array containing MCMC samples
        alpha : float
        Desired probability of type I error (defaults to 0.05)
    """

    # Make a copy of trace
    x = x.copy()
    # For multivariate node
    if x.ndim > 1:
        # Transpose first, then sort
        tx = np.transpose(x, list(range(x.ndim))[1:]+[0])
        dims = np.shape(tx)
        # Container list for intervals
        intervals = np.resize(0.0, dims[:-1]+(2,))

        for index in make_indices(dims[:-1]):
            try:
                index = tuple(index)
            except TypeError:
                pass

            # Sort trace
            sx = np.sort(tx[index])
            # Append to list
            intervals[index] = calc_min_interval(sx, alpha)
        # Transpose back before returning
        return np.array(intervals)
    else:
        # Sort univariate node
        sx = np.sort(x)
        return np.array(calc_min_interval(sx, alpha))
In [29]:
"""
06_BernGrid.py
Inferring a binomial proportion via grid aproximation.
"""

def bern_grid(theta, p_theta, data, credib=.95):
    """
    Bayesian updating for Bernoulli likelihood and prior specified on a grid.
    Input arguments:
     theta is a vector of theta values, all between 0 and 1.
     p_theta is a vector of corresponding probability _masses_.
     data is a vector of 1's and 0's, where 1 corresponds to a and 0 to b.
     credib is the probability mass of the credible interval, default is 0.95.
    Output:
     p_theta_given_data is a vector of posterior probability masses over theta.
     Also creates a three-panel graph of prior, likelihood, and posterior
     probability masses with credible interval.
    Example of use:
     Create vector of theta values.
     bin_width = 1/1000 
     theta_grid = np.arange(0, 1+bin_width, bin_width)
     Specify probability mass at each theta value.
     > rel_prob = np.minimum(theta_grid, 1-theta_grid) relative prob at each theta
     > prior = rel_prob / sum(rel_prob) probability mass at each theta
     Specify the data vector.
     data_vec = np.repeat([1, 0], [11, 3])  # 3 heads, 1 tail
     Call the function.
     > posterior = bern_grid( theta=theta_grid , p_theta=prior , data=data_vec )
    """

# Create summary values of data
    z = sum(data[data == 1])  # number of 1's in data
    N = len(data)  # number of flips in data
# Compute the likelihood of the data for each value of theta.
    p_data_given_theta = theta**z * (1 - theta)**(N - z)
# Compute the evidence and the posterior.
    p_data = sum(p_data_given_theta * p_theta)
    p_theta_given_data = p_data_given_theta * p_theta / p_data
    # Determine the limits of the highest density interval
    x = np.random.choice(theta, size=5000, replace=True, p=p_theta_given_data)
    intervals = hpd(x, alpha=1-credib)

# Plot the results.
    plt.figure(figsize=(16, 12)) # Brian edited
    plt.subplots_adjust(hspace=0.7)

#    # Plot the prior.
    locx = 0.05
    mean_theta = sum(theta * p_theta)  # mean of prior, for plotting
    plt.subplot(3, 1, 1)
    plt.plot(theta, p_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta)$')
    plt.title('Prior')
    plt.text(locx, np.max(p_theta)/2, r'mean($\theta$;%5.2f)' % mean_theta)
    # Plot the likelihood:
    plt.subplot(3, 1, 2)
    plt.plot(theta, p_data_given_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_data_given_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(D|\theta)$')
    plt.title('Likelihood')
    plt.text(locx, np.max(p_data_given_theta)/2, 'data: z=%s, N=%s' % (z, N))
    # Plot the posterior:
    mean_theta_given_data = sum(theta * p_theta_given_data)
    plt.subplot(3, 1, 3)
    plt.plot(theta, p_theta_given_data)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta_given_data)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta|D)$')
    plt.title('Posterior')
    loc = np.linspace(0, np.max(p_theta_given_data), 5)
    plt.text(locx, loc[1], r'mean($\theta$;%5.2f)' % mean_theta_given_data)
    plt.text(locx, loc[2], 'P(D) = %g' % p_data)
    # Plot the HDI
    plt.text(locx, loc[3],
             'Intervals =%s' % ', '.join('%.3f' % x for x in intervals))
    for i in range(0, len(intervals), 2):
        plt.fill_between(theta, 0, p_theta_given_data,
                         where=np.logical_and(theta > intervals[i],
                                              theta < intervals[i+1]),
                         color='blue', alpha=0.3)
    plt.show()
    return p_theta_given_data


###Create vector of theta values.
bin_width = 1/1000.
theta_grid = np.arange(0, 1+bin_width, bin_width)
##Specify probability mass at each theta value.
rel_prob = np.array([0.1] * len(theta_grid))  # uniform prior
rel_prob = np.array([0.1] * len(theta_grid))  # uniform prior
prior = rel_prob / sum(rel_prob)  # probability mass at each theta


#### figure 6.2 ###
#np.random.seed(123)
#a = [0.1] * 50
#b = np.linspace(0.1, 1, 50)
#c = np.linspace(1, 0.1, 50)
#d = [0.1] * 50
#p_theta = np.concatenate((a, b, c, d))
#prior = np.where(p_theta != 0 , p_theta / sum(p_theta), 0.)
#width = 1. / len(p_theta)
#theta_grid = np.arange(width/2 , (1-width/2)+width, width)

### figure 6.3 ###
#np.random.seed(123)
#a = np.repeat([0], [50])
#b = np.linspace(0, 1, 50)
#c = (np.linspace(1, 0, 20))**2
#d = np.random.uniform(size=3)
#e = np.repeat([1], [20])
#p_theta = np.concatenate((a, b, c, d, e))
#prior = np.where(p_theta != 0 , p_theta / sum(p_theta), 0.)
#width = 1. / len(p_theta)
#theta_grid = np.arange(width/2 , (1-width/2)+width, width)

###Specify the data vector.
data_vec = np.repeat([1, 0], [11, 3])  # 3 heads, 1 tail
###Call the function.
posterior = bern_grid(theta=theta_grid, p_theta=prior, data=data_vec)
In [30]:
"""plot_post.py"""

def plot_post(param_sample_vec, cred_mass=0.95, comp_val=False,
              ROPE=False, ylab='', xlab='parameter', fontsize=14, labelsize=14,
              title='', framealpha=1, facecolor='skyblue', edgecolor='white',
              show_mode=True, bins=50):
    
    #compute HDI
    HDI = hpd(param_sample_vec, 1-cred_mass)

    post_summary = {'mean':0,'median':0,'mode':0, 'hdi_mass':0,'hdi_low':0,
                   'hdi_high':0, 'comp_val':0, 'pc_gt_comp_val':0, 'ROPE_low':0,
                   'ROPE_high':0, 'pc_in_ROPE':0}
    post_summary['mean'] = np.mean(param_sample_vec)
    post_summary['median'] = np.median(param_sample_vec)
    post_summary['mode'] = stats.mode(param_sample_vec)[0]
    post_summary['hdi_mass'] = cred_mass
    post_summary['hdi_low'] = HDI[0]
    post_summary['hdi_high'] = HDI[1]

    # Plot histogram.
    n, bins, patches = plt.hist(param_sample_vec, normed=True, bins=bins,
                                edgecolor=edgecolor, facecolor=facecolor)
    plt.xlabel(xlab, fontsize=fontsize)
    plt.ylabel(ylab, fontsize=fontsize)
    plt.title(title, fontsize=fontsize)

    cv_ht = 0.75*np.max(n)
    cen_tend_ht = 0.9 * cv_ht
    ROPE_text_ht = 0.55 * cv_ht
#    # Display mean or mode:
    if show_mode:
        plt.plot(0, label='mode = %.2f' % post_summary['mode'], alpha=0)
    else:
        plt.plot(0, label='mean = %.2f' % post_summary['mean'], alpha=0)
    # Display the comparison value.

    if comp_val is not False:
        pc_gt_comp_val = 100 * np.sum(param_sample_vec > comp_val)/len(param_sample_vec)
        pc_lt_comp_val = 100 - pc_gt_comp_val
        plt.plot([comp_val, comp_val], [0, cv_ht], color='darkgreen',
                 linestyle='--', linewidth=2,
                 label='%.1f%% <%.1f < %.1f%%'
                 % (pc_lt_comp_val, comp_val, pc_gt_comp_val))
        post_summary['comp_val'] = comp_val
        post_summary['pc_gt_comp_val'] = pc_gt_comp_val
#    # Display the ROPE.
    if ROPE is not False:
        rope_col = 'darkred'
        pc_in_ROPE = round(np.sum((param_sample_vec > ROPE[0]) & (param_sample_vec < ROPE[1]))/len(param_sample_vec)*100)
        plt.plot([ROPE[0], ROPE[0]], [0, 0.96*ROPE_text_ht], color=rope_col,
                linestyle=':', linewidth=4,
                label='%.1f%% in ROPE' % pc_in_ROPE)
        plt.plot([ROPE[1], ROPE[1]], [0, 0.96*ROPE_text_ht], color=rope_col,
                linestyle=':', linewidth=4)
        post_summary['ROPE_low'] = ROPE[0] 
        post_summary['ROPE_high'] = ROPE[1] 
        post_summary['pc_in_ROPE'] = pc_in_ROPE
#    # Display the HDI.
    plt.plot(HDI, [0, 0], linewidth=6, color='k', label='HDI %.1f%% %.3f-%.3f' % (cred_mass*100, HDI[0], HDI[1]))
    plt.legend(loc='upper left', fontsize=labelsize, framealpha=framealpha)
    return post_summary
In [31]:
"""
07_BernBetaPyMCFull.py
Inferring a binomial proportion using PyMC.
"""
# Generate the data
y = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0])  # 11 heads and 3 tails


with pm.Model() as model:
    # define the prior
    theta = pm.Beta('theta', 1, 1)  # prior
    # define the likelihood
    y = pm.Bernoulli('y', p=theta, observed=y)

    # Generate a MCMC chain
    trace = pm.sample(5000, pm.Metropolis(),
                      progressbar=False)  # Use Metropolis sampling
#    start = pm.find_MAP()  # Find starting value by optimization
#    step = pm.NUTS()  # Instantiate NUTS sampler
#    trace = pm.sample(5000, step, start=start, progressbar=False)

# create an array with the posterior sample
theta_sample = trace['theta']

print "theta_sample", theta_sample
plt.figure(figsize=(16,6)) # Brian added

plt.subplot(1, 2, 1)
plt.plot(theta_sample[:500], np.arange(500), marker='o')
plt.xlim(0, 1)
plt.xlabel(r'$\theta$')
plt.ylabel('Position in Chain')

plt.subplot(1, 2, 2)
mcmc_info = plot_post(theta_sample, xlab=r'$\theta', show_mode=False)

# Posterior prediction:
# For each step in the chain, use posterior theta to flip a coin:
y_pred = np.zeros(len(theta_sample))
for i, p_head in enumerate(theta_sample):
    y_pred[i] = np.random.choice([0, 1], p=[1 - p_head, p_head])

# Jitter the 0,1 y values for plotting purposes:
y_pred_jittered = y_pred + np.random.uniform(-.05, .05, size=len(theta_sample))

# Now plot the jittered values:
plt.figure(figsize=(16,6)) # Brian added

plt.plot(theta_sample[:500], y_pred_jittered[:500], 'ro')
plt.xlim(-.1, 1.1)
plt.ylim(-.1, 1.1)
plt.xlabel(r'$\theta$')
plt.ylabel('y (jittered)')

mean_y = np.mean(y_pred)
mean_theta = np.mean(theta_sample)

plt.plot(mean_y, mean_theta, 'k+', markersize=15)
plt.annotate('mean(y) = %.2f\nmean($\\theta$) = %.2f' %
    (mean_y, mean_theta), xy=(mean_y, mean_theta))
plt.plot([0, 1], [0, 1], linestyle='--')

plt.show()
theta_sample [ 0.5         0.5         0.5        ...,  0.80979487  0.80979487
  0.80979487]
In [32]:
"""
07_BernMetropolisTemplate.py
Use this program as a template for experimenting with the Metropolis algorithm
applied to a single parameter called theta, defined on the interval [0,1].
"""

# Specify the data, to be used in the likelihood function.
# This is a vector with one component per flip,
# in which 1 means a "head" and 0 means a "tail".
my_data = np.repeat([1, 0], [11, 3])  # 11 heads, 2 tail

# Define the Bernoulli likelihood function, p(D|theta).
# The argument theta could be a vector, not just a scalar.
def likelihood(theta, data):
    theta = np.array(theta) # ensure you have an array
    z = sum(data[data == 1])  # number of 1's in Data
    N = len(data)  # number of flips in Data
# Compute the likelihood of the Data for each value of Theta.
    if np.size(theta) == 1:  # if theta is an scalar
        p_data_given_theta = 0
        if theta < 1 and theta > 0:
            p_data_given_theta = theta**z * (1-theta)**(N-z)
    else: # if theta is an array
        p_data_given_theta = theta**z * (1-theta)**(N-z)
        # The theta values passed into this function are generated at random,
        # and therefore might be inadvertently greater than 1 or less than 0.
        # The likelihood for theta > 1 or for theta < 0 is zero:
        p_data_given_theta[(theta > 1) | (theta < 0)] = 0
    return p_data_given_theta


# Define the prior density function. For purposes of computing p(D),
# at the end of this program, we want this prior to be a proper density.
# The argument theta could be a vector, not just a scalar.
def prior(theta):
    theta = np.array(theta) # ensure you have an array
# For kicks, here's a bimodal prior. To try it, uncomment the next 2 lines.
    #from scipy.stats import beta
    #prior = dbeta(np.minium(2*theta, 2*(1-theta)), 2, 2)
    if np.size(theta) == 1:  # if theta is an scalar
        prior = 0
        if theta < 1 and theta > 0:
            prior = 1
    else: # if theta is an array
        prior = np.ones(len(theta))  # uniform density over [0,1]
        # The theta values passed into this function are generated at random,
        # and therefore might be inadvertently greater than 1 or less than 0.
        # The likelihood for theta > 1 or for theta < 0 is zero:
        prior[(theta > 1) | (theta < 0)] = 0
    return prior



# Define the relative probability of the target distribution, 
# as a function of vector theta. For our application, this
# target distribution is the unnormalized posterior distribution.
def target_rel_prob(theta, data):
    target_rel_prob = likelihood(theta , data) * prior(theta)
    return target_rel_prob

# Specify the length of the trajectory, i.e., the number of jumps to try:
traj_length = 5000 # arbitrary large number
# Initialize the vector that will store the results:
trajectory = np.zeros(traj_length)
# Specify where to start the trajectory:
trajectory[0] = 0.50 # arbitrary value
# Specify the burn-in period:
burn_in = np.ceil(0.1 * traj_length) # arbitrary number, less than traj_length
# Initialize accepted, rejected counters, just to monitor performance:
n_accepted = 0
n_rejected = 0
# Specify seed to reproduce same random walk:
np.random.seed(4745)

# Now generate the random walk. The 't' index is time or trial in the walk.
for t in range(traj_length-1):
    current_position = trajectory[t]
    # Use the proposal distribution to generate a proposed jump.
    # The shape and variance of the proposal distribution can be changed
    # to whatever you think is appropriate for the target distribution.
    proposed_jump = np.random.normal(loc=0 , scale=0.1, size=1)
    
#    # Compute the probability of accepting the proposed jump.
    prob_accept = np.minimum(1, 
                            target_rel_prob(current_position + proposed_jump, my_data)
                            / target_rel_prob(current_position, my_data))
#    # Generate a random uniform value from the interval [0,1] to
#    # decide whether or not to accept the proposed jump.
    if np.random.rand() < prob_accept:
        # accept the proposed jump
        trajectory[t+1] = current_position + proposed_jump
        # increment the accepted counter, just to monitor performance
        if t > burn_in:
            n_accepted += 1
    else:
        # reject the proposed jump, stay at current position
        trajectory[t+1] = current_position
        # increment the rejected counter, just to monitor performance
        if t > burn_in:
            n_rejected += 1


# Extract the post-burn_in portion of the trajectory.
accepted_traj = trajectory[burn_in:]
# End of Metropolis algorithm.



# Display rejected/accepted ratio in the plot.
mean_traj = np.mean(accepted_traj)
std_traj = np.std(accepted_traj)

plt.figure(figsize=(16,6)) # Brian added

plt.plot(0, label=r'$N_{pro}=%s$ $\frac{N_{acc}}{N_{pro}} = %.3f$' % (len(accepted_traj), (n_accepted/len(accepted_traj))), alpha=0)

# Evidence for model, p(D).

# Compute a,b parameters for beta distribution that has the same mean
# and stdev as the sample from the posterior. This is a useful choice
# when the likelihood function is Bernoulli.
a = mean_traj * ((mean_traj*(1 - mean_traj)/std_traj**2) - 1)
b = (1 - mean_traj) * ((mean_traj*(1 - mean_traj)/std_traj**2) - 1)

# For every theta value in the posterior sample, compute 
# dbeta(theta,a,b) / likelihood(theta)*prior(theta)
# This computation assumes that likelihood and prior are proper densities,
# i.e., not just relative probabilities. This computation also assumes that
# the likelihood and prior functions were defined to accept a vector argument,
# not just a single-component scalar argument.
wtd_evid = beta.pdf(accepted_traj, a, b) / (likelihood(accepted_traj, my_data) * prior(accepted_traj))
p_data = 1 / np.mean(wtd_evid)


# Display p(D) in the graph
plt.plot(0, label='p(D) = %.3e' % p_data, alpha=0)

# Display the posterior.
ROPE = np.array([0.76, 0.8])
mcmc_info = plot_post(accepted_traj, xlab='theta', show_mode=False, comp_val=0.9, ROPE=ROPE)


# Uncomment next line if you want to save the graph.
plt.show()
In [33]:
"""
HDI_of_grid.py
Arguments:
probMassVec is a vector of probability masses at each grid point.
credMass is the desired mass of the HDI region.
Return a dictionary with:
indices is a vector of indices that are in the HDI
mass is the total mass of the included indices
height is the smallest component probability mass in the HDI
"""

def HDI_of_grid(probMassVec, credMass=0.95):
    sortedProbMass = np.sort(probMassVec, axis=None)[::-1]
    HDIheightIdx = np.min(np.where(np.cumsum(sortedProbMass) >= credMass))
    HDIheight = sortedProbMass[HDIheightIdx]
    HDImass = np.sum(probMassVec[probMassVec >= HDIheight])
    idx = np.where(probMassVec >= HDIheight)
    return {'indices':idx, 'mass':HDImass, 'height':HDIheight}
In [34]:
"""
08_BernTwoGrid.py
Inferring two binomial proportions via grid aproximation.
"""

# Specify the grid on theta1,theta2 parameter space.
n_int = 500  # arbitrary number of intervals for grid on theta.
theta1 = np.linspace(0, 1, n_int)
theta2 = theta1

theta1_grid, theta2_grid = np.meshgrid(theta1, theta2)

# Specify the prior probability _masses_ on the grid.
prior_name = ("Beta","Ripples","Null","Alt")[0]  # or define your own.
if prior_name == "Beta":
    a1, b1, a2, b2 = 3, 3, 3, 3
    prior1 = beta.pdf(theta1_grid, a1, b1)
    prior2 = beta.pdf(theta2_grid, a1, b1)
    prior = prior1 * prior2
    prior = prior / np.sum(prior)

if prior_name == "Ripples":
    m1, m2, k = 0, 1, 0.75 * np.pi
    prior = np.sin((k*(theta1_grid-m1))**2 + (k*(theta2_grid-m2))**2)**2
    prior = prior / np.sum(prior)

if prior_name == "Null":
    # 1's at theta1=theta2, 0's everywhere else:
    prior = np.eye(len(theta1_grid), len(theta2_grid))
    prior = prior / np.sum(prior)

if prior_name == "Alt":
#    # Uniform:
    prior = np.ones((len(theta1_grid), len(theta2_grid)))
    prior = prior / np.sum(prior)

# Specify likelihood
z1, N1, z2, N2 = 5, 7, 2, 7  # data are specified here
likelihood = theta1_grid**z1 * (1-theta1_grid)**(N1-z1) * theta2_grid**z2 * (1-theta2_grid)**(N2-z2)

# Compute posterior from point-by-point multiplication and normalizing:
p_data = np.sum(prior * likelihood)
posterior = (prior * likelihood) / p_data

# Specify the probability mass for the HDI region
credib = .95
thin = 4
color = 'skyblue'

fig = plt.figure(figsize=(12,12))

# prior
ax = fig.add_subplot(3, 2, 1, projection='3d')
ax.plot_surface(theta1_grid[::thin,::thin], theta2_grid[::thin,::thin], prior[::thin,::thin], color=color)
ax.set_xlabel(r'$\theta1$')
ax.set_ylabel(r'$\theta1$')
ax.set_zlabel(r'$p(t1,t2)$')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

plt.subplot(3, 2, 2)
plt.contour(theta1_grid, theta2_grid, prior, colors=color)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta1$')

# likelihood
ax = fig.add_subplot(3, 2, 3, projection='3d')
ax.plot_surface(theta1_grid[::thin,::thin], theta2_grid[::thin,::thin], likelihood[::thin,::thin], color=color)
ax.set_xlabel(r'$\theta1$')
ax.set_ylabel(r'$\theta1$')
ax.set_zlabel(r'$p(D|t1,t2)$')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

plt.subplot(3, 2, 4)
plt.contour(theta1_grid, theta2_grid, likelihood, colors=color)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta1$')
plt.plot(0, label='z1,N1,z2,N2=%s,%s,%s,%s' % (z1, N1, z2, N2), alpha=0)
plt.legend(loc='upper left')

# posterior
ax = fig.add_subplot(3, 2, 5, projection='3d')
ax.plot_surface(theta1_grid[::thin,::thin], theta2_grid[::thin,::thin],posterior[::thin,::thin], color=color)
ax.set_xlabel(r'$\theta1$')
ax.set_ylabel(r'$\theta1$')
ax.set_zlabel(r'$p(t1,t2|D)$')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

plt.subplot(3, 2, 6)
plt.contour(theta1_grid, theta2_grid, posterior, colors=color)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta1$')
plt.plot(0, label='p(D) = %.3e' % p_data, alpha=0)
plt.legend(loc='upper left')

# Mark the highest posterior density region
HDI_height = HDI_of_grid(posterior)['height']
plt.contour(theta1_grid, theta2_grid, posterior, levels=[HDI_height], colors='k')

plt.tight_layout()
plt.show()