#!/usr/bin/env python
# coding: utf-8

# Kruschke's Doing Bayesian Data Analysis in PyMC3
# ==========================
# 
# The scripts from https://github.com/aloctavodia/Doing_bayesian_data_analysis collected into one IPython notebook, with minor edits:
# - all imports collected into the first cell
# - import scipy.special.binom so it does not overwrite scipy.stats.binom
# - move csv files into cells, and read them using StringIO
# 
# Only some files import division from \_\_future__ but this does not appear to cause any bugs.

# In[3]:


from __future__ import division
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm

import sys
from StringIO import StringIO

from scipy.stats import norm, multivariate_normal, beta, binom, stats
from scipy.stats import t as stats_t
from scipy.special import beta as beta_func
from scipy.special import binom as special_binom
from scipy.special import betaln as special_betaln
from scipy.optimize import fmin
from scipy.interpolate import spline
from mpl_toolkits.mplot3d.axes3d import Axes3D

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")


# In[21]:


"""
02_SimpleGraph.py
A simple graph drawn by Python :-)
"""
x = np.linspace(-2, 2, 40)
y = x**2

plt.figure(figsize=(16,6))
plt.plot(x, y)
plt.show()


# In[22]:


"""
03_IntegralOfDensity.py
Graph of normal probability density function, with comb of intervals.
"""
meanval = 0.0              # Specify mean of distribution.
sdval = 0.2                # Specify standard deviation of distribution.
xlow = meanval - 3 * sdval  # Specify low end of x-axis.
xhigh = meanval + 3 * sdval  # Specify high end of x-axis.
dx = 0.02                  # Specify interval width on x-axis
# Specify comb points along the x axis:
x = np.arange(xlow, xhigh, dx)
# Compute y values, i.e., probability density at each value of x:
y = (1/(sdval*np.sqrt(2*np.pi))) * np.exp(-.5 * ((x - meanval)/sdval)**2)
# Plot the function. "plot" draws the bell curve. "stem" draws the intervals.
plt.figure(figsize=(16,6)) # Brian added
plt.plot(x, y)
plt.stem(x, y, markerfmt=' ')

plt.xlabel('$x$')
plt.ylabel('$p(x)$')
plt.title('Normal Probability Density')
# Approximate the integral as the sum of width * height for each interval.
area = np.sum(dx*y)
# Display info in the graph.
_ = plt.text(-.6, 1.7, '$\mu$ = %s' % meanval)
_ = plt.text(-.6, 1.5, '$\sigma$ = %s' % sdval)
_ = plt.text(.2, 1.7, '$\Delta x$ = %s' % dx)
_ = plt.text(.2, 1.5, '$\sum_{x}$ $\Delta x$ $p(x)$ = %5.3f' % area)


# In[5]:


"""
03_RunningProportion.py
Goal: Toss a coin N times and compute the running proportion of heads.
"""

# Specify the total number of flips, denoted N.
N = 500
# Generate a random sample of N flips for a fair coin (heads=1, tails=0);
np.random.seed(47405)
flip_sequence = np.random.choice(a=(0, 1), p=(.5, .5), size=N, replace=True)
# Compute the running proportion of heads:
r = np.cumsum(flip_sequence)
n = np.linspace(1, N, N)  # n is a vector.
run_prop = r/n  # component by component division.

# Graph the running proportion:
plt.figure(figsize=(16,6)) # Brian added
plt.plot(n, run_prop, '-o', )
plt.xscale('log')  # an alternative to plot() and xscale() is semilogx()
plt.xlim(1, N)
plt.ylim(0, 1)
plt.xlabel('Flip Number')
plt.ylabel('Proportion Heads')
plt.title('Running Proportion of Heads')
# Plot a dotted horizontal line at y=.5, just as a reference line:
plt.axhline(y=.5, ls='dashed')

# Display the beginning of the flip sequence.
flipletters = ''.join(["T","H"][flip] for flip in flip_sequence[:10])

_ = plt.text(10, 0.8, 'Flip Sequence = %s...' % flipletters)
# Display the relative frequency at the end of the sequence.
_ = plt.text(25, 0.2, 'End Proportion = %s' % run_prop[-1])


# In[24]:


"""
04_BayesUpdate.py
Bayesian updating of beliefs about the bias of a coin. The prior and posterior
distributions indicate probability masses at discrete candidate values of theta.
"""

# theta is the vector of candidate values for the parameter theta.
# n_theta_vals is the number of candidate theta values.
# To produce the examples in the book, set n_theta_vals to either 3 or 63.
n_theta_vals = 3.
# Now make the vector of theta values:
theta = np.linspace(1/(n_theta_vals +1), n_theta_vals /(n_theta_vals +1), n_theta_vals )

# p_theta is the vector of prior probabilities on the theta values.
p_theta = np.minimum(theta, 1-theta)  # Makes a triangular belief distribution.
p_theta = p_theta / np.sum(p_theta)     # Makes sure that beliefs sum to 1.

# Specify the data. To produce the examples in the book, use either
# data = np.repeat([1,0], [3, 9]) or data = np.repeat([1,0], [1, 11])
data = np.repeat([1, 0], [3, 9])
n_heads = np.sum(data)
n_tails = len(data) - n_heads

# Compute the likelihood of the data for each value of theta:
p_data_given_theta = theta**n_heads * (1-theta)**n_tails

# Compute the posterior:
p_data = np.sum(p_data_given_theta * p_theta)
p_theta_given_data = p_data_given_theta * p_theta / p_data   # This is Bayes' rule!

# Plot the results.
plt.figure(figsize=(12, 11))
plt.subplots_adjust(hspace=0.7)

# Plot the prior:
plt.subplot(3, 1, 1)
plt.stem(theta, p_theta, markerfmt=' ')
plt.xlim(0, 1)
plt.xlabel('$\\theta$')
plt.ylabel('$P(\\theta)$')
plt.title('Prior')
# Plot the likelihood:
plt.subplot(3, 1, 2)
plt.stem(theta, p_data_given_theta, markerfmt=' ')
plt.xlim(0, 1)
plt.xlabel('$\\theta$')
plt.ylabel('$P(D|\\theta)$')
plt.title('Likelihood')
plt.text(0.6, np.max(p_data_given_theta)/2, 'D = %sH,%sT' % (n_heads, n_tails))
# Plot the posterior:
plt.subplot(3, 1, 3)
plt.stem(theta, p_theta_given_data, markerfmt=' ')
plt.xlim(0, 1)
plt.xlabel('$\\theta$')
plt.ylabel('$P(\\theta|D)$')
plt.title('Posterior')
_ = plt.text(0.6, np.max(p_theta_given_data)/2, 'P(D) = %g' % p_data)


# In[25]:


"""
HIDofICDF.py
This program finds the HDI of a probability density function that is specified 
mathematically in Python.
"""

def HDIofICDF(dist_name, credMass=0.95, **args):
    # freeze distribution with given arguments
    distri = dist_name(**args)
    # initial guess for HDIlowTailPr
    incredMass =  1.0 - credMass

    def intervalWidth(lowTailPr):
        return distri.ppf(credMass + lowTailPr) - distri.ppf(lowTailPr)

    # find lowTailPr that minimizes intervalWidth
    HDIlowTailPr = fmin(intervalWidth, incredMass, ftol=1e-8, disp=False)[0]
    # return interval as array([low, high])
    return distri.ppf([HDIlowTailPr, credMass + HDIlowTailPr])


# In[26]:


"""
05_BernBeta.py
Inferring a binomial proportion via exact mathematical analysis.
"""

def bern_beta(prior_shape, data_vec, cred_mass=0.95):
    """Bayesian updating for Bernoulli likelihood and beta prior.
     Input arguments:
       prior_shape
         vector of parameter values for the prior beta distribution.
       data_vec
         vector of 1's and 0's.
       cred_mass
         the probability mass of the HDI.
     Output:
       post_shape
         vector of parameter values for the posterior beta distribution.
     Graphics:
       Creates a three-panel graph of prior, likelihood, and posterior
       with highest posterior density interval.
     Example of use:
     post_shape = bern_beta(prior_shape=[1,1] , data_vec=[1,0,0,1,1])"""

    # Check for errors in input arguments:
    if len(prior_shape) != 2:
        sys.exit('prior_shape must have two components.')
    if any([i < 0 for i in prior_shape]):
        sys.exit('prior_shape components must be positive.')
    if any([i != 0 and i != 1 for i in data_vec]):
        sys.exit('data_vec must be a vector of 1s and 0s.')
    if cred_mass <= 0 or cred_mass >= 1.0:
        sys.exit('cred_mass must be between 0 and 1.')

    # Rename the prior shape parameters, for convenience:
    a = prior_shape[0]
    b = prior_shape[1]
    # Create summary values of the data:
    z = sum(data_vec[data_vec == 1])  # number of 1's in data_vec
    N = len(data_vec)   # number of flips in data_vec
    # Compute the posterior shape parameters:
    post_shape = [a+z, b+N-z]
    # Compute the evidence, p(D):
    p_data = beta_func(z+a, N-z+b)/beta_func(a, b)
    # Construct grid of theta values, used for graphing.
    bin_width = 0.005  # Arbitrary small value for comb on theta.
    theta = np.arange(bin_width/2, 1-(bin_width/2)+bin_width, bin_width)
    # Compute the prior at each value of theta.
    p_theta = beta.pdf(theta, a, b)
    # Compute the likelihood of the data at each value of theta.
    p_data_given_theta = theta**z * (1-theta)**(N-z)
    # Compute the posterior at each value of theta.
    post_a = a + z
    post_b = b+N-z
    p_theta_given_data = beta.pdf(theta, a+z, b+N-z)
    # Determine the limits of the highest density interval
    intervals = HDIofICDF(beta, cred_mass, a=post_shape[0], b=post_shape[1])

    # Plot the results.
    plt.figure(figsize=(12, 12))
    plt.subplots_adjust(hspace=0.7)

    # Plot the prior.
    locx = 0.05
    plt.subplot(3, 1, 1)
    plt.plot(theta, p_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta)$')
    plt.title('Prior')
    plt.text(locx, np.max(p_theta)/2, r'beta($\theta$;%s,%s)' % (a, b))
    # Plot the likelihood:
    plt.subplot(3, 1, 2)
    plt.plot(theta, p_data_given_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_data_given_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(D|\theta)$')
    plt.title('Likelihood')
    plt.text(locx, np.max(p_data_given_theta)/2, 'Data: z=%s, N=%s' % (z, N))
    # Plot the posterior:
    plt.subplot(3, 1, 3)
    plt.plot(theta, p_theta_given_data)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta_given_data)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta|D)$')
    plt.title('Posterior')
    locy = np.linspace(0, np.max(p_theta_given_data), 5)
    plt.text(locx, locy[1], r'beta($\theta$;%s,%s)' % (post_a, post_b))
    plt.text(locx, locy[2], 'P(D) = %g' % p_data)
    # Plot the HDI
    plt.text(locx, locy[3],
             'Intervals = %.3f - %.3f' % (intervals[0], intervals[1]))
    plt.fill_between(theta, 0, p_theta_given_data,
                    where=np.logical_and(theta > intervals[0],
                    theta < intervals[1]),
                        color='blue', alpha=0.3)
    return intervals

data_vec = np.repeat([1, 0], [11, 3])  # 11 heads, 3 tail
intervals = bern_beta(prior_shape=[100, 100], data_vec=data_vec)
plt.show()


# In[27]:


"""
05_BetaPosteriorPredictions.py
Posterior predictive check. Examine the veracity of the winning model by
simulating data sampled from the winning model and see if the simulated data
'look like' the actual data.
"""
# Specify known values of prior and actual data.
prior_a = 100
prior_b = 1
actual_data_Z = 8
actual_data_N = 12
# Compute posterior parameter values.
post_a = prior_a + actual_data_Z
post_b = prior_b + actual_data_N - actual_data_Z
# Number of flips in a simulated sample should match the actual sample size:
sim_sample_size = actual_data_N
# Designate an arbitrarily large number of simulated samples.
n_sim_samples = 1000
# Set aside a vector in which to store the simulation results.
sim_sample_Z_record = np.zeros(n_sim_samples)
# Now generate samples from the posterior.
for sample_idx in range(0, n_sim_samples):
    # Generate a theta value for the new sample from the posterior.
    sample_theta = beta.rvs(post_a, post_b)
    # Generate a sample, using sample_theta.
    sample_data = np.random.choice([0, 1], p=[1-sample_theta, sample_theta],
                                  size=sim_sample_size, replace=True)
    sim_sample_Z_record[sample_idx] = sum(sample_data)


## Make a histogram of the number of heads in the samples.
plt.figure(figsize=(16,4))
plt.title("a histogram of the number of heads in the samples")
plt.hist(sim_sample_Z_record)
plt.show()


# In[28]:


"""HPD.py
This code was taken form the PyMC library https://github.com/pymc-devs/pymc
"""

def calc_min_interval(x, alpha):
    """Internal method to determine the minimum interval of a given width
    Assumes that x is sorted numpy array.
    """

    n = len(x)
    cred_mass = 1.0-alpha

    interval_idx_inc = int(np.floor(cred_mass*n))
    n_intervals = n - interval_idx_inc
    interval_width = x[interval_idx_inc:] - x[:n_intervals]

    if len(interval_width) == 0:
        raise ValueError('Too few elements for interval calculation')

    min_idx = np.argmin(interval_width)
    hdi_min = x[min_idx]
    hdi_max = x[min_idx+interval_idx_inc]
    return hdi_min, hdi_max


def hpd(x, alpha=0.05):
    """Calculate highest posterior density (HPD) of array for given alpha. 
    The HPD is the minimum width Bayesian credible interval (BCI).
    :Arguments:
        x : Numpy array
        An array containing MCMC samples
        alpha : float
        Desired probability of type I error (defaults to 0.05)
    """

    # Make a copy of trace
    x = x.copy()
    # For multivariate node
    if x.ndim > 1:
        # Transpose first, then sort
        tx = np.transpose(x, list(range(x.ndim))[1:]+[0])
        dims = np.shape(tx)
        # Container list for intervals
        intervals = np.resize(0.0, dims[:-1]+(2,))

        for index in make_indices(dims[:-1]):
            try:
                index = tuple(index)
            except TypeError:
                pass

            # Sort trace
            sx = np.sort(tx[index])
            # Append to list
            intervals[index] = calc_min_interval(sx, alpha)
        # Transpose back before returning
        return np.array(intervals)
    else:
        # Sort univariate node
        sx = np.sort(x)
        return np.array(calc_min_interval(sx, alpha))


# In[29]:


"""
06_BernGrid.py
Inferring a binomial proportion via grid aproximation.
"""

def bern_grid(theta, p_theta, data, credib=.95):
    """
    Bayesian updating for Bernoulli likelihood and prior specified on a grid.
    Input arguments:
     theta is a vector of theta values, all between 0 and 1.
     p_theta is a vector of corresponding probability _masses_.
     data is a vector of 1's and 0's, where 1 corresponds to a and 0 to b.
     credib is the probability mass of the credible interval, default is 0.95.
    Output:
     p_theta_given_data is a vector of posterior probability masses over theta.
     Also creates a three-panel graph of prior, likelihood, and posterior
     probability masses with credible interval.
    Example of use:
     Create vector of theta values.
     bin_width = 1/1000 
     theta_grid = np.arange(0, 1+bin_width, bin_width)
     Specify probability mass at each theta value.
     > rel_prob = np.minimum(theta_grid, 1-theta_grid) relative prob at each theta
     > prior = rel_prob / sum(rel_prob) probability mass at each theta
     Specify the data vector.
     data_vec = np.repeat([1, 0], [11, 3])  # 3 heads, 1 tail
     Call the function.
     > posterior = bern_grid( theta=theta_grid , p_theta=prior , data=data_vec )
    """

# Create summary values of data
    z = sum(data[data == 1])  # number of 1's in data
    N = len(data)  # number of flips in data
# Compute the likelihood of the data for each value of theta.
    p_data_given_theta = theta**z * (1 - theta)**(N - z)
# Compute the evidence and the posterior.
    p_data = sum(p_data_given_theta * p_theta)
    p_theta_given_data = p_data_given_theta * p_theta / p_data
    # Determine the limits of the highest density interval
    x = np.random.choice(theta, size=5000, replace=True, p=p_theta_given_data)
    intervals = hpd(x, alpha=1-credib)

# Plot the results.
    plt.figure(figsize=(16, 12)) # Brian edited
    plt.subplots_adjust(hspace=0.7)

#    # Plot the prior.
    locx = 0.05
    mean_theta = sum(theta * p_theta)  # mean of prior, for plotting
    plt.subplot(3, 1, 1)
    plt.plot(theta, p_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta)$')
    plt.title('Prior')
    plt.text(locx, np.max(p_theta)/2, r'mean($\theta$;%5.2f)' % mean_theta)
    # Plot the likelihood:
    plt.subplot(3, 1, 2)
    plt.plot(theta, p_data_given_theta)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_data_given_theta)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(D|\theta)$')
    plt.title('Likelihood')
    plt.text(locx, np.max(p_data_given_theta)/2, 'data: z=%s, N=%s' % (z, N))
    # Plot the posterior:
    mean_theta_given_data = sum(theta * p_theta_given_data)
    plt.subplot(3, 1, 3)
    plt.plot(theta, p_theta_given_data)
    plt.xlim(0, 1)
    plt.ylim(0, np.max(p_theta_given_data)*1.2)
    plt.xlabel(r'$\theta$')
    plt.ylabel(r'$P(\theta|D)$')
    plt.title('Posterior')
    loc = np.linspace(0, np.max(p_theta_given_data), 5)
    plt.text(locx, loc[1], r'mean($\theta$;%5.2f)' % mean_theta_given_data)
    plt.text(locx, loc[2], 'P(D) = %g' % p_data)
    # Plot the HDI
    plt.text(locx, loc[3],
             'Intervals =%s' % ', '.join('%.3f' % x for x in intervals))
    for i in range(0, len(intervals), 2):
        plt.fill_between(theta, 0, p_theta_given_data,
                         where=np.logical_and(theta > intervals[i],
                                              theta < intervals[i+1]),
                         color='blue', alpha=0.3)
    plt.show()
    return p_theta_given_data


###Create vector of theta values.
bin_width = 1/1000.
theta_grid = np.arange(0, 1+bin_width, bin_width)
##Specify probability mass at each theta value.
rel_prob = np.array([0.1] * len(theta_grid))  # uniform prior
rel_prob = np.array([0.1] * len(theta_grid))  # uniform prior
prior = rel_prob / sum(rel_prob)  # probability mass at each theta


#### figure 6.2 ###
#np.random.seed(123)
#a = [0.1] * 50
#b = np.linspace(0.1, 1, 50)
#c = np.linspace(1, 0.1, 50)
#d = [0.1] * 50
#p_theta = np.concatenate((a, b, c, d))
#prior = np.where(p_theta != 0 , p_theta / sum(p_theta), 0.)
#width = 1. / len(p_theta)
#theta_grid = np.arange(width/2 , (1-width/2)+width, width)

### figure 6.3 ###
#np.random.seed(123)
#a = np.repeat([0], [50])
#b = np.linspace(0, 1, 50)
#c = (np.linspace(1, 0, 20))**2
#d = np.random.uniform(size=3)
#e = np.repeat([1], [20])
#p_theta = np.concatenate((a, b, c, d, e))
#prior = np.where(p_theta != 0 , p_theta / sum(p_theta), 0.)
#width = 1. / len(p_theta)
#theta_grid = np.arange(width/2 , (1-width/2)+width, width)

###Specify the data vector.
data_vec = np.repeat([1, 0], [11, 3])  # 3 heads, 1 tail
###Call the function.
posterior = bern_grid(theta=theta_grid, p_theta=prior, data=data_vec)


# In[30]:


"""plot_post.py"""

def plot_post(param_sample_vec, cred_mass=0.95, comp_val=False,
              ROPE=False, ylab='', xlab='parameter', fontsize=14, labelsize=14,
              title='', framealpha=1, facecolor='skyblue', edgecolor='white',
              show_mode=True, bins=50):
    
    #compute HDI
    HDI = hpd(param_sample_vec, 1-cred_mass)

    post_summary = {'mean':0,'median':0,'mode':0, 'hdi_mass':0,'hdi_low':0,
                   'hdi_high':0, 'comp_val':0, 'pc_gt_comp_val':0, 'ROPE_low':0,
                   'ROPE_high':0, 'pc_in_ROPE':0}
    post_summary['mean'] = np.mean(param_sample_vec)
    post_summary['median'] = np.median(param_sample_vec)
    post_summary['mode'] = stats.mode(param_sample_vec)[0]
    post_summary['hdi_mass'] = cred_mass
    post_summary['hdi_low'] = HDI[0]
    post_summary['hdi_high'] = HDI[1]

    # Plot histogram.
    n, bins, patches = plt.hist(param_sample_vec, normed=True, bins=bins,
                                edgecolor=edgecolor, facecolor=facecolor)
    plt.xlabel(xlab, fontsize=fontsize)
    plt.ylabel(ylab, fontsize=fontsize)
    plt.title(title, fontsize=fontsize)

    cv_ht = 0.75*np.max(n)
    cen_tend_ht = 0.9 * cv_ht
    ROPE_text_ht = 0.55 * cv_ht
#    # Display mean or mode:
    if show_mode:
        plt.plot(0, label='mode = %.2f' % post_summary['mode'], alpha=0)
    else:
        plt.plot(0, label='mean = %.2f' % post_summary['mean'], alpha=0)
    # Display the comparison value.

    if comp_val is not False:
        pc_gt_comp_val = 100 * np.sum(param_sample_vec > comp_val)/len(param_sample_vec)
        pc_lt_comp_val = 100 - pc_gt_comp_val
        plt.plot([comp_val, comp_val], [0, cv_ht], color='darkgreen',
                 linestyle='--', linewidth=2,
                 label='%.1f%% <%.1f < %.1f%%'
                 % (pc_lt_comp_val, comp_val, pc_gt_comp_val))
        post_summary['comp_val'] = comp_val
        post_summary['pc_gt_comp_val'] = pc_gt_comp_val
#    # Display the ROPE.
    if ROPE is not False:
        rope_col = 'darkred'
        pc_in_ROPE = round(np.sum((param_sample_vec > ROPE[0]) & (param_sample_vec < ROPE[1]))/len(param_sample_vec)*100)
        plt.plot([ROPE[0], ROPE[0]], [0, 0.96*ROPE_text_ht], color=rope_col,
                linestyle=':', linewidth=4,
                label='%.1f%% in ROPE' % pc_in_ROPE)
        plt.plot([ROPE[1], ROPE[1]], [0, 0.96*ROPE_text_ht], color=rope_col,
                linestyle=':', linewidth=4)
        post_summary['ROPE_low'] = ROPE[0] 
        post_summary['ROPE_high'] = ROPE[1] 
        post_summary['pc_in_ROPE'] = pc_in_ROPE
#    # Display the HDI.
    plt.plot(HDI, [0, 0], linewidth=6, color='k', label='HDI %.1f%% %.3f-%.3f' % (cred_mass*100, HDI[0], HDI[1]))
    plt.legend(loc='upper left', fontsize=labelsize, framealpha=framealpha)
    return post_summary


# In[31]:


"""
07_BernBetaPyMCFull.py
Inferring a binomial proportion using PyMC.
"""
# Generate the data
y = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0])  # 11 heads and 3 tails


with pm.Model() as model:
    # define the prior
    theta = pm.Beta('theta', 1, 1)  # prior
    # define the likelihood
    y = pm.Bernoulli('y', p=theta, observed=y)

    # Generate a MCMC chain
    trace = pm.sample(5000, pm.Metropolis(),
                      progressbar=False)  # Use Metropolis sampling
#    start = pm.find_MAP()  # Find starting value by optimization
#    step = pm.NUTS()  # Instantiate NUTS sampler
#    trace = pm.sample(5000, step, start=start, progressbar=False)

# create an array with the posterior sample
theta_sample = trace['theta']

print "theta_sample", theta_sample
plt.figure(figsize=(16,6)) # Brian added

plt.subplot(1, 2, 1)
plt.plot(theta_sample[:500], np.arange(500), marker='o')
plt.xlim(0, 1)
plt.xlabel(r'$\theta$')
plt.ylabel('Position in Chain')

plt.subplot(1, 2, 2)
mcmc_info = plot_post(theta_sample, xlab=r'$\theta', show_mode=False)

# Posterior prediction:
# For each step in the chain, use posterior theta to flip a coin:
y_pred = np.zeros(len(theta_sample))
for i, p_head in enumerate(theta_sample):
    y_pred[i] = np.random.choice([0, 1], p=[1 - p_head, p_head])

# Jitter the 0,1 y values for plotting purposes:
y_pred_jittered = y_pred + np.random.uniform(-.05, .05, size=len(theta_sample))

# Now plot the jittered values:
plt.figure(figsize=(16,6)) # Brian added

plt.plot(theta_sample[:500], y_pred_jittered[:500], 'ro')
plt.xlim(-.1, 1.1)
plt.ylim(-.1, 1.1)
plt.xlabel(r'$\theta$')
plt.ylabel('y (jittered)')

mean_y = np.mean(y_pred)
mean_theta = np.mean(theta_sample)

plt.plot(mean_y, mean_theta, 'k+', markersize=15)
plt.annotate('mean(y) = %.2f\nmean($\\theta$) = %.2f' %
    (mean_y, mean_theta), xy=(mean_y, mean_theta))
plt.plot([0, 1], [0, 1], linestyle='--')

plt.show()


# In[32]:


"""
07_BernMetropolisTemplate.py
Use this program as a template for experimenting with the Metropolis algorithm
applied to a single parameter called theta, defined on the interval [0,1].
"""

# Specify the data, to be used in the likelihood function.
# This is a vector with one component per flip,
# in which 1 means a "head" and 0 means a "tail".
my_data = np.repeat([1, 0], [11, 3])  # 11 heads, 2 tail

# Define the Bernoulli likelihood function, p(D|theta).
# The argument theta could be a vector, not just a scalar.
def likelihood(theta, data):
    theta = np.array(theta) # ensure you have an array
    z = sum(data[data == 1])  # number of 1's in Data
    N = len(data)  # number of flips in Data
# Compute the likelihood of the Data for each value of Theta.
    if np.size(theta) == 1:  # if theta is an scalar
        p_data_given_theta = 0
        if theta < 1 and theta > 0:
            p_data_given_theta = theta**z * (1-theta)**(N-z)
    else: # if theta is an array
        p_data_given_theta = theta**z * (1-theta)**(N-z)
        # The theta values passed into this function are generated at random,
        # and therefore might be inadvertently greater than 1 or less than 0.
        # The likelihood for theta > 1 or for theta < 0 is zero:
        p_data_given_theta[(theta > 1) | (theta < 0)] = 0
    return p_data_given_theta


# Define the prior density function. For purposes of computing p(D),
# at the end of this program, we want this prior to be a proper density.
# The argument theta could be a vector, not just a scalar.
def prior(theta):
    theta = np.array(theta) # ensure you have an array
# For kicks, here's a bimodal prior. To try it, uncomment the next 2 lines.
    #from scipy.stats import beta
    #prior = dbeta(np.minium(2*theta, 2*(1-theta)), 2, 2)
    if np.size(theta) == 1:  # if theta is an scalar
        prior = 0
        if theta < 1 and theta > 0:
            prior = 1
    else: # if theta is an array
        prior = np.ones(len(theta))  # uniform density over [0,1]
        # The theta values passed into this function are generated at random,
        # and therefore might be inadvertently greater than 1 or less than 0.
        # The likelihood for theta > 1 or for theta < 0 is zero:
        prior[(theta > 1) | (theta < 0)] = 0
    return prior


# Define the relative probability of the target distribution, 
# as a function of vector theta. For our application, this
# target distribution is the unnormalized posterior distribution.
def target_rel_prob(theta, data):
    target_rel_prob = likelihood(theta , data) * prior(theta)
    return target_rel_prob

# Specify the length of the trajectory, i.e., the number of jumps to try:
traj_length = 5000 # arbitrary large number
# Initialize the vector that will store the results:
trajectory = np.zeros(traj_length)
# Specify where to start the trajectory:
trajectory[0] = 0.50 # arbitrary value
# Specify the burn-in period:
burn_in = np.ceil(0.1 * traj_length) # arbitrary number, less than traj_length
# Initialize accepted, rejected counters, just to monitor performance:
n_accepted = 0
n_rejected = 0
# Specify seed to reproduce same random walk:
np.random.seed(4745)

# Now generate the random walk. The 't' index is time or trial in the walk.
for t in range(traj_length-1):
    current_position = trajectory[t]
    # Use the proposal distribution to generate a proposed jump.
    # The shape and variance of the proposal distribution can be changed
    # to whatever you think is appropriate for the target distribution.
    proposed_jump = np.random.normal(loc=0 , scale=0.1, size=1)
    
#    # Compute the probability of accepting the proposed jump.
    prob_accept = np.minimum(1, 
                            target_rel_prob(current_position + proposed_jump, my_data)
                            / target_rel_prob(current_position, my_data))
#    # Generate a random uniform value from the interval [0,1] to
#    # decide whether or not to accept the proposed jump.
    if np.random.rand() < prob_accept:
        # accept the proposed jump
        trajectory[t+1] = current_position + proposed_jump
        # increment the accepted counter, just to monitor performance
        if t > burn_in:
            n_accepted += 1
    else:
        # reject the proposed jump, stay at current position
        trajectory[t+1] = current_position
        # increment the rejected counter, just to monitor performance
        if t > burn_in:
            n_rejected += 1


# Extract the post-burn_in portion of the trajectory.
accepted_traj = trajectory[burn_in:]
# End of Metropolis algorithm.


# Display rejected/accepted ratio in the plot.
mean_traj = np.mean(accepted_traj)
std_traj = np.std(accepted_traj)

plt.figure(figsize=(16,6)) # Brian added

plt.plot(0, label=r'$N_{pro}=%s$ $\frac{N_{acc}}{N_{pro}} = %.3f$' % (len(accepted_traj), (n_accepted/len(accepted_traj))), alpha=0)

# Evidence for model, p(D).

# Compute a,b parameters for beta distribution that has the same mean
# and stdev as the sample from the posterior. This is a useful choice
# when the likelihood function is Bernoulli.
a = mean_traj * ((mean_traj*(1 - mean_traj)/std_traj**2) - 1)
b = (1 - mean_traj) * ((mean_traj*(1 - mean_traj)/std_traj**2) - 1)

# For every theta value in the posterior sample, compute 
# dbeta(theta,a,b) / likelihood(theta)*prior(theta)
# This computation assumes that likelihood and prior are proper densities,
# i.e., not just relative probabilities. This computation also assumes that
# the likelihood and prior functions were defined to accept a vector argument,
# not just a single-component scalar argument.
wtd_evid = beta.pdf(accepted_traj, a, b) / (likelihood(accepted_traj, my_data) * prior(accepted_traj))
p_data = 1 / np.mean(wtd_evid)


# Display p(D) in the graph
plt.plot(0, label='p(D) = %.3e' % p_data, alpha=0)

# Display the posterior.
ROPE = np.array([0.76, 0.8])
mcmc_info = plot_post(accepted_traj, xlab='theta', show_mode=False, comp_val=0.9, ROPE=ROPE)


# Uncomment next line if you want to save the graph.
plt.show()


# In[33]:


"""
HDI_of_grid.py
Arguments:
probMassVec is a vector of probability masses at each grid point.
credMass is the desired mass of the HDI region.
Return a dictionary with:
indices is a vector of indices that are in the HDI
mass is the total mass of the included indices
height is the smallest component probability mass in the HDI
"""

def HDI_of_grid(probMassVec, credMass=0.95):
    sortedProbMass = np.sort(probMassVec, axis=None)[::-1]
    HDIheightIdx = np.min(np.where(np.cumsum(sortedProbMass) >= credMass))
    HDIheight = sortedProbMass[HDIheightIdx]
    HDImass = np.sum(probMassVec[probMassVec >= HDIheight])
    idx = np.where(probMassVec >= HDIheight)
    return {'indices':idx, 'mass':HDImass, 'height':HDIheight}


# In[34]:


"""
08_BernTwoGrid.py
Inferring two binomial proportions via grid aproximation.
"""

# Specify the grid on theta1,theta2 parameter space.
n_int = 500  # arbitrary number of intervals for grid on theta.
theta1 = np.linspace(0, 1, n_int)
theta2 = theta1

theta1_grid, theta2_grid = np.meshgrid(theta1, theta2)

# Specify the prior probability _masses_ on the grid.
prior_name = ("Beta","Ripples","Null","Alt")[0]  # or define your own.
if prior_name == "Beta":
    a1, b1, a2, b2 = 3, 3, 3, 3
    prior1 = beta.pdf(theta1_grid, a1, b1)
    prior2 = beta.pdf(theta2_grid, a1, b1)
    prior = prior1 * prior2
    prior = prior / np.sum(prior)

if prior_name == "Ripples":
    m1, m2, k = 0, 1, 0.75 * np.pi
    prior = np.sin((k*(theta1_grid-m1))**2 + (k*(theta2_grid-m2))**2)**2
    prior = prior / np.sum(prior)

if prior_name == "Null":
    # 1's at theta1=theta2, 0's everywhere else:
    prior = np.eye(len(theta1_grid), len(theta2_grid))
    prior = prior / np.sum(prior)

if prior_name == "Alt":
#    # Uniform:
    prior = np.ones((len(theta1_grid), len(theta2_grid)))
    prior = prior / np.sum(prior)

# Specify likelihood
z1, N1, z2, N2 = 5, 7, 2, 7  # data are specified here
likelihood = theta1_grid**z1 * (1-theta1_grid)**(N1-z1) * theta2_grid**z2 * (1-theta2_grid)**(N2-z2)

# Compute posterior from point-by-point multiplication and normalizing:
p_data = np.sum(prior * likelihood)
posterior = (prior * likelihood) / p_data

# Specify the probability mass for the HDI region
credib = .95
thin = 4
color = 'skyblue'

fig = plt.figure(figsize=(12,12))

# prior
ax = fig.add_subplot(3, 2, 1, projection='3d')
ax.plot_surface(theta1_grid[::thin,::thin], theta2_grid[::thin,::thin], prior[::thin,::thin], color=color)
ax.set_xlabel(r'$\theta1$')
ax.set_ylabel(r'$\theta1$')
ax.set_zlabel(r'$p(t1,t2)$')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

plt.subplot(3, 2, 2)
plt.contour(theta1_grid, theta2_grid, prior, colors=color)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta1$')

# likelihood
ax = fig.add_subplot(3, 2, 3, projection='3d')
ax.plot_surface(theta1_grid[::thin,::thin], theta2_grid[::thin,::thin], likelihood[::thin,::thin], color=color)
ax.set_xlabel(r'$\theta1$')
ax.set_ylabel(r'$\theta1$')
ax.set_zlabel(r'$p(D|t1,t2)$')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

plt.subplot(3, 2, 4)
plt.contour(theta1_grid, theta2_grid, likelihood, colors=color)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta1$')
plt.plot(0, label='z1,N1,z2,N2=%s,%s,%s,%s' % (z1, N1, z2, N2), alpha=0)
plt.legend(loc='upper left')

# posterior
ax = fig.add_subplot(3, 2, 5, projection='3d')
ax.plot_surface(theta1_grid[::thin,::thin], theta2_grid[::thin,::thin],posterior[::thin,::thin], color=color)
ax.set_xlabel(r'$\theta1$')
ax.set_ylabel(r'$\theta1$')
ax.set_zlabel(r'$p(t1,t2|D)$')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

plt.subplot(3, 2, 6)
plt.contour(theta1_grid, theta2_grid, posterior, colors=color)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta1$')
plt.plot(0, label='p(D) = %.3e' % p_data, alpha=0)
plt.legend(loc='upper left')

# Mark the highest posterior density region
HDI_height = HDI_of_grid(posterior)['height']
plt.contour(theta1_grid, theta2_grid, posterior, levels=[HDI_height], colors='k')

plt.tight_layout()
plt.show()


# In[6]:


"""
08_BernTwoMetropolis.py
Use this program as a template for experimenting with the Metropolis algorithm
applied to 2 parameters called theta1,theta2 defined on the domain [0,1]x[0,1].
"""

# Define the likelihood function.
# The input argument is a vector: theta = [theta1 , theta2]

def likelihood(theta):
    # Data are constants, specified here:
    z1, N1, z2, N2 = 5, 7, 2, 7
    likelihood = (theta[0]**z1 * (1-theta[0])**(N1-z1)
                 * theta[1]**z2 * (1-theta[1])**(N2-z2))
    return likelihood


# Define the prior density function.
# The input argument is a vector: theta = [theta1 , theta2]
def prior(theta):
    # Here's a beta-beta prior:
    a1, b1, a2, b2 = 3, 3, 3, 3
    prior = beta.pdf(theta[0], a1, b1) * beta.pdf(theta[1], a2, b2) 
    return prior


# Define the relative probability of the target distribution, as a function 
# of theta.  The input argument is a vector: theta = [theta1 , theta2].
# For our purposes, the value returned is the UNnormalized posterior prob.
def target_rel_prob(theta):
    if ((theta >= 0.0).all() & (theta <= 1.0).all()):
        target_rel_probVal =  likelihood(theta) * prior(theta)
    else:
        # This part is important so that the Metropolis algorithm
        # never accepts a jump to an invalid parameter value.
        target_rel_probVal = 0.0
    return target_rel_probVal
#    if ( all( theta >= 0.0 ) & all( theta <= 1.0 ) ) {
#        target_rel_probVal =  likelihood( theta ) * prior( theta )


# Specify the length of the trajectory, i.e., the number of jumps to try:
traj_length = 5000 # arbitrary large number
# Initialize the vector that will store the results.
trajectory = np.zeros((traj_length, 2))
# Specify where to start the trajectory
trajectory[0, ] = [0.50, 0.50] # arbitrary start values of the two param's
# Specify the burn-in period.
burn_in = np.ceil(.1 * traj_length) # arbitrary number
# Initialize accepted, rejected counters, just to monitor performance.
n_accepted = 0
n_rejected = 0
# Specify the seed, so the trajectory can be reproduced.
np.random.seed(47405)
# Specify the covariance matrix for multivariate normal proposal distribution.
n_dim, sd1, sd2 = 2, 0.2, 0.2
covar_mat = [[sd1**2, 0], [0, sd2**2]]

# Now generate the random walk. step is the step in the walk.
for step in range(traj_length-1):
    current_position = trajectory[step, ]
    # Use the proposal distribution to generate a proposed jump.
    # The shape and variance of the proposal distribution can be changed
    # to whatever you think is appropriate for the target distribution.
    proposed_jump = np.random.multivariate_normal(mean=np.zeros((n_dim)),
                                                 cov=covar_mat)
    # Compute the probability of accepting the proposed jump.
    prob_accept = np.minimum(1, target_rel_prob(current_position + proposed_jump)
                            / target_rel_prob(current_position))
    # Generate a random uniform value from the interval [0,1] to
    # decide whether or not to accept the proposed jump.
    if np.random.rand() < prob_accept:
        # accept the proposed jump
        trajectory[step+1, ] = current_position + proposed_jump
        # increment the accepted counter, just to monitor performance
        if step > burn_in:
            n_accepted += 1
    else:
        # reject the proposed jump, stay at current position
        trajectory[step+1, ] = current_position
        # increment the rejected counter, just to monitor performance
        if step > burn_in:
            n_rejected += 1

# End of Metropolis algorithm.

#-----------------------------------------------------------------------
# Begin making inferences by using the sample generated by the
# Metropolis algorithm.

# Extract just the post-burnIn portion of the trajectory.
accepted_traj = trajectory[burn_in:]

# Compute the means of the accepted points.
mean_traj =  np.mean(accepted_traj, axis=0)
# Compute the standard deviations of the accepted points.
stdTraj =  np.std(accepted_traj, axis=0)

# Plot the trajectory of the last 500 sampled values.
plt.figure(figsize=(16,4))
plt.plot(accepted_traj[:,0], accepted_traj[:,1], marker='o', alpha=0.3)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta2$')

# Display means in plot.
plt.plot(0, label='M = %.3f, %.3f' % (mean_traj[0], mean_traj[1]), alpha=0.0)
# Display rejected/accepted ratio in the plot.
plt.plot(0, label=r'$N_{pro}=%s$ $\frac{N_{acc}}{N_{pro}} = %.3f$' % (len(accepted_traj), (n_accepted/len(accepted_traj))), alpha=0)

# Evidence for model, p(D).
# Compute a,b parameters for beta distribution that has the same mean
# and stdev as the sample from the posterior. This is a useful choice
# when the likelihood function is binomial.
a =   mean_traj * ((mean_traj*(1-mean_traj)/stdTraj**2) - np.ones(n_dim))
b = (1-mean_traj) * ( (mean_traj*(1-mean_traj)/stdTraj**2) - np.ones(n_dim))
# For every theta value in the posterior sample, compute 
# beta.pdf(theta, a, b) / likelihood(theta) * prior(theta)
# This computation assumes that likelihood and prior are properly normalized,
# i.e., not just relative probabilities.

wtd_evid = np.zeros(np.shape(accepted_traj)[0])
for idx in range(np.shape(accepted_traj)[0]):
    wtd_evid[idx] = (beta.pdf(accepted_traj[idx,0],a[0],b[0] )
        * beta.pdf(accepted_traj[idx,1],a[1],b[1]) /
        (likelihood(accepted_traj[idx,]) * prior(accepted_traj[idx,])))

p_data = 1 / np.mean(wtd_evid)
# Display p(D) in the graph
plt.plot(0, label='p(D) = %.3e' % p_data, alpha=0)
plt.legend(loc='upper left')

# Estimate highest density region by evaluating posterior at each point.
accepted_traj = trajectory[burn_in:]
npts = np.shape(accepted_traj)[0] 
post_prob = np.zeros((npts))
for ptIdx in range(npts):
    post_prob[ptIdx] = target_rel_prob(accepted_traj[ptIdx,])

# Determine the level at which credmass points are above:
credmass = 0.95
waterline = np.percentile(post_prob, (credmass))

HDI_points = accepted_traj[post_prob > waterline, ]

plt.figure(figsize=(16,4))
plt.plot(HDI_points[:,0], HDI_points[:,1], 'ro')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta2$')

# Display means in plot.
plt.plot(0, label='M = %.3f, %.3f' % (mean_traj[0], mean_traj[1]), alpha=0.0)
# Display rejected/accepted ratio in the plot.
plt.plot(0, label=r'$N_{pro}=%s$ $\frac{N_{acc}}{N_{pro}} = %.3f$' % (len(accepted_traj), (n_accepted/len(accepted_traj))), alpha=0)
# Display p(D) in the graph
plt.plot(0, label='p(D) = %.3e' % p_data, alpha=0)
plt.legend(loc='upper left')

plt.show()


# In[36]:


"""
08_BernTwoPyMC.py
Inferring two binomial proportions using PyMC.
"""

# Generate the data
y1 = np.array([1, 1, 1, 1, 1, 0, 0])  # 5 heads and 2 tails
y2 = np.array([1, 1, 0, 0, 0, 0, 0])  # 2 heads and 5 tails


with pm.Model() as model:
    # define the prior
    theta1 = pm.Beta('theta1', 3, 3)  # prior
    theta2 = pm.Beta('theta2', 3, 3)  # prior
    # define the likelihood
    y1 = pm.Bernoulli('y1', p=theta1, observed=y1)
    y2 = pm.Bernoulli('y2', p=theta2, observed=y2)

    # Generate a MCMC chain
    start = pm.find_MAP()  # Find starting value by optimization
    trace = pm.sample(10000, pm.Metropolis(),
                      progressbar=False)  # Use Metropolis sampling
    #start = pm.find_MAP()  # Find starting value by optimization
    #step = pm.NUTS()  # Instantiate NUTS sampler
    #trace = pm.sample(10000, step, start=start, progressbar=False)

# create an array with the posterior sample
theta1_sample = trace['theta1']
theta2_sample = trace['theta2']

# Plot the trajectory of the last 500 sampled values.
plt.figure(figsize=(16,4))
plt.plot(theta1_sample[:-500], theta2_sample[:-500], marker='o')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\theta2$')

# Display means in plot.
plt.plot(0, label='M = %.3f, %.3f' % (np.mean(theta1_sample), np.mean(theta2_sample)), alpha=0.0)

plt.legend(loc='upper left')

# Plot a histogram of the posterior differences of theta values.
plt.figure(figsize=(16,4))
theta_diff = theta1_sample - theta2_sample
plot_post(theta_diff, xlab=r'\theta1-theta2', comp_val=0.0, show_mode=False, bins=30)


# For Exercise 8.5:
# Posterior prediction. For each step in the chain, use the posterior thetas 
# to flip the coins.
chain_len = len(theta1_sample)
# Create matrix to hold results of simulated flips:
y_pred = np.zeros((2, chain_len))
for step_idx in range(chain_len):  # step through the chain
    # flip the first coin:
    p_head1 = theta1_sample[step_idx]
    y_pred[0, step_idx] = np.random.choice([0,1], p=[1-p_head1, p_head1])
    # flip the second coin:
    p_head2 = theta2_sample[step_idx]
    y_pred[1, step_idx] = np.random.choice([0,1], p=[1-p_head2, p_head2])


# Now determine the proportion of times that y1==1 and y2==0
pY1eq1andY2eq0 = sum((y_pred[0] ==1) & (y_pred[1] == 0)) / chain_len

print "P(Y1==1 and Y2==0)", pY1eq1andY2eq0
plt.show()


# In[37]:


"""
09_BernBetaMuKappaPyMC.py
Bernoulli Likelihood with Hierarchical Prior!
"""

# Data for figure 9.11
N =  [10, 10, 10]  # Number of flips per coin
z =  [5, 5, 5]  # Number of heads per coin
## Data for figure 9.12
#N =  [10, 10, 10]  # Number of flips per coin
#z =  [1, 5, 9]  # Number of heads per coin

## Data for exercise 9.1
#ncoins = 50
#nflipspercoin = 5
#mu_act = .7
#kappa_act = 20
#theta_act = beta.rvs(mu_act*kappa_act+1, (1-mu_act)*kappa_act+1, size=ncoins)
#z = binom.rvs(n=nflipspercoin, p=theta_act, size=ncoins)
#N = [nflipspercoin] * ncoins


# Arrange the data into a more convenient way to feed the PyMC model.
coin = []  # list/vector index for each coins (from 0 to number of coins)
y = []  # list/vector with head (1) or tails (0) for each flip.
for i, flips in enumerate(N):
    heads = z[i]
    if  heads > flips:
        sys.exit("The number of heads can't be greater than the number of flips")
    else:
        y = y + [1] * heads + [0] * (flips-heads)
        coin = coin + [i] * flips


# Specify the model in PyMC
with pm.Model() as model:
# define the hyperparameters
    mu = pm.Beta('mu', 2, 2)
    kappa = pm.Gamma('kappa', 1, 0.1)
    # define the prior
    theta = pm.Beta('theta', mu * kappa, (1 - mu) * kappa, shape=len(N))
    # define the likelihood
    y = pm.Bernoulli('y', p=theta[coin], observed=y)

#   Generate a MCMC chain
    step = pm.Metropolis()
    trace = pm.sample(5000, step, progressbar=False)
#   Restricted models like this could be difficult to sample. This is related 
#   to the censoring comment in the book. One way to detect that something is 
#   wrong with the sampling is to compare the autocorrelation plot and the 
#   sampled values under different sampler, or you can try combinations of 
#   sampler like this

#    step1 = pm.Metropolis([theta, mu])
#    step2 = pm.Slice([kappa])
#    trace = pm.sample(5000, [step1, step2], progressbar=False)

#    or this (this combination was used to generate the figures)

#    start = pm.find_MAP()
#    step1 = pm.Metropolis([theta, mu])
#    step2 = pm.NUTS([kappa])
#    trace = pm.sample(5000, [step1, step2], start=start, progressbar=False)

## Check the results.
burnin = 2000  # posterior samples to discard
thin = 10  # posterior samples to discard

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
pm.autocorrplot(trace[burnin::thin], vars =[mu, kappa])
#pm.autocorrplot(trace, vars =[mu, kappa])

## Plot KDE and sampled values for each parameter.
pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)

# Create arrays with the posterior sample
theta1_sample = trace['theta'][:,0][burnin::thin]
theta2_sample = trace['theta'][:,1][burnin::thin]
theta3_sample = trace['theta'][:,2][burnin::thin]
mu_sample = trace['mu'][burnin::thin]
kappa_sample = trace['kappa'][burnin::thin]

fig = plt.figure(figsize=(12,12))

# Scatter plot hyper-parameters
plt.subplot(4, 3, 1)
plt.scatter(mu_sample, kappa_sample, marker='o')
plt.xlim(0,1)
plt.xlabel(r'$\mu$')
plt.ylabel(r'$\kappa$')

# Plot mu histogram
plt.subplot(4, 3, 2)
plot_post(mu_sample, xlab=r'$\mu$', show_mode=False, labelsize=9, framealpha=0.5)
plt.xlim(0,1)

# Plot kappa histogram
plt.subplot(4, 3, 3)
plot_post(kappa_sample, xlab=r'$\kappa$', show_mode=False, labelsize=9, framealpha=0.5)

# Plot theta 1
plt.subplot(4, 3, 4)
plot_post(theta1_sample, xlab=r'$\theta1$', show_mode=False, labelsize=9, framealpha=0.5)
plt.xlim(0,1)

# Scatter theta 1 vs mu
plt.subplot(4, 3, 5)
plt.scatter(theta1_sample, mu_sample, marker='o')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\mu$')

# Scatter theta 1 vs kappa
plt.subplot(4, 3, 6)
plt.scatter(theta1_sample, kappa_sample, marker='o')
plt.xlim(0,1)
plt.xlabel(r'$\theta1$')
plt.ylabel(r'$\kappa$')

# Plot theta 2
plt.subplot(4, 3, 7)
plot_post(theta2_sample, xlab=r'$\theta2$', show_mode=False, labelsize=9, framealpha=0.5)
plt.xlim(0,1)

# Scatter theta 2 vs mu
plt.subplot(4, 3, 8)
plt.scatter(theta2_sample, mu_sample, marker='o')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel(r'$\theta2$')
plt.ylabel(r'$\mu$')

# Scatter theta 2 vs kappa
plt.subplot(4, 3, 9)
plt.scatter(theta2_sample, kappa_sample, marker='o')
plt.xlim(0,1)
plt.xlabel(r'$\theta2$')
plt.ylabel(r'$\kappa$')

# Plot theta 3
plt.subplot(4, 3, 10)
plot_post(theta3_sample, xlab=r'$\theta3$', show_mode=False, labelsize=9, framealpha=0.5)
plt.xlim(0,1)

# Scatter theta 3 vs mu
plt.subplot(4, 3, 11)
plt.scatter(theta3_sample, mu_sample, marker='o')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel(r'$\theta3$')
plt.ylabel(r'$\mu$')

# Scatter theta 3 vs kappa
plt.subplot(4, 3, 12)
plt.scatter(theta3_sample, kappa_sample, marker='o')
plt.xlim(0,1)
plt.xlabel(r'$\theta3$')
plt.ylabel(r'$\kappa$')

plt.tight_layout()
plt.show()


# In[38]:


"""
09_BernBetaMuKappaPyMC_TT.py
Bernoulli Likelihood with Hierarchical Prior. The Therapeutic Touch example.
"""

## Therapeutic touch data:
z =  [1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
      5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8]  # Number of heads per coin
N =  [10] * len(z)  # Number of flips per coin

# rearrange the data to load it PyMC model.
coin = []  # list/vector index for each coins (from 0 to number of coins)
y = []  # list/vector with head (1) or tails (0) for each flip.
for i, flips in enumerate(N):
    heads = z[i]
    if  heads > flips:
        sys.exit("The number of heads can't be greater than the number of flips")
    else:
        y = y + [1] * heads + [0] * (flips-heads)
        coin = coin + [i] * flips


# Specify the model in PyMC
with pm.Model() as model:
# define the hyperparameters
    mu = pm.Beta('mu', 2, 2)
    kappa = pm.Gamma('kappa', 1, 0.1)
    # define the prior
    theta = pm.Beta('theta', mu * kappa, (1 - mu) * kappa, shape=len(N))
    # define the likelihood
    y = pm.Bernoulli('y', p=theta[coin], observed=y)
#   Generate a MCMC chain
    start = pm.find_MAP()  # find a reasonable starting point.
    step1 = pm.Metropolis([theta, mu])
    step2 = pm.NUTS([kappa])
    trace = pm.sample(10000, [step1, step2], start=start, random_seed=(123), progressbar=False)

## Check the results.
burnin = 2000  # posterior samples to discard
thin = 10  # posterior samples to discard

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
pm.autocorrplot(trace[burnin::thin], vars =[mu, kappa])
#pm.autocorrplot(trace, vars =[mu, kappa])

## Plot KDE and sampled values for each parameter.
pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)

# Create arrays with the posterior sample
theta1_sample = trace['theta'][:,0][burnin::thin]
theta28_sample = trace['theta'][:,27][burnin::thin]
mu_sample = trace['mu'][burnin::thin]
kappa_sample = trace['kappa'][burnin::thin]

fig = plt.figure(figsize=(16,12))

# Plot mu histogram
plt.subplot(2, 2, 1)
plot_post(mu_sample, xlab=r'$\mu$', show_mode=False, labelsize=9, framealpha=0.5)

# Plot kappa histogram
plt.subplot(2, 2, 2)
plot_post(kappa_sample, xlab=r'$\kappa$', show_mode=False, labelsize=9, framealpha=0.5)

# Plot theta 1
plt.subplot(2, 2, 3)
plot_post(theta1_sample, xlab=r'$\theta1$', show_mode=False, labelsize=9, framealpha=0.5)

# Plot theta 28
plt.subplot(2, 2, 4)
plot_post(theta28_sample, xlab=r'$\theta28$', show_mode=False, labelsize=9, framealpha=0.5)

plt.tight_layout()
plt.show()


# In[39]:


"""
09_FilconPyMC.py
More Hierarchical models. The filtration-condensation experiment.
"""
# Data
# For each subject, specify the condition s/he was in,
# the number of trials s/he experienced, and the number correct.
ncond = 4
nSubj = 40
trials = 64

N = np.repeat([trials], (ncond * nSubj))
z = np.array([45, 63, 58, 64, 58, 63, 51, 60, 59, 47, 63, 61, 60, 51, 59, 45,
61, 59, 60, 58, 63, 56, 63, 64, 64, 60, 64, 62, 49, 64, 64, 58, 64, 52, 64, 64,
64, 62, 64, 61, 59, 59, 55, 62, 51, 58, 55, 54, 59, 57, 58, 60, 54, 42, 59, 57,
59, 53, 53, 42, 59, 57, 29, 36, 51, 64, 60, 54, 54, 38, 61, 60, 61, 60, 62, 55,
38, 43, 58, 60, 44, 44, 32, 56, 43, 36, 38, 48, 32, 40, 40, 34, 45, 42, 41, 32,
48, 36, 29, 37, 53, 55, 50, 47, 46, 44, 50, 56, 58, 42, 58, 54, 57, 54, 51, 49,
52, 51, 49, 51, 46, 46, 42, 49, 46, 56, 42, 53, 55, 51, 55, 49, 53, 55, 40, 46,
56, 47, 54, 54, 42, 34, 35, 41, 48, 46, 39, 55, 30, 49, 27, 51, 41, 36, 45, 41,
53, 32, 43, 33])
condition = np.repeat([0,1,2,3], nSubj)

# Specify the model in PyMC
with pm.Model() as model:
    kappa = pm.Gamma('kappa', 1, 0.1, shape=ncond)
    mu = pm.Beta('mu', 1, 1, shape=ncond)
    theta = pm.Beta('theta', mu[condition] * kappa[condition], (1 - mu[condition]) * kappa[condition], shape=len(z))
    y = pm.Binomial('y', p=theta, n=N, observed=z)
    start = pm.find_MAP()
    step1 = pm.Metropolis([mu])
    step2 = pm.Metropolis([theta])
    step3 = pm.NUTS([kappa])
#    samplers = [pm.Metropolis([rv]) for rv in model.unobserved_RVs]
    trace = pm.sample(10000, [step1, step2, step3], start=start, progressbar=False)

## Check the results.
burnin = 5000  # posterior samples to discard
thin = 10  # posterior samples to discard

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace, vars =[mu, kappa])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
pm.traceplot(trace)


# Create arrays with the posterior sample
mu1_sample = trace['mu'][:,0][burnin::thin]
mu2_sample = trace['mu'][:,1][burnin::thin]
mu3_sample = trace['mu'][:,2][burnin::thin]
mu4_sample = trace['mu'][:,3][burnin::thin]


fig = plt.figure(figsize=(16, 6))

# Plot differences among filtrations experiments
plt.subplot(1, 3, 1)
plot_post((mu1_sample-mu2_sample), xlab=r'$\mu1-\mu2$', show_mode=False, comp_val=0, framealpha=0.5)

# Plot differences among condensation experiments
plt.subplot(1, 3, 2)
plot_post((mu3_sample-mu4_sample), xlab=r'$\mu3-\mu4$', show_mode=False, comp_val=0, framealpha=0.5)

# Plot differences between filtration and condensation experiments
plt.subplot(1, 3, 3)
a = (mu1_sample+mu2_sample)/2 - (mu3_sample+mu4_sample)/2
plot_post(a, xlab=r'$(\mu1+\mu2)/2 - (\mu3+\mu4)/2$', show_mode=False, comp_val=0, framealpha=0.5)

plt.tight_layout()
plt.show()


# In[40]:


"""
09_FilconPyMC_ex9.2.A.py
More Hierarchical models. The filtration-condensation experiment. A single kappa
for all conditions.
"""

# Data
# For each subject, specify the condition s/he was in,
# the number of trials s/he experienced, and the number correct.
ncond = 4
nSubj = 40
trials = 64

N = np.repeat([trials], (ncond * nSubj))
z = np.array([45, 63, 58, 64, 58, 63, 51, 60, 59, 47, 63, 61, 60, 51, 59, 45,
61, 59, 60, 58, 63, 56, 63, 64, 64, 60, 64, 62, 49, 64, 64, 58, 64, 52, 64, 64,
64, 62, 64, 61, 59, 59, 55, 62, 51, 58, 55, 54, 59, 57, 58, 60, 54, 42, 59, 57,
59, 53, 53, 42, 59, 57, 29, 36, 51, 64, 60, 54, 54, 38, 61, 60, 61, 60, 62, 55,
38, 43, 58, 60, 44, 44, 32, 56, 43, 36, 38, 48, 32, 40, 40, 34, 45, 42, 41, 32,
48, 36, 29, 37, 53, 55, 50, 47, 46, 44, 50, 56, 58, 42, 58, 54, 57, 54, 51, 49,
52, 51, 49, 51, 46, 46, 42, 49, 46, 56, 42, 53, 55, 51, 55, 49, 53, 55, 40, 46,
56, 47, 54, 54, 42, 34, 35, 41, 48, 46, 39, 55, 30, 49, 27, 51, 41, 36, 45, 41,
53, 32, 43, 33])
condition = np.repeat([0,1,2,3], nSubj)

# Specify the model in PyMC
with pm.Model() as model:
    # define the hyperparameters
    kappa = pm.Gamma('kappa', 1, 0.1)
    mu = pm.Beta('mu', 1, 1, shape=ncond)
    # define the prior
    theta = pm.Beta('theta', mu[condition] * kappa, (1 - mu[condition]) * kappa, shape=len(z))
    # define the likelihood
    y = pm.Binomial('y', p=theta, n=N, observed=z)
    start = pm.find_MAP()
    step1 = pm.Metropolis([mu])
    step2 = pm.Metropolis([theta])
    step3 = pm.NUTS([kappa])
#    samplers = [pm.Metropolis([rv]) for rv in model.unobserved_RVs]
    trace = pm.sample(10000, [step1, step2, step3], start=start, progressbar=False)

## Check the results.
burnin = 5000  # posterior samples to discard
thin = 10  # posterior samples to discard

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace, vars =[mu, kappa])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
pm.traceplot(trace)


# Create arrays with the posterior sample
mu1_sample = trace['mu'][:,0][burnin::thin]
mu2_sample = trace['mu'][:,1][burnin::thin]
mu3_sample = trace['mu'][:,2][burnin::thin]
mu4_sample = trace['mu'][:,3][burnin::thin]


fig = plt.figure(figsize=(16, 6))

# Plot differences among filtrations experiments
plt.subplot(1, 3, 1)
plot_post((mu1_sample-mu2_sample), xlab=r'$\mu1-\mu2$', show_mode=False, comp_val=0, framealpha=0.5)

# Plot differences among condensation experiments
plt.subplot(1, 3, 2)
plot_post((mu3_sample-mu4_sample), xlab=r'$\mu3-\mu4$', show_mode=False, comp_val=0, framealpha=0.5)

# Plot differences between filtration and condensation experiments
plt.subplot(1, 3, 3)
a = (mu1_sample+mu2_sample)/2 - (mu3_sample+mu4_sample)/2
plot_post(a, xlab=r'$(\mu1+\mu2)/2 - (\mu3+\mu4)/2$', show_mode=False, comp_val=0, framealpha=0.5)

plt.tight_layout()
plt.show()


# In[41]:


"""
09_FilconPyMC_ex9.2.B.py
More Hierarchical models. The filtration-condensation experiment. 
Hyperparameters for kappafor all conditions.
"""
# Data
# For each subject, specify the condition s/he was in,
# the number of trials s/he experienced, and the number correct.
ncond = 4
nSubj = 40
trials = 64

N = np.repeat([trials], (ncond * nSubj))
z = np.array([45, 63, 58, 64, 58, 63, 51, 60, 59, 47, 63, 61, 60, 51, 59, 45,
61, 59, 60, 58, 63, 56, 63, 64, 64, 60, 64, 62, 49, 64, 64, 58, 64, 52, 64, 64,
64, 62, 64, 61, 59, 59, 55, 62, 51, 58, 55, 54, 59, 57, 58, 60, 54, 42, 59, 57,
59, 53, 53, 42, 59, 57, 29, 36, 51, 64, 60, 54, 54, 38, 61, 60, 61, 60, 62, 55,
38, 43, 58, 60, 44, 44, 32, 56, 43, 36, 38, 48, 32, 40, 40, 34, 45, 42, 41, 32,
48, 36, 29, 37, 53, 55, 50, 47, 46, 44, 50, 56, 58, 42, 58, 54, 57, 54, 51, 49,
52, 51, 49, 51, 46, 46, 42, 49, 46, 56, 42, 53, 55, 51, 55, 49, 53, 55, 40, 46,
56, 47, 54, 54, 42, 34, 35, 41, 48, 46, 39, 55, 30, 49, 27, 51, 41, 36, 45, 41,
53, 32, 43, 33])
condition = np.repeat([0,1,2,3], nSubj)

# Specify the model in PyMC
with pm.Model() as model:
    # define the hyper-hyperparameters for kappa
    mean_gamma = pm.Uniform('mean_gamma', 0, 30)
    sd_gamma = pm.Uniform('sd_gamma', 0, 30)
    s_kappa = mean_gamma**2/sd_gamma**2
    r_kappa = mean_gamma/sd_gamma**2
    # define the hyperparameters
    kappa = pm.Gamma('kappa', s_kappa, r_kappa)
    mu = pm.Beta('mu', 1, 1, shape=ncond)
    # define the prior
    theta = pm.Beta('theta', mu[condition] * kappa, (1 - mu[condition]) * kappa, shape=len(z))
    # define the likelihood
    y = pm.Binomial('y', p=theta, n=N, observed=z)
    start = pm.find_MAP()
    step1 = pm.Metropolis([mu])
    step2 = pm.Metropolis([theta])
    step3 = pm.NUTS([kappa])
#    samplers = [pm.Metropolis([rv]) for rv in model.unobserved_RVs]
    trace = pm.sample(10000, [step1, step2, step3], start=start, progressbar=False)

## Check the results.
burnin = 5000  # posterior samples to discard
thin = 10  # posterior samples to discard

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace, vars =[mu, kappa])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])

#pm.traceplot(trace)


# Create arrays with the posterior sample
mu1_sample = trace['mu'][:,0][burnin::thin]
mu2_sample = trace['mu'][:,1][burnin::thin]
mu3_sample = trace['mu'][:,2][burnin::thin]
mu4_sample = trace['mu'][:,3][burnin::thin]


fig = plt.figure(figsize=(16, 6))

# Plot differences among filtrations experiments
plt.subplot(1, 3, 1)
plot_post((mu1_sample-mu2_sample), xlab=r'$\mu1-\mu2$', show_mode=False, comp_val=0, framealpha=0.5)

# Plot differences among condensation experiments
plt.subplot(1, 3, 2)
plot_post((mu3_sample-mu4_sample), xlab=r'$\mu3-\mu4$', show_mode=False, comp_val=0, framealpha=0.5)

# Plot differences between filtration and condensation experiments
plt.subplot(1, 3, 3)
a = (mu1_sample+mu2_sample)/2 - (mu3_sample+mu4_sample)/2
plot_post(a, xlab=r'$(\mu1+\mu2)/2 - (\mu3+\mu4)/2$', show_mode=False, comp_val=0, framealpha=0.5)

plt.tight_layout()
plt.show()


# In[42]:


"""
10_BernBetaModelCompPyMC.py
Comparing models using Hierarchical modelling.
"""

## specify the Data
y = np.repeat([0, 1], [3, 6])  # 3 tails 6 heads

with pm.Model() as model:
    # Hyperhyperprior:
    model_index = pm.DiscreteUniform('model_index', lower=0, upper=1)
    # Hyperprior:
    kappa_theta = 12
    mu_theta = pm.switch(pm.eq(model_index, 1), 0.25, 0.75)
    # Prior distribution:
    a_theta = mu_theta * kappa_theta
    b_theta = (1 - mu_theta) * kappa_theta
    theta = pm.Beta('theta', a_theta, b_theta) # theta distributed as beta density
    #likelihood
    y = pm.Bernoulli('y', theta, observed=y)
    start = pm.find_MAP()
    step = pm.Metropolis()
    trace = pm.sample(10000, step, start=start, progressbar=False)


## Check the results.
burnin = 2000  # posterior samples to discard
thin = 1  # posterior samples to discard

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace)

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)

## Get the posterior sample of model_index:
model_idx_sample = trace['model_index'][burnin::thin]
## Compute the proportion of model_index at each value:
p_M1 = sum(model_idx_sample == 1) / len(model_idx_sample)
p_M2 = 1 - p_M1


## Get the posterior sample of theta:
theta_sample = trace['theta'][burnin::thin]
## Extract theta values when model_index is 1:
theta_sample_M1 = theta_sample[model_idx_sample == 1]
## Extract theta values when model_index is 2:
theta_sample_M2 = theta_sample[model_idx_sample == 0]

## Plot histograms of sampled theta values for each model,
plt.figure(figsize=(16,6))
plt.subplot(1, 2, 1)
plt.hist(theta_sample_M1, label='p(M1|D) = %.3f' % p_M1)
plt.xlabel(r'$\theta$')
plt.ylabel(r'$p(\theta|\mu=0.25,D)$')
plt.xlim(0, 1)
plt.legend(loc='upper right', framealpha=0.5)

plt.subplot(1, 2, 2)
plt.hist(theta_sample_M2, label='p(M2|D) = %.3f' % p_M2)
plt.xlabel(r'$\theta$')
plt.ylabel(r'$p(\theta|\mu=0.75,D)$')
plt.xlim(0, 1)
plt.legend(loc='upper right', framealpha=0.5)

plt.show()


# In[43]:


"""
10_FilconModelCompPyMC.py
Comparing models using Hierarchical modelling.
"""

# THE DATA.
# For each subject, specify the condition s/he was in,
# the number of trials s/he experienced, and the number correct.

cond_of_subj = np.repeat([0,1,2,3], 40)

n_trl_of_subj = np.array([64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64])

n_corr_of_subj = np.array([45,63,58,64,58,63,51,60,59,47,63,61,60,51,59,45,61,
59,60,58,63,56,63,64,64,60,64,62,49,64,64,58,64,52,64,64,64,62,64,61,59,59,
55,62,51,58,55,54,59,57,58,60,54,42,59,57,59,53,53,42,59,57,29,36,51,64,60,
54,54,38,61,60,61,60,62,55,38,43,58,60,44,44,32,56,43,36,38,48,32,40,40,34,
45,42,41,32,48,36,29,37,53,55,50,47,46,44,50,56,58,42,58,54,57,54,51,49,52,
51,49,51,46,46,42,49,46,56,42,53,55,51,55,49,53,55,40,46,56,47,54,54,42,34,
35,41,48,46,39,55,30,49,27,51,41,36,45,41,53,32,43,33])

n_subj = len(cond_of_subj)
n_cond = len(set(cond_of_subj))


# THE MODEL.
with pm.Model() as model:
    # Hyperprior on model index:
    model_index = pm.DiscreteUniform('model_index', lower=0, upper=1)
    # Constants for hyperprior:
    shape_Gamma = 1.0
    rate_Gamma = 0.1
    # Hyperprior on mu and kappa:
    mu = pm.Beta('mu', 1, 1, shape=n_cond)

    kappa0 = pm.Gamma('kappa0', shape_Gamma, rate_Gamma)
    a_Beta0 = mu[cond_of_subj] * kappa0
    b_Beta0 = (1 - mu[cond_of_subj]) * kappa0

    kappa1 = pm.Gamma('kappa1', shape_Gamma, rate_Gamma, shape=n_cond)
    a_Beta1 = mu[cond_of_subj] * kappa1[cond_of_subj]
    b_Beta1 = (1 - mu[cond_of_subj]) * kappa1[cond_of_subj]

    #Prior on theta
    theta0 = pm.Beta('theta0', a_Beta0, b_Beta0, shape=n_subj)
    theta1 = pm.Beta('theta1', a_Beta1, b_Beta1, shape=n_subj)
    # if model_index == 0 then sample from theta1 else sample from theta0
    theta = pm.switch(pm.eq(model_index, 0), theta1, theta0)

    # Likelihood:
    y = pm.Binomial('y', p=theta, n=n_trl_of_subj, observed=n_corr_of_subj)

    # Sampling
    start = pm.find_MAP()
    steps = [pm.Metropolis([i]) for i in model.unobserved_RVs[1:]]
    steps.append(pm.ElemwiseCategoricalStep(var=model_index,values=[0,1]))
    trace = pm.sample(50000, steps, start=start, progressbar=False)


# EXAMINE THE RESULTS.
burnin = 1000
thin = 1
model_idx_sample = trace['model_index'][burnin::thin]

pM1 = sum(model_idx_sample == 1) / len(model_idx_sample)
pM2 = 1 - pM1

plt.figure(figsize=(15, 15))
plt.subplot2grid((5,4), (0,0), colspan=4)
plt.plot(model_idx_sample, label='p(M1|D) = %.3f ; p(M2|D) = %.3f' % (pM1, pM2));
plt.xlabel('Step in Markov Chain')
plt.legend(loc='upper right', framealpha=0.75)

for m in range(0, 2):
    kappa0_sample = trace['kappa0'][burnin::thin][model_idx_sample == m]
    plt.subplot2grid((5,4), (3+m, 1), colspan=2)
    plt.hist(kappa0_sample, bins=20)
    plt.title(r'Post. $\kappa_0$ for M=%s' % (m+1), fontsize=14)
    plt.xlabel(r'$\kappa_0$')
    plt.xlim(0, 30)
    for i in range(0, 4):
        kappa1_sample = trace['kappa1'][:,i][burnin::thin][model_idx_sample == m]
        plt.subplot2grid((5,4), (m+1, i))
        plt.hist(kappa1_sample)
        plt.title(r'Post. $\kappa_%s$ for M=%s' % (i+1, m+1), fontsize=14)
        plt.xlabel(r'$\kappa_%s$' % (i+1))
        plt.xlim(0, 30)

plt.tight_layout()
plt.show()


# In[44]:


"""
10_ToyModelCompPyMC.py
Comparing models using Hierarchical modelling. Toy Model.
"""

# THE DATA.
N = 30
z = 8
y = np.repeat([1, 0], [z, N-z]) 

# THE MODEL.
with pm.Model() as model:
    # Hyperprior on model index:
    model_index = pm.DiscreteUniform('model_index', lower=0, upper=1)
    # Prior
    nu = pm.Normal('nu', mu=0, tau=0.1) # it is posible to use tau or sd
    eta = pm.Gamma('eta', .1, .1)
    theta0 = 1 / (1 + pm.exp(-nu)) # theta from model index 0
    theta1 = pm.exp(-eta)    # theta from model index 1
    theta = pm.switch(pm.eq(model_index, 0), theta0, theta1)
    # Likelihood
    y = pm.Bernoulli('y', p=theta, observed=y)
    # Sampling
    start = pm.find_MAP()
    step1 = pm.Metropolis(model.vars[1:])
    step2 = pm.ElemwiseCategoricalStep(var=model_index,values=[0,1])
    trace = pm.sample(10000, [step1, step2], start=start, progressbar=False)


# EXAMINE THE RESULTS.
burnin = 1000
thin = 5

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[nu, eta])
#pm.autocorrplot(trace, vars =[nu, eta])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)


model_idx_sample = trace['model_index'][burnin::thin]
pM1 = sum(model_idx_sample == 0) / len(model_idx_sample)
pM2 = 1 - pM1


nu_sample_M1 = trace['nu'][burnin::thin][model_idx_sample == 0]
eta_sample_M2 = trace['eta'][burnin::thin][model_idx_sample == 1]

plt.figure(figsize=(16,6))
plt.subplot(2, 1, 1)
plot_post(nu_sample_M1, xlab=r'$\nu$', show_mode=False, labelsize=9, framealpha=0.5)
plt.xlabel(r'$\nu$')
plt.ylabel('frequency')
plt.title(r'p($\nu$|D,M2), with p(M2|D)=%.3f' % pM1, fontsize=14)
plt.xlim(-8, 8)

plt.subplot(2, 1, 2)
plot_post(eta_sample_M2, xlab=r'$\eta$', show_mode=False, labelsize=9, framealpha=0.5)
plt.xlabel(r'$\eta$')
plt.ylabel('frequency')
plt.title(r'p($\eta$|D,M2), with p(M2|D)=%.3f' % pM2, fontsize=14)
plt.xlim(0, 8)
plt.show()


# In[48]:


"""
12_OneOddGroupModelComp.py
Testing a point ('Null') Hypothesis (not using seudopriors)
"""

# THE DATA.
# For each subject, specify the condition s/he was in,
# the number of trials s/he experienced, and the number correct.
# (Randomly generated fictitious data.)
npg = 20  # number of subjects per group
ntrl = 20 # number of trials per subject
cond_of_subj = np.repeat([0, 1, 2, 3], npg)
n_trl_of_subj = np.repeat([ntrl], 4*npg)
np.random.seed(47401)

n_corr_of_subj = np.concatenate((binom.rvs(n=ntrl, p=.61, size=npg), 
                binom.rvs(n=ntrl, p=.50, size=npg),
                binom.rvs(n=ntrl, p=.49, size=npg),
                binom.rvs(n=ntrl, p=.51, size=npg)))

n_subj = len(cond_of_subj)
n_cond = len(set(cond_of_subj))


# THE MODEL
with pm.Model() as model:
    # Hyperprior on model index:
    model_index = pm.DiscreteUniform('model_index', lower=0, upper=1)
    # Constants for hyperprior:
    shape_Gamma = 1.0
    rate_Gamma = 0.1
    # Hyperprior on mu and kappa:
    kappa = pm.Gamma('kappa', shape_Gamma, rate_Gamma, shape=n_cond)

    mu0 = pm.Beta('mu0', 1, 1)
    a_Beta0 = mu0 * kappa[cond_of_subj]
    b_Beta0 = (1 - mu0) * kappa[cond_of_subj]

    mu1 = pm.Beta('mu1', 1, 1, shape=n_cond)
    a_Beta1 = mu1[cond_of_subj] * kappa[cond_of_subj]
    b_Beta1 = (1 - mu1[cond_of_subj]) * kappa[cond_of_subj]

    #Prior on theta
    theta0 = pm.Beta('theta0', a_Beta0, b_Beta0, shape=n_subj)
    theta1 = pm.Beta('theta1', a_Beta1, b_Beta1, shape=n_subj)
    # if model_index == 0 then sample from theta1 else sample from theta0
    theta = pm.switch(pm.eq(model_index, 0), theta1, theta0)

    # Likelihood:
    y = pm.Binomial('y', p=theta, n=n_trl_of_subj, observed=n_corr_of_subj)

    # Sampling
    start = pm.find_MAP()
    step1 = pm.Metropolis(model.vars[1:])
    step2 = pm.ElemwiseCategoricalStep(var=model_index,values=[0,1])
    trace = pm.sample(20000, [step1, step2], start=start, progressbar=False)


# EXAMINE THE RESULTS.
burnin = 10000
thin = 10

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace, vars =[mu, kappa])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)

model_idx_sample = trace['model_index'][burnin::thin]
pM1 = sum(model_idx_sample == 1) / len(model_idx_sample)
pM2 = 1 - pM1

plt.figure(figsize=(16, 15))
plt.subplot2grid((3,3), (0,0), colspan=3)
plt.plot(model_idx_sample, label='p(DiffMu|D) = %.3f ; p(SameMu|D) = %.3f' % (pM1, pM2));
plt.xlabel('Step in Markov Chain')
plt.legend(loc='upper right', framealpha=0.75)

count = 0
position = [(1,0), (1,1), (1,2), (2,0), (2,1), (2,2)]
for i in range(0, 4):
    mui_sample = trace['mu1'][:,i][burnin::thin][model_idx_sample == 0]
    for j in range(i+1, 4):
        muj_sample = trace['mu1'][:,j][burnin::thin][model_idx_sample == 0]
        plt.subplot2grid((3,3), position[count])
        plot_post(mui_sample-muj_sample, xlab=r'$\mu_%s - \mu_%s$' % (i+1, j+1), show_mode=False, comp_val=0, framealpha=0.5)
        plt.xlim(-0.3, 0.3)
        count += 1


plt.tight_layout()
plt.show()


# In[49]:


"""
HDIofICDF.py
This program finds the HDI of a probability density function that is specified 
mathematically in Python.
"""

def HDIofICDF(dist_name, credMass=0.95, **args):
    # freeze distribution with given arguments
    distri = dist_name(**args)
    # initial guess for HDIlowTailPr
    incredMass =  1.0 - credMass

    def intervalWidth(lowTailPr):
        return distri.ppf(credMass + lowTailPr) - distri.ppf(lowTailPr)

    # find lowTailPr that minimizes intervalWidth
    HDIlowTailPr = fmin(intervalWidth, incredMass, ftol=1e-8, disp=False)[0]
    # return interval as array([low, high])
    return distri.ppf([HDIlowTailPr, credMass + HDIlowTailPr])


# In[50]:


"""
13_minNforHDIpower.py
The program described in this section was used to generate Tables 13.1 and 13.2.
The program determines the minimal sample size needed to achieve a specified 
goal with a specified power, when flipping a single coin.
"""

def minNforHDIpower(genPriorMean, genPriorN, HDImaxwid=None, nullVal=None,
                    ROPE=None, desiredPower=0.8, audPriorMean=0.5,
                    audPriorN=2, HDImass=0.95, initSampSize=1, verbose=True):
    if HDImaxwid != None and nullVal != None:
        sys.exit('One and only one of HDImaxwid and nullVal must be specified')
    if ROPE == None:
        ROPE = [nullVal, nullVal]
    
    # Convert prior mean and N to a, b parameter values of beta distribution.
    genPriorA = genPriorMean * genPriorN
    genPriorB = (1.0 - genPriorMean) * genPriorN
    audPriorA = audPriorMean * audPriorN
    audPriorB = (1.0 - audPriorMean) * audPriorN
    # Initialize loop for incrementing sampleSize
    sampleSize = initSampSize
    # Increment sampleSize until desired power is achieved.
    while True:
        zvec = np.arange(0, sampleSize+1) # All possible z values for N flips.
        # Compute probability of each z value for data-generating prior.
        pzvec = np.exp(np.log(special_binom(sampleSize, zvec))
                   + special_betaln(zvec + genPriorA, sampleSize - zvec + genPriorB)
                   - special_betaln(genPriorA, genPriorB))
        # For each z value, compute HDI. hdiMat is min, max of HDI for each z.
        hdiMat = np.zeros((len(zvec), 2))
        for zIdx in range(0, len(zvec)):
            z = zvec[zIdx]
            # Determine the limits of the highest density interval
            # hdp is a function from PyMC package and takes a sample vector as 
            # input, not a function.
            hdiMat[zIdx] = HDIofICDF(beta, credMass=HDImass, a=(z + audPriorA), 
                                     b=(sampleSize - z + audPriorB))
        if HDImaxwid != None:
            hdiWid = hdiMat[:,1] - hdiMat[:,0]
            powerHDI = np.sum(pzvec[hdiWid < HDImaxwid])
        if nullVal != None:
            powerHDI = np.sum(pzvec[(hdiMat[:,0] > ROPE[1]) | 
                                    (hdiMat[:,1] < ROPE[0])])
        if verbose:
            print " For sample size = %s\npower = %s\n" % (sampleSize, powerHDI)

        if powerHDI > desiredPower:
            break
        else:
            sampleSize += 1
    return sampleSize

print minNforHDIpower(genPriorMean=.85 , genPriorN=2000 , nullVal=0.5, verbose=False)
#print minNforHDIpower(genPriorMean=.85 , genPriorN=10 , HDImaxwid=0.2, verbose=False)


# In[51]:


Systems = """Aircraft    Failure      Days        DaysTransf
   1.          1.        194.          2.867876
   1.          2.         15.          1.718772
   1.          3.         41.          2.101632
   1.          4.         29.          1.961009
   1.          5.         33.          2.012347
   1.          6.        181.          2.828367
   2.          1.        413.          3.335723
   2.          2.         14.          1.695218
   2.          3.         58.          2.252608
   2.          4.         37.          2.058924
   2.          5.        100.          2.511886
   2.          6.         65.          2.304532
   2.          7.          9.          1.551846
   2.          8.        169.          2.789827
   2.          9.        447.          3.388921
   2.         10.        184.          2.837681
   2.         11.         36.          2.047673
   2.         12.        201.          2.888279
   2.         13.        118.          2.596429
   3.          1.         90.          2.459509
   3.          2.         10.          1.584893
   3.          3.         60.          2.267933
   3.          4.        186.          2.843823
   3.          5.         61.          2.275443
   3.          6.         49.          2.177906
   3.          7.         14.          1.695218
   3.          8.         24.          1.888175
   3.          9.         56.          2.236854
   3.         10.         20.          1.820564
   3.         11.         79.          2.396213
   3.         12.         84.          2.425805
   3.         13.         44.          2.131526
   3.         14.         59.          2.260322
   3.         15.         29.          1.961009
   3.         16.        118.          2.596429
   3.         17.         25.          1.903654
   3.         18.        156.          2.745522
   3.         19.        310.          3.149723
   3.         20.         76.          2.377731
   3.         21.         26.          1.918645
   3.         22.         44.          2.131526
   3.         23.         23.          1.872171
   3.         24.         62.          2.282855
   4.          1.         74.          2.365083
   4.          2.         57.          2.244786
   4.          3.         48.          2.168944
   4.          4.         29.          1.961009
   4.          5.        502.          3.468492
   4.          6.         12.          1.643752
   4.          7.         70.          2.338943
   4.          8.         21.          1.838416
   4.          9.         29.          1.961009
   4.         10.        386.          3.290921
   4.         11.         59.          2.260322
   4.         12.         27.          1.933182
   5.          1.         55.          2.228807
   5.          2.        320.          3.169786
   5.          3.         56.          2.236854
   5.          4.        104.          2.531668
   5.          5.        220.          2.940929
   5.          6.        239.          2.990058
   5.          7.         47.          2.15983 
   5.          8.        246.          3.007371
   5.          9.        176.          2.812565
   5.         10.        182.          2.831485
   5.         11.         33.          2.012347
   6.          1.         23.          1.872171
   6.          2.        261.          3.043183
   6.          3.         87.          2.44289 
   6.          4.          7.          1.475773
   6.          5.        120.          2.605171
   6.          6.         14.          1.695218
   6.          7.         62.          2.282855
   6.          8.         47.          2.15983 
   6.          9.        225.          2.954177
   6.         10.         71.          2.345588
   6.         11.        246.          3.007371
   6.         12.         21.          1.838416
   6.         13.         42.          2.111786
   6.         14.         20.          1.820564
   6.         15.          5.          1.37973 
   6.         16.         12.          1.643752
   6.         17.        120.          2.605171
   6.         18.         11.          1.615394
   6.         19.          3.          1.245731
   6.         20.         14.          1.695218
   6.         21.         71.          2.345588
   6.         22.         11.          1.615394
   6.         23.         14.          1.695218
   6.         24.         11.          1.615394
   6.         25.         16.          1.741101
   6.         26.         90.          2.459509
   6.         27.          1.          1.      
   6.         28.         16.          1.741101
   6.         29.         52.          2.203945
   6.         30.         95.          2.48625 
   7.          1.         97.          2.496631
   7.          2.         51.          2.195402
   7.          3.         11.          1.615394
   7.          4.          4.          1.319508
   7.          5.        141.          2.690567
   7.          6.         18.          1.782602
   7.          7.        142.          2.694373
   7.          8.         68.          2.325422
   7.          9.         77.          2.383956
   7.         10.         80.          2.402249
   7.         11.          1.          1.      
   7.         12.         16.          1.741101
   7.         13.        106.          2.541331
   7.         14.        206.          2.902508
   7.         15.         82.          2.414142
   7.         16.         54.          2.220643
   7.         17.         31.          1.987341
   7.         18.        216.          2.930156
   7.         19.         46.          2.15056 
   7.         20.        111.          2.564865
   7.         21.         39.          2.080717
   7.         22.         63.          2.290172
   7.         23.         18.          1.782602
   7.         24.        191.          2.858951
   7.         25.         18.          1.782602
   7.         26.        163.          2.769731
   7.         27.         24.          1.888175
   8.          1.         50.          2.186724
   8.          2.         44.          2.131526
   8.          3.        102.          2.521855
   8.          4.         72.          2.352158
   8.          5.         22.          1.855601
   8.          6.         39.          2.080717
   8.          7.          3.          1.245731
   8.          8.         15.          1.718772
   8.          9.        197.          2.876691
   8.         10.        188.          2.849913
   8.         11.         79.          2.396213
   8.         12.         88.          2.44848 
   8.         13.         46.          2.15056 
   8.         14.          5.          1.37973 
   8.         15.          5.          1.37973 
   8.         16.         36.          2.047673
   8.         17.         22.          1.855601
   8.         18.        139.          2.682891
   8.         19.        210.          2.913693
   8.         20.         97.          2.496631
   8.         21.         30.          1.97435 
   8.         22.         23.          1.872171
   8.         23.         13.          1.670278
   8.         24.         14.          1.695218
   9.          1.        359.          3.243537
   9.          2.          9.          1.551846
   9.          3.         12.          1.643752
   9.          4.        270.          3.063887
   9.          5.        603.          3.598019
   9.          6.          3.          1.245731
   9.          7.        104.          2.531668
   9.          8.          2.          1.148698
   9.          9.        438.          3.375164
  10.          1.         50.          2.186724
  10.          2.        254.          3.026682
  10.          3.          5.          1.37973 
  10.          4.        283.          3.092839
  10.          5.         35.          2.036168
  10.          6.         12.          1.643752
  11.          1.        487.          3.447512
  11.          2.         18.          1.782602
  11.          3.        100.          2.511886
  11.          4.          7.          1.475773
  11.          5.         98.          2.501758
  11.          6.          5.          1.37973 
  11.          7.         85.          2.431553
  11.          8.         91.          2.464951
  11.          9.         43.          2.121747
  11.         10.        230.          2.967191
  11.         11.          3.          1.245731
  11.         12.        130.          2.647212
  12.          1.        102.          2.521855
  12.          2.        209.          2.910913
  12.          3.         14.          1.695218
  12.          4.         57.          2.244786
  12.          5.         54.          2.220643
  12.          6.         32.          2.      
  12.          7.         67.          2.318542
  12.          8.         59.          2.260322
  12.          9.        134.          2.663305
  12.         10.        152.          2.731296
  12.         11.         27.          1.933182
  12.         12.         14.          1.695218
  12.         13.        230.          2.967191
  12.         14.         66.          2.311579
  12.         15.         61.          2.275443
  12.         16.         34.          2.024397
"""


# In[52]:


'''
15_SystemsPyMC.py
Estimating the mean and standard deviation of a Gaussian likelihood with a
hierarchical model.
'''
# THE DATA.
# Load the aircraft data:
data = np.genfromtxt(StringIO(Systems), skip_header=True)

n_subj = len(set(data[:,0]))
# Put it into generic variables so easier to change data in other applications:
y = data[:,3]
subj = data[:,0].astype(int)


## Specify the model in PyMC
with pm.Model() as model:
    # define the HyperPriors
    muG = pm.Normal('muG', mu=2.3, tau=0.1)
    tauG = pm.Gamma('tauG', 1, .5)
    m = pm.Gamma('m', 1, .25)
    d = pm.Gamma('d', 1, .5)
    sG = m**2 / d**2
    rG = m / d**2
    # define the priors
    tau = pm.Gamma('tau', sG, rG, shape=n_subj)
    mu = pm.Normal('mu', mu=muG, tau=tauG, shape=n_subj)
    # define the likelihood
    y = pm.Normal('y', mu=mu[subj-1], tau=tau[subj-1], observed=y)
    # Generate a MCMC chain
    #start = pm.find_MAP()
    #step = pm.Metropolis()
    step = pm.Metropolis()
    trace = pm.sample(20000, step, progressbar=False)


# EXAMINE THE RESULTS
burnin = 5000
thin = 100


## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[mu, tau])
#pm.autocorrplot(trace, vars =[mu, tau])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)


## Extract chains
muG_sample = trace['muG'][burnin::thin]
tauG_sample = trace['tauG'][burnin::thin]
m_sample = trace['m'][burnin::thin]
d_sample = trace['d'][burnin::thin]

# Plot the hyperdistributions:
plt.figure(figsize=(20, 5))
plt.subplot(1, 4, 1)
plot_post(muG_sample, xlab=r'$\mu_g$', bins=30, show_mode=False)
plt.subplot(1, 4, 2)
plot_post(tauG_sample, xlab=r'$\tau_g$', bins=30, show_mode=False)
plt.subplot(1, 4, 3)
plot_post(m_sample, xlab='m', bins=30, show_mode=False)
plt.subplot(1, 4, 4)
plot_post(d_sample, xlab='d', bins=30, show_mode=False)

plt.show()


# In[53]:


'''
15_YmetricXsinglePyMC.py
Estimating the mean and standard deviation of a Gaussian likelihood.
'''

# THE DATA.

# Generate random data from known parameter values:
np.random.seed(4745)
true_mu = 100
true_std = 15
y = norm.rvs(true_mu, true_std, 500)


# Specify the model in PyMC
with pm.Model() as model:
    # define the priors
    tau = pm.Gamma('tau', 0.01, 0.01)
    mu = pm.Normal('mu', mu=0, tau=1E-10) # PyMC support precission and std
    #define the likelihood
    y = pm.Normal('y', mu=mu, tau=tau, observed=y)
#   Generate a MCMC chain
    start = pm.find_MAP()
    step = pm.Metropolis()
    trace = pm.sample(5000, step, start, progressbar=False)


# EXAMINE THE RESULTS
burnin = 1000
thin = 10


## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[mu, tau])
#pm.autocorrplot(trace, vars =[mu, tau])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)

mu_sample = trace['mu'][burnin::thin]
tau_sample = trace['tau'][burnin::thin]
sigma_sample = 1 / np.sqrt(tau_sample) # Convert precision to std


plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plot_post(mu_sample, xlab='mu', bins=30, title='Posterior', show_mode=False)
plt.xlim(98, 102)

plt.subplot(1, 2, 2)

mu_mean = np.mean(mu_sample)
sigma_mean = np.mean(sigma_sample)

plt.scatter(mu_sample, sigma_sample , c='gray')
plt.plot(mu_mean, sigma_mean, 'r*',
        label=r'$\mu$ = %.1f, $\sigma$ = %.1f' % (mu_mean, sigma_mean))
plt.xlabel('mu')
plt.ylabel('sigma')
plt.title('Posterior')
plt.legend(loc=0)
plt.show()


# In[54]:


"""
HtWtDataGenerator.py
Random height, weight generator for males and females. Uses parameters from
Brainard, J. & Burmaster, D. E. (1992). Bivariate distributions for height and
weight of men and women in the United States. Risk Analysis, 12(2), 267-275.
John K. Kruschke, January 2008.
"""

def HtWtDataGenerator(nSubj, rndsd=None):
    # Specify parameters of multivariate normal (MVN) distributions.
    # Men:
    HtMmu = 69.18
    HtMsd = 2.87
    lnWtMmu = 5.14
    lnWtMsd = 0.17
    Mrho = 0.42
    Mmean = np.array([HtMmu , lnWtMmu])
    Msigma = np.array([[HtMsd**2, Mrho * HtMsd * lnWtMsd],
                     [Mrho * HtMsd * lnWtMsd, lnWtMsd**2]])

    # Women cluster 1:
    HtFmu1 = 63.11
    HtFsd1 = 2.76
    lnWtFmu1 = 5.06
    lnWtFsd1 = 0.24
    Frho1 = 0.41
    prop1 = 0.46
    Fmean1 = np.array([HtFmu1, lnWtFmu1])
    Fsigma1 = np.array([[HtFsd1**2, Frho1 * HtFsd1 * lnWtFsd1],
                     [Frho1 * HtFsd1 * lnWtFsd1, lnWtFsd1**2]])
    # Women cluster 2:
    HtFmu2 = 64.36
    HtFsd2 = 2.49
    lnWtFmu2 = 4.86
    lnWtFsd2 = 0.14
    Frho2 = 0.44
    prop2 = 1 - prop1
    Fmean2 = np.array([HtFmu2, lnWtFmu2])
    Fsigma2 = np.array([[HtFsd2**2 , Frho2 * HtFsd2 * lnWtFsd2],
                [Frho2 * HtFsd2 * lnWtFsd2 , lnWtFsd2**2]])

    # Randomly generate data values from those MVN distributions.
    if rndsd is not None:
        np.random.seed(rndsd)
    datamatrix = np.zeros((nSubj, 3))
    # arbitrary coding values
    maleval = 1
    femaleval = 0 
    for i in range(0, nSubj):
        # Flip coin to decide sex
        sex = np.random.choice([maleval, femaleval], replace=True, p=(.5,.5), size=1)
        if sex == maleval:
            datum = multivariate_normal.rvs(mean=Mmean, cov=Msigma) 
        if sex == femaleval:
            Fclust = np.random.choice([1, 2], replace=True, p=(prop1, prop2), size=1)
            if Fclust == 1:
                datum = multivariate_normal.rvs(mean=Fmean1, cov=Fsigma1)
            if Fclust == 2: 
                datum = multivariate_normal.rvs(mean=Fmean2, cov=Fsigma2)
        datamatrix[i] = np.concatenate([sex, np.round([datum[0], np.exp(datum[1])], 1)])

    return datamatrix


# In[55]:


"""
16_SimpleLinearRegressionPyMC.py
Estimating the mean and standard deviation of a Gaussian likelihood with a
hierarchical model.
"""

# THE DATA.
# Simulated height and weight data:
n_subj = 30
HtWtData = HtWtDataGenerator(n_subj, rndsd=5678)
x = HtWtData[:,1]
y = HtWtData[:,2]

# Re-center data at mean, to reduce autocorrelation in MCMC sampling.
# Standardize (divide by SD) to make initialization easier.
x_m = np.mean(x)
x_sd = np.std(x)
y_m = np.mean(y)
y_sd = np.std(y)
zx = (x - x_m) / x_sd
zy = (y - y_m) / y_sd


# THE MODEL
with pm.Model() as model:
    # define the priors
    tau = pm.Gamma('tau', 0.001, 0.001)
    beta0 = pm.Normal('beta0', mu=0, tau=1.0E-12)
    beta1 = pm.Normal('beta1', mu=0, tau=1.0E-12)
    mu = beta0 + beta1 * zx
    # define the likelihood
    yl = pm.Normal('yl', mu=mu, tau=tau, observed=zy)
    # Generate a MCMC chain
    start = pm.find_MAP()
    step = pm.Metropolis()
    trace = pm.sample(10000, step, start, progressbar=False)


# EXAMINE THE RESULTS
burnin = 5000
thin = 10

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[tau])
#pm.autocorrplot(trace, vars =[tau])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)

## Extract chain values:
z0 = trace['beta0']
z1 = trace['beta1']
z_tau = trace['tau']
z_sigma = 1 / np.sqrt(z_tau) # Convert precision to SD


# Convert to original scale:
b1 = z1 * y_sd / x_sd
b0 = (z0 * y_sd + y_m - z1 * y_sd * x_m / x_sd)
sigma = z_sigma * y_sd


# Posterior prediction:
# Specify x values for which predicted y's are needed:
x_post_pred = np.arange(55, 81)
# Define matrix for recording posterior predicted y values at each x value.
# One row per x value, with each row holding random predicted y values.
post_samp_size = len(b1)
y_post_pred = np.zeros((len(x_post_pred), post_samp_size))
# Define matrix for recording HDI limits of posterior predicted y values:
y_HDI_lim = np.zeros((len(x_post_pred), 2))
# Generate posterior predicted y values.
# This gets only one y value, at each x, for each step in the chain.
for chain_idx in range(post_samp_size):
    y_post_pred[:,chain_idx] = norm.rvs(loc=b0[chain_idx] + b1[chain_idx] * x_post_pred ,
                           scale = np.repeat([sigma[chain_idx]], [len(x_post_pred)]), size=len(x_post_pred))

for x_idx in range(len(x_post_pred)):
    y_HDI_lim[x_idx] = hpd(y_post_pred[x_idx])

## Display believable beta0 and b1 values
plt.figure(figsize=(16,6))
plt.subplot(1, 2, 1)
thin_idx = 50
plt.plot(z1[::thin_idx], z0[::thin_idx], 'b.', alpha=0.7)
plt.ylabel('Standardized Intercept')
plt.xlabel('Standardized Slope')
plt.subplot(1, 2, 2)
plt.plot(b1[::thin_idx], b0[::thin_idx], 'b.', alpha=0.7)
plt.ylabel('Intercept (ht when wt=0)')
plt.xlabel('Slope (pounds per inch)')
plt.tight_layout()

# Display the posterior of the b1:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_post(z1, xlab='Standardized slope', 
          comp_val=0.0, bins=30, show_mode=False)
plt.subplot(1, 2, 2)
plot_post(b1, xlab='Slope (pounds per inch)', 
          comp_val=0.0, bins=30, show_mode=False)
plt.tight_layout()

# Display data with believable regression lines and posterior predictions.
plt.figure(figsize=(16,6))
# Plot data values:
x_rang = np.max(x) - np.min(x)
y_rang = np.max(y) - np.min(y)
lim_mult = 0.25
x_lim = [np.min(x)-lim_mult*x_rang, np.max(x)+lim_mult*x_rang]
y_lim = [np.min(y)-lim_mult*y_rang, np.max(y)+lim_mult*y_rang]
plt.plot(x, y, 'k.')
plt.title('Data with credible regression lines')
plt.xlabel('X (height in inches)')
plt.ylabel('Y (weight in pounds)')
plt.xlim(x_lim)
plt.ylim(y_lim)
# Superimpose a smattering of believable regression lines:
for i in range(0, len(b0), 100):
    plt.plot(x, b0[i] + b1[i]*x  , c='k', alpha=0.05 )

# Display data with HDIs of posterior predictions.

plt.figure(figsize=(16,6))
# Plot data values:
y_lim = [np.min(y_HDI_lim), np.max(y_HDI_lim)]
plt.plot(x, y, 'k.')
plt.xlim(x_lim)
plt.ylim(y_lim)
plt.xlabel('X (height in inches)')
plt.ylabel('Y (weight in pounds)')
plt.title('Data with 95% HDI & Mean of Posterior Predictions')
# Superimpose posterior predicted 95% HDIs:
y_post_pred_ave = np.average(y_post_pred, axis=1)
#Book version of the HDI representation
#plt.errorbar(x_post_pred,y_post_pred_ave, 
#             yerr=[abs(y_HDI_lim[:,0]-y_post_pred_ave),
#                   abs(y_HDI_lim[:,1]-y_post_pred_ave)], fmt='.')

#Smoothed version of the HDI representation
x_new = np.linspace(x_post_pred.min(), x_post_pred.max(), 200)
y_HDI_lim_smooth = spline(x_post_pred, y_HDI_lim, x_new)
plt.plot(x_post_pred, y_post_pred_ave)
plt.fill_between(x_new, y_HDI_lim_smooth[:,0], y_HDI_lim_smooth[:,1], alpha=0.3)


plt.show()


# In[56]:


McIntyre1994 = """Brand,Tar,Nic,Wt,CO
Alpine,14.1,0.86,0.9853,13.6
BensonAndHedges,16.0,1.06,1.0938,16.6
BullDurham,29.8,2.03,1.1650,23.5
CamelLights,8.0,0.67,0.9280,10.2
Carlton,4.1,0.40,0.9462,5.4
Chesterfield,15.0,1.04,0.8885,15.0
GoldenLights,8.8,0.76,1.0267,9.0
Kent,12.4,0.95,0.9225,12.3
Kool,16.6,1.12,0.9372,16.3
LandM,14.9,1.02,0.8858,15.4
LarkLights,13.7,1.01,0.9643,13.0
Marlboro,15.1,0.90,0.9316,14.4
Merit,7.8,0.57,0.9705,10.0
MultiFilter,11.4,0.78,1.1240,10.2
NewportLights,9.0,0.74,0.8517,9.5
Now,1.0,0.13,0.7851,1.5
OldGold,17.0,1.26,0.9186,18.5
PallMallLight,12.8,1.08,1.0395,12.6
Raleigh,15.8,0.96,0.9573,17.5
SalemUltra,4.5,0.42,0.9106,4.9
Tareyton,14.5,1.01,1.0070,15.9
True,7.3,0.61,0.9806,8.5
ViceroyRichLight,8.6,0.69,0.9693,10.6
VirginiaSlims,15.2,1.02,0.9496,13.9
WinstonLights,12.0,0.82,1.1184,14.9
"""


# In[57]:


"""
16_SimpleRobustLinearRegressionPyMC.py
Simple Robust Linear Regression
"""
# THE DATA.

#cig_data = np.genfromtxt("McIntyre1994data.csv", skip_header=True, delimiter=",")
cig_data = np.genfromtxt(StringIO(McIntyre1994), skip_header=True, delimiter=",")

n_subj = len(cig_data)
x = cig_data[:,3]
y = cig_data[:,1]


# Re-center data at mean, to reduce autocorrelation in MCMC sampling.
# Standardize (divide by SD) to make initialization easier.
x_m = np.mean(x)
x_sd = np.std(x)
y_m = np.mean(y)
y_sd = np.std(y)
zx = (x - x_m) / x_sd
zy = (y - y_m) / y_sd

tdf_gain = 1 # 1 for low-baised tdf, 100 for high-biased tdf

# THE MODEL
with pm.Model() as model:
    # define the priors
    udf = pm.Uniform('udf', 0, 1)
    tdf = 1 - tdf_gain * pm.log(1 - udf) # tdf in [1,Inf).
    tau = pm.Gamma('tau', 0.001, 0.001)
    beta0 = pm.Normal('beta0', mu=0, tau=1.0E-12)
    beta1 = pm.Normal('beta1', mu=0, tau=1.0E-12)
    mu = beta0 + beta1 * zx
    # define the likelihood
    yl = pm.T('yl', mu=mu, lam=tau, nu=tdf, observed=zy)
    # Generate a MCMC chain
    start = pm.find_MAP()
    step = pm.Metropolis()
    trace = pm.sample(20000, step, start, progressbar=False)


# EXAMINE THE RESULTS
burnin = 1000
thin = 10

## Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

## Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[tau])
#pm.autocorrplot(trace, vars =[tau])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)


# Extract chain values:

tdf_samp = 1 - tdf_gain * np.log(1 - trace['udf'][burnin::thin])
tdf_m = np.mean(tdf_samp)
z0 = trace["beta0"][burnin::thin]
z1 = trace["beta1"][burnin::thin]
z_tau = trace["tau"][burnin::thin]
z_sigma = 1 / np.sqrt(z_tau) # Convert precision to SD

# Convert to original scale:
b1 = z1 * y_sd / x_sd
b0 = (z0 * y_sd + y_m - z1 * y_sd * x_m / x_sd)
sigma = z_sigma * y_sd

# Posterior prediction:
# Specify x values for which predicted y's are needed:
x_rang = np.max(x) - np.min(x)
y_rang = np.max(y) - np.min(y)
lim_mult = 0.25
x_lim = [np.min(x) - lim_mult * x_rang, np.max(x) + lim_mult * x_rang]
#y_lim = [np.min(y) - lim_mult*y_rang, np.max(y) + lim_mult*y_rang]
y_lim = [-10, 40]
x_post_pred = np.linspace(x_lim[0], x_lim[1], 20)
# Define matrix for recording posterior predicted y values at each x value.
# One row per x value, with each row holding random predicted y values.
post_samp_size = len(b1)
y_post_pred = np.zeros((len(x_post_pred), post_samp_size))
# Define matrix for recording HDI limits of posterior predicted y values:
y_HDI_lim = np.zeros((len(x_post_pred), 2))
# Generate posterior predicted y values.
# This gets only one y value, at each x, for each step in the chain.
for chain_idx in range(post_samp_size):
    y_post_pred[:,chain_idx] = stats_t.rvs(df=np.repeat([tdf_samp[chain_idx]], [len(x_post_pred)]),
                            loc = b0[chain_idx] + b1[chain_idx] * x_post_pred,
                            scale = np.repeat([sigma[chain_idx]], [len(x_post_pred)]))

for x_idx in range(len(x_post_pred)):
    y_HDI_lim[x_idx] = hpd(y_post_pred[x_idx])


# Display believable beta0 and b1 values
plt.figure(figsize=(16,6))
thin_idx = 5
plt.plot(b1[::thin_idx], b0[::thin_idx], '.')
plt.ylabel("Intercept")
plt.xlabel("Slope")

# Display the posterior of the b1:
plt.figure(figsize=(16,6))
plot_post(b1, xlab=r'Slope ($\Delta$ tar  / $\Delta$ weight)', comp_val=0.0,
            show_mode=False, bins=30)
plt.title('Mean tdf = %.2f' % tdf_m)

# Display data with believable regression lines and posterior predictions.
plt.figure(figsize=(16,6))
plt.plot(x, y, 'k.')
plt.title('Data with credible regression lines')
plt.xlabel('weight')
plt.ylabel('tar')
plt.xlim(x_lim)
plt.ylim(y_lim)
# Superimpose a smattering of believable regression lines:
for i in range(0, len(b0), 5):
    plt.plot(x, b0[i] + b1[i]*x  , c='k', alpha=0.05 )


plt.figure(figsize=(16,6))
# Plot data values:
plt.plot(x, y, 'k.')
plt.xlim(x_lim)
plt.ylim(y_lim)
plt.xlabel('weight')
plt.ylabel('tar')
plt.title('Data with 95% HDI & Mean of Posterior Predictions')
# Superimpose posterior predicted 95% HDIs:
y_post_pred_med = np.median(y_post_pred, axis=1)

#Book version of the HDI representation
#plt.errorbar(x_post_pred, y_post_pred_med, 
#             yerr=[abs(y_HDI_lim[:,0]-y_post_pred_med),
#                   abs(y_HDI_lim[:,1]-y_post_pred_med)], fmt='.')

#Smoothed version of the HDI representation
x_new = np.linspace(x_post_pred.min(), x_post_pred.max(), 200)
y_HDI_lim_smooth = spline(x_post_pred, y_HDI_lim, x_new)
plt.plot(x_post_pred, y_post_pred_med)
plt.fill_between(x_new, y_HDI_lim_smooth[:,0], y_HDI_lim_smooth[:,1], alpha=0.3)

plt.show()


# In[58]:


Guber1999 = """"Alabama"        4.405 17.2 31.144  8 491 538 1029
"Alaska"         8.963 17.6 47.951 47 445 489  934
"Arizona"        4.778 19.3 32.175 27 448 496  944
"Arkansas"       4.459 17.1 28.934  6 482 523 1005
"California"     4.992 24.0 41.078 45 417 485  902
"Colorado"       5.443 18.4 34.571 29 462 518  980
"Connecticut"    8.817 14.4 50.045 81 431 477  908
"Delaware"       7.030 16.6 39.076 68 429 468  897
"Florida"        5.718 19.1 32.588 48 420 469  889
"Georgia"        5.193 16.3 32.291 65 406 448  854
"Hawaii"         6.078 17.9 38.518 57 407 482  889
"Idaho"          4.210 19.1 29.783 15 468 511  979
"Illinois"       6.136 17.3 39.431 13 488 560 1048
"Indiana"        5.826 17.5 36.785 58 415 467  882
"Iowa"           5.483 15.8 31.511  5 516 583 1099
"Kansas"         5.817 15.1 34.652  9 503 557 1060
"Kentucky"       5.217 17.0 32.257 11 477 522  999
"Louisiana"      4.761 16.8 26.461  9 486 535 1021
"Maine"          6.428 13.8 31.972 68 427 469  896
"Maryland"       7.245 17.0 40.661 64 430 479  909
"Massachusetts"  7.287 14.8 40.795 80 430 477  907
"Michigan"       6.994 20.1 41.895 11 484 549 1033
"Minnesota"      6.000 17.5 35.948  9 506 579 1085
"Mississippi"    4.080 17.5 26.818  4 496 540 1036
"Missouri"       5.383 15.5 31.189  9 495 550 1045
"Montana"        5.692 16.3 28.785 21 473 536 1009
"Nebraska"       5.935 14.5 30.922  9 494 556 1050
"Nevada"         5.160 18.7 34.836 30 434 483  917
"New Hampshire"  5.859 15.6 34.720 70 444 491  935
"New Jersey"     9.774 13.8 46.087 70 420 478  898
"New Mexico"     4.586 17.2 28.493 11 485 530 1015
"New York"       9.623 15.2 47.612 74 419 473  892
"North Carolina" 5.077 16.2 30.793 60 411 454  865
"North Dakota"   4.775 15.3 26.327  5 515 592 1107
"Ohio"           6.162 16.6 36.802 23 460 515  975
"Oklahoma"       4.845 15.5 28.172  9 491 536 1027
"Oregon"         6.436 19.9 38.555 51 448 499  947
"Pennsylvania"   7.109 17.1 44.510 70 419 461  880
"Rhode Island"   7.469 14.7 40.729 70 425 463  888
"South Carolina" 4.797 16.4 30.279 58 401 443  844
"South Dakota"   4.775 14.4 25.994  5 505 563 1068
"Tennessee"      4.388 18.6 32.477 12 497 543 1040
"Texas"          5.222 15.7 31.223 47 419 474  893
"Utah"           3.656 24.3 29.082  4 513 563 1076
"Vermont"        6.750 13.8 35.406 68 429 472  901
"Virginia"       5.327 14.6 33.987 65 428 468  896
"Washington"     5.906 20.2 36.151 48 443 494  937
"West Virginia"  6.107 14.8 31.944 17 448 484  932
"Wisconsin"      6.930 15.9 37.746  9 501 572 1073
"Wyoming"        6.160 14.9 31.285 10 476 525 1001
"""


# In[59]:


"""
17_MultiLinRegressHyperPyMC.py
Multiple linear regression with hyperpriors.
"""

# THE DATA.

tdfBgain = 1

dataSource = ["Guber1999", "McIntyre1994", "random"][0]


if dataSource == "Guber1999":
    fname = "Guber1999" # file name for saved graphs
    #data = pd.read_csv('Guber1999data.txt', sep='\s+', 
    data = pd.read_csv(StringIO(Guber1999), sep='\s+', 
    names = ["State","Spend","StuTchRat","Salary", "PrcntTake","SATV","SATM","SATT"])
    # Specify variables to be used in BUGS analysis:
    predicted_name = "SATT"
    predictor_names = ["Spend" , "PrcntTake"]
    n_data = len(data)
    y = data[predicted_name]
    x = data[predictor_names]
    n_predictors = len(x.columns)


if dataSource == "McIntyre1994":
    fname = "McIntyre1994" # file name for saved graphs
    data = pd.read_csv(StringIO(McIntyre1994))
    predicted_name = "CO"
    predictor_names = ["Tar","Nic","Wt"]
    n_data = len(data)
    y = data[predicted_name]
    x = data[predictor_names]
    n_data = len(data)


if dataSource == "random":
    fname = "Random"  # file name for saved graphs
    # Generate random data.
    # True parameter values:
    beta_true = np.repeat(0, 21)
    beta_true = np.insert(beta_true, [0,0,0], [100, 1, 2])  # beta0 is first component
    n_predictors = len(beta_true) - 1
    sd_true = 2
    tau_true = 1/sd_true**2
    # Random X values:
    np.random.seed(47405)
    xM = 5
    xSD = 2
    n_data = 100
    x = norm.rvs(xM, xSD, n_predictors*n_data).reshape(100, -1)
    x = pd.DataFrame(x, columns=['X%s' % i for i in range(0, n_predictors)])
    # Random Y values generated from linear model with true parameter values:
    y = np.sum(x * beta_true[1:].T, axis=1) + beta_true[0] + norm.rvs(0, sd_true, n_data)
   # Select which predictors to include
    include_only = range(0, n_predictors) # default is to include all
    #x = x.iloc[include_only]
    predictor_names = x.columns
    n_predictors = len(predictor_names)


# THE MODEL
with pm.Model() as model:
    # define hyperpriors
    muB = pm.Normal('muB', 0,.100 )
    tauB = pm.Gamma('tauB', .01, .01)
    udfB = pm.Uniform('udfB', 0, 1)
    tdfB = 1 + tdfBgain * (-pm.log(1 - udfB))
    # define the priors
    tau = pm.Gamma('tau', 0.01, 0.01)
    beta0 = pm.Normal('beta0', mu=0, tau=1.0E-12)
    beta1 = pm.T('beta1', mu=muB, lam=tauB, nu=tdfB, shape=n_predictors)
    mu = beta0 + pm.dot(beta1, x.values.T)
    # define the likelihood
    #mu = beta0 + beta1[0] * x.values[:,0] + beta1[1] * x.values[:,1]
    yl = pm.Normal('yl', mu=mu, tau=tau, observed=y)
    # Generate a MCMC chain
    start = pm.find_MAP()
    step1 = pm.NUTS([beta1])
    step2 = pm.Metropolis([beta0, tau, muB, tauB, udfB])
    trace = pm.sample(10000, [step1, step2], start, progressbar=False)


# EXAMINE THE RESULTS
burnin = 2000
thin = 1

# Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

# Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[mu, tau])
#pm.autocorrplot(trace, vars =[beta0])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)


# Extract chain values:
b0_samp = trace['beta0'][burnin::thin]
b_samp = trace['beta1'][burnin::thin]
tau_samp = trace['tau'][burnin::thin]
sigma_samp = 1 / np.sqrt(tau_samp) # Convert precision to SD
chain_length = len(tau_samp)

if n_predictors >= 6: # don't display if too many predictors
    n_predictors == 6

columns = ['Sigma y', 'Intercept']
[columns.append('Slope_%s' % i) for i in predictor_names[:n_predictors]]
traces = np.array([sigma_samp, b0_samp, b_samp[:,0], b_samp[:,1]]).T
df = pd.DataFrame(traces, columns=columns)
sns.set_style('dark')
g = sns.PairGrid(df)
g.map(plt.scatter)

## Display the posterior:
sns.set_style('darkgrid')

plt.figure(figsize=(16,4))
plt.subplot(1, n_predictors+2, 1)
plot_post(sigma_samp, xlab=r'$\sigma y$', show_mode=False, framealpha=0.5)
plt.subplot(1, n_predictors+2, 2)
plot_post(b0_samp, xlab='Intercept', show_mode=False, framealpha=0.5)

for i in range(0, n_predictors):
    plt.subplot(1, n_predictors+2, 3+i)
    plot_post(b_samp[:,i], xlab='Slope_%s' % predictor_names[i],
              show_mode=False, framealpha=0.5, comp_val=0)
plt.tight_layout()

# Posterior prediction:
# Define matrix for recording posterior predicted y values for each xPostPred.
# One row per xPostPred value, with each row holding random predicted y values.
y_post_pred = np.zeros((len(x), chain_length))
# Define matrix for recording HDI limits of posterior predicted y values:
y_HDI_lim = np.zeros((len(x), 2))
# Generate posterior predicted y values.
# This gets only one y value, at each x, for each step in the chain.
#or chain_idx in range(chain_length):
for chain_idx in range(chain_length):
    y_post_pred[:,chain_idx] = norm.rvs(loc = b0_samp[chain_idx] + np.dot(b_samp[chain_idx], x.values.T), 
                                        scale = np.repeat([sigma_samp[chain_idx]], [len(x)]))

for x_idx in range(len(x)):
    y_HDI_lim[x_idx] = hpd(y_post_pred[x_idx])

#for i in range(len(x)):
#    print np.mean(y_post_pred, axis=1)[i], y_HDI_lim[i]

plt.show()


# In[60]:


"""
17_MultipleLinearRegressionPyMC.py
Multiple linear regression
"""

# THE DATA.
dataSource = ["Guber1999", "McIntyre1994", "random"][0]

if dataSource == "Guber1999":
    fname = "Guber1999" # file name for saved graphs
    data = pd.read_csv(StringIO(Guber1999), sep='\s+', 
                       names = ["State","Spend","StuTchRat","Salary", "PrcntTake","SATV","SATM","SATT"])
    # Specify variables to be used in BUGS analysis:
    predictedName = "SATT"
    predictorNames = ["Spend" , "PrcntTake"]
    nData = len(data)
    y = data[predictedName]
    x = data[predictorNames]
    n_predictors = len(x.columns)


if dataSource == "McIntyre1994":
    fname = "McIntyre1994" # file name for saved graphs
    data = pd.read_csv(StringIO(McIntyre1994))
    predictedName = "CO"
    predictorNames = ["Tar","Nic","Wt"]
    nData = len(data)
    y = data[predictedName]
    x = data[predictorNames]
    nData = len(data)


if dataSource == "random":
    fname = "Random"  # file name for saved graphs
    # Generate random data.
    # True parameter values:
    beta_true = np.repeat(0, 21)
    beta_true = np.insert(beta_true, [0,0,0], [100, 1, 2])  # beta0 is first component
    n_predictors = len(beta_true) - 1
    sd_true = 2
    tau_true = 1/sd_true**2
    # Random X values:
    np.random.seed(47405)
    xM = 5
    xSD = 2
    nData = 100
    x = norm.rvs(xM, xSD, n_predictors*nData).reshape(100, -1)
    x = pd.DataFrame(x, columns=['X%s' % i for i in range(0, n_predictors)])
    # Random Y values generated from linear model with true parameter values:
    y = np.sum(x * beta_true[1:].T, axis=1) + beta_true[0] + norm.rvs(0, sd_true, nData)
    # Select which predictors to include
    includeOnly = range(0, n_predictors) # default is to include all
    #x = x.iloc[includeOnly]
    predictorNames = x.columns
    n_predictors = len(predictorNames)


# THE MODEL
with pm.Model() as model:
    # define the priors
    beta0 = pm.Normal('beta0', mu=0, tau=1.0E-12)
    beta1 = pm.Normal('beta1', mu= 0, tau=1.0E-12, shape=n_predictors)
    tau = pm.Gamma('tau', 0.01, 0.01)
    mu = beta0 + pm.dot(beta1, x.values.T)
    # define the likelihood
    yl = pm.Normal('yl', mu=mu, tau=tau, observed=y)
    # Generate a MCMC chain
    start = pm.find_MAP()
    step1 = pm.NUTS([beta1])
    step2 = pm.Metropolis([beta0, tau])
    trace = pm.sample(10000, [step1, step2], start, progressbar=False)

# EXAMINE THE RESULTS
burnin = 5000
thin = 1

# Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

# Check for mixing and autocorrelation
#pm.autocorrplot(trace[burnin::thin], vars =[mu, tau])
#pm.autocorrplot(trace, vars =[beta0])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
#pm.traceplot(trace)


# Extract chain values:
b0_samp = trace['beta0'][burnin::thin]
b_samp = trace['beta1'][burnin::thin]
Tau_samp = trace['tau'][burnin::thin]
Sigma_samp = 1 / np.sqrt(Tau_samp) # Convert precision to SD
chain_length = len(Tau_samp)

if n_predictors >= 6: # don't display if too many predictors
    n_predictors == 6

columns = ['Sigma y', 'Intercept']
[columns.append('Slope_%s' % i) for i in predictorNames[:n_predictors]]
traces = np.array([Sigma_samp, b0_samp, b_samp[:,0], b_samp[:,1]]).T
df = pd.DataFrame(traces, columns=columns)
sns.set_style('dark')
g = sns.PairGrid(df)
g.map(plt.scatter)

## Display the posterior:
sns.set_style('darkgrid')

plt.figure(figsize=(16,4))
plt.subplot(1, n_predictors+2, 1)
plot_post(Sigma_samp, xlab=r'$\sigma y$', show_mode=False, framealpha=0.5)
plt.subplot(1, n_predictors+2, 2)
plot_post(b0_samp, xlab='Intercept', show_mode=False, framealpha=0.5)

for i in range(0, n_predictors):
    plt.subplot(1, n_predictors+2, 3+i)
    plot_post(b_samp[:,i], xlab='Slope_%s' % predictorNames[i],
              show_mode=False, framealpha=0.5, comp_val=0)
plt.tight_layout()


# Posterior prediction:
# Define matrix for recording posterior predicted y values for each xPostPred.
# One row per xPostPred value, with each row holding random predicted y values.
y_post_pred = np.zeros((len(x), chain_length))
# Define matrix for recording HDI limits of posterior predicted y values:
y_HDI_lim = np.zeros((len(x), 2))
# Generate posterior predicted y values.
# This gets only one y value, at each x, for each step in the chain.
#or chain_idx in range(chain_length):
for chain_idx in range(chain_length):
    y_post_pred[:,chain_idx] = norm.rvs(loc = b0_samp[chain_idx] + np.dot(b_samp[chain_idx], x.values.T), 
                                        scale = np.repeat([Sigma_samp[chain_idx]], [len(x)]))

for x_idx in range(len(x)):
    y_HDI_lim[x_idx] = hpd(y_post_pred[x_idx])

#for i in range(len(x)):
#    print np.mean(y_post_pred, axis=1)[i], y_HDI_lim[i]

plt.show()


# In[61]:


McDonaldSK1991="""Group Size Site
1 0.0571 OregonT                
1 0.0813 OregonT                 
1 0.0831 OregonT                 
1 0.0976 OregonT                 
1 0.0817 OregonT                 
1 0.0859 OregonT                 
1 0.0735 OregonT                 
1 0.0659 OregonT                 
1 0.0923 OregonT    
1 0.0836 OregonT     
2 0.0873 OregonN
2 0.0662 OregonN
2 0.0672 OregonN
2 0.0819 OregonN
2 0.0749 OregonN
2 0.0649 OregonN
2 0.0835 OregonN
2 0.0725 OregonN
3 0.0974 Alaska
3 0.1352 Alaska
3 0.0817 Alaska
3 0.1016 Alaska
3 0.0968 Alaska
3 0.1064 Alaska
3 0.1050 Alaska
4 0.1033 Russia
4 0.0915 Russia
4 0.0781 Russia
4 0.0685 Russia
4 0.0677 Russia
4 0.0697 Russia
4 0.0764 Russia
4 0.0689 Russia
5 0.0703 Finland
5 0.1026 Finland
5 0.0956 Finland
5 0.0973 Finland
5 0.1039 Finland
5 0.1045 Finland
"""


# In[62]:


SolariLS2008="""Type Acid
1 7.12
1 7.16
1 4.57
1 3.79
1 4.20
1 5.84
1 5.56
1 5.02
1 3.69
1 2.99
1 4.99
1 2.16
2 4.42
2 5.68
2 5.15
2 3.83
2 3.30
2 4.44
2 3.51
2 4.60
2 4.85
2 4.84
2 5.45
2 4.71
3 6.49
3 8.09
3 8.79
3 8.44
3 6.11
3 5.17
3 8.13
3 7.58
3 6.47
3 5.45
3 6.18
3 4.34
4 8.07
4 2.86
4 6.84
4 6.85
4 4.12
4 3.32
4 1.74
4 1.74
4 1.57
4 3.02
4 5.08
4 4.96
5 8.05
5 5.82
5 2.47
5 3.28
5 5.38
5 3.98
5 6.08
5 6.28
5 5.72
5 2.88
5 6.40
5 4.58
6 5.09
6 4.57
6 6.06
6 4.87
6 4.52
6 5.08
6 4.29
6 6.19
6 3.45
6 5.85
6 2.51
6 4.93
7 5.87
7 5.36
7 5.85
7 6.27
7 5.96
7 4.95
7 5.85
7 4.70
7 1.53
7 3.88
7 2.88
7 2.07
8 6.57
8 5.08
8 5.95
8 7.51
8 3.79
8 4.33
8 3.70
8 5.21
8 4.48
8 5.17
8 4.69
8 2.12
9 4.13
9 7.31
9 4.47
9 2.53
9 3.96
9 5.30
9 2.66
9 4.12
9 3.54
9 2.98
9 5.08
9 5.15
"""


# In[63]:


"""
18_ANOVAonewayNonhomogvarBrugs.py
One way BANOVA Non Homogeneous Variance
"""

# THE DATA.
# Specify data source:
dataSource = ["McDonaldSK1991" , "SolariLS2008" , "Random"][0]

# Load the data:
if dataSource == "McDonaldSK1991":
    datarecord = pd.read_csv(StringIO(McDonaldSK1991), sep='\s+')
    
    y = datarecord['Size']
    Ntotal = len(y)
    x = (datarecord['Group'] - 1).values
    xnames = pd.unique(datarecord['Site'])
    NxLvl = len(xnames)
    contrast_dict = {'BIGvSMALL':[-1/3,-1/3,1/2,-1/3,1/2],
                       'ORE1vORE2': [1,-1,0,0,0],
                       'ALAvORE':[-1/2,-1/2,1,0,0],
                       'NPACvORE':[-1/2,-1/2,1/2,1/2,0],
                       'USAvRUS':[1/3,1/3,1/3,-1,0],
                       'FINvPAC':[-1/4,-1/4,-1/4,-1/4,1],
                       'ENGvOTH':[1/3,1/3,1/3,-1/2,-1/2],
                       'FINvRUS':[0,0,0,-1,1]}


if dataSource == "SolariLS2008":
    datarecord = pd.read_csv(StringIO(SolariLS2008), sep='\s+')
    y = datarecord['Acid']
    Ntotal = len(y)
    x = (datarecord['Type'] - 1).values
    xnames = pd.unique(x)
    NxLvl = len(xnames)
    contrast_dict = {'G3vOTHER':[-1/8,-1/8,1,-1/8,-1/8,-1/8,-1/8,-1/8,-1/8]}


if dataSource == "Random":
    np.random.seed(47405)
    ysdtrue = 4.0
    a0true = 100
    atrue = [2, -2]  # sum to zero
    npercell = 8
    x = []
    y = []
    for xidx in range(len(atrue)):
        for subjidx in range(npercell):
            x.append(xidx)
            y.append(a0true + atrue[xidx] + norm.rvs(1, ysdtrue))
    Ntotal = len(y)
    NxLvl = len(set(x))
#  # Construct list of all pairwise comparisons, to compare with NHST TukeyHSD:
    contrast_dict = None
    for g1idx in range(NxLvl):
        for g2idx in range(g1idx+1, NxLvl):
            cmpVec = np.repeat(0, NxLvl)
            cmpVec[g1idx] = -1
            cmpVec[g2idx] = 1
            contrast_dict = (contrast_dict, cmpVec)


z = (y - np.mean(y))/np.std(y)


## THE MODEL.
with pm.Model() as model:
    # define the hyperpriors
    a_SD_unabs = pm.T('a_SD_unabs', mu=0, lam=0.001, nu=1)
    a_SD = abs(a_SD_unabs) + 0.1
    atau = 1 / a_SD**2
    m = pm.Gamma('m', 1, 1)
    d = pm.Gamma('d', 1, 1)
    sG = m**2 / d**2 
    rG = m / d**2 
    # define the priors
    sigma = pm.Uniform('sigma', 0, 10) # y values are assumed to be standardized
    tau = pm.Gamma('tau', sG, rG) 
    a0 = pm.Normal('a0', mu=0, tau=0.001) # y values are assumed to be standardized
    a = pm.Normal('a', mu=0 , tau=atau, shape=NxLvl)
    mu = a0 + a
    # define the likelihood
    yl = pm.Normal('yl', mu[x], tau=tau, observed=z)
    # Generate a MCMC chain
    start = pm.find_MAP()
    steps = pm.Metropolis()
    trace = pm.sample(20000, steps, start, progressbar=False)


# EXAMINE THE RESULTS
burnin = 2000
thin = 50

# Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

# Check for mixing and autocorrelation
#plt.figure(figsize=(16,6))
pm.autocorrplot(trace[burnin::thin], vars=model.unobserved_RVs[:-1])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
pm.traceplot(trace)

a0_sample = trace['a0'][burnin::thin]
a_sample = trace['a'][burnin::thin]
# Convert baseline to the original scale
m_sample = a0_sample.repeat(NxLvl).reshape(len(a0_sample), NxLvl) + a_sample
b0_sample = m_sample.mean(axis=1)
b0_sample = b0_sample * np.std(y) + np.mean(y)
# Convert baseline to the original scale
n_sample = b0_sample.repeat(NxLvl).reshape(len(b0_sample), NxLvl)
b_sample = (m_sample - n_sample)
b_sample = b_sample * np.std(y)


plt.figure(figsize=(20, 4))
for i in range(5):
    plt.subplot(1, 5, i+1)
    plot_post(b_sample[:,i], xlab=r'$\beta1_{%s}$' % i,
              show_mode=False, framealpha=0.5,
              bins=50, title='x:%s' % i)
plt.tight_layout()


nContrasts = len(contrast_dict)
if nContrasts > 0:
    plt.figure(figsize=(20, 8))
    count = 0
    for key, value in contrast_dict.items():
        contrast = np.dot(b_sample, value)
        plt.subplot(2, 4, count)
        plot_post(contrast, title='Contrast %s' % key, comp_val=0.0, 
                  show_mode=False, framealpha=0.5, 
                  bins=50)
        count += 1
    plt.tight_layout()

plt.show()


# In[64]:


"""
18_ANOVAonewayPyMC.py
One way BANOVA
"""

# THE DATA.
# Specify data source:
dataSource = ["McDonaldSK1991" , "SolariLS2008" , "Random"][0]

# Load the data:
if dataSource == "McDonaldSK1991":
    datarecord = pd.read_csv(StringIO(McDonaldSK1991), sep='\s+')
    y = datarecord['Size']
    Ntotal = len(y)
    x = (datarecord['Group'] - 1).values
    xnames = pd.unique(datarecord['Site'])
    NxLvl = len(xnames)
    contrast_dict = {'BIGvSMALL':[-1/3,-1/3,1/2,-1/3,1/2],
                       'ORE1vORE2': [1,-1,0,0,0],
                       'ALAvORE':[-1/2,-1/2,1,0,0],
                       'NPACvORE':[-1/2,-1/2,1/2,1/2,0],
                       'USAvRUS':[1/3,1/3,1/3,-1,0],
                       'FINvPAC':[-1/4,-1/4,-1/4,-1/4,1],
                       'ENGvOTH':[1/3,1/3,1/3,-1/2,-1/2],
                       'FINvRUS':[0,0,0,-1,1]}


if dataSource == "SolariLS2008":
    datarecord = pd.read_csv(StringIO(SolariLS2008), sep='\s+')
    y = datarecord['Acid']
    Ntotal = len(y)
    x = (datarecord['Type'] - 1).values
    xnames = pd.unique(x)
    NxLvl = len(xnames)
    contrast_dict = {'G3vOTHER':[-1/8,-1/8,1,-1/8,-1/8,-1/8,-1/8,-1/8,-1/8]}


if dataSource == "Random":
    np.random.seed(47405)
    ysdtrue = 4.0
    a0true = 100
    atrue = [2, -2]  # sum to zero
    npercell = 8
    x = []
    y = []
    for xidx in range(len(atrue)):
        for subjidx in range(npercell):
            x.append(xidx)
            y.append(a0true + atrue[xidx] + norm.rvs(1, ysdtrue))
    Ntotal = len(y)
    NxLvl = len(set(x))
#  # Construct list of all pairwise comparisons, to compare with NHST TukeyHSD:
    contrast_dict = None
    for g1idx in range(NxLvl):
        for g2idx in range(g1idx+1, NxLvl):
            cmpVec = np.repeat(0, NxLvl)
            cmpVec[g1idx] = -1
            cmpVec[g2idx] = 1
            contrast_dict = (contrast_dict, cmpVec)


z = (y - np.mean(y))/np.std(y)


## THE MODEL.
with pm.Model() as model:
    # define the hyperpriors
    a_SD_unabs = pm.T('a_SD_unabs', mu=0, lam=0.001, nu=1)
    a_SD = abs(a_SD_unabs) + 0.1
    atau = 1 / a_SD**2
    # define the priors
    sigma = pm.Uniform('sigma', 0, 10) # y values are assumed to be standardized
    tau = 1 / sigma**2
    a0 = pm.Normal('a0', mu=0, tau=0.001) # y values are assumed to be standardized
    a = pm.Normal('a', mu=0 , tau=atau, shape=NxLvl)
    mu = a0 + a
    # define the likelihood
    yl = pm.Normal('yl', mu[x], tau=tau, observed=z)
    # Generate a MCMC chain
    start = pm.find_MAP()
    steps = pm.Metropolis()
    trace = pm.sample(20000, steps, start, progressbar=False)


# EXAMINE THE RESULTS
burnin = 2000
thin = 50

# Print summary for each trace
#pm.summary(trace[burnin::thin])
#pm.summary(trace)

# Check for mixing and autocorrelation
pm.autocorrplot(trace[burnin::thin], vars=model.unobserved_RVs[:-1])

## Plot KDE and sampled values for each parameter.
#pm.traceplot(trace[burnin::thin])
pm.traceplot(trace)

a0_sample = trace['a0'][burnin::thin]
a_sample = trace['a'][burnin::thin]
# Convert baseline to the original scale
m_sample = a0_sample.repeat(NxLvl).reshape(len(a0_sample), NxLvl) + a_sample
b0_sample = m_sample.mean(axis=1)
b0_sample = b0_sample * np.std(y) + np.mean(y)
# Convert baseline to the original scale
n_sample = b0_sample.repeat(NxLvl).reshape(len(b0_sample), NxLvl)
b_sample = (m_sample - n_sample)
b_sample = b_sample * np.std(y)


plt.figure(figsize=(20, 4))
for i in range(5):
    plt.subplot(1, 5, i+1)
    plot_post(b_sample[:,i], xlab=r'$\beta1_{%s}$' % i,
              show_mode=False, framealpha=0.5,
              bins=50, title='x:%s' % i)
plt.tight_layout()

nContrasts = len(contrast_dict)
if nContrasts > 0:
    plt.figure(figsize=(20, 8))
    count = 0
    for key, value in contrast_dict.items():
        contrast = np.dot(b_sample, value)
        plt.subplot(2, 4, count)
        plot_post(contrast, title='Contrast %s' % key, comp_val=0.0, 
                  show_mode=False, framealpha=0.5, 
                  bins=50)
        count += 1
    plt.tight_layout()

plt.show()