Think Bayes

Second Edition

Copyright 2020 Allen B. Downey

License: Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)

In [1]:
# If we're running on Colab, install empiricaldist
# https://pypi.org/project/empiricaldist/

import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install empiricaldist
In [2]:
# Get utils.py and create directories

import os

if not os.path.exists('utils.py'):
    !wget https://github.com/AllenDowney/ThinkBayes2/raw/master/soln/utils.py
        
if not os.path.exists('figs'):
    !mkdir figs
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from empiricaldist import Pmf, Cdf
from utils import decorate, savefig

Univariate normal

Generate data

In [4]:
from scipy.stats import norm

data = norm(10, 2).rvs(20)
data
Out[4]:
array([ 9.19413254,  7.93174418,  9.26052972, 11.09719199,  8.40716297,
       10.28738107,  9.40427944,  7.29124596,  9.94310494, 10.34056147,
        8.25335541,  9.52849993,  8.49452561, 11.50807398,  9.39053403,
        8.14435859, 15.22433046,  6.37571855, 13.72639858,  9.00185182])
In [5]:
n = len(data)
xbar = np.mean(data)
s2 = np.var(data)

n, xbar, s2
Out[5]:
(20, 9.640249062837732, 4.073028457273443)

Grid algorithm

In [6]:
mus = np.linspace(8, 12, 101)
prior_mu = Pmf(1, mus)
prior_mu.index.name = 'mu'
In [7]:
sigmas = np.linspace(0.01, 5, 100)
ps = sigmas**-2
prior_sigma = Pmf(ps, sigmas)
prior_sigma.index.name = 'sigma'
In [8]:
from utils import make_joint

prior = make_joint(prior_mu, prior_sigma)
In [9]:
from utils import normalize

def update_norm(prior, data):
    """Update the prior based on data.
    
    prior: joint distribution of mu and sigma
    data: sequence of observations
    """
    X, Y, Z = np.meshgrid(prior.columns, prior.index, data)
    likelihood = norm(X, Y).pdf(Z).prod(axis=2)

    posterior = prior * likelihood
    normalize(posterior)

    return posterior
In [10]:
posterior = update_norm(prior, data)
In [11]:
from utils import marginal

posterior_mu_grid = marginal(posterior, 0)
posterior_sigma_grid = marginal(posterior, 1)
In [12]:
posterior_mu_grid.plot()
decorate(title='Posterior distribution of mu')
In [13]:
posterior_sigma_grid.plot(color='C1')
decorate(title='Posterior distribution of sigma')
In [ ]:
 

Update

Mostly following notation in Murphy, Conjugate Bayesian analysis of the Gaussian distribution

In [14]:
m0 = 0
kappa0 = 0
alpha0 = 0
beta0 = 0
In [15]:
m_n = (kappa0 * m0 + n * xbar) / (kappa0 + n)
m_n
Out[15]:
9.640249062837732
In [16]:
kappa_n = kappa0 + n
kappa_n
Out[16]:
20
In [17]:
alpha_n = alpha0 + n/2
alpha_n
Out[17]:
10.0
In [18]:
beta_n = beta0 + n*s2/2 + n * kappa0 * (xbar-m0)**2 / (kappa0 + n)  / 2
beta_n
Out[18]:
40.73028457273443
In [19]:
def update_normal(prior, summary):
    m0, kappa0, alpha0, beta0 = prior
    n, xbar, s2 = summary

    m_n = (kappa0 * m0 + n * xbar) / (kappa0 + n)
    kappa_n = kappa0 + n
    alpha_n = alpha0 + n/2
    beta_n = (beta0 + n*s2/2 + 
              n * kappa0 * (xbar-m0)**2 / (kappa0 + n) / 2)

    return m_n, kappa_n, alpha_n, beta_n
In [20]:
prior = 0, 0, 0, 0
summary = n, xbar, s2
update_normal(prior, summary)
Out[20]:
(9.640249062837732, 20, 10.0, 40.73028457273443)

Posterior distribution of sigma

In [21]:
from scipy.stats import invgamma

dist_sigma2 = invgamma(alpha_n, scale=beta_n)
In [22]:
dist_sigma2.mean()
Out[22]:
4.52558717474827
In [23]:
dist_sigma2.std()
Out[23]:
1.6000366900576855
In [24]:
sigma2s = np.linspace(0.01, 20, 101)
ps = dist_sigma2.pdf(sigma2s)
posterior_sigma2_invgammas = Pmf(ps, sigma2s)
posterior_sigma2_invgammas.normalize()
Out[24]:
5.002242322922353
In [25]:
posterior_sigma2_invgammas.plot()
decorate(xlabel='$\sigma^2$',
         ylabel='PDF',
         title='Posterior distribution of variance')
In [26]:
sigmas = np.sqrt(sigma2s)
posterior_sigma_invgammas = Pmf(ps, sigmas)
posterior_sigma_invgammas.normalize()
Out[26]:
5.002242322922353
In [27]:
posterior_sigma_grid.make_cdf().plot(color='gray', label='grid')
posterior_sigma_invgammas.make_cdf().plot(color='C1', label='invgamma')

decorate(xlabel='$\sigma$',
         ylabel='PDF',
         title='Posterior distribution of standard deviation')
In [28]:
posterior_sigma_invgammas.mean(), posterior_sigma_grid.mean()
Out[28]:
(2.0978773059173537, 2.0974647660359413)
In [29]:
posterior_sigma_invgammas.std(), posterior_sigma_grid.std()
Out[29]:
(0.3514983795094244, 0.35127637938193573)
In [30]:
2 / np.sqrt(2 * (n-1))
Out[30]:
0.3244428422615251

Posterior distribution of mu

In [31]:
from scipy.stats import t as student_t

def make_student_t(df, loc, scale):
    return student_t(df, loc=loc, scale=scale)
In [32]:
df = 2 * alpha_n
precision = alpha_n * kappa_n / beta_n
dist_mu = make_student_t(df, m_n, 1/np.sqrt(precision))
In [33]:
dist_mu.mean()
Out[33]:
9.640249062837732
In [34]:
dist_mu.std()
Out[34]:
0.47568829997952805
In [35]:
np.sqrt(4/n)
Out[35]:
0.4472135954999579
In [36]:
mus = np.linspace(8, 12, 101)
ps = dist_mu.pdf(mus)
posterior_mu_student = Pmf(ps, mus)
posterior_mu_student.normalize()
Out[36]:
24.980970615100787
In [37]:
posterior_mu_student.plot()
decorate(xlabel='$\mu$',
         ylabel='PDF',
         title='Posterior distribution of mu')
In [38]:
posterior_mu_grid.make_cdf().plot(color='gray', label='grid')
posterior_mu_student.make_cdf().plot(label='invgamma')
decorate(xlabel='$\mu$',
         ylabel='CDF',
         title='Posterior distribution of mu')
In [39]:
def make_posterior_mu(m_n, kappa_n, alpha_n, beta_n):
    df = 2 * alpha_n
    loc = m_n
    precision = alpha_n * kappa_n / beta_n
    dist_mu = make_student_t(df, loc, 1/np.sqrt(precision))
    return dist_mu

Posterior joint distribution

In [40]:
mu_mesh, sigma2_mesh = np.meshgrid(mus, sigma2s)
In [41]:
joint = (dist_sigma2.pdf(sigma2_mesh) * 
         norm(m_n, sigma2_mesh/kappa_n).pdf(mu_mesh))
joint_df = pd.DataFrame(joint, columns=mus, index=sigma2s)
In [42]:
from utils import plot_contour

plot_contour(joint_df)
decorate(xlabel='$\mu$',
         ylabel='$\sigma^2$',
         title='Posterior joint distribution')

Sampling from posterior predictive

In [43]:
sample_sigma2 = dist_sigma2.rvs(1000)
In [44]:
sample_mu = norm(m_n, sample_sigma2 / kappa_n).rvs()
In [45]:
sample_pred = norm(sample_mu, np.sqrt(sample_sigma2)).rvs()
In [46]:
cdf_pred = Cdf.from_seq(sample_pred)
cdf_pred.plot()
In [47]:
sample_pred.mean(), sample_pred.var()
Out[47]:
(9.516734504437707, 4.577270204875544)

Analytic posterior predictive

In [48]:
df = 2 * alpha_n
precision = alpha_n * kappa_n / beta_n / (kappa_n+1)
dist_pred = make_student_t(df, m_n, 1/np.sqrt(precision))
In [49]:
xs = np.linspace(2, 16, 101)
ys = dist_pred.cdf(xs)

plt.plot(xs, ys, color='gray', label='student t')
cdf_pred.plot(label='sample')

decorate(title='Predictive distribution')
In [50]:
def make_posterior_pred(m_n, kappa_n, alpha_n, beta_n):
    df = 2 * alpha_n
    loc = m_n
    precision = alpha_n * kappa_n / beta_n / (kappa_n+1)
    dist_pred = make_student_t(df, loc, 1/np.sqrt(precision))
    return dist_pred

Multivariate normal

Generate data

In [51]:
mean = [10, 20]

sigma_x = 2
sigma_y = 3
rho = 0.3
cov = rho * sigma_x * sigma_y 

Sigma = [[sigma_x**2, cov], [cov, sigma_y**2]]
Sigma
Out[51]:
[[4, 1.7999999999999998], [1.7999999999999998, 9]]
In [52]:
from scipy.stats import multivariate_normal

n = 20
data = multivariate_normal(mean, Sigma).rvs(n)
data
Out[52]:
array([[ 9.09443543, 17.71994515],
       [16.44233547, 23.6472648 ],
       [12.01135574, 24.03660932],
       [ 9.0748985 , 21.66032914],
       [13.11185908, 18.83369963],
       [13.63682486, 21.49850919],
       [ 8.24152444, 22.92781604],
       [13.56010439, 29.07263218],
       [11.65768605, 20.22149536],
       [ 9.63956732, 16.85039916],
       [ 9.31995128, 15.62154768],
       [ 7.95094601, 14.22500863],
       [ 8.72949594, 20.7225257 ],
       [10.01041019, 21.53598675],
       [10.2989316 , 16.31756551],
       [ 7.95686713, 18.08908348],
       [ 6.84997664, 16.56372388],
       [ 8.88606479, 18.42782995],
       [10.0890154 , 21.17141536],
       [11.62347859, 17.63973527]])
In [53]:
n = len(data)
n
Out[53]:
20
In [54]:
xbar = np.mean(data, axis=0)
xbar
Out[54]:
array([10.40928644, 19.83915611])
In [55]:
S = np.cov(data.transpose())
S
Out[55]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])
In [56]:
np.corrcoef(data.transpose())
Out[56]:
array([[1.        , 0.55681205],
       [0.55681205, 1.        ]])
In [57]:
stds = np.sqrt(np.diag(S))
stds
Out[57]:
array([2.40367702, 3.51935735])
In [58]:
corrcoef = S / np.outer(stds, stds)
corrcoef
Out[58]:
array([[1.        , 0.55681205],
       [0.55681205, 1.        ]])
In [59]:
def unpack_cov(S):
    stds = np.sqrt(np.diag(S))
    corrcoef = S / np.outer(stds, stds)
    return stds[0], stds[1], corrcoef[0][1]
In [60]:
sigma_x, sigma_y, rho = unpack_cov(S)
sigma_x, sigma_y, rho
Out[60]:
(2.4036770231837497, 3.519357348098385, 0.5568120515289984)
In [61]:
def pack_cov(sigma_x, sigma_y, rho):
    cov = sigma_x * sigma_y * rho
    return np.array([[sigma_x**2, cov], [cov, sigma_y**2]])
In [62]:
pack_cov(sigma_x, sigma_y, rho)
Out[62]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])
In [63]:
S
Out[63]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])

Update

In [64]:
m_0 = 0
Lambda_0 = 0
nu_0 = 0
kappa_0 = 0
In [65]:
m_n = (kappa_0 * m_0 + n * xbar) / (kappa_0 + n)
m_n
Out[65]:
array([10.40928644, 19.83915611])
In [66]:
xbar
Out[66]:
array([10.40928644, 19.83915611])
In [67]:
diff = (xbar - m_0)
D = np.outer(diff, diff)
D
Out[67]:
array([[108.35324426, 206.51145873],
       [206.51145873, 393.59211512]])
In [68]:
Lambda_n = Lambda_0 + S + n * kappa_0 * D / (kappa_0 + n)
Lambda_n
Out[68]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])
In [69]:
S
Out[69]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])
In [70]:
nu_n = nu_0 + n
nu_n
Out[70]:
20
In [71]:
kappa_n = kappa_0 + n
kappa_n
Out[71]:
20

Posterior distribution of covariance

In [72]:
from scipy.stats import invwishart

def make_invwishart(nu, Lambda):
    d, _ = Lambda.shape
    return invwishart(nu, scale=Lambda * (nu - d - 1))
In [73]:
dist_cov = make_invwishart(nu_n, Lambda_n)
In [74]:
dist_cov.mean()
Out[74]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])
In [75]:
S
Out[75]:
array([[ 5.77766323,  4.71029497],
       [ 4.71029497, 12.38587614]])
In [76]:
sample_Sigma = dist_cov.rvs(1000)
np.mean(sample_Sigma, axis=0)
Out[76]:
array([[ 5.75009385,  4.73096796],
       [ 4.73096796, 12.48177402]])
In [77]:
res = [unpack_cov(Sigma) for Sigma in sample_Sigma]
In [78]:
sample_sigma_x, sample_sigma_y, sample_rho = np.transpose(res)
sample_sigma_x.mean(), sample_sigma_y.mean(), sample_rho.mean()
Out[78]:
(2.3607897260964137, 3.4767864662050987, 0.5480438257464233)
In [79]:
unpack_cov(S)
Out[79]:
(2.4036770231837497, 3.519357348098385, 0.5568120515289984)
In [80]:
Cdf.from_seq(sample_sigma_x).plot(label=r'$\sigma_x$')
Cdf.from_seq(sample_sigma_y).plot(label=r'$\sigma_y$')

decorate(xlabel='Standard deviation',
         ylabel='CDF',
        title='Posterior distribution of standard deviation')
In [81]:
Cdf.from_seq(sample_rho).plot()

decorate(xlabel='Coefficient of correlation',
         ylabel='CDF',
         title='Posterior distribution of correlation')

Evaluate the Inverse Wishart PDF

In [82]:
num = 51
sigma_xs = np.linspace(0.01, 10, num)
In [83]:
sigma_ys = np.linspace(0.01, 10, num)
In [84]:
rhos = np.linspace(-0.3, 0.9, num)
In [85]:
index = pd.MultiIndex.from_product([sigma_xs, sigma_ys, rhos],
                             names=['sigma_x', 'sigma_y', 'rho'])
joint = Pmf(0, index)
joint.head()
Out[85]:
sigma_x  sigma_y  rho   
0.01     0.01     -0.300    0
                  -0.276    0
                  -0.252    0
                  -0.228    0
                  -0.204    0
dtype: int64
In [86]:
dist_cov.pdf(S)
Out[86]:
0.006027909520828536
In [87]:
for sigma_x, sigma_y, rho in joint.index:
    Sigma = pack_cov(sigma_x, sigma_y, rho)
    joint.loc[sigma_x, sigma_y, rho] = dist_cov.pdf(Sigma)
    
joint.normalize()
Out[87]:
4.700704063935837
In [88]:
from utils import pmf_marginal

posterior_sigma_x = pmf_marginal(joint, 0)
posterior_sigma_y = pmf_marginal(joint, 1)
marginal_rho = pmf_marginal(joint, 2)

posterior_sigma_x.mean(), posterior_sigma_y.mean(), marginal_rho.mean()
Out[88]:
(2.210173131819553, 3.236037871074533, 0.5138102647160548)
In [89]:
unpack_cov(S)
Out[89]:
(2.4036770231837497, 3.519357348098385, 0.5568120515289984)
In [90]:
posterior_sigma_x.plot(label='$\sigma_x$')
posterior_sigma_y.plot(label='$\sigma_y$')

decorate(xlabel='Standard deviation',
         ylabel='PDF',
         title='Posterior distribution of standard deviation')
In [91]:
posterior_sigma_x.make_cdf().plot(color='gray')
posterior_sigma_y.make_cdf().plot(color='gray')

Cdf.from_seq(sample_sigma_x).plot(label=r'$\sigma_x$')
Cdf.from_seq(sample_sigma_y).plot(label=r'$\sigma_y$')

decorate(xlabel='Standard deviation',
         ylabel='CDF',
         title='Posterior distribution of standard deviation')
In [92]:
marginal_rho.make_cdf().plot(color='gray')

Cdf.from_seq(sample_rho).plot()

decorate(xlabel='Coefficient of correlation',
         ylabel='CDF',
         title='Posterior distribution of correlation')

Posterior distribution of mu

In [93]:
m_n
Out[93]:
array([10.40928644, 19.83915611])
In [94]:
sample_mu = [multivariate_normal(m_n, Sigma/kappa_n).rvs()
             for Sigma in sample_Sigma]
In [95]:
sample_mu0, sample_mu1 = np.transpose(sample_mu)

sample_mu0.mean(), sample_mu1.mean()
Out[95]:
(10.417787293315167, 19.808873575990383)
In [96]:
xbar
Out[96]:
array([10.40928644, 19.83915611])
In [97]:
sample_mu0.std(), sample_mu1.std()
Out[97]:
(0.5383503263225001, 0.7858092840001913)
In [98]:
2 / np.sqrt(n), 3 / np.sqrt(n)
Out[98]:
(0.4472135954999579, 0.6708203932499369)
In [99]:
Cdf.from_seq(sample_mu0).plot(label=r'$\mu_0$ sample')
Cdf.from_seq(sample_mu1).plot(label=r'$\mu_1$ sample')

decorate(xlabel=r'$\mu$',
         ylabel='CDF',
         title=r'Posterior distribution of $\mu$')

Multivariate student t

Let's use this implementation

In [100]:
from scipy.special import gammaln

def multistudent_pdf(x, mean, shape, df):
    return np.exp(logpdf(x, mean, shape, df))

def logpdf(x, mean, shape, df):
    p = len(mean)
    vals, vecs = np.linalg.eigh(shape)
    logdet     = np.log(vals).sum()
    valsinv    = np.array([1.0/v for v in vals])
    U          = vecs * np.sqrt(valsinv)
    dev        = x - mean
    maha       = np.square(dev @ U).sum(axis=-1)

    t = 0.5 * (df + p)
    A = gammaln(t)
    B = gammaln(0.5 * df)
    C = p/2. * np.log(df * np.pi)
    D = 0.5 * logdet
    E = -t * np.log(1 + (1./df) * maha)

    return A - B - C - D + E
In [101]:
d = len(m_n)
x = m_n
mean = m_n
df = nu_n - d + 1
shape = Lambda_n / kappa_n
multistudent_pdf(x, mean, shape, df)
Out[101]:
0.4530003997972322
In [102]:
mu0s = np.linspace(8, 12, 91)
mu1s = np.linspace(18, 22, 101)
In [103]:
mu_mesh = np.dstack(np.meshgrid(mu0s, mu1s))
mu_mesh.shape
Out[103]:
(101, 91, 2)
In [104]:
ps = multistudent_pdf(mu_mesh, mean, shape, df)
In [105]:
joint = pd.DataFrame(ps, columns=mu0s, index=mu1s)
normalize(joint)
Out[105]:
549.2874162564223
In [106]:
plot_contour(joint)
Out[106]:
<matplotlib.contour.QuadContourSet at 0x7fb168b42410>
In [107]:
from utils import marginal

posterior_mu0_student = marginal(joint, 0)
posterior_mu1_student = marginal(joint, 1)
In [108]:
posterior_mu0_student.make_cdf().plot(color='gray', label=r'$\mu_0 multi t$')
posterior_mu1_student.make_cdf().plot(color='gray', label=r'$\mu_1 multi t$')

Cdf.from_seq(sample_mu0).plot(label=r'$\mu_0$ sample')
Cdf.from_seq(sample_mu1).plot(label=r'$\mu_1$ sample')

decorate(xlabel=r'$\mu$',
         ylabel='CDF',
         title=r'Posterior distribution of $\mu$')

Compare to analytic univariate distributions

In [109]:
prior = 0, 0, 0, 0
summary = n, xbar[0], S[0][0]
summary
Out[109]:
(20, 10.409286443363563, 5.777663231781492)
In [110]:
params = update_normal(prior, summary)
params
Out[110]:
(10.409286443363563, 20, 10.0, 57.77663231781492)
In [111]:
dist_mu0 = make_posterior_mu(*params)
dist_mu0.mean(), dist_mu0.std()
Out[111]:
(10.409286443363563, 0.5665521076251745)
In [112]:
mu0s = np.linspace(7, 12, 101)
ps = dist_mu0.pdf(mu0s)
posterior_mu0 = Pmf(ps, index=mu0s)
posterior_mu0.normalize()
Out[112]:
19.930298060446894
In [113]:
prior = 0, 0, 0, 0
summary = n, xbar[1], S[1][1]
summary
Out[113]:
(20, 19.839156108992704, 12.385876143614098)
In [114]:
params = update_normal(prior, summary)
params
Out[114]:
(19.839156108992704, 20, 10.0, 123.85876143614098)
In [115]:
dist_mu1 = make_posterior_mu(*params)
dist_mu1.mean(), dist_mu1.std()
Out[115]:
(19.839156108992704, 0.8295204820863575)
In [116]:
mu1s = np.linspace(17, 23, 101)
ps = dist_mu1.pdf(mu1s)
posterior_mu1 = Pmf(ps, index=mu1s)
posterior_mu1.normalize()
Out[116]:
16.64813735166017
In [117]:
posterior_mu0.make_cdf().plot(label=r'$\mu_0$ uni t', color='gray')
posterior_mu1.make_cdf().plot(label=r'$\mu_1$ uni t', color='gray')

Cdf.from_seq(sample_mu0).plot(label=r'$\mu_0$ sample')
Cdf.from_seq(sample_mu1).plot(label=r'$\mu_1$ sample')

decorate(xlabel=r'$\mu$',
         ylabel='CDF',
         title=r'Posterior distribution of $\mu$')

Sampling from posterior predictive

In [118]:
sample_pred = [multivariate_normal(mu, Sigma).rvs()
               for mu, Sigma in zip(sample_mu, sample_Sigma)]
In [119]:
sample_x0, sample_x1 = np.transpose(sample_pred)

sample_x0.mean(), sample_x1.mean()
Out[119]:
(10.343334031620982, 19.79428758822268)
In [120]:
sample_x0.std(), sample_x1.std()
Out[120]:
(2.4491840957223703, 3.5023208219780724)
In [121]:
prior = 0, 0, 0, 0
summary = n, xbar[0], S[0][0]
params = update_normal(prior, summary)
dist_x0 = make_posterior_pred(*params)
dist_x0.mean(), dist_x0.std()
Out[121]:
(10.409286443363563, 2.5962679183291297)
In [122]:
x0s = np.linspace(2, 18, 101)
ps = dist_x0.pdf(x0s)
pred_x0 = Pmf(ps, index=x0s)
pred_x0.normalize()
Out[122]:
6.224974183920316
In [123]:
prior = 0, 0, 0, 0
summary = n, xbar[1], S[1][1]
params = update_normal(prior, summary)
dist_x1 = make_posterior_pred(*params)
dist_x1.mean(), dist_x1.std()
Out[123]:
(19.839156108992704, 3.8013403996769934)
In [124]:
x1s = np.linspace(10, 30, 101)
ps = dist_x1.pdf(x1s)
pred_x1 = Pmf(ps, index=x1s)
pred_x1.normalize()
Out[124]:
4.944574125732073
In [125]:
pred_x0.make_cdf().plot(label=r'$x_0$ student t', color='gray')
pred_x1.make_cdf().plot(label=r'$x_1$ student t', color='gray')

Cdf.from_seq(sample_x0).plot(label=r'$x_0$ sample')
Cdf.from_seq(sample_x1).plot(label=r'$x_1$ sample')

decorate(xlabel='Quantity',
         ylabel='CDF',
         title='Posterior predictive distributions')

Comparing to the multivariate student t

In [126]:
d = len(m_n)
x = m_n
mean = m_n
df = nu_n - d + 1
shape = Lambda_n * (kappa_n+1) / kappa_n
multistudent_pdf(x, mean, shape, df)
Out[126]:
0.02157144760939201
In [127]:
x0s = np.linspace(0, 20, 91)
x1s = np.linspace(10, 30, 101)
In [128]:
x_mesh = np.dstack(np.meshgrid(x0s, x1s))
x_mesh.shape
Out[128]:
(101, 91, 2)
In [129]:
ps = multistudent_pdf(x_mesh, mean, shape, df)
In [130]:
joint = pd.DataFrame(ps, columns=x0s, index=x1s)
normalize(joint)
Out[130]:
22.23349484051357
In [131]:
plot_contour(joint)
Out[131]:
<matplotlib.contour.QuadContourSet at 0x7fb16888dc50>
In [132]:
from utils import marginal

posterior_x0_student = marginal(joint, 0)
posterior_x1_student = marginal(joint, 1)
In [133]:
posterior_x0_student.make_cdf().plot(color='gray', label=r'$x_0$ multi t')
posterior_x1_student.make_cdf().plot(color='gray', label=r'$x_1$ multi t')

Cdf.from_seq(sample_x0).plot(label=r'$x_0$ sample')
Cdf.from_seq(sample_x1).plot(label=r'$x_1$ sample')

decorate(xlabel='Quantity',
         ylabel='CDF',
         title='Posterior predictive distributions')

Bayesian linear regression

Generate data

In [ ]:
inter, slope = 5, 2
sigma = 3
n = 20
In [ ]:
xs = norm(0, 3).rvs(n)
xs = np.sort(xs)
ys = inter + slope * xs + norm(0, sigma).rvs(20)

plt.plot(xs, ys, 'o');
In [ ]:
import statsmodels.api as sm

X = sm.add_constant(xs)
X
In [ ]:
model = sm.OLS(ys, X)
results = model.fit()
results.summary()
In [ ]:
beta_hat = results.params
beta_hat
In [ ]:
# k = results.df_model
k = 2
In [ ]:
s2 = results.resid @ results.resid / (n - k)
s2
In [ ]:
s2 = results.ssr / (n - k)
s2
In [ ]:
np.sqrt(s2)

Grid algorithm

In [ ]:
beta0s = np.linspace(2, 8, 71)
prior_inter = Pmf(1, beta0s, name='inter')
prior_inter.index.name = 'Intercept'
In [ ]:
beta1s = np.linspace(1, 3, 61)
prior_slope = Pmf(1, beta1s, name='slope')
prior_slope.index.name = 'Slope'
In [ ]:
sigmas = np.linspace(1, 6, 51)
ps = sigmas**-2
prior_sigma = Pmf(ps, sigmas, name='sigma')
prior_sigma.index.name = 'Sigma'
prior_sigma.normalize()

prior_sigma.plot()
In [ ]:
from utils import make_joint

def make_joint3(pmf1, pmf2, pmf3):
    """Make a joint distribution with three parameters.
    
    pmf1: Pmf object
    pmf2: Pmf object
    pmf3: Pmf object
    
    returns: Pmf representing a joint distribution
    """
    joint2 = make_joint(pmf2, pmf1).stack()
    joint3 = make_joint(pmf3, joint2).stack()
    return Pmf(joint3)
In [ ]:
prior3 = make_joint3(prior_slope, prior_inter, prior_sigma)
prior3.head()
In [ ]:
from utils import normalize

def update_optimized(prior, data):
    """Posterior distribution of regression parameters
    `slope`, `inter`, and `sigma`.
    
    prior: Pmf representing the joint prior
    data: DataFrame with columns `x` and `y`
    
    returns: Pmf representing the joint posterior
    """
    xs = data['x']
    ys = data['y']
    sigmas = prior.columns
    likelihood = prior.copy()

    for slope, inter in prior.index:
        expected = slope * xs + inter
        resid = ys - expected
        resid_mesh, sigma_mesh = np.meshgrid(resid, sigmas)
        densities = norm.pdf(resid_mesh, 0, sigma_mesh)
        likelihood.loc[slope, inter] = densities.prod(axis=1)
        
    posterior = prior * likelihood
    normalize(posterior)
    return posterior
In [ ]:
data = pd.DataFrame(dict(x=xs, y=ys))
In [ ]:
from utils import normalize

posterior = update_optimized(prior3.unstack(), data)
normalize(posterior)
In [ ]:
from utils import marginal

posterior_sigma_grid = marginal(posterior, 0)
posterior_sigma_grid.plot(label='grid')

decorate(title='Posterior distribution of sigma')
In [ ]:
joint_posterior = marginal(posterior, 1).unstack()
plot_contour(joint_posterior)
In [ ]:
posterior_beta0_grid = marginal(joint_posterior, 0)
posterior_beta1_grid = marginal(joint_posterior, 1)
In [ ]:
posterior_beta0_grid.make_cdf().plot(label=r'$\beta_0$')
posterior_beta1_grid.make_cdf().plot(label=r'$\beta_1$')

decorate(title='Posterior distributions of parameters')

Posterior distribution of sigma

According to Gelman et al, the posterior distribution of $\sigma^2$ is scaled inverse chi2 with $\nu=n-k$ and scale $s^2$.

According to Wikipedia, that's equivalent to inverse gamma with parameters $\nu/2$ and $\nu s^2 / 2$.

In [ ]:
nu = n-k
nu/2, nu*s2/2
In [ ]:
from scipy.stats import invgamma

dist_sigma2 = invgamma(nu/2, scale=nu*s2/2)
dist_sigma2.mean()
In [ ]:
sigma2s = np.linspace(0.01, 30, 101)
ps = dist_sigma2.pdf(sigma2s)
posterior_sigma2_invgamma = Pmf(ps, sigma2s)
posterior_sigma2_invgamma.normalize()
In [ ]:
posterior_sigma2_invgamma.plot()
In [ ]:
sigmas = np.sqrt(sigma2s)
posterior_sigma_invgamma = Pmf(ps, sigmas)
posterior_sigma_invgamma.normalize()
In [ ]:
posterior_sigma_invgamma.mean(), posterior_sigma_grid.mean()
In [ ]:
posterior_sigma_grid.make_cdf().plot(color='gray', label='grid')
posterior_sigma_invgamma.make_cdf().plot(label='invgamma')

decorate(title='Posterior distribution of sigma')

Posterior distribution of sigma, updatable version

Per the Wikipedia page: https://en.wikipedia.org/wiki/Bayesian_linear_regression

In [ ]:
Lambda_0 = np.zeros((k, k))
Lambda_n = Lambda_0 + X.T @ X
Lambda_n
In [ ]:
from scipy.linalg import inv

mu_0 = np.zeros(k)
mu_n = inv(Lambda_n) @ (Lambda_0 @ mu_0 + X.T @ X @ beta_hat)
mu_n
In [ ]:
a_0 = 0
a_n = a_0 + n / 2
a_n
In [ ]:
b_0 = 0
b_n = b_0 + (ys.T @ ys + 
             mu_0.T @ Lambda_0 @ mu_0 - 
             mu_n.T @ Lambda_n @ mu_n) / 2
b_n
In [ ]:
a_n, nu/2
In [ ]:
b_n, nu * s2 / 2

Sampling the posterior of the parameters

In [ ]:
sample_sigma2 = dist_sigma2.rvs(1000)
In [ ]:
sample_sigma = np.sqrt(sample_sigma2)
In [ ]:
from scipy.linalg import inv

V_beta = inv(X.T @ X)
V_beta
In [ ]:
sample_beta = [multivariate_normal(beta_hat, V_beta * sigma2).rvs()
               for sigma2 in sample_sigma2]
In [ ]:
np.mean(sample_beta, axis=0)
In [ ]:
beta_hat
In [ ]:
np.std(sample_beta, axis=0)
In [ ]:
results.bse
In [ ]:
sample_beta0, sample_beta1 = np.transpose(sample_beta)
In [ ]:
Cdf.from_seq(sample_beta0).plot(label=r'$\beta_0$')
Cdf.from_seq(sample_beta1).plot(label=r'$\beta_1$')

decorate(title='Posterior distributions of the parameters')

Posterior using multivariate Student t

In [ ]:
x = beta_hat
mean = beta_hat
df = (n - k)
shape = (V_beta * s2)
multistudent_pdf(x, mean, shape, df)
In [ ]:
low, high = sample_beta0.min(), sample_beta0.max()
low, high
In [ ]:
beta0s = np.linspace(0.9*low, 1.1*high, 101)
In [ ]:
low, high = sample_beta1.min(), sample_beta1.max()
In [ ]:
beta1s = np.linspace(0.9*low, 1.1*high, 91)
In [ ]:
beta0_mesh, beta1_mesh = np.meshgrid(beta0s, beta1s)
In [ ]:
beta_mesh = np.dstack(np.meshgrid(beta0s, beta1s))
beta_mesh.shape
In [ ]:
ps = multistudent_pdf(beta_mesh, mean, shape, df)
ps.shape
In [ ]:
joint = pd.DataFrame(ps, columns=beta0s, index=beta1s)
In [ ]:
from utils import normalize

normalize(joint)
In [ ]:
from utils import plot_contour

plot_contour(joint)
decorate(xlabel=r'$\beta_0$',
         ylabel=r'$\beta_1$')
In [ ]:
marginal_beta0_student = marginal(joint, 0)
marginal_beta1_student = marginal(joint, 1)
In [ ]:
from utils import marginal

posterior_beta0_grid.make_cdf().plot(color='gray', label=r'grid $\beta_0$')
posterior_beta1_grid.make_cdf().plot(color='gray', label=r'grid $\beta_1$')

marginal_beta0_student.make_cdf().plot(label=r'student $\beta_0$', color='gray')
marginal_beta1_student.make_cdf().plot(label=r'student $\beta_0$', color='gray')

Cdf.from_seq(sample_beta0).plot(label=r'sample $\beta_0$')
Cdf.from_seq(sample_beta1).plot(label=r'sample $\beta_1$')

decorate()

Sampling the predictive distribution

In [ ]:
t = [X @ beta + norm(0, sigma).rvs(n)
     for beta, sigma in zip(sample_beta, sample_sigma)]
predictions = np.array(t)
predictions.shape
In [ ]:
low, median, high = np.percentile(predictions, [5, 50, 95], axis=0)
In [ ]:
plt.plot(xs, ys, 'o')
plt.plot(xs, median)
plt.fill_between(xs, low, high, color='C1', alpha=0.3)

Modeling the predictive distribution

In [ ]:
xnew = [1, 2, 3]
Xnew = sm.add_constant(xnew)
Xnew
In [ ]:
t = [Xnew @ beta + norm(0, sigma).rvs(len(xnew))
     for beta, sigma in zip(sample_beta, sample_sigma)]
predictions = np.array(t)
predictions.shape
In [ ]:
x0, x1, x2 = predictions.T

Cdf.from_seq(x0).plot()
Cdf.from_seq(x1).plot()
Cdf.from_seq(x2).plot()
In [ ]:
 
In [ ]:
 
In [ ]:
mu_new = Xnew @ beta_hat
mu_new
In [ ]:
cov_new = s2 * (np.eye(len(xnew)) + Xnew @ V_beta @ Xnew.T)
cov_new
In [ ]:
x = mu_new
mean = mu_new
df = (n - k)
shape = cov_new
multistudent_pdf(x, mean, shape, df)
In [ ]:
y1s = np.linspace(0, 20, 51)
y0s = np.linspace(0, 20, 61)
y2s = np.linspace(0, 20, 71)
In [ ]:
mesh = np.stack(np.meshgrid(y0s, y1s, y2s), axis=-1)
mesh.shape
In [ ]:
ps = multistudent_pdf(mesh, mean, shape, df)
ps.shape
In [ ]:
ps /= ps.sum()
ps.sum()
In [ ]:
p1s = ps.sum(axis=1).sum(axis=1)
p1s.shape
In [ ]:
p0s = ps.sum(axis=0).sum(axis=1)
p0s.shape
In [ ]:
p2s = ps.sum(axis=0).sum(axis=0)
p2s.shape
In [ ]:
pmf_y0 = Pmf(p0s, y0s)
pmf_y1 = Pmf(p1s, y1s)
pmf_y2 = Pmf(p2s, y2s)

pmf_y0.mean(), pmf_y1.mean(), pmf_y2.mean()
In [ ]:
pmf_y0.make_cdf().plot(color='gray')
pmf_y1.make_cdf().plot(color='gray')
pmf_y2.make_cdf().plot(color='gray')

Cdf.from_seq(x0).plot()
Cdf.from_seq(x1).plot()
Cdf.from_seq(x2).plot()
In [ ]:
stop

Leftovers

In [ ]:
from scipy.stats import chi2


class NormalInverseWishartDistribution(object):
    def __init__(self, mu, lmbda, nu, psi):
        self.mu = mu
        self.lmbda = float(lmbda)
        self.nu = nu
        self.psi = psi
        self.inv_psi = np.linalg.inv(psi)

    def sample(self):
        sigma = np.linalg.inv(self.wishartrand())
        return (np.random.multivariate_normal(self.mu, sigma / self.lmbda), sigma)

    def wishartrand(self):
        dim = self.inv_psi.shape[0]
        chol = np.linalg.cholesky(self.inv_psi)
        foo = np.zeros((dim,dim))

        for i in range(dim):
            for j in range(i+1):
                if i == j:
                    foo[i,j] = np.sqrt(chi2.rvs(self.nu-(i+1)+1))
                else:
                    foo[i,j]  = np.random.normal(0,1)
        return np.dot(chol, np.dot(foo, np.dot(foo.T, chol.T)))

    def posterior(self, data):
        n = len(data)
        mean_data = np.mean(data, axis=0)
        sum_squares = np.sum([np.array(np.matrix(x - mean_data).T * np.matrix(x - mean_data)) for x in data], axis=0)
        mu_n = (self.lmbda * self.mu + n * mean_data) / (self.lmbda + n)
        lmbda_n = self.lmbda + n
        nu_n = self.nu + n
        dev = mean_data - self.mu
        psi_n = (self.psi + sum_squares + 
                 self.lmbda * n / (self.lmbda + n) * np.array(dev.T @ dev))
        return NormalInverseWishartDistribution(mu_n, lmbda_n, nu_n, psi_n)

    

x = NormalInverseWishartDistribution(np.array([0,0])-3,1,3,np.eye(2))
samples = [x.sample() for _ in range(100)]
data = [np.random.multivariate_normal(mu,cov) for mu,cov in samples]
y = NormalInverseWishartDistribution(np.array([0,0]),1,3,np.eye(2))
z = y.posterior(data)

print('mu_n: {0}'.format(z.mu))

print('psi_n: {0}'.format(z.psi))
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
from scipy.linalg import inv
from scipy.linalg import cholesky

def wishartrand(nu, Lambda):
    d, _ = Lambda.shape
    chol = cholesky(Lambda)
    foo = np.empty((d, d))

    for i in range(d):
        for j in range(i+1):
            if i == j:
                foo[i,j] = np.sqrt(chi2.rvs(nu-(i+1)+1))
            else:
                foo[i,j] = np.random.normal(0, 1)
                
    return np.dot(chol, np.dot(foo, np.dot(foo.T, chol.T)))
In [ ]:
sample = [wishartrand(nu_n, Lambda_n) for i in range(1000)]
In [ ]:
np.mean(sample, axis=0)
In [ ]:
Lambda_n
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: