# If we're running on Colab, install empiricaldist
# https://pypi.org/project/empiricaldist/
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
!pip install empiricaldist
# Get utils.py
from os.path import basename, exists
def download(url):
filename = basename(url)
if not exists(filename):
from urllib.request import urlretrieve
local, _ = urlretrieve(url, filename)
print('Downloaded ' + local)
download('https://github.com/AllenDowney/ThinkBayes2/raw/master/soln/utils.py')
Downloaded utils.py
from utils import set_pyplot_params
set_pyplot_params()
In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
The following cell downloads the data.
download('https://github.com/AllenDowney/ThinkBayes2/raw/master/examples/Osogbo2021.csv')
I'll use Pandas to load the data into a DataFrame
.
import pandas as pd
df = pd.read_csv('Osogbo2021.csv')
df.head()
TIME(In Days) | AGE OF PATIENTS | CENSOR | AGE AT MENARACHE | BREASTFEED | CONTRACEPT | DETECTION | NEOADJUVANT | |
---|---|---|---|---|---|---|---|---|
0 | 48 | 44 | 1 | 18 | 1.5 | 2 | 1 | 2 |
1 | 898 | 43 | 0 | 17 | 1.0 | 1 | 1 | 2 |
2 | 36 | 40 | 0 | 15 | 2.0 | 2 | 1 | 2 |
3 | 23 | 56 | 1 | 15 | 1.5 | 1 | 1 | 1 |
4 | 300 | 53 | 0 | 12 | 1.0 | 2 | 1 | 2 |
import numpy as np
log_duration = np.log10(duration)
from empiricaldist import Cdf
from utils import decorate
cdf = Cdf.from_seq(log_duration)
cdf.plot()
decorate(xlabel='Survival time (log10 days)',
ylabel='CDF',
title='Distribution raw survival times')
mu = log_duration.mean()
sigma = log_duration.std()
mu, sigma
(2.0641865669356645, 0.8499523545190062)
from empiricaldist import Pmf
def make_uniform(qs, name=None, **options):
"""Make a Pmf that represents a uniform distribution."""
pmf = Pmf(1.0, qs, **options)
pmf.normalize()
if name:
pmf.index.name = name
return pmf
import numpy as np
qs = np.linspace(2, 4, num=101)
prior_mu = make_uniform(qs, name='mean')
I chose the lower and upper bounds by trial and error. I'll explain how when we look at the posterior distribution.
Here's the prior distribution for sigma
:
qs = np.linspace(0.01, 1, num=90)
prior_sigma = make_uniform(qs, name='std')
Now we can use make_joint
to make the joint prior distribution.
from utils import make_joint
prior = make_joint(prior_mu, prior_sigma)
And we'll start by working with the data from the control group.
complete = (df['CENSOR'] == 0)
data_complete = log_duration[complete]
data_ongoing = log_duration[~complete]
len(data_complete), len(data_ongoing)
(38, 51)
First update
mu_mesh, sigma_mesh, data_mesh = np.meshgrid(
prior.columns, prior.index, data_complete)
mu_mesh.shape
(90, 101, 38)
from scipy.stats import norm
densities = norm(mu_mesh, sigma_mesh).pdf(data_mesh)
densities.shape
(90, 101, 38)
likelihood = densities.prod(axis=2)
likelihood.shape
(90, 101)
from utils import normalize
posterior = prior * likelihood
normalize(posterior)
posterior.shape
(90, 101)
Second update
mu_mesh, sigma_mesh, data_mesh = np.meshgrid(
prior.columns, prior.index, data_ongoing)
mu_mesh.shape
(90, 101, 51)
from scipy.stats import norm
densities = norm(mu_mesh, sigma_mesh).sf(data_mesh)
densities.shape
(90, 101, 51)
likelihood = densities.prod(axis=2)
likelihood.shape
(90, 101)
from utils import normalize
posterior2 = posterior * likelihood
normalize(posterior2)
posterior2.shape
(90, 101)
import matplotlib.pyplot as plt
from utils import plot_contour
plot_contour(posterior2, cmap='Blues')
decorate(xlabel='Mean (mu)',
ylabel='Standard deviation (sigma)',
title='Joint posterior distributions of mu and sigma')
I'll use marginal
, which we saw in <<_MarginalDistributions>>, to extract the posterior marginal distributions for the population means.
from utils import marginal
Here's what they look like:
pmf_mean = marginal(posterior, 0)
pmf_mean.plot()
pmf_mean2 = marginal(posterior2, 0)
pmf_mean2.plot()
decorate(xlabel='Population mean (mu)',
ylabel='PDF',
title='Posterior distributions of mu')
pmf_std = marginal(posterior, 1)
pmf_std.plot()
pmf_std2 = marginal(posterior2, 1)
pmf_std2.plot()
decorate(xlabel='Population std (sigma)',
ylabel='PDF')
Think Bayes, Second Edition
Copyright 2020 Allen B. Downey
License: Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)