# Configure Jupyter so figures appear in the notebook
%matplotlib inline
# Configure Jupyter to display the assigned value after an assignment
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'
import numpy as np
import pandas as pd
# import classes from thinkbayes2
from thinkbayes2 import Pmf, Cdf, Suite, Joint
import thinkplot
Here's a problem from Bayesian Methods for Hackers
On January 28, 1986, the twenty-fifth flight of the U.S. space shuttle program ended in disaster when one of the rocket boosters of the Shuttle Challenger exploded shortly after lift-off, killing all seven crew members. The presidential commission on the accident concluded that it was caused by the failure of an O-ring in a field joint on the rocket booster, and that this failure was due to a faulty design that made the O-ring unacceptably sensitive to a number of factors including outside temperature. Of the previous 24 flights, data were available on failures of O-rings on 23, (one was lost at sea), and these data were discussed on the evening preceding the Challenger launch, but unfortunately only the data corresponding to the 7 flights on which there was a damage incident were considered important and these were thought to show no obvious trend. The data are shown below (see 1):
# !wget https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv
columns = ['Date', 'Temperature', 'Incident']
df = pd.read_csv('challenger_data.csv', parse_dates=[0])
df.drop(labels=[3, 24], inplace=True)
df
Date | Temperature | Damage Incident | |
---|---|---|---|
0 | 1981-04-12 | 66 | 0 |
1 | 1981-11-12 | 70 | 1 |
2 | 1982-03-22 | 69 | 0 |
4 | 1982-01-11 | 68 | 0 |
5 | 1983-04-04 | 67 | 0 |
6 | 1983-06-18 | 72 | 0 |
7 | 1983-08-30 | 73 | 0 |
8 | 1983-11-28 | 70 | 0 |
9 | 1984-02-03 | 57 | 1 |
10 | 1984-04-06 | 63 | 1 |
11 | 1984-08-30 | 70 | 1 |
12 | 1984-10-05 | 78 | 0 |
13 | 1984-11-08 | 67 | 0 |
14 | 1985-01-24 | 53 | 1 |
15 | 1985-04-12 | 67 | 0 |
16 | 1985-04-29 | 75 | 0 |
17 | 1985-06-17 | 70 | 0 |
18 | 1985-07-29 | 81 | 0 |
19 | 1985-08-27 | 76 | 0 |
20 | 1985-10-03 | 79 | 0 |
21 | 1985-10-30 | 75 | 1 |
22 | 1985-11-26 | 76 | 0 |
23 | 1986-01-12 | 58 | 1 |
df['Incident'] = df['Damage Incident'].astype(float)
df
Date | Temperature | Damage Incident | Incident | |
---|---|---|---|---|
0 | 1981-04-12 | 66 | 0 | 0.0 |
1 | 1981-11-12 | 70 | 1 | 1.0 |
2 | 1982-03-22 | 69 | 0 | 0.0 |
4 | 1982-01-11 | 68 | 0 | 0.0 |
5 | 1983-04-04 | 67 | 0 | 0.0 |
6 | 1983-06-18 | 72 | 0 | 0.0 |
7 | 1983-08-30 | 73 | 0 | 0.0 |
8 | 1983-11-28 | 70 | 0 | 0.0 |
9 | 1984-02-03 | 57 | 1 | 1.0 |
10 | 1984-04-06 | 63 | 1 | 1.0 |
11 | 1984-08-30 | 70 | 1 | 1.0 |
12 | 1984-10-05 | 78 | 0 | 0.0 |
13 | 1984-11-08 | 67 | 0 | 0.0 |
14 | 1985-01-24 | 53 | 1 | 1.0 |
15 | 1985-04-12 | 67 | 0 | 0.0 |
16 | 1985-04-29 | 75 | 0 | 0.0 |
17 | 1985-06-17 | 70 | 0 | 0.0 |
18 | 1985-07-29 | 81 | 0 | 0.0 |
19 | 1985-08-27 | 76 | 0 | 0.0 |
20 | 1985-10-03 | 79 | 0 | 0.0 |
21 | 1985-10-30 | 75 | 1 | 1.0 |
22 | 1985-11-26 | 76 | 0 | 0.0 |
23 | 1986-01-12 | 58 | 1 | 1.0 |
import matplotlib.pyplot as plt
plt.scatter(df.Temperature, df.Incident, s=75, color="k",
alpha=0.5)
plt.yticks([0, 1])
plt.ylabel("Damage Incident?")
plt.xlabel("Outside temperature (Fahrenheit)")
plt.title("Defects of the Space Shuttle O-Rings vs temperature");
We can solve the problem first using a grid algorithm, with parameters b0
and b1
, and
$\mathrm{logit}(p) = b0 + b1 * T$
and each datum being a temperature T
and a boolean outcome fail
, which is true is there was damage and false otherwise.
Hint: the expit
function from scipy.special
computes the inverse of the logit
function.
from scipy.special import expit
class Logistic(Suite, Joint):
def Likelihood(self, data, hypo):
"""
data: T, fail
hypo: b0, b1
"""
return 1
# Solution
from scipy.special import expit
class Logistic(Suite, Joint):
def Likelihood(self, data, hypo):
"""
data: T, fail
hypo: b0, b1
"""
temp, fail = data
b0, b1 = hypo
log_odds = b0 + b1 * temp
p_fail = expit(log_odds)
if fail == 1:
return p_fail
elif fail == 0:
return 1-p_fail
else:
# NaN
return 1
b0 = np.linspace(0, 50, 101);
b1 = np.linspace(-1, 1, 101);
from itertools import product
hypos = product(b0, b1)
<itertools.product at 0x7f25c84d75e8>
suite = Logistic(hypos);
for data in zip(df.Temperature, df.Incident):
print(data)
suite.Update(data)
(66, 0.0) (70, 1.0) (69, 0.0) (68, 0.0) (67, 0.0) (72, 0.0) (73, 0.0) (70, 0.0) (57, 1.0) (63, 1.0) (70, 1.0) (78, 0.0) (67, 0.0) (53, 1.0) (67, 0.0) (75, 0.0) (70, 0.0) (81, 0.0) (76, 0.0) (79, 0.0) (75, 1.0) (76, 0.0) (58, 1.0)
thinkplot.Pdf(suite.Marginal(0))
thinkplot.decorate(xlabel='Intercept',
ylabel='PMF',
title='Posterior marginal distribution')
thinkplot.Pdf(suite.Marginal(1))
thinkplot.decorate(xlabel='Log odds ratio',
ylabel='PMF',
title='Posterior marginal distribution')
According to the posterior distribution, what was the probability of damage when the shuttle launched at 31 degF?
# Solution
T = 31
total = 0
for hypo, p in suite.Items():
b0, b1 = hypo
log_odds = b0 + b1 * T
p_fail = expit(log_odds)
total += p * p_fail
total
0.9909003512180481
# Solution
pred = suite.Copy()
pred.Update((31, True))
0.9909003512180481
Implement this model using MCMC. As a starting place, you can use this example from the PyMC3 docs.
As a challege, try writing the model more explicitly, rather than using the GLM module.
from warnings import simplefilter
simplefilter('ignore', FutureWarning)
import pymc3 as pm
# Solution
with pm.Model() as model:
pm.glm.GLM.from_formula('Incident ~ Temperature', df,
family=pm.glm.families.Binomial())
start = pm.find_MAP()
trace = pm.sample(1000, start=start, tune=1000)
logp = -18, ||grad|| = 11.915: 100%|██████████| 27/27 [00:00<00:00, 1502.12it/s] Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [Temperature, Intercept] Sampling 2 chains: 100%|██████████| 4000/4000 [00:06<00:00, 594.28draws/s] The estimated number of effective samples is smaller than 200 for some parameters.
pm.traceplot(trace);
# Solution
with pm.Model() as model:
pm.glm.GLM.from_formula('Incident ~ Temperature', df,
family=pm.glm.families.Binomial())
trace = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [Temperature, Intercept] Sampling 2 chains: 0%| | 0/4000 [00:00<?, ?draws/s]
--------------------------------------------------------------------------- RemoteTraceback Traceback (most recent call last) RemoteTraceback: """ Traceback (most recent call last): File "/home/downey/anaconda3/lib/python3.6/site-packages/pymc3/parallel_sampling.py", line 73, in run self._start_loop() File "/home/downey/anaconda3/lib/python3.6/site-packages/pymc3/parallel_sampling.py", line 113, in _start_loop point, stats = self._compute_point() File "/home/downey/anaconda3/lib/python3.6/site-packages/pymc3/parallel_sampling.py", line 139, in _compute_point point, stats = self._step_method.step(self._point) File "/home/downey/anaconda3/lib/python3.6/site-packages/pymc3/step_methods/arraystep.py", line 247, in step apoint, stats = self.astep(array) File "/home/downey/anaconda3/lib/python3.6/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 117, in astep 'might be misspecified.' % start.energy) ValueError: Bad initial energy: inf. The model might be misspecified. """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) ValueError: Bad initial energy: inf. The model might be misspecified. The above exception was the direct cause of the following exception: RuntimeError Traceback (most recent call last) <ipython-input-20-18c7e811f775> in <module>() 5 family=pm.glm.families.Binomial()) 6 ----> 7 trace = pm.sample(1000, tune=1000) ~/anaconda3/lib/python3.6/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, nuts_kwargs, step_kwargs, progressbar, model, random_seed, live_plot, discard_tuned_samples, live_plot_kwargs, compute_convergence_checks, use_mmap, **kwargs) 447 _print_step_hierarchy(step) 448 try: --> 449 trace = _mp_sample(**sample_args) 450 except pickle.PickleError: 451 _log.warning("Could not pickle model, sampling singlethreaded.") ~/anaconda3/lib/python3.6/site-packages/pymc3/sampling.py in _mp_sample(draws, tune, step, chains, cores, chain, random_seed, start, progressbar, trace, model, use_mmap, **kwargs) 997 try: 998 with sampler: --> 999 for draw in sampler: 1000 trace = traces[draw.chain - chain] 1001 if trace.supports_sampler_stats and draw.stats is not None: ~/anaconda3/lib/python3.6/site-packages/pymc3/parallel_sampling.py in __iter__(self) 303 304 while self._active: --> 305 draw = ProcessAdapter.recv_draw(self._active) 306 proc, is_last, draw, tuning, stats, warns = draw 307 if self._progress is not None: ~/anaconda3/lib/python3.6/site-packages/pymc3/parallel_sampling.py in recv_draw(processes, timeout) 221 if msg[0] == 'error': 222 old = msg[1] --> 223 six.raise_from(RuntimeError('Chain %s failed.' % proc.chain), old) 224 elif msg[0] == 'writing_done': 225 proc._readable = True ~/anaconda3/lib/python3.6/site-packages/six.py in raise_from(value, from_value) RuntimeError: Chain 0 failed.
The posterior distributions for these parameters should be similar to what we got with the grid algorithm.