#!/usr/bin/env python # coding: utf-8 # # Bayesian Zig Zag # # Developing probabilistic models using grid methods and MCMC. # # Thanks to Chris Fonnesback for his help with this notebook, and to Colin Carroll, who added features to pymc3 to support some of these examples. # # Copyright 2018 Allen Downey # # MIT License: https://opensource.org/licenses/MIT # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InteractiveShell.ast_node_interactivity='last_expr_or_assign'") import numpy as np import pymc3 as pm import matplotlib.pyplot as plt # ## Simulating hockey # # I'll model hockey as a Poisson process, where each team has some long-term average scoring rate, `lambda`, in goals per game. # # For the first example, I'll assume that `lambda` is somehow known to be 2.7. Since regulation play (as opposed to overtime) is 60 minutes, we can compute the goal scoring rate per minute. # In[2]: lam_per_game = 2.7 min_per_game = 60 lam_per_min = lam_per_game / min_per_game lam_per_min, lam_per_min**2 # If we assume that a goal is equally likely during any minute of the game, and we ignore the possibility of scoring more than one goal in the same minute, we can simulate a game by generating one random value each minute. # In[3]: def simulate_game(p, n=60): goals = np.random.choice([0, 1], n, p=[1-p, p]) return np.sum(goals) # And simulate 10 games. # In[4]: size = 10 sample = [simulate_game(lam_per_min) for i in range(size)] # If we simulate 1000 games, we can see what the distribution looks like. The average of this sample should be close to `lam_per_game`. # In[5]: size = 1000 sample_sim = [simulate_game(lam_per_min) for i in range(size)] np.mean(sample_sim), lam_per_game # ## PMFs # # To visualize distributions, I'll start with a probability mass function (PMF), which I'll implement using a `Counter`. # # # In[6]: from collections import Counter class Pmf(Counter): def normalize(self): """Normalizes the PMF so the probabilities add to 1.""" total = sum(self.values()) for key in self: self[key] /= total def sorted_items(self): """Returns the outcomes and their probabilities.""" return zip(*sorted(self.items())) # Here are some functions for plotting PMFs. # In[7]: plot_options = dict(linewidth=3, alpha=0.6) def underride(options): """Add key-value pairs to d only if key is not in d. options: dictionary """ for key, val in plot_options.items(): options.setdefault(key, val) return options def plot(xs, ys, **options): """Line plot with plot_options.""" plt.plot(xs, ys, **underride(options)) def bar(xs, ys, **options): """Bar plot with plot_options.""" plt.bar(xs, ys, **underride(options)) def plot_pmf(sample, **options): """Compute and plot a PMF.""" pmf = Pmf(sample) pmf.normalize() xs, ps = pmf.sorted_items() bar(xs, ps, **options) def decorate_pmf_goals(): """Decorate the axes.""" plt.xlabel('Number of goals') plt.ylabel('PMF') plt.title('Distribution of goals scored') legend() def legend(**options): """Draw a legend only if there are labeled items. """ ax = plt.gca() handles, labels = ax.get_legend_handles_labels() if len(labels): plt.legend(**options) # Here's what the results from the simulation look like. # In[8]: plot_pmf(sample_sim, label='simulation') decorate_pmf_goals() plt.savefig('zigzag1.png', dpi=150) # ## Poisson process # # For large values of `n`, the process we just simulated converges to the Poisson distribution with parameter `mu = n * p`, which is also `mu = lam_per_game`. # # We can use NumPy to generate a sample from a Poisson distribution. # In[9]: n = 60 p = lam_per_min mu = n * p sample_poisson = np.random.poisson(mu, size) np.mean(sample_poisson) # And confirm that the results are similar to what we got from the model. # In[10]: plot_pmf(sample_sim, label='simulation') plot_pmf(sample_poisson, label='Poisson') decorate_pmf_goals() plt.savefig('zigzag2.png', dpi=150) # But plotting PMFs is a bad way to compare distributions. It's better to use the cumulative distribution function (CDF). # In[11]: def plot_cdf(sample, **options): """Compute and plot the CDF of a sample.""" pmf = Pmf(sample) xs, freqs = pmf.sorted_items() ps = np.cumsum(freqs, dtype=np.float) ps /= ps[-1] plot(xs, ps, **options) def decorate_cdf_rates(): """Decorate the axes.""" plt.xlabel('Goal scoring rate (mu)') plt.ylabel('CDF') plt.title('Distribution of goal scoring rate') legend() def decorate_cdf_goals(): """Decorate the axes.""" plt.xlabel('Number of goals') plt.ylabel('CDF') plt.title('Distribution of goals scored') legend() def plot_cdfs(*sample_seq, **options): """Plot multiple CDFs.""" for sample in sample_seq: plot_cdf(sample, **options) decorate_cdf_goals() # Comparing CDFs makes it clearer that the results from the simulation are consistent with the Poisson model. # In[12]: plot_cdf(sample_sim, label='simulation') plot_cdf(sample_poisson, label='Poisson') decorate_cdf_goals() plt.savefig('zigzag3.png', dpi=150) # ## Warming up PyMC # # Soon we will want to use `pymc3` to do inference, which is really what it's for. But just to get warmed up, I will use it to generate a sample from a Poisson distribution. # In[13]: model = pm.Model() with model: goals = pm.Poisson('goals', mu) trace = pm.sample_prior_predictive(1000) # In[14]: len(trace['goals']) # In[15]: sample_pm = trace['goals'] np.mean(sample_pm) # This example is like using a cannon to kill a fly. But it help us learn to use the cannon. # In[16]: plot_cdf(sample_sim, label='simulation') plot_cdf(sample_pm, label='PyMC') decorate_cdf_goals() # ## Evaluating the Poisson distribution # # One of the nice things about the Poisson distribution is that we can compute its PMF analytically, and SciPy provides an implementation. # In[17]: from scipy.stats import poisson xs = np.arange(11) ps = poisson.pmf(xs, mu) bar(xs, ps, label='Poisson PMF') decorate_pmf_goals() # And here's a function that computes the probability of scoring a given number of goals in a game, for a known value of `mu`. # In[18]: def poisson_likelihood(goals, mu): """Probability of goals given scoring rate. goals: observed number of goals (scalar or sequence) mu: hypothetical goals per game returns: probability """ return np.prod(poisson.pmf(goals, mu)) # Here's the probability of scoring 6 goals in a game if the long-term rate is 2.7 goals per game. # In[19]: poisson_likelihood(goals=6, mu=2.7) # Here's the probability of scoring 3 goals. # In[20]: poisson_likelihood(goals=3, mu=2.7) # This function also works with a sequence of goals, so we can compute the probability of scoring 6 goals in the first game and 3 in the second. # In[21]: poisson_likelihood(goals=[6, 3], mu=2.7) # ## Bayesian inference with grid approximation # # Ok, it's finally time to do some inference! The function we just wrote computes the likelihood of the data, given a hypothetical value of `mu`: # # $\mathrm{Prob}~(x ~|~ \mu)$ # # But what we really want is the distribution of `mu`, given the data: # # $\mathrm{Prob}~(\mu ~|~ x)$ # # If only there were some theorem that relates these probabilities! # # The following class implements Bayes's theorem. # In[22]: class Suite(Pmf): """Represents a set of hypotheses and their probabilities.""" def bayes_update(self, data, like_func): """Perform a Bayesian update. data: some representation of observed data like_func: likelihood function that takes (data, hypo), where hypo is the hypothetical value of some parameter, and returns P(data | hypo) """ for hypo in self: self[hypo] *= like_func(data, hypo) self.normalize() def plot(self, **options): """Plot the hypotheses and their probabilities.""" xs, ps = self.sorted_items() plot(xs, ps, **options) def decorate_pdf_rate(): """Decorate the axes.""" plt.xlabel('Goals per game (mu)') plt.ylabel('PDF') plt.title('Distribution of goal scoring rate') legend() # I'll start with a uniform prior just to keep things simple. We'll choose a better prior later. # In[23]: hypo_mu = np.linspace(0, 15, num=51) hypo_mu # Initially `uniform_prior` represents the prior distribution of `mu`. # In[24]: uniform_prior = Suite(hypo_mu) uniform_prior.normalize() uniform_prior.plot(label='prior') decorate_pdf_rate() plt.savefig('zigzag4.png', dpi=150) # Now we can update it with the data and plot the posterior. # In[25]: uniform_prior.bayes_update(data=3, like_func=poisson_likelihood) uniform_prior.plot(label='posterior') decorate_pdf_rate() plt.savefig('zigzag5.png', dpi=150) # With a uniform prior, the posterior is the likelihood function, and the MAP is the value of `mu` that maximizes likelihood, which is the observed number of goals, 6. # # This result is probably not reasonable, because the prior was not reasonable. # ## A better prior # # To construct a better prior, I'll use a Gamma distribution with parameters chosen to be consistent with previous playoff games. # We can use `make_gamma_dist` to construct a prior suite with the given parameters. # In[26]: from scipy.stats import gamma def make_gamma_suite(xs, alpha, beta): """Makes a suite based on a gamma distribution. xs: places to evaluate the PDF alpha, beta: parameters of the distribution returns: Suite """ ps = gamma(a=alpha, scale=1/beta).pdf(xs) prior = Suite(dict(zip(xs, ps))) prior.normalize() return prior # Here's what it looks like. # In[27]: alpha = 9 beta = 3 hypo_mu = np.linspace(0, 15, num=101) gamma_prior = make_gamma_suite(hypo_mu, alpha, beta) gamma_prior.plot(label='gamma prior') decorate_pdf_rate() plt.savefig('zigzag6.png', dpi=150) # And we can update this prior using the observed data. If we only score 1 goal in a game, our estimate for the goal scoring rate shifts left. # In[28]: posterior = gamma_prior.copy() posterior.bayes_update(data=1, like_func=poisson_likelihood) gamma_prior.plot(label='prior') posterior.plot(label='posterior') decorate_pdf_rate() plt.savefig('zigzag7.png', dpi=150) # If we go back to the original prior, and score 6 goals in a game, the estimate shifts to the right. # In[29]: posterior = gamma_prior.copy() posterior.bayes_update(data=6, like_func=poisson_likelihood) gamma_prior.plot(label='prior') posterior.plot(label='posterior') decorate_pdf_rate() plt.savefig('zigzag8.png', dpi=150) # If we score 3 goals in a game, the estimate doesn't shift much, but it gets pointier, which means we are more certain about the goal scoring rate. # In[30]: posterior = gamma_prior.copy() posterior.bayes_update(data=3, like_func=poisson_likelihood) gamma_prior.plot(label='prior') posterior.plot(label='posterior') decorate_pdf_rate() plt.savefig('zigzag8.png', dpi=150) # ## Posterior predictive distribution # # Now, let's get to what is usually the point of this whole exercise, making predictions. # # The prior represents what we believe about the distribution of `mu` based on the data (and our prior beliefs). # # Each value of `mu` is a possible goal scoring rate. # # For a given value of `mu`, we can generate a distribution of goals scored in a particular game, which is Poisson. # # But we don't have a given value of `mu`, we have a whole bunch of values for `mu`, with different probabilities. # # So the posterior predictive distribution is a mixture of Poissons with different parameters. # # The simplest way to generate the posterior predictive distribution is to # # 1. Draw a random `mu` from the posterior distribution. # # 2. Draw a random number of goals from `Poisson(mu)`. # # 3. Repeat. # # Here's a function that draws a sample from a posterior `Suite`: # In[31]: def sample_suite(suite, size): """Draw a random sample from a Suite suite: Suite object size: sample size """ xs, ps = np.transpose(list(suite.items())) return np.random.choice(xs, size, replace=True, p=ps) # Here's a sample of `mu` drawn from the posterior distribution (after one game). # In[32]: size = 10000 sample_post = sample_suite(posterior, size) np.mean(sample_post) # Now for each value of `mu` in the posterior sample we draw one sample from `Poisson(mu)` # In[33]: sample_post_pred = np.random.poisson(sample_post) np.mean(sample_post_pred) # The result is a sample from the posterior predictive distribution. Here's what it looks like. # In[34]: plot_pmf(sample_post_pred, label='posterior predictive sample') decorate_pmf_goals() plt.savefig('zigzag9.png', dpi=150) # The prior and posterior are distributions of mu, which is a continuous value. # # The prior predictive and posterior predictive are distributions of goals, which are discrete. # # To help keep them straight, I will plot distributions of mu as CDFs, and distributions of goals as PMFs. # # ## Back to PyMC # # Previously we used PyMC to draw a sample from a Poisson distribution with known `mu`. # # Now we'll use it to draw a sample from a gamma distribution of `mu`, with known `alpha` and `beta`. # # And then use that value of `mu` to draw a sample from a Poisson distribution. # # Here are the values of the parameters: # In[35]: print(alpha, beta) # Here's what the model looks like: # In[36]: model = pm.Model() with model: mu = pm.Gamma('mu', alpha, beta) goals = pm.Poisson('goals', mu) trace = pm.sample_prior_predictive(1000) # The distribution of `mu` from this model is the prior. Let's see what it looks like: # In[37]: sample_prior_pm = trace['mu'] np.mean(sample_prior_pm) # And compare it to a sample from the gamma prior. # In[38]: sample_prior = sample_suite(gamma_prior, 1000) np.mean(sample_prior) # In[39]: plot_cdf(sample_prior, label='prior') plot_cdf(sample_prior_pm, label='prior pymc') decorate_cdf_rates() plt.savefig('zigzag10.png', dpi=150) # It looks pretty good. # The distributions of `goals` from this model is the prior predictive. # In[40]: sample_prior_pred_pm = trace['goals'] np.mean(sample_prior_pred_pm) # And let's compare it to a prior predictive distribution estimated by sampling. # In[41]: sample_prior_pred = np.random.poisson(sample_prior) np.mean(sample_prior_pred) # In[42]: plot_cdf(sample_prior_pred, label='prior pred') plot_cdf(sample_prior_pred_pm, label='prior pred pymc') decorate_cdf_goals() plt.savefig('zigzag11.png', dpi=150) # Looks good. # ## Now with PyMC # # Finally, we are ready to use PyMC for actual inference. We just have to make one small change. # # Instead of generating `goals`, we'll mark goals as `observed` and provide the observed data, `3`. # # And instead of called `sample_prior_predictive`, we'll call `sample`, which is understood to sample from the posterior distribution of `mu`. # In[43]: model = pm.Model() with model: mu = pm.Gamma('mu', alpha, beta) goals = pm.Poisson('goals', mu, observed=3) trace = pm.sample(1000, tune=2000) # With `goals` fixed, the only unknown is `mu`, so `trace` contains a sample drawn from the posterior distribution of `mu`. We can plot the posterior using a function provided by PyMC: # In[44]: pm.plot_posterior(trace) decorate_pdf_rate() # And we can extract a sample from the posterior of `mu` # In[45]: sample_post_pm = trace['mu'] np.mean(sample_post_pm) # And compare it to the sample we drew from the grid approximation: # In[46]: plot_cdf(sample_post, label='posterior grid') plot_cdf(sample_post_pm, label='posterior pymc') decorate_cdf_rates() plt.savefig('zigzag12.png', dpi=150) # Again, it looks pretty good. # # To sample from the posterior predictive distribution, we can use `sample_posterior_predictive`: # In[47]: with model: post_pred = pm.sample_posterior_predictive(trace, samples=1000) # Here's what it looks like: # In[48]: sample_post_pred_pm = post_pred['goals'] sample_post_pred_pm.shape # In[49]: sample_post_pred_pm = post_pred['goals'] np.mean(sample_post_pred_pm) # In[50]: plot_cdf(sample_post_pred, label='posterior pred grid') plot_cdf(sample_post_pred_pm, label='posterior pred pm') decorate_cdf_goals() plt.savefig('zigzag13.png', dpi=150) # Looks pretty good! # ## Two teams # # We can extend the model to estimate different values of `mu` for the two teams. # In[51]: model = pm.Model() with model: mu_BOS = pm.Gamma('mu_BOS', alpha, beta) mu_ANA = pm.Gamma('mu_ANA', alpha, beta) goals_BOS = pm.Poisson('goals_BOS', mu_BOS, observed=3) goals_ANA = pm.Poisson('goals_ANA', mu_ANA, observed=1) trace = pm.sample(1000, tune=2000) # We can use `traceplot` to review the results and do some visual diagnostics. # In[52]: pm.traceplot(trace); # Here are the posterior distribitions for `mu_BOS` and `mu_ANA`. # In[53]: mu_BOS = trace['mu_BOS'] plot_cdf(mu_BOS, label='mu_BOS posterior') mu_ANA = trace['mu_ANA'] plot_cdf(mu_ANA, label='mu_ANA posterior') decorate_cdf_rates() np.mean(mu_BOS), np.mean(mu_ANA) plt.savefig('zigzag14.png', dpi=150) # On the basis of one game here's the probability that Boston is the better team. # In[54]: np.mean(mu_BOS > mu_ANA) # In[55]: np.mean(mu_BOS == mu_ANA) # ## Predictions # # Even if Boston is the better team, that doesn't mean they'll win the next game. # # We can use `sample_posterior_predictive` to generate predictions. # In[56]: with model: post_pred = pm.sample_posterior_predictive(trace, samples=1000) # Here are the posterior predictive distributions of goals scored. # In[57]: goals_BOS = post_pred['goals_BOS'].flatten() goals_ANA = post_pred['goals_ANA'].flatten() plot_cdf(goals_BOS, label='BOS') plot_cdf(goals_ANA, label='ANA') decorate_cdf_goals() plt.savefig('zigzag15.png', dpi=150) # Here's the chance that Boston wins the next game. # In[58]: win = np.mean(goals_BOS > goals_ANA) win # The chance that they lose. # In[59]: lose = np.mean(goals_ANA > goals_BOS) lose # And the chance of a tie. # In[60]: tie = np.mean(goals_BOS == goals_ANA) tie # ## Overtime! # # In the event of a tie, the teams play a five-minute overtime period during which the team that scores first wins. # # In a Poisson process with rate parameter `mu`, the time until the next event is exponential with parameter `lam = 1/mu`. # # So we can take a sample from the posterior distributions of `mu`: # In[61]: mu_ANA = trace['mu_ANA'] mu_BOS = trace['mu_BOS'] # And generate time to score,`tts`, for each team: # In[62]: tts_ANA = np.random.exponential(1/mu_ANA) np.mean(tts_ANA) # In[63]: tts_BOS = np.random.exponential(1/mu_BOS) np.mean(tts_BOS) # Here's the chance that Boston wins in overtime. # In[64]: win_ot = np.mean(tts_BOS < tts_ANA) win_ot # Since `tts` is continuous, ties are unlikely. # In[65]: total_win = win + tie * win_ot total_win