#!/usr/bin/env python
# coding: utf-8

# # Think Bayes
# 
# Copyright 2018 Allen B. Downey
# 
# MIT License: https://opensource.org/licenses/MIT

# In[1]:


# Configure Jupyter so figures appear in the notebook
get_ipython().run_line_magic('matplotlib', 'inline')

# Configure Jupyter to display the assigned value after an assignment
get_ipython().run_line_magic('config', "InteractiveShell.ast_node_interactivity='last_expr_or_assign'")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# import classes from thinkbayes2
from thinkbayes2 import Pmf, Cdf, Suite, Joint

import thinkplot


# ## Lions and Tigers and Bears
# 
# Suppose we visit a wild animal preserve where we know that the only animals are lions and tigers and bears, but we don't know how many of each there are.
# 
# During the tour, we see 3 lions, 2 tigers and one bear.  Assuming that every animal had an equal chance to appear in our sample, estimate the prevalence of each species.
# 
# What is the probability that the next animal we see is a bear?

# ### Grid algorithm
# 
# I'll start with a grid algorithm, enumerating the space of prevalences, `p1`, `p2`, and `p3`, that add up to 1, and computing the likelihood of the data for each triple of prevalences.

# In[2]:


class LionsTigersBears(Suite, Joint):
    
    def Likelihood(self, data, hypo):
        """
        
        data: string 'L' , 'T', 'B'
        hypo: p1, p2, p3
        """
        # Fill this in.


# In[3]:


# Solution

class LionsTigersBears(Suite, Joint):
    
    def Likelihood(self, data, hypo):
        """
        
        data: string 'L' , 'T', 'B'
        hypo: p1, p2, p3
        """
        p1, p2, p3 = hypo
        if data == 'L':
            return p1
        if data == 'T':
            return p2
        if data == 'B':
            return p3


# In[4]:


ps = np.linspace(0, 1, 101);


# Here's a simple way to find eligible triplets, but it is inefficient, and it runs into problems with floating-point approximations.

# In[5]:


from itertools import product

def enumerate_triples(ps):
    for p1, p2, p3 in product(ps, ps, ps):
        if p1+p2+p3 == 1:
            yield p1, p2, p3


# As an exercise, write a better version of `enumerate_triples`.

# In[6]:


# Solution

from itertools import product

def enumerate_triples(ps):
    for p1, p2 in product(ps, ps):
        if p1 + p2 > 1:
            continue
        p3 = 1 - p1 - p2
        yield p1, p2, p3


# Now we can initialize the suite.

# In[7]:


suite = LionsTigersBears(enumerate_triples(ps));


# Here are functions for displaying the distributions

# In[8]:


def plot_marginal_pmfs(joint):
    pmf_lion = joint.Marginal(0)
    pmf_tiger = joint.Marginal(1)
    pmf_bear = joint.Marginal(2)

    thinkplot.Pdf(pmf_lion, label='lions')
    thinkplot.Pdf(pmf_tiger, label='tigers')
    thinkplot.Pdf(pmf_bear, label='bears')
    
    thinkplot.decorate(xlabel='Prevalence',
                       ylabel='PMF')


# In[9]:


def plot_marginal_cdfs(joint):
    pmf_lion = joint.Marginal(0)
    pmf_tiger = joint.Marginal(1)
    pmf_bear = joint.Marginal(2)

    thinkplot.Cdf(pmf_lion.MakeCdf(), label='lions')
    thinkplot.Cdf(pmf_tiger.MakeCdf(), label='tigers')
    thinkplot.Cdf(pmf_bear.MakeCdf(), label='bears')
    
    thinkplot.decorate(xlabel='Prevalence',
                       ylabel='CDF')


# Here are the prior distributions

# In[10]:


plot_marginal_cdfs(suite)


# Now we can do the update.

# In[11]:


for data in 'LLLTTB':
    suite.Update(data)


# And here are the posteriors.

# In[12]:


plot_marginal_cdfs(suite)


# To get the predictive probability of a bear, we can take the mean of the posterior marginal distribution:

# In[13]:


suite.Marginal(2).mean()


# Or we can do a pseudo-update and use the total probability of the data.

# In[14]:


suite.Copy().Update('B')


# ### Using the Dirichlet object

# The Dirichlet distribution is the conjugate prior for this likelihood function, so we can use the `Dirichlet` object to do the update.
# 
# The following is a [monkey patch](https://en.wikipedia.org/wiki/Monkey_patch) that gives `Dirichlet` objects a `Marginal` method.

# In[15]:


from thinkbayes2 import Dirichlet

def DirichletMarginal(dirichlet, i):
    return dirichlet.MarginalBeta(i).MakePmf()

Dirichlet.Marginal = DirichletMarginal


# Here are the priors:

# In[16]:


dirichlet = Dirichlet(3)
plot_marginal_cdfs(dirichlet)


# Here's the update.

# In[17]:


dirichlet.Update((3, 2, 1))


# Here are the posterior PDFs.

# In[18]:


plot_marginal_pmfs(dirichlet)


# And the CDFs.

# In[19]:


plot_marginal_cdfs(dirichlet)


# And we can confirm that we get the same results as the grid algorithm.

# In[20]:


thinkplot.PrePlot(6)
plot_marginal_cdfs(dirichlet)
plot_marginal_cdfs(suite)


# 

# ### MCMC
# 
# Exercise: Implement this model using MCMC.  You might want to start with [this example](http://christianherta.de/lehre/dataScience/bayesian/Multinomial-Dirichlet.slides.php).

# In[21]:


import warnings
warnings.simplefilter('ignore', FutureWarning)

import pymc3 as pm


# Here's the data.

# In[22]:


observed = [0,0,0,1,1,2]
k = len(Pmf(observed))
a = np.ones(k)


# Here's the MCMC model:

# In[23]:


# Solution

model = pm.Model()

with model:
    ps = pm.Dirichlet('ps', a, shape=a.shape)
    xs = pm.Categorical('xs', ps, observed=observed, shape=1)
    
model


# In[24]:


# Solution

with model:
    start = pm.find_MAP()
    step = pm.Metropolis()
    trace = pm.sample(1000, start=start, step=step, tune=1000)


# Check the traceplot

# In[25]:


pm.traceplot(trace);


# And let's see the results.

# In[26]:


def plot_trace_cdfs(trace):
    rows = trace['ps'].transpose()

    cdf_lion = Cdf(rows[0])
    cdf_tiger = Cdf(rows[1])
    cdf_bear = Cdf(rows[2])

    thinkplot.Cdf(cdf_lion, label='lions')
    thinkplot.Cdf(cdf_tiger, label='tigers')
    thinkplot.Cdf(cdf_bear, label='bears')
    
    thinkplot.decorate(xlabel='Prevalence',
                       ylabel='CDF')


# In[27]:


plot_trace_cdfs(trace)


# And compare them to what we got with Dirichlet:

# In[28]:


thinkplot.PrePlot(6)
plot_marginal_cdfs(dirichlet)
plot_trace_cdfs(trace)


# ### Using a Multinomial distribution
# 
# Here's another solution that uses a Multinomial distribution instead of a Categorical.  In this case, we represent the observed data using just the counts, `[3, 2, 1]`, rather than a specific sequence of observations `[0,0,0,1,1,2]`.
# 
# I suspect that this is a better option; because it uses a less specific representation of the data (without losing any information), I would expect the probability space to be easier to search.
# 
# This solution is based on [this excellent notebook](http://nbviewer.jupyter.org/github/WillKoehrsen/probabilistic-programming/blob/master/Allen%20Downey%20Problem.ipynb) from Will Koehrsen.

# In[29]:


animals = ['lions', 'tigers', 'bears']
c = np.array([3, 2, 1])
a = np.array([1, 1, 1])


# In[30]:


warnings.simplefilter('ignore', UserWarning)

with pm.Model() as model:
    # Probabilities for each species
    ps = pm.Dirichlet('ps', a=a, shape=3)
    # Observed data is a multinomial distribution with 6 trials
    xs = pm.Multinomial('xs', n=6, p=ps, shape=3, observed=c)   


# In[31]:


model


# In[32]:


with model:
    # Sample from the posterior
    trace = pm.sample(draws=1000, tune=1000)


# In[33]:


pm.traceplot(trace);


# In[34]:


thinkplot.PrePlot(6)
plot_marginal_cdfs(dirichlet)
plot_trace_cdfs(trace)


# The results look good.  We can use `summary` to get the posterior means, and other summary stats.

# In[35]:


summary = pm.summary(trace)
summary.index = animals
summary


# We can also use `plot_posterior` to get a better view of the results.

# In[36]:


ax = pm.plot_posterior(trace, varnames = ['ps']);

for i, a in enumerate(animals):
    ax[i].set_title(a)


# In[ ]: