#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import seaborn as sns
sns.set()
from pymc3 import *

np.random.seed(42)


# Extract data

# In[6]:


data = np.reshape([2005, 52, 30, 21, 1, 40280, 1.29, 0.74, 0.52, 2006, 55, 33, 19, 3, 42615, 1.29, 0.77, 0.45, 
            2007, 46, 25, 21, 0, 43199, 1.06, 0.58, 0.49, 2008, 52, 21, 22, 9, 43237, 1.20, 0.49, 0.51, 
            2009, 52, 19, 29, 4, 41476, 1.25, 0.46, 0.70, 2010, 35, 14, 20, 1, 40147, 0.87, 0.35, 0.50, 
            2011, 32, 14, 18, 0, 40366, 0.79, 0.35, 0.45, 2012, 40, 11, 23, 6, 40571, 0.99, 0.27, 0.57, 
            2013, 33, 14, 17, 2, 40792, 0.81, 0.34, 0.42, 2014, 34, 12, 18, 4, 41788, 0.81, 0.29, 0.43], 
                  (10,9))


# In[13]:


year, total, early, late, late_late, births, *rates = data.T.astype(int)


# Build binomial model for GBS

# In[33]:


with Model() as simple_model:
    
    # Probabilities for 2005-2009
    p_early_0509 = Beta('p_early_0509', 1, 1)
    p_late_0509 = Beta('p_late_0509', 1, 1)
    
    # Difference between early and late in 2005-2009
    diff_early_late_0509 = Deterministic('diff_early_late_0509', 1000*(p_late_0509 - p_early_0509))
    
    # Probabilities for 2010-2014
    p_early_1014 = Beta('p_early_1014', 1, 1)
    p_late_1014 = Beta('p_late_1014', 1, 1)
    
    # Difference between early and late in 2010-2014
    diff_early_late_1014 = Deterministic('diff_early_late_1014', 1000*(p_late_1014 - p_early_1014))
    
    # Differences between periods within early group
    diff_early_0509_1014 = Deterministic('diff_early_0509_1014', 1000*(p_early_1014 - p_early_0509))
    # Differences between periods within late group
    diff_late_0509_1014 = Deterministic('diff_late_0509_1014', 1000*(p_late_1014 - p_late_0509))
    
    pre_2010 = year < 2010
    
    # Binomial likelihoods
    early_0509 = Binomial('early_0509', births[pre_2010], p_early_0509, observed=early[pre_2010])
    late_0509 = Binomial('late_0509', births[pre_2010], p_late_0509, observed=late[pre_2010])
    early_1014 = Binomial('early_1014', births[~pre_2010], p_early_1014, observed=early[~pre_2010])
    late_1014 = Binomial('late_1014', births[~pre_2010], p_late_1014, observed=late[~pre_2010])
    

# Run the model using Hamiltonian MC

# In[34]:


with simple_model:
    
    start = find_MAP()
    step = NUTS(scaling=start)
    trace = sample(5000, step, start)


# Estimates of binomial probabilities

# In[35]:


forestplot(trace, vars=['p_early_0509', 'p_late_0509', 'p_early_1014', 'p_late_1014']);


# Estimates of differences between probabilities

# In[36]:


summary(trace, vars=[diff_early_late_0509, diff_early_late_1014, diff_early_0509_1014, diff_late_0509_1014])


# ### Conclusions
# 
# There is a 95% probability the true difference between early rates in 2010-2014 versus 2005-2009 is between -0.421 and -0.155, with an estimated median difference of -0.288. This interval does not contain zero, so we are sure that the rate in 2010-14 is smaller than the rate in 2005-09.
# 
# There is a 95% probability the true difference between late rates in 2010-2014 versus 2005-2009 is between -0.217 and 0.072, with an estimated median difference of -0.060. This interval contains zero, so we are not confident that the estimated difference is real.
# 
# There is a 95% probability the true difference between early rate versus late rate in 2005-2009 is between -0.227 and 0.067, with an estimated median difference of -0.077. This interval contains zero, so we are not confident that the estimated difference is real.
# 
# There is a 95% probability the true difference between early rate versus late rate in 2010-2014 is between 0.032 and 0.278, with an estimated median difference of 0.151. This interval does not contain zero, so we are sure that the late rate is larger than the early rate in 2010-14.

# In[38]:


traceplot(trace, vars=['diff_early_0509_1014', 'diff_late_0509_1014',
                      'diff_early_late_0509', 'diff_early_late_1014']);


# In[39]:


n_white = 327017.2
n_black = 70874.54
eos_white = 148
los_white = 96
eos_black = 221
los_black = 132


# In[40]:


with Model() as race_model:
    
    p_eos_white = Beta('p_eos_white', 1, 1)
    p_los_white = Beta('p_los_white', 1, 1)
    p_eos_black = Beta('p_eos_black', 1, 1)
    p_los_black = Beta('p_los_black', 1, 1)
    
    eos_diff = Deterministic('eos_diff', 1000*(p_eos_white - p_eos_black))
    los_diff = Deterministic('los_diff', 1000*(p_los_white - p_los_black))
    
    y_eos_white = Binomial('y_eos_white', n_white, p_eos_white, observed=eos_white)
    y_los_white = Binomial('y_los_white', n_white, p_los_white, observed=eos_white)
    y_eos_black = Binomial('y_eos_black', n_black, p_eos_black, observed=eos_black)
    y_los_black = Binomial('y_los_black', n_black, p_los_black, observed=eos_black)


# In[41]:


with race_model:
    start = find_MAP()
    step = NUTS(scaling=start)
    race_trace = sample(5000, step, start)


# In[42]:


summary(race_trace, vars=['eos_diff', 'los_diff'])