#!/usr/bin/env python
# coding: utf-8

# Quick Notes:
# * Is this the same as the chisquare test I was using - https://www.itl.nist.gov/div898/handbook/prc/section4/prc46.htm
# * Traditional test for which ranked items are significantly greater - https://www.itl.nist.gov/div898/handbook/prc/section4/prc474.htm
# * Beta-Binomial Tutorial - http://sl8r000.github.io/ab_testing_statistics/use_a_hierarchical_model/
# * Multilevel Beta-Binomial - https://dsaber.com/2016/08/27/analyze-your-experiment-with-a-multilevel-logistic-regression-using-pymc3%E2%80%8B/
# * Another Beta Binomial Tutorial - https://blog.dominodatalab.com/ab-testing-with-hierarchical-models-in-python/
# * Pymc3 Beta Binomial Tutorial - https://docs.pymc.io/notebooks/GLM-hierarchical-binominal-model.html
# * Another pymc3 beta binomial - https://dsaber.com/2016/08/27/analyze-your-experiment-with-a-multilevel-logistic-regression-using-pymc3%E2%80%8B/
# * Note how the beta priors are created (the 5/2 thing). Test this in other regression with pymc3 project.
# * Multi-level beta binomial question - https://stats.stackexchange.com/questions/230034/extending-a-hierarchical-beta-binomial-model-to-account-for-higher-level-groups
# * Multilevel pymc3 - https://austinrochford.com/posts/2017-07-09-mrpymc3.html
# * Partial Pooling - https://docs.pymc.io/notebooks/hierarchical_partial_pooling.html

# In[1]:


get_ipython().run_cell_magic('bash', '', 'head ../processed_data/all_strikeouts_pa.out\n')


# In[2]:


import pandas as pd

DF = pd.read_csv('../processed_data/all_strikeouts_pa.out',
                 header=None,
                 names=['playerid', 'player_name', 'strikeouts', 'pa', 'year'])


# In[3]:


DF.head()


# In[4]:


import numpy as np

GROUPED_DF = (DF
              .groupby("year")
              .agg({'strikeouts': np.sum,
                    'pa': np.sum,
                   })
             )

GROUPED_DF["prop_strikeouts"] = GROUPED_DF["strikeouts"] / GROUPED_DF["pa"]


# In[5]:


GROUPED_DF.describe()


# In[6]:


get_ipython().run_line_magic('matplotlib', 'inline')
GROUPED_DF["prop_strikeouts"].hist();


# In[7]:


len(GROUPED_DF)


# In[9]:


GROUPED_DF.head()


# In[10]:


GROUPED_DF


# In[11]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano.tensor as tt


# In[12]:


GROUPED_DF.shape


# In[13]:


with pm.Model() as bb_model:
 
    phi = pm.Uniform('phi', lower=0.0, upper=1.0)

    kappa_log = pm.Exponential('kappa_log', lam=1.5)
    kappa = pm.Deterministic('kappa', tt.exp(kappa_log))

    rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF))

 
    trials = np.array(GROUPED_DF["pa"])
    successes = np.array(GROUPED_DF["strikeouts"])
 
    obs = pm.Binomial('observed_values', trials, rates,
                      observed=successes)
    trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95})


# In[14]:


pm.traceplot(trace, varnames=['kappa_log']);


# In[15]:


pm.traceplot(trace, varnames=['kappa']);


# In[16]:


pm.traceplot(trace, varnames=['phi']);


# In[17]:


pm.traceplot(trace, varnames=['rates']);


# In[18]:


bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr)));


# In[19]:


energy = trace['energy']
energy_diff = np.diff(energy)
plt.hist(energy - energy.mean(), label='energy', alpha=0.5)
plt.hist(energy_diff, label='energy diff', alpha=0.5)
plt.legend();


# In[20]:


GROUPED_DF.head()


# In[21]:


rate_means = trace['rates', 1000:].mean(axis=0)
rate_se = trace['rates', 1000:].std(axis=0)

mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)]
sorted_means_se = sorted(mean_se, key=lambda x: x[0])
sorted_means = [x[0] for x in sorted_means_se]
sorted_se = [x[1] for x in sorted_means_se]

x = np.arange(len(sorted_means))

plt.plot(x, sorted_means, 'o');

for x_val, m, se in zip(x, sorted_means, sorted_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')


# In[22]:


plt.plot(GROUPED_DF.index, rate_means, 'o');
for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')


# In[23]:


sorted_means_se[:5]


# In[24]:


sorted_means_se[-5:]


# In[25]:


GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :]


# In[26]:


GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :]


# In[27]:


pm.plots.forestplot(trace, varnames=["rates"])