#!/usr/bin/env python # coding: utf-8 # Quick Notes: # * Is this the same as the chisquare test I was using - https://www.itl.nist.gov/div898/handbook/prc/section4/prc46.htm # * Traditional test for which ranked items are significantly greater - https://www.itl.nist.gov/div898/handbook/prc/section4/prc474.htm # * Beta-Binomial Tutorial - http://sl8r000.github.io/ab_testing_statistics/use_a_hierarchical_model/ # * Multilevel Beta-Binomial - https://dsaber.com/2016/08/27/analyze-your-experiment-with-a-multilevel-logistic-regression-using-pymc3%E2%80%8B/ # * Another Beta Binomial Tutorial - https://blog.dominodatalab.com/ab-testing-with-hierarchical-models-in-python/ # * Pymc3 Beta Binomial Tutorial - https://docs.pymc.io/notebooks/GLM-hierarchical-binominal-model.html # * Another pymc3 beta binomial - https://dsaber.com/2016/08/27/analyze-your-experiment-with-a-multilevel-logistic-regression-using-pymc3%E2%80%8B/ # * Note how the beta priors are created (the 5/2 thing). Test this in other regression with pymc3 project. # * Multi-level beta binomial question - https://stats.stackexchange.com/questions/230034/extending-a-hierarchical-beta-binomial-model-to-account-for-higher-level-groups # * Multilevel pymc3 - https://austinrochford.com/posts/2017-07-09-mrpymc3.html # * Partial Pooling - https://docs.pymc.io/notebooks/hierarchical_partial_pooling.html # In[1]: get_ipython().run_cell_magic('bash', '', 'head ../processed_data/all_strikeouts_pa.out\n') # In[2]: import pandas as pd DF = pd.read_csv('../processed_data/all_strikeouts_pa.out', header=None, names=['playerid', 'player_name', 'strikeouts', 'pa', 'year']) # In[3]: DF.head() # In[4]: import numpy as np GROUPED_DF = (DF .groupby("year") .agg({'strikeouts': np.sum, 'pa': np.sum, }) ) GROUPED_DF["prop_strikeouts"] = GROUPED_DF["strikeouts"] / GROUPED_DF["pa"] # In[5]: GROUPED_DF.describe() # In[6]: get_ipython().run_line_magic('matplotlib', 'inline') GROUPED_DF["prop_strikeouts"].hist(); # In[7]: len(GROUPED_DF) # In[9]: GROUPED_DF.head() # In[10]: GROUPED_DF # In[11]: get_ipython().run_line_magic('matplotlib', 'inline') import pymc3 as pm import numpy as np import matplotlib.pyplot as plt import theano.tensor as tt # In[12]: GROUPED_DF.shape # In[13]: with pm.Model() as bb_model: phi = pm.Uniform('phi', lower=0.0, upper=1.0) kappa_log = pm.Exponential('kappa_log', lam=1.5) kappa = pm.Deterministic('kappa', tt.exp(kappa_log)) rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF)) trials = np.array(GROUPED_DF["pa"]) successes = np.array(GROUPED_DF["strikeouts"]) obs = pm.Binomial('observed_values', trials, rates, observed=successes) trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95}) # In[14]: pm.traceplot(trace, varnames=['kappa_log']); # In[15]: pm.traceplot(trace, varnames=['kappa']); # In[16]: pm.traceplot(trace, varnames=['phi']); # In[17]: pm.traceplot(trace, varnames=['rates']); # In[18]: bfmi = pm.bfmi(trace) max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) (pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr))); # In[19]: energy = trace['energy'] energy_diff = np.diff(energy) plt.hist(energy - energy.mean(), label='energy', alpha=0.5) plt.hist(energy_diff, label='energy diff', alpha=0.5) plt.legend(); # In[20]: GROUPED_DF.head() # In[21]: rate_means = trace['rates', 1000:].mean(axis=0) rate_se = trace['rates', 1000:].std(axis=0) mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)] sorted_means_se = sorted(mean_se, key=lambda x: x[0]) sorted_means = [x[0] for x in sorted_means_se] sorted_se = [x[1] for x in sorted_means_se] x = np.arange(len(sorted_means)) plt.plot(x, sorted_means, 'o'); for x_val, m, se in zip(x, sorted_means, sorted_se): plt.plot([x_val, x_val], [m-se, m+se], 'b-') # In[22]: plt.plot(GROUPED_DF.index, rate_means, 'o'); for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se): plt.plot([x_val, x_val], [m-se, m+se], 'b-') # In[23]: sorted_means_se[:5] # In[24]: sorted_means_se[-5:] # In[25]: GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :] # In[26]: GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :] # In[27]: pm.plots.forestplot(trace, varnames=["rates"])