Quick Notes:
%%bash
head ../processed_data/all_strikeouts_pa.out
import pandas as pd
DF = pd.read_csv('../processed_data/all_strikeouts_pa.out',
header=None,
names=['playerid', 'player_name', 'strikeouts', 'pa', 'year'])
DF.head()
import numpy as np
GROUPED_DF = (DF
.groupby("year")
.agg({'strikeouts': np.sum,
'pa': np.sum,
})
)
GROUPED_DF["prop_strikeouts"] = GROUPED_DF["strikeouts"] / GROUPED_DF["pa"]
GROUPED_DF.describe()
%matplotlib inline
GROUPED_DF["prop_strikeouts"].hist();
len(GROUPED_DF)
GROUPED_DF.head()
GROUPED_DF
%matplotlib inline
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano.tensor as tt
GROUPED_DF.shape
with pm.Model() as bb_model:
phi = pm.Uniform('phi', lower=0.0, upper=1.0)
kappa_log = pm.Exponential('kappa_log', lam=1.5)
kappa = pm.Deterministic('kappa', tt.exp(kappa_log))
rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF))
trials = np.array(GROUPED_DF["pa"])
successes = np.array(GROUPED_DF["strikeouts"])
obs = pm.Binomial('observed_values', trials, rates,
observed=successes)
trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95})
pm.traceplot(trace, varnames=['kappa_log']);
pm.traceplot(trace, varnames=['kappa']);
pm.traceplot(trace, varnames=['phi']);
pm.traceplot(trace, varnames=['rates']);
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr)));
energy = trace['energy']
energy_diff = np.diff(energy)
plt.hist(energy - energy.mean(), label='energy', alpha=0.5)
plt.hist(energy_diff, label='energy diff', alpha=0.5)
plt.legend();
GROUPED_DF.head()
rate_means = trace['rates', 1000:].mean(axis=0)
rate_se = trace['rates', 1000:].std(axis=0)
mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)]
sorted_means_se = sorted(mean_se, key=lambda x: x[0])
sorted_means = [x[0] for x in sorted_means_se]
sorted_se = [x[1] for x in sorted_means_se]
x = np.arange(len(sorted_means))
plt.plot(x, sorted_means, 'o');
for x_val, m, se in zip(x, sorted_means, sorted_se):
plt.plot([x_val, x_val], [m-se, m+se], 'b-')
plt.plot(GROUPED_DF.index, rate_means, 'o');
for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se):
plt.plot([x_val, x_val], [m-se, m+se], 'b-')
sorted_means_se[:5]
sorted_means_se[-5:]
GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :]
GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :]
pm.plots.forestplot(trace, varnames=["rates"])