In [1]:
%%bash
head ../processed_data/all_walks_pa.out
aaroh101, Hank Aaron, 74, 598, 1970
aarot101, Tommie Aaron, 3, 66, 1970
abert101, Ted Abernathy, 0, 20, 1970
adaij101, Jerry Adair, 5, 33, 1970
ageet101, Tommie Agee, 55, 696, 1970
aguih101, Hank Aguirre, 0, 2, 1970
akerj102, Jack Aker, 0, 16, 1970
alcal101, Luis Alcaraz, 4, 127, 1970
alleb105, Bernie Allen, 43, 305, 1970
alled101, Dick Allen, 71, 533, 1970
In [2]:
import pandas as pd

DF = pd.read_csv('../processed_data/all_walks_pa.out',
                 header=None,
                 names=['playerid', 'player_name', 'walks', 'pa', 'year'])
In [3]:
DF.head()
Out[3]:
playerid player_name walks pa year
0 aaroh101 Hank Aaron 74 598 1970
1 aarot101 Tommie Aaron 3 66 1970
2 abert101 Ted Abernathy 0 20 1970
3 adaij101 Jerry Adair 5 33 1970
4 ageet101 Tommie Agee 55 696 1970
In [4]:
import numpy as np

GROUPED_DF = (DF
              .groupby("year")
              .agg({'walks': np.sum,
                    'pa': np.sum,
                   })
             )

GROUPED_DF["prop_walk"] = GROUPED_DF["walks"] / GROUPED_DF["pa"]
In [5]:
GROUPED_DF.describe()
Out[5]:
walks pa prop_walk
count 49.000000 49.000000 49.000000
mean 14434.530612 168774.551020 0.085571
std 1707.544986 19009.189224 0.003909
min 8868.000000 105892.000000 0.076225
25% 13518.000000 160033.000000 0.083789
50% 14227.000000 161922.000000 0.085222
75% 15806.000000 185553.000000 0.087354
max 18237.000000 190261.000000 0.095853
In [6]:
%matplotlib inline
GROUPED_DF["prop_walk"].hist();
In [7]:
len(GROUPED_DF)
Out[7]:
49
In [8]:
GROUPED_DF.head()
Out[8]:
walks pa prop_walk
year
1970 13727 149332 0.091923
1971 12484 145958 0.085531
1972 11715 139815 0.083789
1973 13094 148637 0.088094
1974 12963 148855 0.087085
In [9]:
GROUPED_DF
Out[9]:
walks pa prop_walk
year
1970 13727 149332 0.091923
1971 12484 145958 0.085531
1972 11715 139815 0.083789
1973 13094 148637 0.088094
1974 12963 148855 0.087085
1975 13402 148621 0.090176
1976 12390 147598 0.083944
1977 13757 161547 0.085158
1978 13566 159192 0.085218
1979 13601 160378 0.084806
1980 13190 161210 0.081819
1981 8868 105892 0.083746
1982 13302 161104 0.082568
1983 13518 160615 0.084164
1984 13320 160566 0.082957
1985 13838 160320 0.086315
1986 14227 160858 0.088444
1987 14389 161922 0.088864
1988 12984 159380 0.081466
1989 13528 160033 0.084533
1990 13852 160316 0.086404
1991 13984 160746 0.086994
1992 13682 160545 0.085222
1993 15110 174564 0.086559
1994 11131 124483 0.089418
1995 14240 156703 0.090873
1996 16093 177261 0.090787
1997 15666 175541 0.089244
1998 16447 188280 0.087354
1999 17891 189692 0.094316
2000 18237 190261 0.095853
2001 15806 186976 0.084535
2002 16246 186615 0.087056
2003 15889 187449 0.084764
2004 16222 188539 0.086041
2005 15207 186292 0.081630
2006 15847 188071 0.084261
2007 16079 188623 0.085244
2008 16337 187631 0.087070
2009 16620 187079 0.088839
2010 15778 185553 0.085032
2011 15018 185245 0.081071
2012 14709 184179 0.079863
2013 14640 184873 0.079189
2014 14020 183929 0.076225
2015 14073 183627 0.076639
2016 15090 184613 0.081739
2017 15829 185295 0.085426
2018 15686 185139 0.084726
In [10]:
%matplotlib inline
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano.tensor as tt
In [11]:
GROUPED_DF.shape
Out[11]:
(49, 3)
In [12]:
with pm.Model() as bb_model:
 
    phi = pm.Uniform('phi', lower=0.0, upper=1.0)

    kappa_log = pm.Exponential('kappa_log', lam=1.5)
    kappa = pm.Deterministic('kappa', tt.exp(kappa_log))

    rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF))

 
    trials = np.array(GROUPED_DF["pa"])
    successes = np.array(GROUPED_DF["walks"])
 
    obs = pm.Binomial('observed_values', trials, rates,
                      observed=successes)
    trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95})
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [rates, kappa_log, phi]
Sampling 2 chains: 100%|██████████| 6000/6000 [00:38<00:00, 187.94draws/s]
In [13]:
pm.traceplot(trace, varnames=['kappa_log']);
In [14]:
pm.traceplot(trace, varnames=['kappa']);
In [15]:
pm.traceplot(trace, varnames=['phi']);
In [16]:
pm.traceplot(trace, varnames=['rates']);
In [17]:
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr)));
In [18]:
energy = trace['energy']
energy_diff = np.diff(energy)
plt.hist(energy - energy.mean(), label='energy', alpha=0.5)
plt.hist(energy_diff, label='energy diff', alpha=0.5)
plt.legend();
In [19]:
GROUPED_DF.head()
Out[19]:
walks pa prop_walk
year
1970 13727 149332 0.091923
1971 12484 145958 0.085531
1972 11715 139815 0.083789
1973 13094 148637 0.088094
1974 12963 148855 0.087085
In [20]:
rate_means = trace['rates', 1000:].mean(axis=0)
rate_se = trace['rates', 1000:].std(axis=0)

mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)]
sorted_means_se = sorted(mean_se, key=lambda x: x[0])
sorted_means = [x[0] for x in sorted_means_se]
sorted_se = [x[1] for x in sorted_means_se]

x = np.arange(len(sorted_means))

plt.plot(x, sorted_means, 'o');

for x_val, m, se in zip(x, sorted_means, sorted_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')
In [21]:
plt.plot(GROUPED_DF.index, rate_means, 'o');
for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')
In [22]:
sorted_means_se[:5]
Out[22]:
[(0.07646209726671635, 0.0005918003330566895, 2014),
 (0.07688963877749279, 0.0006224337494514535, 2015),
 (0.07937218775493034, 0.0006088604882029617, 2013),
 (0.08001400543286034, 0.0006207386019788478, 2012),
 (0.0811766723382111, 0.0006115530774205621, 2011)]
In [23]:
sorted_means_se[-5:]
Out[23]:
[(0.09066824445979997, 0.0006691094857225917, 1996),
 (0.09071123822616874, 0.0007035829860009472, 1995),
 (0.0917172719443447, 0.0007240624312056991, 1970),
 (0.09409286147240523, 0.0006469790042051997, 1999),
 (0.0955956759518374, 0.0006744226015336069, 2000)]
In [24]:
GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :]
Out[24]:
walks pa prop_walk
year
1996 16093 177261 0.090787
1995 14240 156703 0.090873
1970 13727 149332 0.091923
1999 17891 189692 0.094316
2000 18237 190261 0.095853
In [25]:
GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :]
Out[25]:
walks pa prop_walk
year
2014 14020 183929 0.076225
2015 14073 183627 0.076639
2013 14640 184873 0.079189
2012 14709 184179 0.079863
2011 15018 185245 0.081071
In [26]:
pm.plots.forestplot(trace, varnames=["rates"])
Out[26]:
GridSpec(1, 2, width_ratios=[3, 1])