In [1]:
%%bash
head ../processed_data/all_strikeouts_pa.out
aaroh101, Hank Aaron, 63, 598, 1970
aarot101, Tommie Aaron, 10, 66, 1970
abert101, Ted Abernathy, 7, 20, 1970
adaij101, Jerry Adair, 3, 33, 1970
ageet101, Tommie Agee, 156, 696, 1970
aguih101, Hank Aguirre, 2, 2, 1970
akerj102, Jack Aker, 6, 16, 1970
alcal101, Luis Alcaraz, 13, 127, 1970
alleb105, Bernie Allen, 21, 305, 1970
alled101, Dick Allen, 118, 533, 1970
In [2]:
import pandas as pd

DF = pd.read_csv('../processed_data/all_strikeouts_pa.out',
                 header=None,
                 names=['playerid', 'player_name', 'strikeouts', 'pa', 'year'])
In [3]:
DF.head()
Out[3]:
playerid player_name strikeouts pa year
0 aaroh101 Hank Aaron 63 598 1970
1 aarot101 Tommie Aaron 10 66 1970
2 abert101 Ted Abernathy 7 20 1970
3 adaij101 Jerry Adair 3 33 1970
4 ageet101 Tommie Agee 156 696 1970
In [4]:
import numpy as np

GROUPED_DF = (DF
              .groupby("year")
              .agg({'strikeouts': np.sum,
                    'pa': np.sum,
                   })
             )

GROUPED_DF["prop_strikeouts"] = GROUPED_DF["strikeouts"] / GROUPED_DF["pa"]
In [5]:
GROUPED_DF.describe()
Out[5]:
strikeouts pa prop_strikeouts
count 49.000000 49.000000 49.000000
mean 27411.285714 168774.551020 0.160451
std 6811.431867 19009.189224 0.025676
min 13237.000000 105892.000000 0.124924
25% 21716.000000 160033.000000 0.140129
50% 25425.000000 161922.000000 0.158785
75% 32189.000000 185553.000000 0.170653
max 41207.000000 190261.000000 0.222573
In [6]:
%matplotlib inline
GROUPED_DF["prop_strikeouts"].hist();
In [7]:
len(GROUPED_DF)
Out[7]:
49
In [9]:
GROUPED_DF.head()
Out[9]:
strikeouts pa prop_strikeouts
year
1970 22374 149332 0.149827
1971 20852 145958 0.142863
1972 20698 139815 0.148038
1973 20338 148637 0.136830
1974 19494 148855 0.130960
In [10]:
GROUPED_DF
Out[10]:
strikeouts pa prop_strikeouts
year
1970 22374 149332 0.149827
1971 20852 145958 0.142863
1972 20698 139815 0.148038
1973 20338 148637 0.136830
1974 19494 148855 0.130960
1975 19280 148621 0.129726
1976 18745 147598 0.127000
1977 21722 161547 0.134462
1978 20058 159192 0.125999
1979 20035 160378 0.124924
1980 20212 161210 0.125377
1981 13237 105892 0.125005
1982 21221 161104 0.131722
1983 21716 160615 0.135205
1984 22500 160566 0.140129
1985 22451 160320 0.140039
1986 24706 160858 0.153589
1987 25100 161922 0.155013
1988 23355 159380 0.146537
1989 23650 160033 0.147782
1990 23853 160316 0.148787
1991 24390 160746 0.151730
1992 23538 160545 0.146613
1993 26310 174564 0.150718
1994 19766 124483 0.158785
1995 25425 156703 0.162250
1996 29308 177261 0.165338
1997 29937 175541 0.170541
1998 31893 188280 0.169391
1999 31119 189692 0.164050
2000 31356 190261 0.164805
2001 32404 186976 0.173306
2002 31394 186615 0.168229
2003 30801 187449 0.164317
2004 31828 188539 0.168814
2005 30644 186292 0.164494
2006 31655 188071 0.168314
2007 32189 188623 0.170653
2008 32884 187631 0.175259
2009 33591 187079 0.179555
2010 34306 185553 0.184885
2011 34488 185245 0.186175
2012 36426 184179 0.197775
2013 36710 184873 0.198569
2014 37441 183929 0.203562
2015 37446 183627 0.203924
2016 38996 184613 0.211231
2017 40104 185295 0.216433
2018 41207 185139 0.222573
In [11]:
%matplotlib inline
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano.tensor as tt
In [12]:
GROUPED_DF.shape
Out[12]:
(49, 3)
In [13]:
with pm.Model() as bb_model:
 
    phi = pm.Uniform('phi', lower=0.0, upper=1.0)

    kappa_log = pm.Exponential('kappa_log', lam=1.5)
    kappa = pm.Deterministic('kappa', tt.exp(kappa_log))

    rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF))

 
    trials = np.array(GROUPED_DF["pa"])
    successes = np.array(GROUPED_DF["strikeouts"])
 
    obs = pm.Binomial('observed_values', trials, rates,
                      observed=successes)
    trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95})
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [rates, kappa_log, phi]
Sampling 2 chains: 100%|██████████| 6000/6000 [00:42<00:00, 141.15draws/s]
In [14]:
pm.traceplot(trace, varnames=['kappa_log']);
In [15]:
pm.traceplot(trace, varnames=['kappa']);
In [16]:
pm.traceplot(trace, varnames=['phi']);
In [17]:
pm.traceplot(trace, varnames=['rates']);
In [18]:
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr)));
In [19]:
energy = trace['energy']
energy_diff = np.diff(energy)
plt.hist(energy - energy.mean(), label='energy', alpha=0.5)
plt.hist(energy_diff, label='energy diff', alpha=0.5)
plt.legend();
In [20]:
GROUPED_DF.head()
Out[20]:
strikeouts pa prop_strikeouts
year
1970 22374 149332 0.149827
1971 20852 145958 0.142863
1972 20698 139815 0.148038
1973 20338 148637 0.136830
1974 19494 148855 0.130960
In [21]:
rate_means = trace['rates', 1000:].mean(axis=0)
rate_se = trace['rates', 1000:].std(axis=0)

mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)]
sorted_means_se = sorted(mean_se, key=lambda x: x[0])
sorted_means = [x[0] for x in sorted_means_se]
sorted_se = [x[1] for x in sorted_means_se]

x = np.arange(len(sorted_means))

plt.plot(x, sorted_means, 'o');

for x_val, m, se in zip(x, sorted_means, sorted_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')
In [22]:
plt.plot(GROUPED_DF.index, rate_means, 'o');
for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')
In [23]:
sorted_means_se[:5]
Out[23]:
[(0.1249820765535394, 0.0008238973206443722, 1979),
 (0.12506506488490912, 0.0009923066059299719, 1981),
 (0.12542282812494432, 0.0008059046311561689, 1980),
 (0.12605604984825827, 0.0008119045915262948, 1978),
 (0.12706977300786318, 0.0008777024824528197, 1976)]
In [24]:
sorted_means_se[-5:]
Out[24]:
[(0.2035089636423048, 0.0009040547071524512, 2014),
 (0.20388652796345932, 0.00092543883685226, 2015),
 (0.21117310785744006, 0.0009592761649375174, 2016),
 (0.21637284104817386, 0.0009636338688238112, 2017),
 (0.22248726258065324, 0.0009928259568675018, 2018)]
In [25]:
GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :]
Out[25]:
strikeouts pa prop_strikeouts
year
2014 37441 183929 0.203562
2015 37446 183627 0.203924
2016 38996 184613 0.211231
2017 40104 185295 0.216433
2018 41207 185139 0.222573
In [26]:
GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :]
Out[26]:
strikeouts pa prop_strikeouts
year
1979 20035 160378 0.124924
1981 13237 105892 0.125005
1980 20212 161210 0.125377
1978 20058 159192 0.125999
1976 18745 147598 0.127000
In [27]:
pm.plots.forestplot(trace, varnames=["rates"])
Out[27]:
GridSpec(1, 2, width_ratios=[3, 1])