In [1]:
%%bash
head ../processed_data/all_errors_bip.out
aaroh101, Hank Aaron, 4, 496, 1968
aarot101, Tommie Aaron, 2, 234, 1968
abert101, Ted Abernathy, 0, 5, 1968
adaij101, Jerry Adair, 2, 180, 1968
adamm101, Mike Adamson, 0, 1, 1968
adled101, Dave Adlesh, 1, 72, 1968
ageet101, Tommie Agee, 8, 265, 1968
aguih101, Hank Aguirre, 0, 1, 1968
akerj102, Jack Aker, 0, 2, 1968
alcal101, Luis Alcaraz, 1, 83, 1968
In [2]:
import pandas as pd

DF = pd.read_csv('../processed_data/all_errors_bip.out',
                 header=None,
                 names=['playerid', 'player_name', 'errors', 'bip', 'year'])
In [3]:
DF.head()
Out[3]:
playerid player_name errors bip year
0 aaroh101 Hank Aaron 4 496 1968
1 aarot101 Tommie Aaron 2 234 1968
2 abert101 Ted Abernathy 0 5 1968
3 adaij101 Jerry Adair 2 180 1968
4 adamm101 Mike Adamson 0 1 1968
In [4]:
import numpy as np

GROUPED_DF = (DF
              .groupby("year")
              .agg({'errors': np.sum,
                    'bip': np.sum,
                   })
             )

GROUPED_DF["prop_error"] = GROUPED_DF["errors"] / GROUPED_DF["bip"]
In [5]:
GROUPED_DF.describe()
Out[5]:
errors bip prop_error
count 51.000000 51.000000 51.000000
mean 1708.039216 121771.960784 0.014140
std 184.392620 12333.240302 0.001840
min 1244.000000 81229.000000 0.010923
25% 1596.000000 118019.500000 0.012417
50% 1723.000000 122928.000000 0.014327
75% 1851.000000 131132.000000 0.015566
max 2036.000000 135936.000000 0.017586
In [7]:
%matplotlib inline
GROUPED_DF["prop_error"].hist();
In [8]:
len(GROUPED_DF)
Out[8]:
51
In [9]:
GROUPED_DF.head()
Out[9]:
errors bip prop_error
year
1968 1534 88471 0.017339
1969 1862 108814 0.017112
1970 1845 109765 0.016809
1971 1635 109018 0.014998
1972 1640 103990 0.015771
In [10]:
GROUPED_DF
Out[10]:
errors bip prop_error
year
1968 1534 88471 0.017339
1969 1862 108814 0.017112
1970 1845 109765 0.016809
1971 1635 109018 0.014998
1972 1640 103990 0.015771
1973 1874 111876 0.016751
1974 1850 112762 0.016406
1975 1973 112192 0.017586
1976 1790 112780 0.015872
1977 1946 122253 0.015918
1978 1852 121509 0.015242
1979 1819 122757 0.014818
1980 1904 123948 0.015361
1981 1244 81229 0.015315
1982 1883 122928 0.015318
1983 1977 121822 0.016229
1984 2036 121329 0.016781
1985 1918 120624 0.015901
1986 1877 118400 0.015853
1987 1825 118995 0.015337
1988 1723 119094 0.014468
1989 1775 118606 0.014966
1990 1728 118395 0.014595
1991 1616 117644 0.013736
1992 1512 118414 0.012769
1993 1868 127970 0.014597
1994 1284 90015 0.014264
1995 1511 112511 0.013430
1996 1817 126822 0.014327
1997 1675 124864 0.013415
1998 1826 134327 0.013594
1999 1810 134835 0.013424
2000 1972 135934 0.014507
2001 1829 133826 0.013667
2002 1772 134188 0.013205
2003 1699 135936 0.012499
2004 1719 135523 0.012684
2005 1638 135691 0.012072
2006 1638 135686 0.012072
2007 1607 135594 0.011852
2008 1600 133830 0.011955
2009 1547 132258 0.011697
2010 1606 131047 0.012255
2011 1637 131217 0.012476
2012 1592 128825 0.012358
2013 1413 129360 0.010923
2014 1460 128173 0.011391
2015 1511 128041 0.011801
2016 1501 126594 0.011857
2017 1481 125463 0.011804
2018 1459 124225 0.011745
In [11]:
%matplotlib inline
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano.tensor as tt
In [12]:
GROUPED_DF.shape
Out[12]:
(51, 3)
In [13]:
with pm.Model() as bb_model:
 
    phi = pm.Uniform('phi', lower=0.0, upper=1.0)

    kappa_log = pm.Exponential('kappa_log', lam=1.5)
    kappa = pm.Deterministic('kappa', tt.exp(kappa_log))

    rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF))

 
    trials = np.array(GROUPED_DF["bip"])
    successes = np.array(GROUPED_DF["errors"])
 
    obs = pm.Binomial('observed_values', trials, rates,
                      observed=successes)
    trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95})
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [rates, kappa_log, phi]
Sampling 2 chains: 100%|██████████| 6000/6000 [00:30<00:00, 194.19draws/s]
In [14]:
pm.traceplot(trace, varnames=['kappa_log']);
In [15]:
pm.traceplot(trace, varnames=['kappa']);
In [16]:
pm.traceplot(trace, varnames=['phi']);
In [17]:
pm.traceplot(trace, varnames=['rates']);
In [18]:
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr)));
In [19]:
energy = trace['energy']
energy_diff = np.diff(energy)
plt.hist(energy - energy.mean(), label='energy', alpha=0.5)
plt.hist(energy_diff, label='energy diff', alpha=0.5)
plt.legend();
In [20]:
GROUPED_DF.head()
Out[20]:
errors bip prop_error
year
1968 1534 88471 0.017339
1969 1862 108814 0.017112
1970 1845 109765 0.016809
1971 1635 109018 0.014998
1972 1640 103990 0.015771
In [21]:
rate_means = trace['rates', 1000:].mean(axis=0)
rate_se = trace['rates', 1000:].std(axis=0)

mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)]
sorted_means_se = sorted(mean_se, key=lambda x: x[0])
sorted_means = [x[0] for x in sorted_means_se]
sorted_se = [x[1] for x in sorted_means_se]

x = np.arange(len(sorted_means))

plt.plot(x, sorted_means, 'o');

for x_val, m, se in zip(x, sorted_means, sorted_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')
In [22]:
plt.plot(GROUPED_DF.index, rate_means, 'o');
for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se):
    plt.plot([x_val, x_val], [m-se, m+se], 'b-')
In [23]:
sorted_means_se[:5]
Out[23]:
[(0.011011680496334737, 0.00028428180520349855, 2013),
 (0.011479085002037233, 0.0002934659490447185, 2014),
 (0.011769051057939574, 0.00029153206076100083, 2009),
 (0.011817645482632833, 0.000298330035691104, 2018),
 (0.011862491079040331, 0.0003085671391091956, 2015)]
In [24]:
sorted_means_se[-5:]
Out[24]:
[(0.016694408477005047, 0.00038003314701982334, 1984),
 (0.016724097724823316, 0.0003985186986550183, 1970),
 (0.01701073549632819, 0.00040345940085901254, 1969),
 (0.017198401082438704, 0.000456887853448958, 1968),
 (0.017469095683654505, 0.00037956537603057024, 1975)]
In [25]:
GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :]
Out[25]:
errors bip prop_error
year
1984 2036 121329 0.016781
1970 1845 109765 0.016809
1969 1862 108814 0.017112
1968 1534 88471 0.017339
1975 1973 112192 0.017586
In [26]:
GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :]
Out[26]:
errors bip prop_error
year
2013 1413 129360 0.010923
2014 1460 128173 0.011391
2009 1547 132258 0.011697
2018 1459 124225 0.011745
2015 1511 128041 0.011801
In [27]:
pm.plots.forestplot(trace, varnames=["rates"])
Out[27]:
GridSpec(1, 2, width_ratios=[3, 1])