Quick Notes:
%%bash
head ../processed_data/all_errors_bip.out
aaroh101, Hank Aaron, 4, 496, 1968 aarot101, Tommie Aaron, 2, 234, 1968 abert101, Ted Abernathy, 0, 5, 1968 adaij101, Jerry Adair, 2, 180, 1968 adamm101, Mike Adamson, 0, 1, 1968 adled101, Dave Adlesh, 1, 72, 1968 ageet101, Tommie Agee, 8, 265, 1968 aguih101, Hank Aguirre, 0, 1, 1968 akerj102, Jack Aker, 0, 2, 1968 alcal101, Luis Alcaraz, 1, 83, 1968
import pandas as pd
DF = pd.read_csv('../processed_data/all_errors_bip.out',
header=None,
names=['playerid', 'player_name', 'errors', 'bip', 'year'])
DF.head()
playerid | player_name | errors | bip | year | |
---|---|---|---|---|---|
0 | aaroh101 | Hank Aaron | 4 | 496 | 1968 |
1 | aarot101 | Tommie Aaron | 2 | 234 | 1968 |
2 | abert101 | Ted Abernathy | 0 | 5 | 1968 |
3 | adaij101 | Jerry Adair | 2 | 180 | 1968 |
4 | adamm101 | Mike Adamson | 0 | 1 | 1968 |
import numpy as np
GROUPED_DF = (DF
.groupby("year")
.agg({'errors': np.sum,
'bip': np.sum,
})
)
GROUPED_DF["prop_error"] = GROUPED_DF["errors"] / GROUPED_DF["bip"]
GROUPED_DF.describe()
errors | bip | prop_error | |
---|---|---|---|
count | 51.000000 | 51.000000 | 51.000000 |
mean | 1708.039216 | 121771.960784 | 0.014140 |
std | 184.392620 | 12333.240302 | 0.001840 |
min | 1244.000000 | 81229.000000 | 0.010923 |
25% | 1596.000000 | 118019.500000 | 0.012417 |
50% | 1723.000000 | 122928.000000 | 0.014327 |
75% | 1851.000000 | 131132.000000 | 0.015566 |
max | 2036.000000 | 135936.000000 | 0.017586 |
%matplotlib inline
GROUPED_DF["prop_error"].hist();
len(GROUPED_DF)
51
GROUPED_DF.head()
errors | bip | prop_error | |
---|---|---|---|
year | |||
1968 | 1534 | 88471 | 0.017339 |
1969 | 1862 | 108814 | 0.017112 |
1970 | 1845 | 109765 | 0.016809 |
1971 | 1635 | 109018 | 0.014998 |
1972 | 1640 | 103990 | 0.015771 |
GROUPED_DF
errors | bip | prop_error | |
---|---|---|---|
year | |||
1968 | 1534 | 88471 | 0.017339 |
1969 | 1862 | 108814 | 0.017112 |
1970 | 1845 | 109765 | 0.016809 |
1971 | 1635 | 109018 | 0.014998 |
1972 | 1640 | 103990 | 0.015771 |
1973 | 1874 | 111876 | 0.016751 |
1974 | 1850 | 112762 | 0.016406 |
1975 | 1973 | 112192 | 0.017586 |
1976 | 1790 | 112780 | 0.015872 |
1977 | 1946 | 122253 | 0.015918 |
1978 | 1852 | 121509 | 0.015242 |
1979 | 1819 | 122757 | 0.014818 |
1980 | 1904 | 123948 | 0.015361 |
1981 | 1244 | 81229 | 0.015315 |
1982 | 1883 | 122928 | 0.015318 |
1983 | 1977 | 121822 | 0.016229 |
1984 | 2036 | 121329 | 0.016781 |
1985 | 1918 | 120624 | 0.015901 |
1986 | 1877 | 118400 | 0.015853 |
1987 | 1825 | 118995 | 0.015337 |
1988 | 1723 | 119094 | 0.014468 |
1989 | 1775 | 118606 | 0.014966 |
1990 | 1728 | 118395 | 0.014595 |
1991 | 1616 | 117644 | 0.013736 |
1992 | 1512 | 118414 | 0.012769 |
1993 | 1868 | 127970 | 0.014597 |
1994 | 1284 | 90015 | 0.014264 |
1995 | 1511 | 112511 | 0.013430 |
1996 | 1817 | 126822 | 0.014327 |
1997 | 1675 | 124864 | 0.013415 |
1998 | 1826 | 134327 | 0.013594 |
1999 | 1810 | 134835 | 0.013424 |
2000 | 1972 | 135934 | 0.014507 |
2001 | 1829 | 133826 | 0.013667 |
2002 | 1772 | 134188 | 0.013205 |
2003 | 1699 | 135936 | 0.012499 |
2004 | 1719 | 135523 | 0.012684 |
2005 | 1638 | 135691 | 0.012072 |
2006 | 1638 | 135686 | 0.012072 |
2007 | 1607 | 135594 | 0.011852 |
2008 | 1600 | 133830 | 0.011955 |
2009 | 1547 | 132258 | 0.011697 |
2010 | 1606 | 131047 | 0.012255 |
2011 | 1637 | 131217 | 0.012476 |
2012 | 1592 | 128825 | 0.012358 |
2013 | 1413 | 129360 | 0.010923 |
2014 | 1460 | 128173 | 0.011391 |
2015 | 1511 | 128041 | 0.011801 |
2016 | 1501 | 126594 | 0.011857 |
2017 | 1481 | 125463 | 0.011804 |
2018 | 1459 | 124225 | 0.011745 |
%matplotlib inline
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import theano.tensor as tt
GROUPED_DF.shape
(51, 3)
with pm.Model() as bb_model:
phi = pm.Uniform('phi', lower=0.0, upper=1.0)
kappa_log = pm.Exponential('kappa_log', lam=1.5)
kappa = pm.Deterministic('kappa', tt.exp(kappa_log))
rates = pm.Beta('rates', alpha=phi*kappa, beta=(1.0-phi)*kappa, shape=len(GROUPED_DF))
trials = np.array(GROUPED_DF["bip"])
successes = np.array(GROUPED_DF["errors"])
obs = pm.Binomial('observed_values', trials, rates,
observed=successes)
trace = pm.sample(2000, tune=1000, chains=2, cores=2, nuts_kwargs={'target_accept': .95})
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [rates, kappa_log, phi] Sampling 2 chains: 100%|██████████| 6000/6000 [00:30<00:00, 194.19draws/s]
pm.traceplot(trace, varnames=['kappa_log']);
pm.traceplot(trace, varnames=['kappa']);
pm.traceplot(trace, varnames=['phi']);
pm.traceplot(trace, varnames=['rates']);
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(bfmi, max_gr)));
energy = trace['energy']
energy_diff = np.diff(energy)
plt.hist(energy - energy.mean(), label='energy', alpha=0.5)
plt.hist(energy_diff, label='energy diff', alpha=0.5)
plt.legend();
GROUPED_DF.head()
errors | bip | prop_error | |
---|---|---|---|
year | |||
1968 | 1534 | 88471 | 0.017339 |
1969 | 1862 | 108814 | 0.017112 |
1970 | 1845 | 109765 | 0.016809 |
1971 | 1635 | 109018 | 0.014998 |
1972 | 1640 | 103990 | 0.015771 |
rate_means = trace['rates', 1000:].mean(axis=0)
rate_se = trace['rates', 1000:].std(axis=0)
mean_se = [(x, y, i) for i, x, y in zip(GROUPED_DF.index, rate_means, rate_se)]
sorted_means_se = sorted(mean_se, key=lambda x: x[0])
sorted_means = [x[0] for x in sorted_means_se]
sorted_se = [x[1] for x in sorted_means_se]
x = np.arange(len(sorted_means))
plt.plot(x, sorted_means, 'o');
for x_val, m, se in zip(x, sorted_means, sorted_se):
plt.plot([x_val, x_val], [m-se, m+se], 'b-')
plt.plot(GROUPED_DF.index, rate_means, 'o');
for x_val, m, se in zip(GROUPED_DF.index, rate_means, rate_se):
plt.plot([x_val, x_val], [m-se, m+se], 'b-')
sorted_means_se[:5]
[(0.011011680496334737, 0.00028428180520349855, 2013), (0.011479085002037233, 0.0002934659490447185, 2014), (0.011769051057939574, 0.00029153206076100083, 2009), (0.011817645482632833, 0.000298330035691104, 2018), (0.011862491079040331, 0.0003085671391091956, 2015)]
sorted_means_se[-5:]
[(0.016694408477005047, 0.00038003314701982334, 1984), (0.016724097724823316, 0.0003985186986550183, 1970), (0.01701073549632819, 0.00040345940085901254, 1969), (0.017198401082438704, 0.000456887853448958, 1968), (0.017469095683654505, 0.00037956537603057024, 1975)]
GROUPED_DF.loc[[x[2] for x in sorted_means_se[-5:]], :]
errors | bip | prop_error | |
---|---|---|---|
year | |||
1984 | 2036 | 121329 | 0.016781 |
1970 | 1845 | 109765 | 0.016809 |
1969 | 1862 | 108814 | 0.017112 |
1968 | 1534 | 88471 | 0.017339 |
1975 | 1973 | 112192 | 0.017586 |
GROUPED_DF.loc[[x[2] for x in sorted_means_se[:5]], :]
errors | bip | prop_error | |
---|---|---|---|
year | |||
2013 | 1413 | 129360 | 0.010923 |
2014 | 1460 | 128173 | 0.011391 |
2009 | 1547 | 132258 | 0.011697 |
2018 | 1459 | 124225 | 0.011745 |
2015 | 1511 | 128041 | 0.011801 |
pm.plots.forestplot(trace, varnames=["rates"])
GridSpec(1, 2, width_ratios=[3, 1])