Repeating the calculations explained here in Python using pybaseball
.
%matplotlib inline
import numpy as np
import pandas as pd
import pybaseball
# the test season from Baseball Reference/Total Baseball
SEASON = 1982
TEAM = "ATL"
# the 1982 Phillies
#SEASON = 1982
#TEAM = "PHI"
# the 2018 Phillies
#SEASON = 2018
#TEAM = "PHI"
DATA_PATH = f'./data/{TEAM}_{SEASON}.csv'
VERIFY = (SEASON == 1982) and (TEAM == "ATL")
# don't download the Lahman database every time the notebook is rerun (if possible)
# see https://github.com/jldbc/pybaseball/issues/29
try:
df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
df = pybaseball.schedule_and_record(season=SEASON, team=TEAM)
df.to_csv(DATA_PATH, index=False)
df = df.rename(columns=str.lower)
df.head()
date | tm | home_away | opp | w/l | r | ra | inn | w-l | rank | gb | win | loss | save | time | d/n | attendance | streak | orig. scheduled | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Tuesday, Apr 6 | ATL | @ | SDP | W | 1.0 | 0.0 | 9.0 | 1-0 | 1.0 | Tied | Mahler | Eichelberger | None | 2:30 | N | 30188.0 | 1 | NaN |
1 | Wednesday, Apr 7 | ATL | @ | SDP | W | 6.0 | 4.0 | 9.0 | 2-0 | 1.0 | Tied | Walk | Montefusco | Garber | 2:40 | D | 16684.0 | 2 | NaN |
2 | Friday, Apr 9 | ATL | Home | HOU | W | 6.0 | 2.0 | 9.0 | 3-0 | 1.0 | up 1.0 | Boggs | Sutton | Hrabosky | 2:43 | N | 33133.0 | 3 | NaN |
3 | Saturday, Apr 10 | ATL | Home | HOU | W | 8.0 | 6.0 | 9.0 | 4-0 | 1.0 | up 1.0 | McWilliams | Ruhle | Camp | 2:51 | N | 10885.0 | 4 | NaN |
4 | Sunday, Apr 11 | ATL | Home | HOU | W | 5.0 | 0.0 | 9.0 | 5-0 | 1.0 | up 1.5 | Mahler | Ryan | None | 2:14 | D | 11322.0 | 5 | NaN |
df['home_away'].value_counts()
Home 81 @ 81 Name: home_away, dtype: int64
df['w/l'].value_counts()
W 78 L 64 W-wo 11 L-wo 9 Name: w/l, dtype: int64
df['is_home'] = df['home_away'] == "Home"
df['is_win'] = df['w/l'].str.startswith("W")
home_away_df = (df.groupby('is_home')
['is_win', 'r', 'ra']
.sum()
.rename(columns={'is_win': 'wins'}))
home_away_df['rtot'] = home_away_df[['r', 'ra']].sum(axis=1)
home_away_df['gp'] = df['is_home'].value_counts()
home_away_df
wins | r | ra | rtot | gp | |
---|---|---|---|---|---|
is_home | |||||
False | 47.0 | 351.0 | 315.0 | 666.0 | 81 |
True | 42.0 | 388.0 | 387.0 | 775.0 | 81 |
home_away_pct = home_away_df['wins'] / home_away_df['gp']
home_away_pct
is_home False 0.580247 True 0.518519 dtype: float64
IPC = (18.5 - home_away_pct[True]) / (18.5 - (1 - home_away_pct[False]))
def verify_agreement_with_total_baseball(val, true_val, name, atol=1e-3):
assert np.allclose(val, true_val, atol=atol)
print(f"{name} agrees with Total Baseball")
if VERIFY:
verify_agreement_with_total_baseball(IPC, 0.995, "IPC")
IPC agrees with Total Baseball
rpg = home_away_df['rtot'] / home_away_df['gp']
rpg
is_home False 8.222222 True 9.567901 dtype: float64
RFT = (rpg[True] / rpg[False]) / IPC
if VERIFY:
verify_agreement_with_total_baseball(RFT, 1.170, "RFT")
RFT agrees with Total Baseball
NT = df['opp'].nunique() + 1
OPC = NT / (NT - 1 + RFT)
if VERIFY:
verify_agreement_with_total_baseball(OPC, 0.986, "OPC")
OPC agrees with Total Baseball
SF = RFT * OPC
if VERIFY:
verify_agreement_with_total_baseball(SF, 1.154, "SF")
SF agrees with Total Baseball
SF1 = 1 - (SF - 1) / (NT - 1)
if VERIFY:
verify_agreement_with_total_baseball(SF1, 0.986, "SF1")
SF1 agrees with Total Baseball
home_away_df.head()
wins | r | ra | rtot | gp | |
---|---|---|---|---|---|
is_home | |||||
False | 47.0 | 351.0 | 315.0 | 666.0 | 81 |
True | 42.0 | 388.0 | 387.0 | 775.0 | 81 |
RAL = 7947 / 972
RAT, RHT = home_away_df['r']
OAT, OHT = home_away_df['ra']
def iterate_team_ratings(TPR, RAL, NT, home_away_df):
assert home_away_df.index.is_monotonic_increasing
RAT, RHT = home_away_df['r'] / home_away_df['gp']
OAT, OHT = home_away_df['ra'] / home_away_df['gp']
TBR = (RAT / SF1 + RHT / SF) * (1 + (TPR - 1) / (NT - 1)) / RAL
TPR_ = (OAT / SF1 + OHT / SF) * (1 + (TBR - 1) / (NT - 1)) / RAL
return TBR, TPR_
def calculate_team_ratings(RAL, NT, home_away_df, TPR=1, iter=4):
for _ in range(iter):
TBR, TPR = iterate_team_ratings(TPR, RAL, NT, home_away_df)
return TBR, TPR
TBR_1iter, TPR_1iter = calculate_team_ratings(RAL, NT, home_away_df, iter=1)
if VERIFY:
verify_agreement_with_total_baseball(TBR_1iter, 1.044, "TBR", atol=1e-2)
verify_agreement_with_total_baseball(TPR_1iter, 0.993, "TPR", atol=1e-2)
TBR agrees with Total Baseball TPR agrees with Total Baseball
TBR, TPR = calculate_team_ratings(RAL, NT, home_away_df)
BPF = (SF + SF1) / (2 * (1 + (TPR - 1) / (NT - 1)))
PPF = (SF + SF1) / (2 * (1 + (TBR - 1) / (NT - 1)))
if VERIFY:
verify_agreement_with_total_baseball(BPF, 1.07, "BPF")
BPF agrees with Total Baseball