In [1]:
from __future__ import division, print_function

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
In [2]:
def scoring_team(s):
    """
    Identifies the team with the next score. If no more scoring,
    returns NaN."""
    if s['PTS'] > 0:
        return s['OFF']
    elif s['PTS'] < 0:
        return s['DEF']
    else:
        return np.nan
    
def score_adjust(s):
    """
    Flips sign of next score if ball with team is not team to score next.
    Returns 0 if the half or game ends without any more scoring."""
    if (s['next_res'] == 'ENDQ') & ((s['QTR'] == 2) | (s['QTR'] == 4)):
        return 0
    if (s['next_score'] > 0) & (s['next_score_team'] != s['OFF']):
        return -1 * s['next_score']
    else:
        return s['next_score']
In [3]:
# Read data in (from Armchair Analysis)
games = pd.read_csv('GAMES.csv', index_col='GID')
core = pd.read_csv('CORE.csv', index_col='PID')
fgxp = pd.read_csv('FGXP.csv', index_col='PID')
scoring = pd.read_csv('SCORING.csv', index_col='PID')
drives = pd.read_csv('DRIVES.csv', index_col='FPID')

# Restrict to the regular season

regular_season = games.index[games.WEEK <= 17]
core = core[core.GID.isin(regular_season)]

core = pd.merge(core, games[['SEAS']], left_on='GID', right_index=True)

# Remove extra point plays and non-plays
df = core.join(scoring, how='left')
df = df.join(fgxp['FGXP'], how='left')
df = df[df.FGXP != 'XP']
df = df[df.DWN > 0] # Eliminate kickoffs, only interested in where play starts
df = df.drop(['FGXP', 'TIMO', 'TIMD', 'ZONE', 'OLID', 'LEN'], axis=1)

df = df.join(drives['RES'], how='left') # To figure out if negative or positive points needed
df['scoring_team'] = df.apply(lambda x: scoring_team(x), axis=1)
In [4]:
# Group data by game (so that 'next score' can't occur in another game)

df_g = df.groupby('GID')
next_score = pd.Series(df_g.PTS.bfill())
next_res = pd.Series(df_g.RES.ffill())
next_score_team = pd.Series(df_g.scoring_team.bfill())
df['next_score'] = next_score
df['next_res'] = next_res
df['next_score_team'] = next_score_team
In [5]:
# Adjust the scoring to have the correct sign and value

df['next_score_adj'] = df.apply(lambda x: score_adjust(x), axis=1)
df['next_score_adj'] = df.next_score_adj.fillna(0)
In [6]:
df_small = df[['SEAS', 'DWN', 'YTG', 'YFOG', 'next_score_adj']]
df_g_seas = df_small.groupby(['SEAS', 'DWN', 'YTG', 'YFOG'])
df_g = df_small.groupby(['DWN', 'YTG', 'YFOG'])
seas_ep = df_g_seas.mean()
ep = df_g.mean()
In [7]:
fifty = seas_ep.query('DWN == 1 & YTG == 1 & YFOG == 99')
fifty.head()
Out[7]:
next_score_adj
SEAS DWN YTG YFOG
2000 1 1 99 6.042254
2001 1 1 99 6.180451
2002 1 1 99 5.756522
2003 1 1 99 6.158273
2004 1 1 99 6.030534
In [8]:
first = df_small.query('DWN == 1').groupby(['YFOG']).mean()
first = first.reset_index()
In [9]:
# Remove large point differentials and garbage time
df['score_diff'] = df.PTSO - df.PTSD
df_close = df[(abs(df.score_diff) <= 10) & ((df.QTR == 1) | (df.QTR == 3))]

df_small_close = df_close[['SEAS', 'DWN', 'YTG', 'YFOG', 'next_score_adj']]
df_g_seas_close = df_small_close.groupby(['SEAS', 'DWN', 'YTG', 'YFOG'])
df_g_close = df_small_close.groupby(['DWN', 'YTG', 'YFOG'])
seas_ep_close = df_g_seas_close.mean()
ep_close = df_g_close.mean()

first_close = df_small_close.query('DWN == 1').groupby(['YFOG']).mean()
first_close = first_close.reset_index()
In [10]:
fig, ax = plt.subplots()
ax.set(xlim=[1, 99], ylim=[-2, 7])
ax.set_xlabel('Yards from own goal', fontsize=16)
ax.set_ylabel('Expected points', fontsize=16)
ax.set_title('Expected points, first down, all plays included', fontsize=16)
ax.plot(first.YFOG.values, first.next_score_adj.values, 'b', lw=0.5, label='All data')
ax.plot(first_close.YFOG.values, first_close.next_score_adj.values, 'r', lw=0.5, label='Trimmed data')
ax.vlines(x=50, ymin=-2, ymax=7, lw=0.25)
ax.set_xticks(range(0, 101, 10))
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend(loc='best', shadow=False, frameon=False)
ax.grid()
fig.set_size_inches(16, 9)
fig.savefig('first_downs.png')
In [11]:
def bootstrap(s, n_resamples=1000):
    """
    Returns a bootstrapped 95% confidence interval for the mean."""
    means = np.empty(n_resamples)
    for i in range(n_resamples):
        boot_sample = np.random.choice(s.values, 
                                       s.shape[0])
        means[i] = np.mean(boot_sample)
    sorted_means = np.sort(means)
    return np.percentile(sorted_means, [2.5, 97.5])
In [12]:
first_ci = df_small.query('DWN == 1')
first_ci.head()

cis = dict()

for yfog in first_ci.YFOG.unique():
    cis[yfog] = bootstrap(first_ci[first_ci.YFOG == yfog].next_score_adj)
In [13]:
ci_df = pd.DataFrame.from_dict(cis, orient='index')
ci_df.columns = ['2.5%', '97.5%']
ci_df.tail()
Out[13]:
2.5% 97.5%
95 5.247813 5.575515
96 5.364489 5.689685
97 5.428547 5.783730
98 5.706743 6.031955
99 5.971299 6.204406
In [14]:
fig, ax = plt.subplots()
ax.set(xlim=[1, 99], ylim=[-2, 7])
ax.set_xlabel('Yards from own goal', fontsize=16)
ax.set_ylabel('Expected points', fontsize=16)
ax.set_title('Expected points, first down, all plays included', fontsize=16)
ax.plot(first.YFOG.values, first.next_score_adj.values, 'b', lw=0.5, label='All data')
ax.plot(first_close.YFOG.values, first_close.next_score_adj.values, 'r', lw=0.5, label='Trimmed data')
ax.fill_between(ci_df.index.values, ci_df['2.5%'].values, ci_df['97.5%'].values,
                alpha=0.1)
ax.vlines(x=50, ymin=-2, ymax=7, lw=0.25)
ax.set_xticks(range(0, 101, 10))
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend(loc='best', shadow=False, frameon=False)
ax.grid()
fig.set_size_inches(16, 9)
fig.savefig('first_downs_ci.png')