from IPython.display import HTML
HTML('<style>div.text_cell_render{font-size:130%;}</style>')
%load_ext version_information
%version_information pandas
Software | Version |
---|---|
Python | 3.6.8 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] |
IPython | 7.2.0 |
OS | Darwin 18.7.0 x86_64 i386 64bit |
pandas | 0.23.4 |
Thu Jan 14 23:50:49 2021 EST |
%load_ext watermark
%watermark -a "Alex Galea" -v -p matplotlib,seaborn,pandas
Alex Galea CPython 3.6.8 IPython 7.2.0 matplotlib 3.0.2 seaborn 0.9.0 pandas 0.23.4
import pandas as pd
import numpy as np
import os
import re
import datetime
import time
import glob
from tqdm import tqdm_notebook
from colorama import Fore, Style
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'
sns.set() # Revert to matplotlib defaults
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['legend.fancybox'] = True
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'Marion'
SMALL_SIZE, MEDIUM_SIZE, BIGGER_SIZE = 14, 16, 20
plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=SMALL_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=MEDIUM_SIZE)
plt.rc('axes', titlesize=BIGGER_SIZE)
plt.rcParams['grid.alpha'] = 0.2
plt.rcParams['axes.labelpad'] = 10
plt.rcParams['axes.labelpad'] = 20
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'
plt.rcParams['xtick.major.pad'] = 15
plt.rcParams['xtick.minor.pad'] = 15
plt.rcParams['ytick.major.pad'] = 10
plt.rcParams['ytick.minor.pad'] = 10
# %config InlineBackend.figure_format='retina'
# sns.set() # Revert to matplotlib defaults
# plt.rcParams['figure.figsize'] = (12, 8)
# plt.style.use('fivethirtyeight')
# plt.rcParams['grid.alpha'] = 0.2
# plt.rcParams['axes.labelpad'] = 20
# plt.rcParams['ytick.labelsize'] = 14
# plt.rcParams['xtick.labelsize'] = 14
# plt.rcParams['axes.labelsize'] = 16
# plt.rcParams['axes.facecolor'] = 'white'
# plt.rcParams['figure.facecolor'] = 'white'
# plt.rcParams['xtick.major.pad'] = 15
# plt.rcParams['xtick.minor.pad'] = 15
# plt.rcParams['ytick.major.pad'] = 10
# plt.rcParams['ytick.minor.pad'] = 10
def savefig(name):
plt.savefig(f'../../figures/{name}.png', bbox_inches='tight', dpi=300)
# To debug UserWarning: findfont: Font family [‘Marion’] not found.
# import matplotlib.font_manager
# matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
# matplotlib.font_manager._rebuild()
ls ../../data/processed/
csv/ pkl/
ls ../../data/processed/pkl
20032004_goalie_pulls_2019-03-01.pkl 20122013_goalie_pulls_2019-04-25.pkl
20052006_goalie_pulls_2019-03-01.pkl 20132014_goalie_pulls_2019-04-25.pkl
20062007_goalie_pulls_2019-03-01.pkl 20142015_goalie_pulls_2019-04-25.pkl
20072008_goalie_pulls_2019-04-25.pkl 20152016_goalie_pulls_2019-04-25.pkl
20082009_goalie_pulls_2019-04-25.pkl 20162017_goalie_pulls_2019-04-25.pkl
20092010_goalie_pulls_2019-04-25.pkl 20172018_goalie_pulls_2019-04-25.pkl
20102011_goalie_pulls_2019-04-25.pkl 20182019_goalie_pulls_2019-04-25.pkl
20112012_goalie_pulls_2019-04-25.pkl tmp/
files = sorted(glob.glob('../../data/processed/pkl/*'))
files
['../../data/processed/pkl/20032004_goalie_pulls_2019-03-01.pkl', '../../data/processed/pkl/20052006_goalie_pulls_2019-03-01.pkl', '../../data/processed/pkl/20062007_goalie_pulls_2019-03-01.pkl', '../../data/processed/pkl/20072008_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20082009_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20092010_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20102011_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20112012_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20122013_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20132014_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20142015_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20152016_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20162017_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20172018_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20182019_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/tmp']
def load_data():
files = [
'../../data/processed/pkl/20032004_goalie_pulls_2019-03-01.pkl',
'../../data/processed/pkl/20052006_goalie_pulls_2019-03-01.pkl',
'../../data/processed/pkl/20062007_goalie_pulls_2019-03-01.pkl',
'../../data/processed/pkl/20072008_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20082009_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20092010_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20102011_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20112012_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20122013_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20132014_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20142015_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20152016_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20162017_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20172018_goalie_pulls_2019-04-25.pkl',
'../../data/processed/pkl/20182019_goalie_pulls_2019-04-25.pkl',
]
return pd.concat((pd.read_pickle(f) for f in files), sort=False)
def clean_df(df):
_df = df.copy()
len_0 = _df.shape[0]
print('Removing goal_for_time < 15 mins')
_df = _df[~(_df.goal_for_time < datetime.timedelta(seconds=15*60))]
print(f'Removed {len_0 - _df.shape[0]} total rows')
if 'game_end_time' in df.columns:
len_0 = _df.shape[0]
print('Removing game_end_time < 15 mins')
_df = _df[~(_df.game_end_time < datetime.timedelta(seconds=60*15))]
print(f'Removed {len_0 - _df.shape[0]} total rows')
return _df
df = load_data()
df = clean_df(df)
Removing goal_for_time < 15 mins Removed 115 total rows Removing game_end_time < 15 mins Removed 0 total rows
# Label the outcomes
df['label'] = ''
label_masks = {
'goal_for': ~(df.goal_for_time.isnull()),
'goal_against': ~(df.goal_against_time.isnull()),
'no_goals': ~(df.game_end_timedelta.isnull()),
}
for label, mask in label_masks.items():
df.loc[mask, 'label'] = label
df.loc[df.label == '', 'label'] = float('nan')
df.label.isnull().sum()
0
df.head()
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20032004 | 710 | CHI | 2004-01-21 | 3 | 00:19:44 | NaT | 00:19:47 | NaT | 00:00:03 | NaT | NaT | goal_against |
1 | 20032004 | 205 | MIN | 2003-11-08 | 3 | 00:19:58 | NaT | NaT | NaT | NaT | 00:00:02 | NaT | no_goals |
2 | 20032004 | 1230 | S.J | 2004-04-04 | 3 | 00:18:39 | 00:19:40 | NaT | 00:01:01 | NaT | NaT | NaT | goal_for |
3 | 20032004 | 655 | PHX | 2004-01-13 | 3 | 00:19:03 | NaT | 00:19:25 | NaT | 00:00:22 | NaT | NaT | goal_against |
4 | 20032004 | 985 | PHX | 2004-03-02 | 3 | 00:18:34 | 00:18:44 | NaT | 00:00:10 | NaT | NaT | NaT | goal_for |
df.tail()
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
901 | 20182019 | 1265 | WSH | 2019-04-06 | 3 | 00:16:40 | NaT | 00:17:27 | NaT | 00:00:47 | NaT | NaT | goal_against |
902 | 20182019 | 1266 | CHI | 2019-04-06 | 3 | 00:17:39 | NaT | 00:17:51 | NaT | 00:00:12 | NaT | NaT | goal_against |
903 | 20182019 | 1268 | CGY | 2019-04-06 | 3 | 00:19:34 | NaT | NaT | NaT | NaT | 00:00:26 | 00:20:00 | no_goals |
904 | 20182019 | 1270 | VGK | 2019-04-06 | 3 | 00:18:37.500000 | NaT | 00:19:05 | NaT | 00:00:27.500000 | NaT | NaT | goal_against |
905 | 20182019 | 1271 | COL | 2019-04-06 | 3 | 00:17:07 | NaT | 00:19:15 | NaT | 00:02:08 | NaT | NaT | goal_against |
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
pull_period | 11168 | 3 | 0 | 3 | 3 | 3 | 3 | 3 |
pull_time | 11168 | 0 days 00:18:32.484195 | 0 days 00:00:52.298570 | 0 days 00:07:13 | 0 days 00:18:10 | 0 days 00:18:40 | 0 days 00:19:05.500000 | 0 days 00:19:59.500000 |
goal_for_time | 1496 | 0 days 00:18:58.131684 | 0 days 00:00:50.558413 | 0 days 00:15:10 | 0 days 00:18:35 | 0 days 00:19:07.500000 | 0 days 00:19:35 | 0 days 00:19:59 |
goal_against_time | 3736 | 0 days 00:19:02.648019 | 0 days 00:00:54.536523 | 0 days 00:08:14 | 0 days 00:18:43.750000 | 0 days 00:19:15 | 0 days 00:19:38 | 0 days 00:19:59 |
goal_for_timedelta | 1496 | 0 days 00:00:45.730614 | 0 days 00:00:34.889763 | 0 days 00:00:00 | 0 days 00:00:19.500000 | 0 days 00:00:39 | 0 days 00:01:03 | 0 days 00:04:06 |
goal_against_timedelta | 3736 | 0 days 00:00:48.343549 | 0 days 00:00:35.784661 | 0 days 00:00:00 | 0 days 00:00:21.500000 | 0 days 00:00:40 | 0 days 00:01:06.500000 | 0 days 00:04:51 |
game_end_timedelta | 5937 | 0 days 00:01:11.010948 | 0 days 00:00:41.067886 | 0 days 00:00:00.500000 | 0 days 00:00:42.500000 | 0 days 00:01:07 | 0 days 00:01:32 | 0 days 00:05:22.500000 |
game_end_time | 4632 | 0 days 00:20:00 | 0 days 00:00:00 | 0 days 00:20:00 | 0 days 00:20:00 | 0 days 00:20:00 | 0 days 00:20:00 | 0 days 00:20:00 |
df.dtypes
season object game_number object team_name object date datetime64[ns] pull_period int64 pull_time timedelta64[ns] goal_for_time timedelta64[ns] goal_against_time timedelta64[ns] goal_for_timedelta timedelta64[ns] goal_against_timedelta timedelta64[ns] game_end_timedelta timedelta64[ns] game_end_time timedelta64[ns] label object dtype: object
df.isnull().sum() / df.shape[0]
season 0.000000 game_number 0.000000 team_name 0.000000 date 0.000090 pull_period 0.000000 pull_time 0.000000 goal_for_time 0.866046 goal_against_time 0.665473 goal_for_timedelta 0.866046 goal_against_timedelta 0.665473 game_end_timedelta 0.468392 game_end_time 0.585244 label 0.000000 dtype: float64
df.columns
Index(['season', 'game_number', 'team_name', 'date', 'pull_period', 'pull_time', 'goal_for_time', 'goal_against_time', 'goal_for_timedelta', 'goal_against_timedelta', 'game_end_timedelta', 'game_end_time', 'label'], dtype='object')
df.date.hist(
color='b',
bins=500,
histtype='stepfilled')
ax = plt.gca()
ax.set_yticklabels([])
plt.text(x='2003', y=78,
s='Goalie Pulls by Date',
fontsize=24, color='black', weight='bold')
plt.text(x='2018-5', y=78,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('goalie_pulls_2003-2019')
df.groupby('season').size().sort_index(ascending=True).rename('counts').reset_index()
season | counts | |
---|---|---|
0 | 20032004 | 716 |
1 | 20052006 | 702 |
2 | 20062007 | 710 |
3 | 20072008 | 683 |
4 | 20082009 | 735 |
5 | 20092010 | 703 |
6 | 20102011 | 712 |
7 | 20112012 | 733 |
8 | 20122013 | 426 |
9 | 20132014 | 749 |
10 | 20142015 | 792 |
11 | 20152016 | 917 |
12 | 20162017 | 811 |
13 | 20172018 | 879 |
14 | 20182019 | 900 |
fig, ax = plt.subplots()
# Calculate number of pulls per season
s = df.groupby('season').size().sort_index(ascending=True).rename('counts').reset_index()
# Add data for the missing season
s = (s.append({'season': '20042005', 'counts': 0}, ignore_index=True)
.sort_values('season', ascending=True).reset_index(drop=True))
s.plot(marker='o', lw=0, ax=ax, color='b', ms=10)
ax.set_ylim(600, 1000)
plt.legend([])
plt.ylabel('Total Goalie Pulls')
# Assign tick names
label_map = {str(i): s for i, s in enumerate(s.season.tolist())}
fig.canvas.draw()
labels = [lab.get_text() for lab in ax.get_xticklabels()]
season_label = lambda x: '{}/{}'.format(x[:4], x[4:])
ax.set_xticklabels(
[season_label(label_map.get(lab, ''))
for lab in labels]
)
plt.text(x=-0.7, y=1032,
s='Goalie Pulls by Season',
fontsize=24, color='black', weight='bold')
plt.text(x=-0.7, y=1015,
s='Goalie pulls have been trending up in the last 10 years.',
fontsize=14, color='black', style='italic')
plt.text(x=14.1, y=1015,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('goalie_pulls_by_season')
Plot goalie pulls per season
games_per_season = {}
for folder in sorted(glob.glob('../../data/raw/html/*')):
files = glob.glob(os.path.join(folder, '*.html'))
print(folder, len(files))
games_per_season[os.path.split(folder)[-1]] = len(files)
../../data/raw/html/20032004 1230 ../../data/raw/html/20042005 0 ../../data/raw/html/20052006 1228 ../../data/raw/html/20062007 1230 ../../data/raw/html/20072008 1230 ../../data/raw/html/20082009 1230 ../../data/raw/html/20092010 1230 ../../data/raw/html/20102011 1230 ../../data/raw/html/20112012 1230 ../../data/raw/html/20122013 720 ../../data/raw/html/20132014 1230 ../../data/raw/html/20142015 1230 ../../data/raw/html/20152016 1230 ../../data/raw/html/20162017 1230 ../../data/raw/html/20172018 1271 ../../data/raw/html/20182019 1271
s['counts_per_game'] = s.season.map(games_per_season)
fig, ax = plt.subplots()
# Calculate number of pulls per season
s = df.groupby('season').size().sort_index(ascending=True).rename('counts').reset_index()
# Add data for the missing season
s = (s.append({'season': '20042005', 'counts': 0}, ignore_index=True)
.sort_values('season', ascending=True).reset_index(drop=True))
# Convert to counts per game
s['games'] = s['season'].apply(lambda x: games_per_season.get(x, 0))
s['counts'] = (s['counts'] / s['games']).fillna(0)
s.plot(marker='o', lw=0, ax=ax, color='b', ms=10)
ax.set_ylim(0.5, 0.8)
plt.legend([])
plt.ylabel('Average Goalie Pulls Per Game')
# Assign tick names
label_map = {str(i): s for i, s in enumerate(s.season.tolist())}
fig.canvas.draw()
labels = [lab.get_text() for lab in ax.get_xticklabels()]
season_label = lambda x: '{}/{}'.format(x[:4], x[4:])
ax.set_xticklabels(
[season_label(label_map.get(lab, ''))
for lab in labels]
)
plt.text(x=-0.05, y=0.823,
s='Goalie Pulls by Season',
fontsize=24, color='black', weight='bold')
plt.text(x=-0.05, y=0.81,
s='Goalie pulls have been trending up in the last 10 years.',
fontsize=14, color='black', style='italic')
plt.text(x=11.1, y=0.81,
s='alexgalea.ca | Source: NHL.com',
fontsize=14, color='black', style='italic')
savefig('goalie_pulls_per_game_by_season')
fig, ax = plt.subplots()
iterables = zip(['green', 'orange', 'red'],
['goal_for', 'no_goals', 'goal_against'])
legend_map = {
'no_goals': 'No Goals',
'goal_against': 'Goal Against',
'goal_for': 'Goal For'
}
axes = []
for c, label in iterables:
m = df.label==label
# Calculate the counts
s = (df[m].groupby('season').size()
.sort_index(ascending=True)
.rename(legend_map[label]).reset_index())
# Add data for the missing season
s = (s.append({'season': '20042005', legend_map[label]: -999}, ignore_index=True)
.sort_values('season', ascending=True).reset_index(drop=True))
s.loc[s.season == '20122013', legend_map[label]] = -999
s.plot(marker='o', lw=0, ax=ax, ms=10, color=c, label=legend_map[label])
plt.legend()
# ax.set_xticklabels(s.season.tolist());
ax.set_ylim(0, 550)
plt.ylabel('Total Counts')
# Assign tick names
label_map = {str(i): s for i, s in enumerate(s.season.tolist())}
fig.canvas.draw()
labels = [lab.get_text() for lab in ax.get_xticklabels()]
season_label = lambda x: '{}/{}'.format(x[:4], x[4:])
ax.set_xticklabels(
[season_label(label_map.get(lab, ''))
for lab in labels]
)
plt.text(x=-0.7, y=608,
s='Goalie Pull Outcomes',
fontsize=24, color='black', weight='bold')
plt.text(x=-0.7, y=567,
s='Good outcomes (green dots) have been on the rise, but the\nbad outcomes (red dots) are rising much faster, doubling in the last 10 years.',
fontsize=14, color='black', style='italic')
plt.text(x=14.1, y=566,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('goalie_pull_outcomes_by_season')
Plot average pull time by season
df['pull_time_remaining'] = (
df['pull_time']
.apply(lambda x: datetime.timedelta(seconds=60*20) - x)
.astype('timedelta64[s]')
) / 60
sns.boxplot(x='season', y='pull_time_remaining', data=df, color='b')
plt.ylabel('Time Remaining when Goalie Pulled (minutes)')
plt.xlabel('Season')
plt.xticks(rotation=45)
plt.ylim(-0.1, 4)
plt.text(x=-0.5, y=4.29,
s='Goalie Pull Times',
fontsize=24, color='black', weight='bold')
plt.text(x=-0.5, y=4.13,
s='Goalies are being pulled earlier in the 3rd period in recent years.',
fontsize=14, color='black', style='italic')
plt.text(x=13.1, y=4.13,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('goalie_pull_times_by_season')
df.groupby('season').pull_time_remaining.mean()
season 20032004 1.191504 20052006 1.210992 20062007 1.208146 20072008 1.200683 20082009 1.252222 20092010 1.231840 20102011 1.287406 20112012 1.321487 20122013 1.269014 20132014 1.403227 20142015 1.699053 20152016 1.839477 20162017 1.627312 20172018 1.753963 20182019 1.912778 Name: pull_time_remaining, dtype: float64
col = 'pull_time'
(df[col].astype('timedelta64[s]') / 60)\
.plot.hist(bins=100,
color='b',
histtype='stepfilled')
plt.xlabel('Time elapsed in 3rd period (minutes)')
ax = plt.gca()
ax.set_yticklabels([])
plt.xlim(14, 20)
plt.text(x=14, y=1043,
s='Goalie Pull Time Distribution',
fontsize=24, color='black', weight='bold')
plt.text(x=14, y=976,
s='Goalie pulls start gradually ramping up\nafter the 16 minute mark in the 3rd period.',
fontsize=14, color='black', style='italic')
plt.text(x=19.4, y=971,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('goalie_pull_game_times_hist')
We're interested in knowing about the outcome, given the pull time. This way we can look at the odds of scoring as a function of game time elapsed.
df.head()
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | label | pull_time_remaining | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20032004 | 710 | CHI | 2004-01-21 | 3 | 00:19:44 | NaT | 00:19:47 | NaT | 00:00:03 | NaT | NaT | goal_against | 0.266667 |
1 | 20032004 | 205 | MIN | 2003-11-08 | 3 | 00:19:58 | NaT | NaT | NaT | NaT | 00:00:02 | NaT | no_goals | 0.033333 |
2 | 20032004 | 1230 | S.J | 2004-04-04 | 3 | 00:18:39 | 00:19:40 | NaT | 00:01:01 | NaT | NaT | NaT | goal_for | 1.350000 |
3 | 20032004 | 655 | PHX | 2004-01-13 | 3 | 00:19:03 | NaT | 00:19:25 | NaT | 00:00:22 | NaT | NaT | goal_against | 0.950000 |
4 | 20032004 | 985 | PHX | 2004-03-02 | 3 | 00:18:34 | 00:18:44 | NaT | 00:00:10 | NaT | NaT | NaT | goal_for | 1.433333 |
# ax = plt.subplot(111)
# ax.set_prop_cycle(color=['red', 'green', 'orange'])
df['pull_time_seconds'] = df['pull_time'].astype('timedelta64[s]') / 60
fig, ax = plt.subplots()
iterables = zip(['green', 'orange', 'red'],
[3, 2, 1],
['goal_for', 'no_goals', 'goal_against'])
legend_map = {
'no_goals': 'No Goals',
'goal_against': 'Goal Against',
'goal_for': 'Goal For'
}
for c, z, label in iterables:
(df[df.label==label]['pull_time_seconds']
.plot.hist(bins=60,
alpha=0.5,
color=c,
histtype='stepfilled',
label=legend_map[label],
zorder=z))
plt.xlabel('Time elapsed in 3rd period (minutes)')
plt.gca().set_yticklabels([])
plt.xlim(14, 20)
plt.legend()
plt.text(x=14, y=609,
s='Goalie Pull Time Distribution',
fontsize=24, color='black', weight='bold')
plt.text(x=14, y=567,
s='Late goalie pulls tend to be no-goal outcomes (yellow bars).\nGoal outcomes (green and red bars) are similarly distributed to each other.',
fontsize=14, color='black', style='italic')
plt.text(x=19.4, y=565,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('goalie_pull_outcomes_game_times_hist')
del df['pull_time_seconds']
cols = ['goal_for_time', 'goal_against_time']
(df[cols].astype('timedelta64[s]') / 60)\
.plot.hist(bins=100,
alpha=0.5,
color=['green', 'red'],
histtype='stepfilled')
plt.xlabel('Time elapsed in 3rd period (minutes)')
plt.yticks([])
plt.xlim(14, 20)
savefig('5_on_6_goals')
cols = ['goal_for_time', 'goal_against_time']
legend_map = {
'goal_for_time': 'Goal For Time',
'goal_against_time': 'Goal Against Time',
}
(df[cols]
.rename(columns=legend_map)
.astype('timedelta64[s]') / 60)\
.plot.hist(bins=100,
alpha=0.5,
density='normed',
color=['green', 'red'],
histtype='stepfilled')
plt.xlabel('Time elapsed in 3rd period (minutes)')
plt.ylabel('Frequency (normed)')
plt.gca().set_yticklabels([])
plt.xlim(14, 20)
plt.text(x=14, y=1.108,
s='Goalie Pull Time Distribution',
fontsize=24, color='black', weight='bold')
plt.text(x=14, y=1.035,
s='Following a goalie pull, goals for (green bars)\ntend to occur slightly earlier than goals against (red bars).',
fontsize=14, color='black', style='italic')
plt.text(x=19.4, y=1.035,
s='alexgalea.ca',
fontsize=14, color='black', style='italic')
savefig('5_on_6_goals_normed')
print('Number of goals found:')
(~df[['goal_for_time', 'goal_against_time']].isnull()).sum()
Number of goals found:
goal_for_time 1496 goal_against_time 3736 dtype: int64
print('Total goals found:')
(~df[['goal_for_time', 'goal_against_time']].isnull()).sum().sum()
Total goals found:
5232
We also want to model the time between goalie pull and goal (i.e. the timedelta).
cols = ['game_end_timedelta', 'goal_against_timedelta', 'goal_for_timedelta', ]
(df[cols].astype('timedelta64[s]') / 60)\
.plot.hist(bins=50, alpha=0.5,
color=['blue', 'red','green'],
histtype='stepfilled')
plt.xlabel('Time since goalie pull (minutes)')
plt.yticks([])
# savefig('5_on_6_goalie_pull_outcomes')
([], <a list of 0 Text yticklabel objects>)
cols = ['goal_against_timedelta', 'goal_for_timedelta', ]
(df[cols].astype('timedelta64[s]') / 60)\
.plot.hist(bins=50, alpha=0.5,
color=['red', 'green',],
histtype='stepfilled')
plt.xlabel('Time since goalie pull (minutes)')
plt.yticks([])
# savefig('5_on_6_goalie_pull_goal_timedeltas')
([], <a list of 0 Text yticklabel objects>)
cols = ['goal_against_timedelta', 'goal_for_timedelta', ]
(df[cols].astype('timedelta64[s]') / 60)\
.plot.hist(bins=50, alpha=0.5,
density='normed',
color=['red', 'green'],
histtype='stepfilled')
plt.xlabel('Time since goalie pull (minutes)')
plt.yticks([])
# savefig('5_on_6_goalie_pull_goal_timedeltas_normed')
([], <a list of 0 Text yticklabel objects>)
The mean/median number of seconds until a goal (after pulling the goalie)
(df[cols].astype('timedelta64[s]')).mean()
goal_against_timedelta 48.181745 goal_for_timedelta 45.568182 dtype: float64
(df[cols].astype('timedelta64[s]')).median()
goal_against_timedelta 40.0 goal_for_timedelta 39.0 dtype: float64
(df['game_end_timedelta'].astype('timedelta64[s]') / 60).plot.hist(bins=50, color='b', histtype='stepfilled')
plt.xlabel('Time since goalie pull (minutes)')
plt.yticks([])
# savefig('5_on_6_game_end_timedeltas')
([], <a list of 0 Text yticklabel objects>)
df = load_data()
Any non 3rd period pulls?
df[df['pull_period'] != 3]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | game_end_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta |
---|
Any bugs not in the last 15 minutes?
mask = df.pull_time < datetime.timedelta(seconds=15*60)
df[mask].shape[0]
0
I noticed some goal for timedelats less than 0.. which makes no sense. I'll have to look into that
mask = df.goal_for_time < datetime.timedelta(seconds=15*60)
df[mask]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | game_end_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
576 | 20052006 | 591 | NYI | 2006-01-04 | 3 | 00:19:51 | 00:02:15 | NaT | 00:02:15 | -1 days +23:42:24 | NaT | -1 days +23:42:24 |
This game is weird... http://www.nhl.com/scores/htmlreports/20052006/PL020591.HTM
We'll drop this point before modeling.
df[df.goal_for_timedelta < datetime.timedelta(0)]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | game_end_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
29 | 20032004 | 907 | ATL | 2004-02-21 | 3 | 00:19:14 | 00:18:59 | NaT | 00:20:00 | -1 days +23:59:45 | NaT | 00:00:46 |
320 | 20032004 | 367 | CHI | 2003-12-03 | 3 | 00:18:19 | 00:18:01 | NaT | 00:19:43 | -1 days +23:59:42 | NaT | 00:01:24 |
462 | 20032004 | 754 | PHX | 2004-01-27 | 3 | 00:19:22 | 00:17:52 | NaT | 00:19:54 | -1 days +23:58:30 | NaT | 00:00:32 |
192 | 20062007 | 665 | BOS | 2007-01-13 | 3 | 00:17:23 | 00:17:13 | NaT | 00:19:33 | -1 days +23:59:50 | NaT | 00:02:10 |
df.game_end_time.astype('timedelta64[s]').plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1a262006d8>
Games should end at 20 mins. Let's throw out the early times (this must be overtime or something).
mask = df.game_end_time < datetime.timedelta(seconds=60*20)
mask.sum(), df.shape[0]
(1273, 2172)
mask = df.game_end_time < datetime.timedelta(seconds=60*15)
mask.sum(), df.shape[0]
(44, 2172)
Obviously the game will end at 20:00, this column corresponds to the last row parsed.
2019-04-25
df = load_data()
Any non 3rd period pulls?
df[df['pull_period'] != 3]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 20072008 | 751 | NSH | 2008-01-24 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:44 | 00:00:44 |
3 | 20072008 | 1158 | WSH | 2008-03-27 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:52 | 00:00:52 |
4 | 20072008 | 696 | CBJ | 2008-01-17 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:27 | 00:01:27 |
5 | 20072008 | 946 | STL | 2008-02-24 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:47 | 00:00:47 |
6 | 20072008 | 1070 | VAN | 2008-03-13 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:14 | 00:01:14 |
9 | 20072008 | 570 | DET | 2007-12-29 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:16 | 00:00:16 |
12 | 20072008 | 618 | ANA | 2008-01-05 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:18 | 00:00:18 |
20 | 20072008 | 1084 | EDM | 2008-03-15 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:07 | 00:00:07 |
21 | 20072008 | 14 | STL | 2007-10-04 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:09 | 00:00:09 |
23 | 20072008 | 526 | VAN | 2007-12-22 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:12 | 00:00:12 |
26 | 20072008 | 96 | EDM | 2007-10-18 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:40 | 00:00:40 |
29 | 20072008 | 910 | CGY | 2008-02-19 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:59 | 00:00:59 |
31 | 20072008 | 269 | S.J | 2007-11-15 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:35 | 00:00:35 |
33 | 20072008 | 1056 | ANA | 2008-03-11 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:21 | 00:00:21 |
34 | 20072008 | 872 | DAL | 2008-02-14 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:32 | 00:00:32 |
38 | 20072008 | 311 | PIT | 2007-11-22 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
39 | 20072008 | 33 | BOS | 2007-10-06 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:13 | 00:00:13 |
40 | 20072008 | 645 | BUF | 2008-01-10 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
41 | 20072008 | 1133 | ANA | 2008-03-22 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:26 | 00:01:26 |
42 | 20072008 | 1099 | STL | 2008-03-18 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
43 | 20072008 | 1020 | MTL | 2008-03-06 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:15 | 00:00:15 |
44 | 20072008 | 404 | L.A | 2007-12-05 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:19 | 00:00:19 |
45 | 20072008 | 238 | PIT | 2007-11-10 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:31 | 00:00:31 |
49 | 20072008 | 613 | T.B | 2008-01-05 | 4 | 00:03:17 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:17 |
50 | 20072008 | 1149 | S.J | 2008-03-25 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:16 | 00:01:16 |
51 | 20072008 | 310 | L.A | 2007-11-21 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:37 | 00:00:37 |
57 | 20072008 | 849 | NSH | 2008-02-10 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:35 | 00:00:35 |
58 | 20072008 | 609 | WSH | 2008-01-05 | 4 | 00:01:27 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:27 |
65 | 20072008 | 562 | OTT | 2007-12-29 | 2 | 00:10:05 | 00:10:06 | NaT | 00:00:01 | NaT | NaT | NaT |
72 | 20072008 | 1127 | BOS | 2008-03-22 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1657 | 20182019 | 549 | MTL | 2018-12-22 | 4 | 00:01:59 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:59 |
1664 | 20182019 | 375 | DAL | 2018-11-27 | 4 | 00:02:42 | NaT | NaT | NaT | NaT | 00:00:00 | 00:02:42 |
1668 | 20182019 | 230 | BUF | 2018-11-08 | 2 | 00:05:31.500000 | 00:05:34 | NaT | 00:00:02.500000 | NaT | NaT | NaT |
1669 | 20182019 | 230 | BUF | 2018-11-08 | 4 | 00:01:38 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:38 |
1676 | 20182019 | 363 | N.J | 2018-11-26 | 4 | 00:01:28 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:28 |
1680 | 20182019 | 1213 | DAL | 2019-03-30 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1691 | 20182019 | 334 | NYI | 2018-11-23 | 4 | 00:03:40 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:40 |
1693 | 20182019 | 1244 | NYI | 2019-04-04 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1695 | 20182019 | 206 | PHI | 2018-11-03 | 4 | 00:00:13 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:13 |
1705 | 20182019 | 986 | PHI | 2019-02-28 | 4 | 00:04:27 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:27 |
1710 | 20182019 | 1264 | NYR | 2019-04-06 | 4 | 00:02:09 | NaT | NaT | NaT | NaT | 00:00:00 | 00:02:09 |
1711 | 20182019 | 251 | NYR | 2018-11-10 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1715 | 20182019 | 197 | EDM | 2018-11-03 | 2 | 00:10:00 | 00:01:15 | NaT | -1 days +23:51:15 | NaT | NaT | NaT |
1730 | 20182019 | 928 | BOS | 2019-02-20 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1731 | 20182019 | 181 | PIT | 2018-11-01 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1737 | 20182019 | 1225 | COL | 2019-04-01 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1738 | 20182019 | 210 | DAL | 2018-11-05 | 4 | 00:04:29 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:29 |
1740 | 20182019 | 705 | MTL | 2019-01-14 | 4 | 00:00:15 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:15 |
1742 | 20182019 | 1049 | STL | 2019-03-09 | 4 | 00:03:21 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:21 |
1746 | 20182019 | 990 | DAL | 2019-02-28 | 4 | 00:03:19 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:19 |
1752 | 20182019 | 791 | TOR | 2019-02-02 | 2 | 00:18:20 | NaT | NaT | NaT | NaT | 00:01:40 | 00:20:00 |
1753 | 20182019 | 841 | COL | 2019-02-09 | 4 | 00:02:23 | NaT | NaT | NaT | NaT | 00:00:00 | 00:02:23 |
1776 | 20182019 | 816 | VGK | 2019-02-05 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1779 | 20182019 | 953 | PIT | 2019-02-23 | 4 | 00:01:59 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:59 |
1781 | 20182019 | 400 | WPG | 2018-12-01 | 4 | 00:04:43 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:43 |
1784 | 20182019 | 380 | DAL | 2018-11-28 | 4 | 00:00:24 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:24 |
1793 | 20182019 | 416 | TOR | 2018-12-04 | 4 | 00:04:57 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:57 |
1797 | 20182019 | 292 | DET | 2018-11-17 | 4 | 00:04:09 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:09 |
1804 | 20182019 | 154 | BUF | 2018-10-27 | 4 | 00:00:31 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:31 |
1812 | 20182019 | 1024 | MIN | 2019-03-05 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
3334 rows × 12 columns
df.pull_period.value_counts().sort_index()
1 205 2 340 3 14532 4 1453 5 1336 Name: pull_period, dtype: int64
None of these make sense.. we'll want to drop them
Any bugs not in the last 15 minutes?
mask = df.pull_time < datetime.timedelta(seconds=15*60)
df[mask].shape[0]
4241
df[mask]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 20072008 | 751 | NSH | 2008-01-24 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:44 | 00:00:44 |
3 | 20072008 | 1158 | WSH | 2008-03-27 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:52 | 00:00:52 |
4 | 20072008 | 696 | CBJ | 2008-01-17 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:27 | 00:01:27 |
5 | 20072008 | 946 | STL | 2008-02-24 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:47 | 00:00:47 |
6 | 20072008 | 1070 | VAN | 2008-03-13 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:14 | 00:01:14 |
9 | 20072008 | 570 | DET | 2007-12-29 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:16 | 00:00:16 |
12 | 20072008 | 618 | ANA | 2008-01-05 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:18 | 00:00:18 |
20 | 20072008 | 1084 | EDM | 2008-03-15 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:07 | 00:00:07 |
21 | 20072008 | 14 | STL | 2007-10-04 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:09 | 00:00:09 |
22 | 20072008 | 422 | MTL | 2007-12-08 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
23 | 20072008 | 526 | VAN | 2007-12-22 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:12 | 00:00:12 |
26 | 20072008 | 96 | EDM | 2007-10-18 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:40 | 00:00:40 |
29 | 20072008 | 910 | CGY | 2008-02-19 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:59 | 00:00:59 |
31 | 20072008 | 269 | S.J | 2007-11-15 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:35 | 00:00:35 |
33 | 20072008 | 1056 | ANA | 2008-03-11 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:21 | 00:00:21 |
34 | 20072008 | 872 | DAL | 2008-02-14 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:32 | 00:00:32 |
38 | 20072008 | 311 | PIT | 2007-11-22 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
39 | 20072008 | 33 | BOS | 2007-10-06 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:13 | 00:00:13 |
40 | 20072008 | 645 | BUF | 2008-01-10 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
41 | 20072008 | 1133 | ANA | 2008-03-22 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:26 | 00:01:26 |
42 | 20072008 | 1099 | STL | 2008-03-18 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
43 | 20072008 | 1020 | MTL | 2008-03-06 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:15 | 00:00:15 |
44 | 20072008 | 404 | L.A | 2007-12-05 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:19 | 00:00:19 |
45 | 20072008 | 238 | PIT | 2007-11-10 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:31 | 00:00:31 |
49 | 20072008 | 613 | T.B | 2008-01-05 | 4 | 00:03:17 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:17 |
50 | 20072008 | 1149 | S.J | 2008-03-25 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:01:16 | 00:01:16 |
51 | 20072008 | 310 | L.A | 2007-11-21 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:37 | 00:00:37 |
57 | 20072008 | 849 | NSH | 2008-02-10 | 1 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:35 | 00:00:35 |
58 | 20072008 | 609 | WSH | 2008-01-05 | 4 | 00:01:27 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:27 |
65 | 20072008 | 562 | OTT | 2007-12-29 | 2 | 00:10:05 | 00:10:06 | NaT | 00:00:01 | NaT | NaT | NaT |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1693 | 20182019 | 1244 | NYI | 2019-04-04 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1694 | 20182019 | 271 | TOR | 2018-11-13 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1695 | 20182019 | 206 | PHI | 2018-11-03 | 4 | 00:00:13 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:13 |
1705 | 20182019 | 986 | PHI | 2019-02-28 | 4 | 00:04:27 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:27 |
1708 | 20182019 | 601 | BUF | 2018-12-31 | 3 | 00:12:55 | NaT | 00:19:28 | NaT | 00:06:33 | NaT | NaT |
1710 | 20182019 | 1264 | NYR | 2019-04-06 | 4 | 00:02:09 | NaT | NaT | NaT | NaT | 00:00:00 | 00:02:09 |
1711 | 20182019 | 251 | NYR | 2018-11-10 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1715 | 20182019 | 197 | EDM | 2018-11-03 | 2 | 00:10:00 | 00:01:15 | NaT | -1 days +23:51:15 | NaT | NaT | NaT |
1722 | 20182019 | 482 | T.B | 2018-12-13 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1730 | 20182019 | 928 | BOS | 2019-02-20 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1731 | 20182019 | 181 | PIT | 2018-11-01 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1737 | 20182019 | 1225 | COL | 2019-04-01 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1738 | 20182019 | 210 | DAL | 2018-11-05 | 4 | 00:04:29 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:29 |
1740 | 20182019 | 705 | MTL | 2019-01-14 | 4 | 00:00:15 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:15 |
1742 | 20182019 | 1049 | STL | 2019-03-09 | 4 | 00:03:21 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:21 |
1746 | 20182019 | 990 | DAL | 2019-02-28 | 4 | 00:03:19 | NaT | NaT | NaT | NaT | 00:00:00 | 00:03:19 |
1753 | 20182019 | 841 | COL | 2019-02-09 | 4 | 00:02:23 | NaT | NaT | NaT | NaT | 00:00:00 | 00:02:23 |
1776 | 20182019 | 816 | VGK | 2019-02-05 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
1779 | 20182019 | 953 | PIT | 2019-02-23 | 4 | 00:01:59 | NaT | NaT | NaT | NaT | 00:00:00 | 00:01:59 |
1781 | 20182019 | 400 | WPG | 2018-12-01 | 4 | 00:04:43 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:43 |
1784 | 20182019 | 380 | DAL | 2018-11-28 | 4 | 00:00:24 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:24 |
1790 | 20182019 | 553 | WSH | 2018-12-22 | 3 | 00:09:04 | NaT | NaT | NaT | NaT | 00:10:56 | 00:20:00 |
1793 | 20182019 | 416 | TOR | 2018-12-04 | 4 | 00:04:57 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:57 |
1797 | 20182019 | 292 | DET | 2018-11-17 | 4 | 00:04:09 | NaT | NaT | NaT | NaT | 00:00:00 | 00:04:09 |
1798 | 20182019 | 768 | VGK | 2019-01-23 | 3 | 00:11:25 | NaT | NaT | NaT | NaT | 00:08:35 | 00:20:00 |
1802 | 20182019 | 504 | PHI | 2018-12-15 | 3 | 00:10:00 | NaT | 00:15:29 | NaT | 00:05:29 | NaT | NaT |
1804 | 20182019 | 154 | BUF | 2018-10-27 | 4 | 00:00:31 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:31 |
1808 | 20182019 | 912 | OTT | 2019-02-18 | 3 | 00:08:45 | 00:09:01 | NaT | 00:00:16 | NaT | NaT | NaT |
1810 | 20182019 | 441 | STL | 2018-12-07 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1812 | 20182019 | 1024 | MIN | 2019-03-05 | 5 | 00:00:00 | NaT | NaT | NaT | NaT | 00:00:00 | 00:00:00 |
4241 rows × 12 columns
I noticed some goal for timedelats less than 0.. which makes no sense. I'll have to look into that
mask = df.goal_for_time < datetime.timedelta(seconds=15*60)
df[mask]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
557 | 20052006 | 591 | NYI | 2006-01-04 | 3 | 00:19:51 | 00:02:15 | NaT | -1 days +23:42:24 | NaT | NaT | NaT |
65 | 20072008 | 562 | OTT | 2007-12-29 | 2 | 00:10:05 | 00:10:06 | NaT | 00:00:01 | NaT | NaT | NaT |
111 | 20072008 | 175 | MTL | 2007-11-01 | 1 | 00:08:36 | 00:08:53 | NaT | 00:00:17 | NaT | NaT | NaT |
19 | 20082009 | 588 | N.J | 2009-01-06 | 3 | 00:11:40.500000 | 00:12:12 | NaT | 00:00:31.500000 | NaT | NaT | NaT |
71 | 20082009 | 1042 | BUF | 2009-03-17 | 1 | 00:03:57 | 00:04:02 | NaT | 00:00:05 | NaT | NaT | NaT |
89 | 20082009 | 1167 | PHI | 2009-04-04 | 2 | 00:02:56 | 00:02:58 | NaT | 00:00:02 | NaT | NaT | NaT |
95 | 20082009 | 959 | OTT | 2009-03-05 | 2 | 00:03:57.500000 | 00:04:03 | NaT | 00:00:05.500000 | NaT | NaT | NaT |
50 | 20092010 | 907 | PHI | 2010-02-13 | 3 | 00:10:53 | 00:10:53 | NaT | 00:00:00 | NaT | 00:09:07 | 00:20:00 |
69 | 20092010 | 815 | CGY | 2010-01-30 | 1 | 00:13:06 | 00:13:20 | NaT | 00:00:14 | NaT | NaT | NaT |
75 | 20092010 | 1123 | TOR | 2010-03-28 | 2 | 00:14:30.500000 | 00:14:35 | NaT | 00:00:04.500000 | NaT | NaT | NaT |
227 | 20092010 | 459 | WSH | 2009-12-11 | 3 | 00:02:30.500000 | 00:02:52 | NaT | 00:00:21.500000 | NaT | NaT | NaT |
237 | 20092010 | 1102 | CBJ | 2010-03-25 | 2 | 00:11:00 | 00:11:02 | NaT | 00:00:02 | NaT | NaT | NaT |
342 | 20092010 | 584 | BUF | 2009-12-29 | 2 | 00:12:43.500000 | 00:12:46 | NaT | 00:00:02.500000 | NaT | NaT | NaT |
374 | 20092010 | 984 | CBJ | 2010-03-09 | 1 | 00:08:42 | 00:08:55 | NaT | 00:00:13 | NaT | NaT | NaT |
471 | 20092010 | 624 | OTT | 2010-01-03 | 3 | 00:02:54.500000 | 00:02:57 | NaT | 00:00:02.500000 | NaT | NaT | NaT |
569 | 20092010 | 151 | TOR | 2009-10-26 | 3 | 00:06:07 | 00:06:11 | NaT | 00:00:04 | NaT | NaT | NaT |
572 | 20092010 | 916 | NYR | 2010-02-14 | 2 | 00:10:00 | 00:01:40 | NaT | -1 days +23:51:40 | NaT | NaT | NaT |
653 | 20092010 | 605 | L.A | 2009-12-31 | 2 | 00:10:00 | 00:03:17 | NaT | -1 days +23:53:17 | NaT | NaT | NaT |
665 | 20092010 | 202 | PIT | 2009-11-03 | 2 | 00:09:43.500000 | 00:10:02 | NaT | 00:00:18.500000 | NaT | NaT | NaT |
719 | 20092010 | 86 | STL | 2009-10-15 | 2 | 00:03:17 | 00:03:22 | NaT | 00:00:05 | NaT | NaT | NaT |
749 | 20092010 | 1112 | BUF | 2010-03-27 | 2 | 00:06:43.500000 | 00:06:57 | NaT | 00:00:13.500000 | NaT | NaT | NaT |
762 | 20092010 | 333 | STL | 2009-11-23 | 1 | 00:12:06.500000 | 00:12:31 | NaT | 00:00:24.500000 | NaT | NaT | NaT |
787 | 20092010 | 46 | S.J | 2009-10-08 | 3 | 00:07:07 | 00:07:08 | NaT | 00:00:01 | NaT | NaT | NaT |
869 | 20092010 | 411 | ANA | 2009-12-04 | 3 | 00:02:52.500000 | 00:03:00 | NaT | 00:00:07.500000 | NaT | NaT | NaT |
940 | 20092010 | 597 | S.J | 2009-12-30 | 3 | 00:08:07.500000 | 00:08:08 | NaT | 00:00:00.500000 | NaT | NaT | NaT |
975 | 20092010 | 26 | OTT | 2009-10-06 | 2 | 00:07:12 | 00:07:22 | NaT | 00:00:10 | NaT | NaT | NaT |
997 | 20092010 | 88 | BUF | 2009-10-16 | 1 | 00:14:41.500000 | 00:14:42 | NaT | 00:00:00.500000 | NaT | NaT | NaT |
1126 | 20092010 | 365 | S.J | 2009-11-27 | 3 | 00:08:14 | 00:08:28 | NaT | 00:00:14 | NaT | NaT | NaT |
1129 | 20092010 | 1215 | BUF | 2010-04-10 | 2 | 00:08:56.500000 | 00:08:57 | NaT | 00:00:00.500000 | NaT | NaT | NaT |
1247 | 20092010 | 159 | PIT | 2009-10-28 | 2 | 00:12:34 | 00:12:58 | NaT | 00:00:24 | NaT | NaT | NaT |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1679 | 20172018 | 482 | N.J | 2017-12-14 | 1 | 00:00:00 | 00:01:21 | NaT | 00:01:21 | NaT | NaT | NaT |
1686 | 20172018 | 302 | NSH | 2017-11-18 | 3 | 00:11:45.500000 | 00:11:51 | NaT | 00:00:05.500000 | NaT | NaT | NaT |
1704 | 20172018 | 61 | OTT | 2017-10-13 | 3 | 00:04:53.500000 | 00:05:18 | NaT | 00:00:24.500000 | NaT | NaT | NaT |
1711 | 20172018 | 791 | VGK | 2018-02-02 | 3 | 00:04:33.500000 | 00:12:38 | NaT | 00:08:04.500000 | NaT | NaT | NaT |
1724 | 20172018 | 683 | DAL | 2018-01-15 | 3 | 00:19:56 | 00:02:59 | NaT | -1 days +23:43:03 | NaT | NaT | NaT |
2 | 20182019 | 205 | L.A | 2018-11-03 | 3 | 00:08:43 | 00:08:43 | NaT | 00:00:00 | NaT | 00:11:17 | 00:20:00 |
127 | 20182019 | 97 | NYI | 2018-10-18 | 3 | 00:09:05.500000 | 00:09:20 | NaT | 00:00:14.500000 | NaT | NaT | NaT |
170 | 20182019 | 458 | ANA | 2018-12-09 | 1 | 00:13:32 | 00:13:39 | NaT | 00:00:07 | NaT | NaT | NaT |
232 | 20182019 | 674 | PHI | 2019-01-10 | 2 | 00:08:10 | 00:08:51 | NaT | 00:00:41 | NaT | NaT | NaT |
261 | 20182019 | 635 | VGK | 2019-01-04 | 1 | 00:00:00 | 00:06:52 | NaT | 00:06:52 | NaT | NaT | NaT |
351 | 20182019 | 59 | OTT | 2018-10-13 | 3 | 00:10:44 | 00:11:03 | NaT | 00:00:19 | NaT | NaT | NaT |
353 | 20182019 | 382 | NYI | 2018-11-29 | 1 | 00:00:00 | 00:12:17 | NaT | 00:12:17 | NaT | NaT | NaT |
496 | 20182019 | 68 | STL | 2018-10-13 | 2 | 00:12:26.500000 | 00:12:35 | NaT | 00:00:08.500000 | NaT | NaT | NaT |
688 | 20182019 | 783 | T.B | 2019-02-01 | 1 | 00:00:00 | 00:00:00 | NaT | 00:00:00 | NaT | 00:00:00 | 00:00:00 |
708 | 20182019 | 957 | DET | 2019-02-24 | 2 | 00:07:26.500000 | 00:08:00 | NaT | 00:00:33.500000 | NaT | NaT | NaT |
868 | 20182019 | 1016 | S.J | 2019-03-03 | 3 | 00:08:10.500000 | 00:08:16 | NaT | 00:00:05.500000 | NaT | NaT | NaT |
938 | 20182019 | 775 | CBJ | 2019-01-29 | 1 | 00:02:45.500000 | 00:03:17 | NaT | 00:00:31.500000 | NaT | NaT | NaT |
974 | 20182019 | 535 | CBJ | 2018-12-20 | 2 | 00:01:00 | 00:01:15 | NaT | 00:00:15 | NaT | NaT | NaT |
1031 | 20182019 | 368 | PHI | 2018-11-27 | 2 | 00:03:14.500000 | 00:03:22 | NaT | 00:00:07.500000 | NaT | NaT | NaT |
1036 | 20182019 | 542 | OTT | 2018-12-21 | 2 | 00:11:25 | 00:11:53 | NaT | 00:00:28 | NaT | NaT | NaT |
1142 | 20182019 | 257 | WPG | 2018-11-11 | 2 | 00:07:47.500000 | 00:07:54 | NaT | 00:00:06.500000 | NaT | NaT | NaT |
1216 | 20182019 | 955 | NYI | 2019-02-23 | 1 | 00:13:30 | 00:14:05 | NaT | 00:00:35 | NaT | NaT | NaT |
1229 | 20182019 | 555 | COL | 2018-12-22 | 3 | 00:07:32.500000 | 00:07:39 | NaT | 00:00:06.500000 | NaT | NaT | NaT |
1315 | 20182019 | 1038 | BUF | 2019-03-07 | 2 | 00:03:02.500000 | 00:03:16 | NaT | 00:00:13.500000 | NaT | NaT | NaT |
1342 | 20182019 | 1079 | NSH | 2019-03-12 | 3 | 00:10:00 | 00:14:16 | NaT | 00:04:16 | NaT | NaT | NaT |
1523 | 20182019 | 1068 | S.J | 2019-03-11 | 3 | 00:10:01 | 00:10:01 | NaT | 00:00:00 | NaT | 00:09:59 | 00:20:00 |
1629 | 20182019 | 973 | CBJ | 2019-02-26 | 2 | 00:04:39.500000 | 00:04:55 | NaT | 00:00:15.500000 | NaT | NaT | NaT |
1668 | 20182019 | 230 | BUF | 2018-11-08 | 2 | 00:05:31.500000 | 00:05:34 | NaT | 00:00:02.500000 | NaT | NaT | NaT |
1715 | 20182019 | 197 | EDM | 2018-11-03 | 2 | 00:10:00 | 00:01:15 | NaT | -1 days +23:51:15 | NaT | NaT | NaT |
1808 | 20182019 | 912 | OTT | 2019-02-18 | 3 | 00:08:45 | 00:09:01 | NaT | 00:00:16 | NaT | NaT | NaT |
263 rows × 12 columns
We'll have to drop these as well
df[df.goal_for_timedelta < datetime.timedelta(0)]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
557 | 20052006 | 591 | NYI | 2006-01-04 | 3 | 00:19:51 | 00:02:15 | NaT | -1 days +23:42:24 | NaT | NaT | NaT |
572 | 20092010 | 916 | NYR | 2010-02-14 | 2 | 00:10:00 | 00:01:40 | NaT | -1 days +23:51:40 | NaT | NaT | NaT |
653 | 20092010 | 605 | L.A | 2009-12-31 | 2 | 00:10:00 | 00:03:17 | NaT | -1 days +23:53:17 | NaT | NaT | NaT |
1044 | 20102011 | 1055 | PHI | 2011-03-17 | 2 | 00:19:57 | 00:18:32 | NaT | -1 days +23:58:35 | NaT | NaT | NaT |
27 | 20112012 | 244 | CAR | 2011-11-14 | 2 | 00:12:44 | 00:04:36 | NaT | -1 days +23:51:52 | NaT | NaT | NaT |
48 | 20112012 | 585 | TOR | 2012-01-05 | 2 | 00:10:00 | 00:01:22 | NaT | -1 days +23:51:22 | NaT | NaT | NaT |
924 | 20112012 | 217 | FLA | 2011-11-10 | 2 | 00:10:00 | 00:04:53 | NaT | -1 days +23:54:53 | NaT | NaT | NaT |
1197 | 20112012 | 728 | STL | 2012-01-24 | 3 | 00:10:00 | 00:05:04 | NaT | -1 days +23:55:04 | NaT | NaT | NaT |
1529 | 20112012 | 292 | PHX | 2011-11-21 | 2 | 00:10:00 | 00:07:34 | NaT | -1 days +23:57:34 | NaT | NaT | NaT |
514 | 20122013 | 450 | PHX | 2013-03-21 | 2 | 00:10:00 | 00:02:26 | NaT | -1 days +23:52:26 | NaT | NaT | NaT |
958 | 20132014 | 715 | STL | 2014-01-16 | 2 | 00:10:00 | 00:05:55 | NaT | -1 days +23:55:55 | NaT | NaT | NaT |
1068 | 20132014 | 934 | TOR | 2014-03-05 | 2 | 00:10:00 | 00:01:44 | NaT | -1 days +23:51:44 | NaT | NaT | NaT |
1430 | 20132014 | 764 | CHI | 2014-01-23 | 1 | 00:19:54 | 00:19:28 | NaT | -1 days +23:59:34 | NaT | NaT | NaT |
1468 | 20132014 | 752 | DAL | 2014-01-21 | 2 | 00:19:58 | 00:11:22 | NaT | -1 days +23:51:24 | NaT | NaT | NaT |
175 | 20142015 | 675 | DET | 2015-01-18 | 3 | 00:10:00 | 00:01:59 | NaT | -1 days +23:51:59 | NaT | NaT | NaT |
594 | 20142015 | 1132 | NSH | 2015-03-29 | 2 | 00:17:44.500000 | 00:09:55 | NaT | -1 days +23:52:10.500000 | NaT | NaT | NaT |
953 | 20142015 | 755 | CAR | 2015-02-03 | 3 | 00:10:00 | 00:04:01 | NaT | -1 days +23:54:01 | NaT | NaT | NaT |
88 | 20152016 | 679 | COL | 2016-01-18 | 1 | 00:11:37 | 00:01:16 | NaT | -1 days +23:49:39 | NaT | NaT | NaT |
665 | 20152016 | 1124 | NYI | 2016-03-26 | 3 | 00:10:00 | 00:00:30 | NaT | -1 days +23:50:30 | NaT | NaT | NaT |
1397 | 20152016 | 948 | COL | 2016-03-01 | 2 | 00:10:00 | 00:03:25 | NaT | -1 days +23:53:25 | NaT | NaT | NaT |
1579 | 20152016 | 656 | VAN | 2016-01-15 | 4 | 00:10:00 | 00:03:25 | NaT | -1 days +23:53:25 | NaT | NaT | NaT |
1632 | 20152016 | 139 | BUF | 2015-10-29 | 2 | 00:19:58 | 00:17:34 | NaT | -1 days +23:57:36 | NaT | NaT | NaT |
765 | 20162017 | 28 | ARI | 2016-10-15 | 4 | 00:10:00 | 00:03:12 | NaT | -1 days +23:53:12 | NaT | NaT | NaT |
149 | 20172018 | 862 | EDM | 2018-02-12 | 2 | 00:10:00 | 00:02:46 | NaT | -1 days +23:52:46 | NaT | NaT | NaT |
288 | 20172018 | 80 | DET | 2017-10-16 | 1 | 00:19:45.500000 | 00:02:30 | NaT | -1 days +23:42:44.500000 | NaT | NaT | NaT |
368 | 20172018 | 1122 | CBJ | 2018-03-20 | 2 | 00:10:00 | 00:09:28 | NaT | -1 days +23:59:28 | NaT | NaT | NaT |
1318 | 20172018 | 109 | ANA | 2017-10-20 | 3 | 00:10:00 | 00:05:58 | NaT | -1 days +23:55:58 | NaT | NaT | NaT |
1724 | 20172018 | 683 | DAL | 2018-01-15 | 3 | 00:19:56 | 00:02:59 | NaT | -1 days +23:43:03 | NaT | NaT | NaT |
1243 | 20182019 | 502 | N.J | 2018-12-15 | 1 | 00:19:02.500000 | 00:18:29 | NaT | -1 days +23:59:26.500000 | NaT | NaT | NaT |
1715 | 20182019 | 197 | EDM | 2018-11-03 | 2 | 00:10:00 | 00:01:15 | NaT | -1 days +23:51:15 | NaT | NaT | NaT |
df.game_end_time.astype('timedelta64[s]').plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1a19fd9048>
Games should end at 20 mins. Let's throw out the early times (this must be overtime or something).
Obviously the game will end at 20:00, this column corresponds to the last row parsed.
Let's look at the spike in goalie pull times around 10 minutes
mask = (df.pull_time <= datetime.timedelta(seconds=15*60))\
& (df.pull_time >= datetime.timedelta(seconds=5*60))
df[mask].pull_time.astype('timedelta64[s]').plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1a1b60ee10>
df[df.pull_time == datetime.timedelta(seconds=10*60)]
season | game_number | team_name | date | pull_period | pull_time | goal_for_time | goal_against_time | goal_for_timedelta | goal_against_timedelta | game_end_timedelta | game_end_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
22 | 20072008 | 422 | MTL | 2007-12-08 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
91 | 20082009 | 914 | S.J | 2009-02-26 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
129 | 20082009 | 338 | MTL | 2008-11-29 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
13 | 20092010 | 747 | NYR | 2010-01-21 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
28 | 20092010 | 1158 | PHI | 2010-04-02 | 2 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
54 | 20092010 | 1174 | ANA | 2010-04-03 | 2 | 00:10:00 | 00:18:28 | NaT | 00:08:28 | NaT | NaT | NaT |
71 | 20092010 | 1089 | VAN | 2010-03-23 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
91 | 20092010 | 550 | MTL | 2009-12-23 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
123 | 20092010 | 198 | BOS | 2009-11-03 | 2 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
151 | 20092010 | 970 | PHI | 2010-03-07 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
189 | 20092010 | 622 | EDM | 2010-01-02 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
194 | 20092010 | 623 | MTL | 2010-01-03 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
259 | 20092010 | 875 | ANA | 2010-02-08 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
289 | 20092010 | 910 | OTT | 2010-02-13 | 3 | 00:10:00 | NaT | 00:19:11 | NaT | 00:09:11 | NaT | NaT |
301 | 20092010 | 551 | EDM | 2009-12-23 | 3 | 00:10:00 | NaT | 00:19:54 | NaT | 00:09:54 | NaT | NaT |
341 | 20092010 | 8 | STL | 2009-10-02 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
373 | 20092010 | 75 | COL | 2009-10-13 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
409 | 20092010 | 537 | N.J | 2009-12-21 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
428 | 20092010 | 219 | NYI | 2009-11-06 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
492 | 20092010 | 468 | PHI | 2009-12-12 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
518 | 20092010 | 429 | BUF | 2009-12-07 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
531 | 20092010 | 517 | NYR | 2009-12-19 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
533 | 20092010 | 393 | OTT | 2009-12-01 | 3 | 00:10:00 | NaT | 00:19:44 | NaT | 00:09:44 | NaT | NaT |
536 | 20092010 | 686 | CBJ | 2010-01-12 | 3 | 00:10:00 | NaT | 00:19:50 | NaT | 00:09:50 | NaT | NaT |
541 | 20092010 | 1060 | STL | 2010-03-20 | 2 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
564 | 20092010 | 444 | CAR | 2009-12-09 | 3 | 00:10:00 | NaT | 00:19:59 | NaT | 00:09:59 | NaT | NaT |
572 | 20092010 | 916 | NYR | 2010-02-14 | 2 | 00:10:00 | 00:01:40 | NaT | -1 days +23:51:40 | NaT | NaT | NaT |
580 | 20092010 | 853 | CGY | 2010-02-05 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
583 | 20092010 | 629 | FLA | 2010-01-05 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
591 | 20092010 | 1132 | ANA | 2010-03-29 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1021 | 20182019 | 554 | CAR | 2018-12-22 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1042 | 20182019 | 1062 | BOS | 2019-03-10 | 3 | 00:10:00 | 00:18:59 | NaT | 00:08:59 | NaT | NaT | NaT |
1045 | 20182019 | 954 | ANA | 2019-02-23 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1107 | 20182019 | 27 | ARI | 2018-10-06 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1110 | 20182019 | 1263 | CAR | 2019-04-06 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1123 | 20182019 | 651 | ANA | 2019-01-06 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1157 | 20182019 | 304 | N.J | 2018-11-18 | 2 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1170 | 20182019 | 646 | VGK | 2019-01-06 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1176 | 20182019 | 353 | CBJ | 2018-11-24 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1199 | 20182019 | 1034 | CBJ | 2019-03-07 | 3 | 00:10:00 | NaT | 00:19:29 | NaT | 00:09:29 | NaT | NaT |
1270 | 20182019 | 1002 | BUF | 2019-03-02 | 3 | 00:10:00 | NaT | 00:18:03 | NaT | 00:08:03 | NaT | NaT |
1294 | 20182019 | 426 | EDM | 2018-12-05 | 3 | 00:10:00 | 00:19:04 | NaT | 00:09:04 | NaT | NaT | NaT |
1342 | 20182019 | 1079 | NSH | 2019-03-12 | 3 | 00:10:00 | 00:14:16 | NaT | 00:04:16 | NaT | NaT | NaT |
1421 | 20182019 | 1199 | N.J | 2019-03-29 | 3 | 00:10:00 | NaT | 00:17:25 | NaT | 00:07:25 | NaT | NaT |
1423 | 20182019 | 905 | STL | 2019-02-17 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1497 | 20182019 | 362 | BOS | 2018-11-26 | 3 | 00:10:00 | NaT | 00:18:25 | NaT | 00:08:25 | NaT | NaT |
1504 | 20182019 | 1091 | NSH | 2019-03-14 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1510 | 20182019 | 661 | PHI | 2019-01-08 | 3 | 00:10:00 | 00:16:33 | NaT | 00:06:33 | NaT | NaT | NaT |
1553 | 20182019 | 1013 | OTT | 2019-03-03 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1577 | 20182019 | 876 | VAN | 2019-02-13 | 2 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1623 | 20182019 | 532 | PHI | 2018-12-20 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1637 | 20182019 | 1100 | NYI | 2019-03-16 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1650 | 20182019 | 1252 | S.J | 2019-04-04 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1656 | 20182019 | 549 | MTL | 2018-12-22 | 3 | 00:10:00 | 00:18:35 | NaT | 00:08:35 | NaT | NaT | NaT |
1677 | 20182019 | 733 | TOR | 2019-01-18 | 3 | 00:10:00 | NaT | 00:18:14 | NaT | 00:08:14 | NaT | NaT |
1694 | 20182019 | 271 | TOR | 2018-11-13 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1715 | 20182019 | 197 | EDM | 2018-11-03 | 2 | 00:10:00 | 00:01:15 | NaT | -1 days +23:51:15 | NaT | NaT | NaT |
1722 | 20182019 | 482 | T.B | 2018-12-13 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
1802 | 20182019 | 504 | PHI | 2018-12-15 | 3 | 00:10:00 | NaT | 00:15:29 | NaT | 00:05:29 | NaT | NaT |
1810 | 20182019 | 441 | STL | 2018-12-07 | 3 | 00:10:00 | NaT | NaT | NaT | NaT | 00:10:00 | 00:20:00 |
622 rows × 12 columns
plt.ylabel('Goalie Pulls')
plt.yticks([])
df.date.hist(
color='b',
bins=100,
histtype='stepfilled')
# savefig(plt, 'goalie_pulls_2003-2019')
df.groupby('season').size()
season 20032004 716 20052006 702 20062007 710 20072008 41 20082009 46 20092010 703 20102011 712 20112012 733 20122013 426 20132014 749 20142015 792 20152016 917 20162017 811 20172018 879 20182019 900 dtype: int64
Too few pulls from 2007-2009. I wonder how many HTML files I have for each
!ls ../../data/raw/html/
20032004 20052006 20072008 20092010 20112012 20132014 20152016 20172018 20042005 20062007 20082009 20102011 20122013 20142015 20162017 20182019
import os, glob
def nested_count(folder_path, level=0):
for folder in sorted(glob.glob('{}/'.format(os.path.join(folder_path, '*')))):
print('{:}{:}: {:,}'.format(' '*level, os.path.split(os.path.split(folder)[-2])[-1], len(glob.glob(os.path.join(folder, '*')))))
nested_count(folder, level+1)
nested_count('../../data/')
external: 0 interim: 0 processed: 2 csv: 16 tmp: 0 pkl: 16 tmp: 17 raw: 1 html: 16 20032004: 1,230 20042005: 0 20052006: 1,228 20062007: 1,230 20072008: 1,230 20082009: 1,230 20092010: 1,230 20102011: 1,230 20112012: 1,230 20122013: 720 20132014: 1,230 20142015: 1,230 20152016: 1,230 20162017: 1,230 20172018: 1,271 20182019: 1,271
I will need to look at the parsing for 2007-2009. Something is up
df_ = df[(df.season == '20072008') | (df.season == '20082009')]
len(df_)
87
df_.label.value_counts()
no_goals 51 goal_against 27 goal_for 9 Name: label, dtype: int64
It turns out that some tables just aren't being parsed properly... I need to use html.parser instead of lxml in some cases
fig, ax = plt.subplots()
s = df.groupby('season').size().sort_index(ascending=True).rename('counts').reset_index()
ax = s.plot(marker='o', lw=0, ax=ax, color='b', ms=10)
ax.set_xticklabels(s.season.tolist());
# ax.set_ylim(600, 1100)
plt.legend([])
plt.ylabel('Total Goalie Pulls')
# savefig('goalie_pulls_by_season')
Text(0, 0.5, 'Total Goalie Pulls')
label_map
{'0': '20032004', '1': '20052006', '2': '20062007', '3': '20072008', '4': '20082009', '5': '20092010', '6': '20102011', '7': '20112012', '8': '20122013', '9': '20132014', '10': '20142015', '11': '20152016', '12': '20162017', '13': '20172018', '14': '20182019'}
xticks
['', '20032004', '20062007', '20082009', '20102011', '20122013', '20142015', '20162017', '20182019']
fig, ax = plt.subplots()
iterables = zip(['orange', 'red', 'green'],
['no_goals', 'goal_against', 'goal_for'])
axes = []
for c, label in iterables:
m = df.label==label
# m = m & (df.season != '20122013')
df_ = df[m].copy()
df_.loc[(df.season == '20122013'), label] = 0
s = df_.groupby('season').size().sort_index(ascending=True).rename(label).reset_index()
s.plot(marker='o', lw=0, ax=ax, ms=10, color=c, label=label)
plt.legend()
ax = plt.gca()
ax.set_ylim(0, 600)
ax.set_xlim(0, 15)
label_map = {str(i): season for i, season in enumerate(df.season.drop_duplicates().sort_values(ascending=True).tolist())}
xticks = [label_map.get(str(round(tick)), '') for tick in ax.get_xticks().tolist()]
ax.set_xticklabels(xticks)
plt.ylabel('Goalie Pulls')
# savefig('goalie_pull_outcomes_by_season')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-335-e4ebc6d64236> in <module> 8 # m = m & (df.season != '20122013') 9 df_ = df[m].copy() ---> 10 df_.loc[(df.season == '20122013'), label] = 0 11 s = df_.groupby('season').size().sort_index(ascending=True).rename(label).reset_index() 12 s.plot(marker='o', lw=0, ax=ax, ms=10, color=c, label=label) /anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __setitem__(self, key, value) 186 else: 187 key = com._apply_if_callable(key, self.obj) --> 188 indexer = self._get_setitem_indexer(key) 189 self._setitem_with_indexer(indexer, value) 190 /anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_setitem_indexer(self, key) 164 if isinstance(key, tuple): 165 try: --> 166 return self._convert_tuple(key, is_setter=True) 167 except IndexingError: 168 pass /anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _convert_tuple(self, key, is_setter) 245 if i >= self.obj.ndim: 246 raise IndexingError('Too many indexers') --> 247 idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) 248 keyidx.append(idx) 249 return tuple(keyidx) /anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter) 1298 1299 if com.is_bool_indexer(obj): -> 1300 obj = check_bool_indexer(labels, obj) 1301 inds, = obj.nonzero() 1302 return inds /anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in check_bool_indexer(ax, key) 2351 result = key 2352 if isinstance(key, ABCSeries) and not key.index.equals(ax): -> 2353 result = result.reindex(ax) 2354 mask = isna(result._values) 2355 if mask.any(): /anaconda3/lib/python3.7/site-packages/pandas/core/series.py in reindex(self, index, **kwargs) 3323 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs) 3324 def reindex(self, index=None, **kwargs): -> 3325 return super(Series, self).reindex(index=index, **kwargs) 3326 3327 def drop(self, labels=None, axis=0, index=None, columns=None, /anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs) 3687 # perform the reindex on the axes 3688 return self._reindex_axes(axes, level, limit, tolerance, method, -> 3689 fill_value, copy).__finalize__(self) 3690 3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, /anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) 3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]}, 3706 fill_value=fill_value, -> 3707 copy=copy, allow_dups=False) 3708 3709 return obj /anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups) 3808 fill_value=fill_value, 3809 allow_dups=allow_dups, -> 3810 copy=copy) 3811 3812 if copy and new_data is self._data: /anaconda3/lib/python3.7/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy) 4412 # some axes don't allow reindexing with dups 4413 if not allow_dups: -> 4414 self.axes[axis]._can_reindex(indexer) 4415 4416 if axis >= self.ndim: /anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer) 3574 # trying to reindex on an axis with duplicates 3575 if not self.is_unique and len(indexer): -> 3576 raise ValueError("cannot reindex from a duplicate axis") 3577 3578 def reindex(self, target, method=None, level=None, limit=None, ValueError: cannot reindex from a duplicate axis
Text(0, 0.5, 'Goalie Pulls')
Something is up with 2009, where I don't seem to collect enough data.
The issue is my labels. That should be the 2012/2013 season, which was shortened by a lockout
ax
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ecb0cc0>
[t.get_text() for t in ax.get_xticklabels()]
['', '20032004', '20062007', '20082009', '20102011', '20122013', '20142015', '20162017', '20182019', '']
for tick in ax.get_xticklabels():
tick.set_text(label_map.get(tick.get_text(), ''))
tick = ax.get_xticklabels()[0]
tick.set_text('')
label_map
{'0': '20032004', '1': '20052006', '2': '20062007', '3': '20072008', '4': '20082009', '5': '20092010', '6': '20102011', '7': '20112012', '8': '20122013', '9': '20132014', '10': '20142015', '11': '20152016', '12': '20162017', '13': '20172018', '14': '20182019'}
label_map = {str(i): season for i, season in enumerate(df.groupby('season').size().sort_index(ascending=True).index.tolist())}
# xticklabels = [label_map.get(t.get_text(), '') for t in ax.get_xticklabels()]
# ax.set_xticklabels(xticklabels)
# print([t.get_text() for t in ax.get_xticklabels()])
# ticks = [t.get_text() for t in ax.get_xticklabels()]
# ax.set_xticklabels(ticks)
fig, ax = plt.subplots()
iterables = zip(['orange', 'red', 'green'],
['no_goals', 'goal_against', 'goal_for'])
axes = []
for c, label in iterables:
m = df.label==label
s = df[m].groupby('season').size().sort_index(ascending=True).rename(label).reset_index()
axes.append(s.plot(marker='o', lw=0, ax=ax, ms=10, color=c, label=label))
plt.legend()
ax.set_xticklabels(s.season.tolist());
ax.set_ylim(0, 600)
plt.ylabel('Total Counts')
# savefig('goalie_pull_outcomes_by_season')
# fig, ax = plt.subplots()
# iterables = zip(['orange', 'red', 'green'],
# ['no_goals', 'goal_against', 'goal_for'])
# axes = []
# for c, label in iterables:
# m = df.label==label
# s = df[m].groupby('season').size().sort_index(ascending=True).rename(label).reset_index()
# s.plot(marker='o', lw=0, ax=ax, ms=10, color=c, label=label)
# plt.legend()
# ax.set_ylim(0, 600)
label_map = {str(i): season for i, season in enumerate(df.groupby('season').size().sort_index(ascending=True).index.tolist())}
# xticklabels = [label_map.get(t.get_text(), '') for t in ax.get_xticklabels()]
# ax.set_xticklabels(xticklabels)
# print([t.get_text() for t in ax.get_xticklabels()])
# ticks = [t.get_text() for t in ax.get_xticklabels()]
# ax.set_xticklabels(ticks)
# plt.ylabel('Total Counts')
# # savefig('goalie_pull_outcomes_by_season')
['', '', '', '', '', '', '', '', '', '']
Text(0, 0.5, 'Total Counts')
label_map
{'0': '20032004', '1': '20052006', '2': '20062007', '3': '20072008', '4': '20082009', '5': '20092010', '6': '20102011', '7': '20112012', '8': '20122013', '9': '20132014', '10': '20142015', '11': '20152016', '12': '20162017', '13': '20172018', '14': '20182019'}
[t.get_text() for t in ax.get_xticklabels()]
['−2', '0', '2', '4', '6', '8', '10', '12', '14', '16']
ax.set_xticklabels([t.get_text() for t in ax.get_xticklabels()])
[Text(-2.0, 0, '−2'), Text(0.0, 0, '0'), Text(2.0, 0, '2'), Text(4.0, 0, '4'), Text(6.0, 0, '6'), Text(8.0, 0, '8'), Text(10.0, 0, '10'), Text(12.0, 0, '12'), Text(14.0, 0, '14'), Text(16.0, 0, '16')]