import scrape_functions
from datetime import date, timedelta
import json_pbp
import html_pbp
import seaborn as sbs
import espn_pbp
import json_shifts
import html_shifts
import playing_roster
import json_schedule
import pandas as pd
import time
import numpy as np
import datetime
import warnings
import shared
import pickle
#pip install mysql-connector-python-rf
import mysql.connector
from mysql.connector import Error
from sqlalchemy import create_engine
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 999)
pd.set_option('display.max_rows', None)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import thinkbayes2 as tb
import thinkbayes as tb0
class RinkAdjust(object):
def __init__( self ):
self.teamxcdf, self.teamycdf, self.otherxcdf, self.otherycdf = {}, {}, {}, {}
def addCDFs( self, team, this_x_cdf, this_y_cdf, other_x_cdf, other_y_cdf ):
self.teamxcdf[team] = this_x_cdf
self.teamycdf[team] = this_y_cdf
self.otherxcdf[team] = other_x_cdf
self.otherycdf[team] = other_y_cdf
def addTeam( self, team, this_team, rest_of_league ):
this_x_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( this_team.X_unadj ) )
this_y_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( this_team.Y_unadj ) )
other_x_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( rest_of_league.X_unadj ) )
other_y_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( rest_of_league.Y_unadj ) )
self.addCDFs( team, this_x_cdf, this_y_cdf, other_x_cdf, other_y_cdf )
def PlotTeamCDFs( self, team, savefig=False ):
this_x_cdf = self.teamxcdf[team]
this_y_cdf = self.teamycdf[team]
other_x_cdf = self.otherxcdf[team]
other_y_cdf = self.otherycdf[team]
f, axx = plt.subplots( 1, 2, sharey='col' )
f.set_size_inches( 14, 8 )
xx1, yx1 = this_x_cdf.Render()
xx2, yx2 = other_x_cdf.Render()
axx[0].plot( xx1, yx1, color='blue', label='@%s' % team )
axx[0].plot( xx2, yx2, color='brown', label='@Rest of League' )
axx[0].set_xlabel( 'CDF of X' )
axx[0].legend()
xy1, yy1 = this_y_cdf.Render()
xy2, yy2 = other_y_cdf.Render()
axx[1].plot( xy1, yy1, color='blue', label='@%s' % team )
axx[1].plot( xy2, yy2, color='brown', label='@Rest of League' )
axx[1].set_xlabel( 'CDF of Y' )
axx[1].legend()
f.suptitle( 'Cumulative Density Function for Shot Location Rink Bias Adjustment' )
plt.show()
if savefig:
#f.set_tight_layout( True )
plt.savefig( 'Rink bias CDF chart %s.png' % team )
def rink_bias_adjust( self, x, y, team ):
""" this method implements the actual location conversion from biased to "unbiased" shot location
the way it works for rink bias adjustment is that for a given shot location in a specific rink,
you find the cumulative probabilities for that x and y in that rink. Then you calculate the league
equivalent x and y that have the same probabilities as the one measured in the specific rink
The equivalency CDFs are calculated using only visiting teams, which ensures that both single rink and
league wide rinks have as wide a sample of teams as possible but avoid any possible home team bias.
All of which lets us assume that they are then unbiased enough to be representative (at least enough
for standardization purposes)
This is (my adaption of my understanding of) Shuckers' method for rink bias adjustment as described in Appendix A here:
http://www.sloansportsconference.com/wp-content/uploads/2013/Total%20Hockey%20Rating%20(THoR)%20A%20comprehensive%20statistical%20rating%20of%20National%20Hockey%20League%20forwards%20and%20defensemen%20based%20upon%20all%20on-ice%20events.pdf
for example, if a shot x coordinate is measured as xmeas in a rink
xprob = this_x_cdf.Prob( xmeas ) # cum prob of seeing xmeas in this rink
xadj = other_x_cdf.Value( xprob ) # value associated with same prob in rest of league
analogous process for y
The code for Cdf/Pmf creation and manipulation is taken directly from Allan Downey's code for "Think Bayes"
"""
xprob = self.teamxcdf[team].Prob( x )
newx = self.otherxcdf[team].Value( xprob )
yprob = self.teamycdf[team].Prob( y )
newy = self.otherycdf[team].Value( yprob )
return newx, newy
NHL PBP Data from scraper: https://github.com/HarryShomer/Hockey-Scraper
def transform_data(data):
import warnings
warnings.simplefilter("ignore")
from sqlalchemy import create_engine
pbp_df = data
print("All events and columns: " + str(pbp_df.shape))
## Remove shootouts
pbp_df['season'] = pbp_df.apply( lambda x: str(pd.to_datetime(x.Date).year-1) + str(pd.to_datetime(x.Date).year) if pd.to_datetime(x.Date).month < 9 else str(pd.to_datetime(x.Date).year) + str(pd.to_datetime(x.Date).year + 1), axis=1 )
pbp_df['season2'] = pbp_df.apply( lambda x: x.season if x.Game_Id < 30000 else str(x.season) + "p", axis=1 )
pbp_df['Season_Type'] = pbp_df.apply( lambda x: 'RS' if x.Game_Id < 30000 else 'PO', axis=1 )
pbp_df['season_model'] = pbp_df.apply(lambda x: '2011_2012' if x.season in ['20102011','20112012'] else
'2013_2014' if x.season in ['20122013','20132014'] else
'2015_2016' if x.season in ['20142015','20152016'] else
'2017_2018' if x.season in ['20162017','20172018'] else 0, axis = 1)
pbp_df = pbp_df.drop_duplicates(['season','Game_Id','Period','Ev_Team','Seconds_Elapsed'])
pbp_df = pbp_df.sort_values(['season','Game_Id','Period','Seconds_Elapsed'], ascending=True)
# Remove SOs
pbp_df = pbp_df.loc[((pbp_df.Period == 5) & (pbp_df.Season_Type == "RS")) != True,:]
# Group Give/Take together
pbp_df['Event'] = pbp_df['Event'].apply( lambda x: 'TURN' if x in ["GIVE","TAKE"] else x )
pbp_df['Type'] = pbp_df['Type'].apply( lambda x: 'DEFLECTED' if x in ["DEFLECTED","TIP-IN"] else \
'WRIST SHOT' if x in ["WRIST SHOT","SNAP SHOT"] else x )
## Check Lag Time doesn't Cross Periods
pbp_df = pbp_df.sort_values(['season','Game_Id','Period','Seconds_Elapsed'], ascending=True)
pbp_df['lagged_Event'] = pbp_df.groupby(['Game_Id','Period'])['Event'].shift(1)
pbp_df['lagged_Ev_Zone'] = pbp_df.groupby(['Game_Id','Period'])['Ev_Zone'].shift(1)
pbp_df['lagged_Seconds_Elapsed'] = pbp_df.groupby(['Game_Id','Period'])['Seconds_Elapsed'].shift(1)
#############################################
### Subset to just shots
#############################################
pbp_df = pbp_df.loc[pbp_df.Event.isin(["SHOT","GOAL","MISS","BLOCK"]),:]
print("All shots/blocks and columns: " + str(pbp_df.shape))
## Binary
pbp_df['Goal'] = pbp_df.apply( lambda x: 1 if x.Event == "GOAL" else 0, axis = 1 )
pbp_df['EmptyNet_SA'] = pbp_df.apply( lambda x: 1 if ((pd.isnull(x.Home_Goalie)) & (x.Ev_Team == x.Away_Team)) | \
((pd.isnull(x.Away_Goalie)) & (x.Ev_Team == x.Home_Team)) else 0, axis = 1)
pbp_df['is_Rebound'] = pbp_df.apply( lambda x: 1 if (x.lagged_Event in ["SHOT"]) & \
((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 2) else 0, axis = 1 )
pbp_df['is_Bounce'] = pbp_df.apply( lambda x: 1 if (x.lagged_Event in ["BLOCK","MISS"]) & \
((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 2) else 0, axis = 1 )
pbp_df['is_Rush'] = pbp_df.apply( lambda x: 1 if (x.Ev_Zone != x.lagged_Ev_Zone) & \
((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 6) else 0, axis = 1 )
# Replace every occurrence of PHX with ARI
pbp_df['Home_Team'] = pbp_df.apply( lambda x: x.Home_Team if x.Home_Team !='PHX' else 'ARI', axis=1 )
pbp_df['Away_Team'] = pbp_df.apply( lambda x: x.Away_Team if x.Away_Team !='PHX' else 'ARI', axis=1 )
pbp_df['Ev_Team'] = pbp_df.apply( lambda x: x.Ev_Team if x.Ev_Team !='PHX' else 'ARI', axis=1 )
# Replace every occurrence of ATL with WPG
pbp_df['Home_Team'] = pbp_df.apply( lambda x: x.Home_Team if x.Home_Team !='ATL' else 'WPG', axis=1 )
pbp_df['Away_Team'] = pbp_df.apply( lambda x: x.Away_Team if x.Away_Team !='ATL' else 'WPG', axis=1 )
pbp_df['Ev_Team'] = pbp_df.apply( lambda x: x.Ev_Team if x.Ev_Team !='ATL' else 'WPG', axis=1 )
# add a 'Direction' column to indicate the primary direction for shots. The heuristic to determine
# direction is the sign of the median of the X coordinate of shots in each period. This then lets us filter
# out shots that originate from back in the defensive zone when the signs don't match
pbp_df['Home_Shooter'] = pbp_df.apply( lambda x: 1 if x.Ev_Team == x.Home_Team else 0, axis = 1)
game_period_locations = pbp_df.groupby( by=['season', 'Game_Id', 'Period','Home_Shooter'] )['xC','yC']
game_period_medians = game_period_locations.transform(np.median)
pbp_df['Direction'] = np.sign( game_period_medians['xC'] )
# should actually write this to a CSV as up to here is the performance intensive part
pbp_df['X_unadj'], pbp_df['Y_unadj'] = zip( *pbp_df.apply( lambda x: (x.xC, x.yC) if x.Direction > 0 else (-x.xC,-x.yC), axis = 1 ) )
pbp_df['LS_Shot'] = pbp_df.apply( lambda x: 1 if x.Y_unadj < 0 else 0, axis = 1)
## Logged Last Event Time
pbp_df['LN_Last_Event_Time'] = pbp_df.apply( lambda x: 0 if (x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 0 \
else np.log(x.Seconds_Elapsed - x.lagged_Seconds_Elapsed + 0.001), axis = 1)
# Last Event
pbp_df['LastEV_Off_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event == 'FAC') else 0, axis = 1)
pbp_df['LastEV_Def_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event == 'FAC') else 0, axis = 1)
pbp_df['LastEV_Neu_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event == 'FAC') else 0, axis = 1)
pbp_df['LastEV_Off_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1)
pbp_df['LastEV_Def_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1)
pbp_df['LastEV_Neu_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1)
pbp_df['LastEV_Off_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event == 'TURN') else 0, axis = 1)
pbp_df['LastEV_Def_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event == 'TURN') else 0, axis = 1)
pbp_df['LastEV_Neu_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event == 'TURN') else 0, axis = 1)
## Adjust X, Y coordinates by Rink, using CDF of shot attempts only (remove blocks since they skew data)
pbp_df = pbp_df.loc[pbp_df.Event.isin(["SHOT","GOAL","MISS"]),:]
## Call RinkAdjust class
adjuster = RinkAdjust()
## New dataframe of adjusted shots for each home rink
pbp_df_adj = pd.DataFrame()
## For each home rink
for team in sorted(pbp_df.Home_Team.unique()):
## Split shots into team arena and all other rinks
shot_data = pbp_df
rink_shots = shot_data[ shot_data.Home_Team == team ]
rest_of_league = shot_data[ shot_data.Home_Team != team ]
## Create teamxcdf and otherxcdf for rink adjustment
adjuster.addTeam( team, rink_shots, rest_of_league )
## Adjusted coordinates
Xadj = []
Yadj = []
## For each shot in rink adjust coordinates based on other rinks
for row in rink_shots.itertuples():
newx, newy = adjuster.rink_bias_adjust( row.X_unadj, row.Y_unadj, row.Home_Team )
Xadj.append(newx)
Yadj.append(newy)
rink_shots['X'] = Xadj
rink_shots['Y'] = Yadj
pbp_df_adj = pbp_df_adj.append(rink_shots)
print ("All shots columns, rink adjusted: " + str(pbp_df_adj.shape))
## Apply only to season level data after x,y CDF adjustment
pbp_df_adj['Shot_Distance_Unadj'] = pbp_df_adj.apply( lambda x: ((89 - x.X_unadj)**2 + (x.Y_unadj ** 2)) ** 0.5, axis = 1 )
pbp_df_adj['Shot_Distance'] = pbp_df_adj.apply( lambda x: ((89 - x.X)**2 + (x.Y ** 2)) ** 0.5, axis = 1 )
pbp_df_adj['Shot_Angle'] = pbp_df_adj.apply( lambda x: np.arctan(abs(89 - x.X) / abs(0 - x.Y)) * (180 / np.pi) if x.Y != 0 \
else 90, axis = 1 )
pbp_df_adj['Last_Shot_Distance'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['Shot_Distance'].shift(1)
pbp_df_adj['Last_Shot_Angle'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['Shot_Angle'].shift(1)
pbp_df_adj['Last_LS_Shot'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['LS_Shot'].shift(1)
pbp_df_adj['Rebound_Distance_Change'] = pbp_df_adj.apply( lambda x: x.Last_Shot_Distance + x.Shot_Distance if x.is_Rebound == 1 else 0, axis = 1 )
pbp_df_adj['Rebound_Angle_Change'] = pbp_df_adj.apply( lambda x: 0 if x.is_Rebound == 0 \
else abs(x.Last_Shot_Angle - x.Shot_Angle) \
if x.is_Rebound == 1 & (x.Last_LS_Shot == x.LS_Shot) else \
(180 - x.Last_Shot_Angle - x.Shot_Angle), axis = 1 )
pbp_df_adj['Rebound_Distance_Traveled_byAngle'] = pbp_df_adj. \
apply( lambda x: x.Rebound_Distance_Change / x.Rebound_Angle_Change \
if x.Rebound_Angle_Change > 0 else 0, axis = 1)
pbp_df_adj['LN_Rebound_Distance_Traveled_byAngle'] = pbp_df_adj. \
apply(lambda x: np.log(x.Rebound_Distance_Traveled_byAngle) \
if x.Rebound_Distance_Traveled_byAngle > 0 else 0, axis = 1)
print ("All shots columns, final calcuations: " + str(pbp_df_adj.shape))
return pbp_df_adj
types = {'xC': np.float64,
'yC': np.float64,
'X': np.float64,
'X_unadj': np.float64,
'Y': np.float64,
'Y_unadj': np.float64,
'Game_Id': int}
nhl_pbp20102011 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20102011.csv', dtype=types)
nhl_pbp20112012 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20112012.csv', dtype=types)
nhl_pbp20122013 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20122013.csv', dtype=types)
nhl_pbp20132014 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20132014.csv', dtype=types)
nhl_pbp20142015 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20142015.csv', dtype=types)
nhl_pbp20152016 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20152016.csv', dtype=types)
nhl_pbp20162017 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20162017.csv', dtype=types)
nhl_pbp20172018 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20172018.csv', dtype=types)
nhl_pbp = pd.concat([nhl_pbp20102011, nhl_pbp20112012, nhl_pbp20122013, nhl_pbp20132014,
nhl_pbp20142015, nhl_pbp20152016, nhl_pbp20162017, nhl_pbp20172018])
unwanted = nhl_pbp.columns[nhl_pbp.columns.str.startswith('Unna')]
nhl_pbp.drop(unwanted, axis=1, inplace=True)
nhl_pbp.head()
Away_Coach | Away_Goalie | Away_Goalie_Id | Away_Players | Away_Score | Away_Team | Date | Description | Ev_Team | Ev_Zone | Event | Game_Id | Home_Coach | Home_Goalie | Home_Goalie_Id | Home_Players | Home_Score | Home_Team | Home_Zone | Period | Seconds_Elapsed | Strength | Time_Elapsed | Type | awayPlayer1 | awayPlayer1_id | awayPlayer2 | awayPlayer2_id | awayPlayer3 | awayPlayer3_id | awayPlayer4 | awayPlayer4_id | awayPlayer5 | awayPlayer5_id | awayPlayer6 | awayPlayer6_id | homePlayer1 | homePlayer1_id | homePlayer2 | homePlayer2_id | homePlayer3 | homePlayer3_id | homePlayer4 | homePlayer4_id | homePlayer5 | homePlayer5_id | homePlayer6 | homePlayer6_id | p1_ID | p1_name | p2_ID | p2_name | p3_ID | p3_name | xC | yC | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | PAUL MAURICE | CAM WARD | 8470320.0 | 6 | 0 | CAR | 2010-10-07 | Period Start- Local time: 7:10 EET | NaN | NaN | PSTR | 20003 | TODD RICHARDS | NIKLAS BACKSTROM | 8473404.0 | 6 | 0 | MIN | NaN | 1 | 0.0 | 5x5 | 0:00 | NaN | JEFF SKINNER | 8475784.0 | TUOMO RUUTU | 8469462.0 | JUSSI JOKINEN | 8469638.0 | JONI PITKANEN | 8470137.0 | JOE CORVO | 8466215.0 | CAM WARD | 8470320.0 | MIKKO KOIVU | 8469459.0 | ANTTI MIETTINEN | 8468704.0 | ANDREW BRUNETTE | 8459596.0 | GREG ZANON | 8468636.0 | CAM BARKER | 8471216.0 | NIKLAS BACKSTROM | 8473404.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | PAUL MAURICE | CAM WARD | 8470320.0 | 6 | 0 | CAR | 2010-10-07 | MIN won Neu. Zone - CAR #36 JOKINEN vs MIN #9 KOIVU | MIN | Neu | FAC | 20003 | TODD RICHARDS | NIKLAS BACKSTROM | 8473404.0 | 6 | 0 | MIN | Neu | 1 | 0.0 | 5x5 | 0:00 | NaN | JEFF SKINNER | 8475784.0 | TUOMO RUUTU | 8469462.0 | JUSSI JOKINEN | 8469638.0 | JONI PITKANEN | 8470137.0 | JOE CORVO | 8466215.0 | CAM WARD | 8470320.0 | MIKKO KOIVU | 8469459.0 | ANTTI MIETTINEN | 8468704.0 | ANDREW BRUNETTE | 8459596.0 | GREG ZANON | 8468636.0 | CAM BARKER | 8471216.0 | NIKLAS BACKSTROM | 8473404.0 | 8469459.0 | MIKKO KOIVU | 8469638.0 | JUSSI JOKINEN | NaN | NaN | 0.0 | 0.0 |
2 | PAUL MAURICE | CAM WARD | 8470320.0 | 6 | 0 | CAR | 2010-10-07 | OFFSIDE | NaN | NaN | STOP | 20003 | TODD RICHARDS | NIKLAS BACKSTROM | 8473404.0 | 6 | 0 | MIN | NaN | 1 | 8.0 | 5x5 | 0:08 | NaN | JEFF SKINNER | 8475784.0 | TUOMO RUUTU | 8469462.0 | JUSSI JOKINEN | 8469638.0 | JONI PITKANEN | 8470137.0 | JOE CORVO | 8466215.0 | CAM WARD | 8470320.0 | MIKKO KOIVU | 8469459.0 | ANTTI MIETTINEN | 8468704.0 | ANDREW BRUNETTE | 8459596.0 | GREG ZANON | 8468636.0 | CAM BARKER | 8471216.0 | NIKLAS BACKSTROM | 8473404.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | PAUL MAURICE | CAM WARD | 8470320.0 | 6 | 0 | CAR | 2010-10-07 | MIN won Neu. Zone - CAR #36 JOKINEN vs MIN #9 KOIVU | MIN | Neu | FAC | 20003 | TODD RICHARDS | NIKLAS BACKSTROM | 8473404.0 | 6 | 0 | MIN | Neu | 1 | 8.0 | 5x5 | 0:08 | NaN | JEFF SKINNER | 8475784.0 | TUOMO RUUTU | 8469462.0 | JUSSI JOKINEN | 8469638.0 | JONI PITKANEN | 8470137.0 | JOE CORVO | 8466215.0 | CAM WARD | 8470320.0 | MIKKO KOIVU | 8469459.0 | ANTTI MIETTINEN | 8468704.0 | ANDREW BRUNETTE | 8459596.0 | GREG ZANON | 8468636.0 | CAM BARKER | 8471216.0 | NIKLAS BACKSTROM | 8473404.0 | 8469459.0 | MIKKO KOIVU | 8469638.0 | JUSSI JOKINEN | NaN | NaN | 20.0 | -22.0 |
4 | PAUL MAURICE | CAM WARD | 8470320.0 | 6 | 0 | CAR | 2010-10-07 | CAR ONGOAL - #12 STAAL, Snap, Off. Zone, 37 ft. | CAR | Off | SHOT | 20003 | TODD RICHARDS | NIKLAS BACKSTROM | 8473404.0 | 6 | 0 | MIN | Def | 1 | 65.0 | 5x5 | 1:05 | SNAP SHOT | ERIC STAAL | 8470595.0 | CHAD LAROSE | 8469812.0 | ERIK COLE | 8467396.0 | JONI PITKANEN | 8470137.0 | JOE CORVO | 8466215.0 | CAM WARD | 8470320.0 | MATT CULLEN | 8464989.0 | CAL CLUTTERBUCK | 8473504.0 | MARTIN HAVLAT | 8467899.0 | BRENT BURNS | 8470613.0 | NICK SCHULTZ | 8468513.0 | NIKLAS BACKSTROM | 8473404.0 | 8470595.0 | ERIC STAAL | NaN | NaN | NaN | NaN | 56.0 | -15.0 |
player_lookup = pd.read_sql(con=engine, sql ="SELECT * FROM `nhl_all`.`hockey_roster_info` AS B")
player_lookup = player_lookup.sort_values('gamesPlayed',ascending=False).groupby(['playerId']).first().reset_index(). \
loc[:, ['playerBirthDate', 'playerPositionCode', 'playerShootsCatches','playerId']]
skater_lookup = player_lookup.loc[player_lookup.playerPositionCode != "G", :]
skater_lookup.columns = ['shooterDOB','Player_Position','Shoots','p1_ID']
skater_lookup['p1_ID'] = skater_lookup['p1_ID'].astype(str)
goalie_lookup = pd.read_sql(con=engine, sql = "SELECT DISTINCT playerId as SA_Goalie_Id, playerShootsCatches as Catches, playerBirthDate as goalieDOB FROM `nhl_all`.`hockey_goalies_roster` AS A")
goalie_lookup['SA_Goalie_Id'] = goalie_lookup['SA_Goalie_Id'].astype(str)
def lookups_data_clean(data):
for col in ['Game_Id','Away_Goalie_Id','Home_Goalie_Id','p1_ID','p2_ID','p3_ID',
'awayPlayer1_id','awayPlayer2_id','awayPlayer3_id','awayPlayer4_id','awayPlayer5_id','awayPlayer6_id',
'homePlayer1_id','homePlayer2_id','homePlayer3_id','homePlayer4_id','homePlayer5_id','homePlayer6_id']:
data[col] = data[col].fillna(0).astype(int).astype(str)
data['SA_Goalie'] = data.apply( lambda x: x.Away_Goalie if x.Ev_Team == x.Home_Team else x.Home_Goalie, axis=1 )
data['SA_Goalie_Id'] = data.apply( lambda x: x.Away_Goalie_Id if x.Ev_Team == x.Home_Team else x.Home_Goalie_Id, axis=1 )
data['Away_State'] = data.apply( lambda x: x.Away_Players - 1 if x.Away_Goalie_Id in [x.awayPlayer6_id, x.awayPlayer5_id, x.awayPlayer4_id, x.awayPlayer3_id] else x.Away_Players, axis=1 )
data['Home_State'] = data.apply( lambda x: x.Home_Players - 1 if x.Home_Goalie_Id in [x.homePlayer6_id, x.homePlayer5_id, x.homePlayer4_id, x.homePlayer3_id] else x.Home_Players, axis=1 )
data['Away_State'] = data.apply( lambda x: x.Away_Players - 1 if x.Away_Goalie_Id in [x.awayPlayer6_id, x.awayPlayer5_id, x.awayPlayer4_id, x.awayPlayer3_id] else x.Away_Players, axis=1 )
data['Home_State'] = data.apply( lambda x: x.Home_Players - 1 if x.Home_Goalie_Id in [x.homePlayer6_id, x.homePlayer5_id, x.homePlayer4_id, x.homePlayer3_id] else x.Home_Players, axis=1 )
data['Results_inRebound'] = data['is_Rebound'].shift(periods=-1)
data['Shooter_State'] = data.apply( lambda x: x.Away_State if x.Ev_Team != x.Home_Team else x.Home_State, axis=1 )
data['Goalie_State'] = data.apply( lambda x: x.Away_State if x.Ev_Team == x.Home_Team else x.Home_State, axis=1 )
data['Game_State'] = data.apply( lambda x: str(x.Away_State) + "v" + str(x.Home_State) if x.Ev_Team == x.Home_Team else \
str(x.Home_State) + "v" + str(x.Away_State) , axis=1 )
data['Game_State'] = data.apply( lambda x: "SH_SA" if x.Game_State in ["3v5","3v4","3v6","4v5","4v6","5v6"] else \
"PP_2p_SA" if x.Game_State in ["6v3","6v4","5v3"] else \
"5v5" if x.Game_State in ["5v5","6v6"] else x.Game_State, axis=1 )
data['State_Space'] = data['Goalie_State'] + data['Shooter_State']
data['Shooter_State_Advantage'] = data['Shooter_State'] - data['Goalie_State']
data = data.merge(skater_lookup, on=['p1_ID'], how = 'left')
data = data.merge(goalie_lookup, on=['SA_Goalie_Id'], how = 'left')
data['Shooter_Handedness'] = data.apply( lambda x: "L" if x.Shoots == "L" else \
"R" if x.Shoots == "R" else "U", axis=1 )
data['Handed_Class'] = data['Shoots'].str.cat(data['Catches'], sep='')
data['Handed_Class2'] = data.apply( lambda x: "Same" if x.Handed_Class in ["LL","RR"] else \
"Opposite" if x.Handed_Class in ["LR","RL"] else "U", axis = 1)
data['Player_Position2'] = data.apply( lambda x: "D" if x.Player_Position == "D" else "F", axis=1 )
return data
def cumulative_shooting_talent(data):
shooting_percentage = data.groupby(['Player_Position2'])['Goal'].mean()
data['Cum_Goal'] = data.groupby(['p1_ID'])['Goal'].cumsum()
data['Cum_Shots'] = data.groupby(['p1_ID']).cumcount()
data['Cum_Goal'] = data.apply( lambda x: x.Cum_Goal - 1 if x.Event == "GOAL" else x.Cum_Goal, axis = 1)
kr21_stabilizer_F = pd.to_numeric(375.0)
kr21_stabilizer_D = pd.to_numeric(275.0)
data['Regressed_Shooting_Indexed'] = data.apply( lambda x: ((x.Cum_Goal + (kr21_stabilizer_D * shooting_percentage[0])) /\
(x.Cum_Shots + kr21_stabilizer_D)) / shooting_percentage[0]\
if x.Player_Position2 == "D" else ((x.Cum_Goal + (kr21_stabilizer_F * shooting_percentage[1])) /\
(x.Cum_Shots + kr21_stabilizer_F)) / shooting_percentage[1], axis = 1)
return data
For each category variable, create dummies For shot distance and angle 3rd degree polynomial
def feature_generation(data,
id_vars = ["season"],
target_vars = ['Goal','Results_inRebound'],
num_vars = ["EmptyNet_SA","is_Rebound","is_Rush","LN_Last_Event_Time","LastEV_Off_Faceoff",
"LastEV_Def_Faceoff","LastEV_Neu_Faceoff","LastEV_Off_Shot","LastEV_Def_Shot","LastEV_Neu_Shot",
"LastEV_Off_Give","LastEV_Def_Give","LastEV_Neu_Give","LN_Rebound_Distance_Traveled_byAngle",
"Regressed_Shooting_Indexed"],
cat_vars = ["Type","Shooter_State","Goalie_State","Handed_Class2","Player_Position2"],
poly_vars = ["Shot_Distance","Shot_Angle"],
model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
'Handed_Class2_Opposite',
'Player_Position2_F', 'Shot_Distance',
'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
'Shot_Angle^3']):
from sklearn.preprocessing import PolynomialFeatures
## Dummy Variables
model_data = data[num_vars].fillna(0)
for i in cat_vars:
var_dummies = pd.get_dummies(data.loc[:,[i]])
model_data = pd.concat([model_data, var_dummies], axis=1)
## Polynomial Variables
for i in poly_vars:
poly_data = data.loc[:,[i]]
poly = PolynomialFeatures(degree=3,interaction_only=False).fit(poly_data)
poly_names = poly.get_feature_names(poly_data.columns)
poly_output = poly.transform(data.loc[:,[i]])
model_data = pd.DataFrame(pd.concat([ model_data,
pd.DataFrame(poly_output,
columns = poly_names ).iloc[:,1:]], axis=1))
#model_mat = model_data.loc[:, model_vars].as_matrix()
model_data = pd.concat([data[id_vars],data[target_vars], model_data], axis=1)
print(model_data.shape)
return model_data
shot_data_all = transform_data(nhl_pbp)
All events and columns: (2878182, 56) All shots/blocks and columns: (1014120, 63) All shots columns, rink adjusted: (753814, 85) All shots columns, final calcuations: (753814, 95)
## Check shot distance adjustment
shot_data_all.loc[shot_data_all.season == "20172018", :].groupby(['Home_Team'])[['Shot_Distance','Shot_Distance_Unadj']]\
.mean().sort_values(['Shot_Distance_Unadj'])
Shot_Distance | Shot_Distance_Unadj | |
---|---|---|
Home_Team | ||
NYR | 37.967128 | 33.186499 |
DET | 34.291843 | 34.282345 |
CHI | 35.837164 | 34.322037 |
STL | 36.458697 | 34.468851 |
N.J | 34.590668 | 34.594601 |
ANA | 36.826106 | 35.160489 |
PIT | 35.505403 | 35.351800 |
NSH | 35.671859 | 35.385298 |
L.A | 34.941428 | 35.561865 |
MTL | 34.586403 | 35.622235 |
WSH | 34.732211 | 35.654820 |
DAL | 36.015360 | 35.655394 |
CAR | 36.121785 | 35.702802 |
TOR | 32.979968 | 35.775225 |
CGY | 36.037519 | 36.258758 |
T.B | 34.318686 | 36.449372 |
VGK | 34.737839 | 36.491532 |
NYI | 37.686836 | 36.756656 |
COL | 34.988536 | 36.862748 |
ARI | 35.588372 | 36.873783 |
FLA | 36.392375 | 36.958453 |
CBJ | 38.090698 | 37.245836 |
EDM | 38.198247 | 37.329866 |
VAN | 36.457876 | 37.376298 |
BOS | 33.265149 | 37.584670 |
S.J | 37.499523 | 37.637475 |
WPG | 39.252447 | 39.239246 |
BUF | 37.702187 | 39.468667 |
MIN | 37.545626 | 39.570156 |
PHI | 37.190102 | 39.699116 |
OTT | 38.729001 | 40.723540 |
shot_data_all = lookups_data_clean(shot_data_all)
shot_data_all2 = cumulative_shooting_talent(shot_data_all)
model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
'Handed_Class2_Opposite',
'Player_Position2_F', 'Shot_Distance',
'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
'Shot_Angle^3']
model_data = feature_generation(shot_data_all2, model_vars = model_vars)
#(744586, 30)
model_data.head()
(753814, 36)
season | Goal | Results_inRebound | EmptyNet_SA | is_Rebound | is_Rush | LN_Last_Event_Time | LastEV_Off_Faceoff | LastEV_Def_Faceoff | LastEV_Neu_Faceoff | LastEV_Off_Shot | LastEV_Def_Shot | LastEV_Neu_Shot | LastEV_Off_Give | LastEV_Def_Give | LastEV_Neu_Give | LN_Rebound_Distance_Traveled_byAngle | Regressed_Shooting_Indexed | Type_BACKHAND | Type_DEFLECTED | Type_SLAP SHOT | Type_WRAP-AROUND | Type_WRIST SHOT | Shooter_State | Goalie_State | Handed_Class2_Opposite | Handed_Class2_Same | Handed_Class2_U | Player_Position2_D | Player_Position2_F | Shot_Distance | Shot_Distance^2 | Shot_Distance^3 | Shot_Angle | Shot_Angle^2 | Shot_Angle^3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 2.197336 | 2.197336 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 1 | 0 | 0 | 5 | 5 | 1 | 0 | 0 | 0 | 1 | 40.311289 | 1625.0 | 65505.844205 | 66.614779 | 4437.528774 | 295604.998305 |
1 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 1.098946 | 0.000000 | 0.0 | 0.0 | 1.098946 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 1 | 0 | 0 | 5 | 5 | 0 | 1 | 0 | 1 | 0 | 58.137767 | 3380.0 | 196505.653863 | 63.434949 | 4023.992732 | 255261.773029 |
2 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 2.302685 | 2.302685 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 0 | 0 | 1 | 5 | 5 | 0 | 1 | 0 | 0 | 1 | 31.016125 | 962.0 | 29837.512095 | 1.847610 | 3.413664 | 6.307120 |
3 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 3.637612 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.99734 | 0 | 0 | 1 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 1 | 40.311289 | 1625.0 | 65505.844205 | 82.874984 | 6868.262915 | 569207.176806 |
4 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 1.386544 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.386544 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 0 | 0 | 1 | 3 | 5 | 1 | 0 | 0 | 0 | 1 | 37.443290 | 1402.0 | 52495.493216 | 55.885527 | 3123.192134 | 174541.238511 |
Modeling two seasons at a time allows model to adjust to changing goalie performance/shot recorder bias as modeling entire period results in fewer goals relative to expected in later seasons. Two seasons are used rather than 1 as to get 2017-18 some stability.
model_data['season_model'] = model_data.apply(lambda x: '2011_2012' if x.season in ['20102011','20112012'] else
'2013_2014' if x.season in ['20122013','20132014'] else
'2015_2016' if x.season in ['20142015','20152016'] else
'2017_2018' if x.season in ['20162017','20172018'] else 0, axis = 1)
#(744586, 30)
model_data.head()
season | Goal | Results_inRebound | EmptyNet_SA | is_Rebound | is_Rush | LN_Last_Event_Time | LastEV_Off_Faceoff | LastEV_Def_Faceoff | LastEV_Neu_Faceoff | LastEV_Off_Shot | LastEV_Def_Shot | LastEV_Neu_Shot | LastEV_Off_Give | LastEV_Def_Give | LastEV_Neu_Give | LN_Rebound_Distance_Traveled_byAngle | Regressed_Shooting_Indexed | Type_BACKHAND | Type_DEFLECTED | Type_SLAP SHOT | Type_WRAP-AROUND | Type_WRIST SHOT | Shooter_State | Goalie_State | Handed_Class2_Opposite | Handed_Class2_Same | Handed_Class2_U | Player_Position2_D | Player_Position2_F | Shot_Distance | Shot_Distance^2 | Shot_Distance^3 | Shot_Angle | Shot_Angle^2 | Shot_Angle^3 | season_model | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 2.197336 | 2.197336 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 1 | 0 | 0 | 5 | 5 | 1 | 0 | 0 | 0 | 1 | 40.311289 | 1625.0 | 65505.844205 | 66.614779 | 4437.528774 | 295604.998305 | 2011_2012 |
1 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 1.098946 | 0.000000 | 0.0 | 0.0 | 1.098946 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 1 | 0 | 0 | 5 | 5 | 0 | 1 | 0 | 1 | 0 | 58.137767 | 3380.0 | 196505.653863 | 63.434949 | 4023.992732 | 255261.773029 | 2011_2012 |
2 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 2.302685 | 2.302685 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 0 | 0 | 1 | 5 | 5 | 0 | 1 | 0 | 0 | 1 | 31.016125 | 962.0 | 29837.512095 | 1.847610 | 3.413664 | 6.307120 | 2011_2012 |
3 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 3.637612 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.99734 | 0 | 0 | 1 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 1 | 40.311289 | 1625.0 | 65505.844205 | 82.874984 | 6868.262915 | 569207.176806 | 2011_2012 |
4 | 20102011 | 0 | 0.0 | 0 | 0 | 0 | 1.386544 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.386544 | 0.0 | 0.0 | 0.0 | 1.00000 | 0 | 0 | 0 | 0 | 1 | 3 | 5 | 1 | 0 | 0 | 0 | 1 | 37.443290 | 1402.0 | 52495.493216 | 55.885527 | 3123.192134 | 174541.238511 | 2011_2012 |
## Check block counts
model_data.groupby(['season_model'])['Goal'].count()
season_model 2011_2012 218979 2013_2014 175529 2015_2016 217610 2017_2018 141696 Name: Goal, dtype: int64
Create function to score xG model based on model_vars, return ROC AUC, then score xR model, return ROC AUC. Save both models and print coefficients. Both models use Logitstic Regression, 10-fold cross-validation
def All_Model_Scoring(model_data, data, szn):
print (szn)
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
import pickle
model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
'Handed_Class2_Opposite',
'Player_Position2_F', 'Shot_Distance',
'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
'Shot_Angle^3']
rebound_vars = ['xG_raw', 'EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
'Handed_Class2_Opposite',
'Player_Position2_F', 'Shot_Distance',
'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
'Shot_Angle^3']
## Subset data to block, create model data, convert to matrix
szn_data = data.loc[data.season_model == szn, :]
szn_model_data = model_data.loc[model_data.season_model == szn, :].fillna(0)
szn_model_mat = szn_model_data.loc[szn_model_data.season_model == szn, model_vars].as_matrix().astype(np.float)
### Train xG Model
goal = szn_model_data.Goal
print (str(szn) + ' seasons dimensions: ' + str(szn_model_mat.shape))
print (str(szn) + ' seasons shooting%: ' + str(sum(goal) / len(goal)))
## xG Model
fold = KFold(len(goal), n_folds=10, shuffle=True, random_state=777)
xG_model_CV = LogisticRegressionCV(
Cs=list(np.power(10.0, np.arange(-10, 10)))
,penalty='l2'
,scoring='roc_auc'
,cv=fold
,random_state=777
,max_iter=10000
,fit_intercept=True
,solver='newton-cg'
,tol=10
)
## Fit model
xG_model_CV.fit(szn_model_mat, goal)
## Save Model
filename = 'xG_Model_' + str(szn) + '_obj.sav'
pickle.dump(xG_model_CV, open(filename, 'wb'))
print (str(szn) + 'Max auc_roc:', xG_model_CV.scores_[1].max())
## Score Model
xG_raw = xG_model_CV.predict_proba(szn_model_mat)[:,1]
print (str(szn) + ' seasons goals: ' + str(sum(goal)) + ', season xG: ' + str(sum(xG_raw)))
### Assemble data and train xRebound Model
rebound = szn_model_data.Results_inRebound.fillna(0)
print (str(szn) + ' goals scored: ' + str(sum(szn_data.Goal)))
print (str(szn) + ' xG scored: ' + str(sum(xG_raw)))
print (str(szn) + ' seasons dimensions: ' + str(szn_model_mat.shape))
print (str(szn) + ' seasons rebound%: ' + str(sum(rebound) / len(rebound)))
fold = KFold(len(rebound), n_folds=10, shuffle=True, random_state=777)
szn_model_mat = pd.concat([szn_model_data.reset_index(drop=True),
pd.DataFrame(xG_raw, columns = ['xG_raw']).reset_index(drop=True)], axis=1).loc[:,rebound_vars].as_matrix()
xR_model_CV = LogisticRegressionCV(
Cs=list(np.power(10.0, np.arange(-10, 10)))
,penalty='l2'
,scoring='roc_auc'
,cv=fold
,random_state=777
,max_iter=10000
,fit_intercept=True
,solver='newton-cg'
,tol=10
)
xR_model_CV.fit(szn_model_mat, rebound)
filename = 'xR_Model_' + str(szn) + '_obj.sav'
pickle.dump(xR_model_CV, open(filename, 'wb'))
print (str(szn) + ' Max auc_roc:', xR_model_CV.scores_[1].max())
xR_raw = xR_model_CV.predict_proba(szn_model_mat)[:,1]
print (str(szn) + ' seasons rebounds: ' + str(sum(rebound)) + ', season xR: ' + str(sum(xR_raw)))
coefs = pd.DataFrame(list(zip(np.array(rebound_vars),xR_model_CV.coef_.T)),
columns = ['Variable','Coef']).sort_values(['Coef'], ascending=False)
scored_data = pd.concat([
pd.DataFrame(xG_raw, columns = ['xG_raw']).reset_index(drop=True),
pd.DataFrame(xR_raw, columns = ['xR']).reset_index(drop=True),
szn_data.reset_index(drop=True)
], axis=1)
scored_data.to_csv("scored_data" + str(szn) + ".csv", index=False)
return coefs.T
All_Model_Scoring(model_data, shot_data_all2, '2017_2018')
2017_2018 2017_2018 seasons dimensions: (141696, 30) 2017_2018 seasons shooting%: 0.0633539408311 2017_2018Max auc_roc: 0.775398676368 2017_2018 seasons goals: 8977, season xG: 8976.17408044 2017_2018 goals scored: 8977 2017_2018 xG scored: 8976.17408044 2017_2018 seasons dimensions: (141696, 30) 2017_2018 seasons rebound%: 0.0323932926829 2017_2018 Max auc_roc: 0.679599683931 2017_2018 seasons rebounds: 4590.0, season xR: 4591.12339643
21 | 2 | 19 | 9 | 14 | 18 | 3 | 20 | 15 | 23 | 8 | 5 | 11 | 26 | 29 | 30 | 27 | 28 | 4 | 25 | 12 | 16 | 6 | 0 | 24 | 22 | 10 | 13 | 1 | 17 | 7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Variable | Shooter_State | is_Rebound | Type_WRAP-AROUND | LastEV_Def_Shot | LN_Rebound_Distance_Traveled_byAngle | Type_SLAP SHOT | is_Rush | Type_WRIST SHOT | Regressed_Shooting_Indexed | Handed_Class2_Opposite | LastEV_Off_Shot | LastEV_Off_Faceoff | LastEV_Off_Give | Shot_Distance^2 | Shot_Angle^2 | Shot_Angle^3 | Shot_Distance^3 | Shot_Angle | LN_Last_Event_Time | Shot_Distance | LastEV_Def_Give | Type_BACKHAND | LastEV_Def_Faceoff | xG_raw | Player_Position2_F | Goalie_State | LastEV_Neu_Shot | LastEV_Neu_Give | EmptyNet_SA | Type_DEFLECTED | LastEV_Neu_Faceoff |
Coef | [0.443230762862] | [0.242593069594] | [0.236305225426] | [0.196529997088] | [0.0938102952462] | [0.0880709042958] | [0.0704929456784] | [0.0687385433112] | [0.0545326544149] | [0.0411686094917] | [0.0204143408809] | [0.00932446948272] | [0.00491453331725] | [0.000910514894216] | [0.000151931517311] | [-4.32632277469e-07] | [-3.29846116297e-06] | [-0.00738119577147] | [-0.0102381420226] | [-0.0786000143722] | [-0.122034340806] | [-0.141906765043] | [-0.144939916331] | [-0.172781889777] | [-0.215843836221] | [-0.317256602872] | [-0.377557564797] | [-0.392747493842] | [-0.512845561177] | [-0.549577304022] | [-0.716628355712] |
All_Model_Scoring(model_data, shot_data_all2, '2015_2016')
2015_2016 2015_2016 seasons dimensions: (217610, 30) 2015_2016 seasons shooting%: 0.0625292955287 2015_2016Max auc_roc: 0.777002681465 2015_2016 seasons goals: 13607, season xG: 13605.6948619 2015_2016 goals scored: 13607 2015_2016 xG scored: 13605.6948619 2015_2016 seasons dimensions: (217610, 30) 2015_2016 seasons rebound%: 0.030462754469 2015_2016 Max auc_roc: 0.676234656964 2015_2016 seasons rebounds: 6629.0, season xR: 6629.73767749
21 | 2 | 19 | 18 | 14 | 20 | 23 | 3 | 28 | 5 | 26 | 30 | 27 | 29 | 4 | 11 | 9 | 8 | 15 | 25 | 16 | 12 | 6 | 24 | 0 | 22 | 13 | 10 | 1 | 17 | 7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Variable | Shooter_State | is_Rebound | Type_WRAP-AROUND | Type_SLAP SHOT | LN_Rebound_Distance_Traveled_byAngle | Type_WRIST SHOT | Handed_Class2_Opposite | is_Rush | Shot_Angle | LastEV_Off_Faceoff | Shot_Distance^2 | Shot_Angle^3 | Shot_Distance^3 | Shot_Angle^2 | LN_Last_Event_Time | LastEV_Off_Give | LastEV_Def_Shot | LastEV_Off_Shot | Regressed_Shooting_Indexed | Shot_Distance | Type_BACKHAND | LastEV_Def_Give | LastEV_Def_Faceoff | Player_Position2_F | xG_raw | Goalie_State | LastEV_Neu_Give | LastEV_Neu_Shot | EmptyNet_SA | Type_DEFLECTED | LastEV_Neu_Faceoff |
Coef | [0.434352784035] | [0.217387469412] | [0.214124051205] | [0.171990320473] | [0.13015520102] | [0.115009004209] | [0.0473993716049] | [0.016660341811] | [0.00603524793123] | [0.00288536951206] | [0.000794841008729] | [4.33940532904e-07] | [-2.53664345805e-06] | [-4.37267199967e-05] | [-0.00227667337587] | [-0.00596937422759] | [-0.0164444195784] | [-0.0333510073401] | [-0.0632778157823] | [-0.0745497498176] | [-0.0918397745913] | [-0.137538495271] | [-0.222625390936] | [-0.229699315968] | [-0.261292995408] | [-0.355609845331] | [-0.376252326754] | [-0.405605383995] | [-0.509204826468] | [-0.535699799812] | [-0.566667016696] |
All_Model_Scoring(model_data, shot_data_all2, '2013_2014')
2013_2014 2013_2014 seasons dimensions: (175529, 30) 2013_2014 seasons shooting%: 0.0626904955876 2013_2014Max auc_roc: 0.770487378959 2013_2014 seasons goals: 11004, season xG: 11002.8547599 2013_2014 goals scored: 11004 2013_2014 xG scored: 11002.8547599 2013_2014 seasons dimensions: (175529, 30) 2013_2014 seasons rebound%: 0.0305818411772 2013_2014 Max auc_roc: 0.654241854268 2013_2014 seasons rebounds: 5368.0, season xR: 5369.87685727
21 | 19 | 18 | 20 | 14 | 2 | 6 | 5 | 11 | 8 | 28 | 26 | 29 | 30 | 27 | 4 | 3 | 23 | 9 | 16 | 25 | 12 | 15 | 0 | 24 | 13 | 22 | 7 | 10 | 1 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Variable | Shooter_State | Type_WRAP-AROUND | Type_SLAP SHOT | Type_WRIST SHOT | LN_Rebound_Distance_Traveled_byAngle | is_Rebound | LastEV_Def_Faceoff | LastEV_Off_Faceoff | LastEV_Off_Give | LastEV_Off_Shot | Shot_Angle | Shot_Distance^2 | Shot_Angle^2 | Shot_Angle^3 | Shot_Distance^3 | LN_Last_Event_Time | is_Rush | Handed_Class2_Opposite | LastEV_Def_Shot | Type_BACKHAND | Shot_Distance | LastEV_Def_Give | Regressed_Shooting_Indexed | xG_raw | Player_Position2_F | LastEV_Neu_Give | Goalie_State | LastEV_Neu_Faceoff | LastEV_Neu_Shot | EmptyNet_SA | Type_DEFLECTED |
Coef | [0.421541250626] | [0.195991297068] | [0.180731390782] | [0.14615833415] | [0.101966515186] | [0.0995412193419] | [0.0874812352645] | [0.0320194794607] | [0.00778680101523] | [0.00497227573604] | [0.00173310331803] | [0.000885809270975] | [0.000119358110981] | [-9.5847370416e-07] | [-2.92774036083e-06] | [-0.0104735349129] | [-0.0149812990545] | [-0.0217751440318] | [-0.0265930399084] | [-0.0665984909775] | [-0.0780029052968] | [-0.0879427972701] | [-0.129339881986] | [-0.199616920194] | [-0.203785946043] | [-0.276553511017] | [-0.292844672162] | [-0.310902202645] | [-0.32141440785] | [-0.395679564539] | [-0.623100975278] |
All_Model_Scoring(model_data, shot_data_all2, '2011_2012')
2011_2012 2011_2012 seasons dimensions: (218979, 30) 2011_2012 seasons shooting%: 0.0630745413944 2011_2012Max auc_roc: 0.781397035575 2011_2012 seasons goals: 13812, season xG: 13811.9586764 2011_2012 goals scored: 13812 2011_2012 xG scored: 13811.9586764 2011_2012 seasons dimensions: (218979, 30) 2011_2012 seasons rebound%: 0.0304732417264 2011_2012 Max auc_roc: 0.662957428347 2011_2012 seasons rebounds: 6673.0, season xR: 6674.41284226
21 | 18 | 20 | 19 | 14 | 23 | 2 | 5 | 11 | 8 | 26 | 29 | 30 | 27 | 28 | 9 | 12 | 4 | 15 | 3 | 25 | 16 | 6 | 0 | 24 | 22 | 13 | 10 | 1 | 7 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Variable | Shooter_State | Type_SLAP SHOT | Type_WRIST SHOT | Type_WRAP-AROUND | LN_Rebound_Distance_Traveled_byAngle | Handed_Class2_Opposite | is_Rebound | LastEV_Off_Faceoff | LastEV_Off_Give | LastEV_Off_Shot | Shot_Distance^2 | Shot_Angle^2 | Shot_Angle^3 | Shot_Distance^3 | Shot_Angle | LastEV_Def_Shot | LastEV_Def_Give | LN_Last_Event_Time | Regressed_Shooting_Indexed | is_Rush | Shot_Distance | Type_BACKHAND | LastEV_Def_Faceoff | xG_raw | Player_Position2_F | Goalie_State | LastEV_Neu_Give | LastEV_Neu_Shot | EmptyNet_SA | LastEV_Neu_Faceoff | Type_DEFLECTED |
Coef | [0.383646161431] | [0.256764086948] | [0.142488000303] | [0.113918135507] | [0.0975769536292] | [0.0559406316352] | [0.0344296756424] | [0.0182337373267] | [0.0143922911474] | [0.00225758886072] | [0.00085221817346] | [0.000243772347851] | [-1.78506003457e-06] | [-2.72353916895e-06] | [-0.00345344999923] | [-0.00861552976001] | [-0.0195882791768] | [-0.0211591668439] | [-0.0297224028101] | [-0.0498855393577] | [-0.0789407644771] | [-0.0912119624353] | [-0.128691218921] | [-0.204174082684] | [-0.271887137976] | [-0.290430989778] | [-0.366768312254] | [-0.389375254075] | [-0.397079300866] | [-0.510863084948] | [-0.526980726973] |