#!/usr/bin/env python # coding: utf-8 # In[1]: import scrape_functions from datetime import date, timedelta import json_pbp import html_pbp import seaborn as sbs import espn_pbp import json_shifts import html_shifts import playing_roster import json_schedule import pandas as pd import time import numpy as np import datetime import warnings import shared import pickle #pip install mysql-connector-python-rf import mysql.connector from mysql.connector import Error from sqlalchemy import create_engine pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', 999) pd.set_option('display.max_rows', None) # ### Create Rink Adjust Object # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import thinkbayes2 as tb import thinkbayes as tb0 class RinkAdjust(object): def __init__( self ): self.teamxcdf, self.teamycdf, self.otherxcdf, self.otherycdf = {}, {}, {}, {} def addCDFs( self, team, this_x_cdf, this_y_cdf, other_x_cdf, other_y_cdf ): self.teamxcdf[team] = this_x_cdf self.teamycdf[team] = this_y_cdf self.otherxcdf[team] = other_x_cdf self.otherycdf[team] = other_y_cdf def addTeam( self, team, this_team, rest_of_league ): this_x_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( this_team.X_unadj ) ) this_y_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( this_team.Y_unadj ) ) other_x_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( rest_of_league.X_unadj ) ) other_y_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( rest_of_league.Y_unadj ) ) self.addCDFs( team, this_x_cdf, this_y_cdf, other_x_cdf, other_y_cdf ) def PlotTeamCDFs( self, team, savefig=False ): this_x_cdf = self.teamxcdf[team] this_y_cdf = self.teamycdf[team] other_x_cdf = self.otherxcdf[team] other_y_cdf = self.otherycdf[team] f, axx = plt.subplots( 1, 2, sharey='col' ) f.set_size_inches( 14, 8 ) xx1, yx1 = this_x_cdf.Render() xx2, yx2 = other_x_cdf.Render() axx[0].plot( xx1, yx1, color='blue', label='@%s' % team ) axx[0].plot( xx2, yx2, color='brown', label='@Rest of League' ) axx[0].set_xlabel( 'CDF of X' ) axx[0].legend() xy1, yy1 = this_y_cdf.Render() xy2, yy2 = other_y_cdf.Render() axx[1].plot( xy1, yy1, color='blue', label='@%s' % team ) axx[1].plot( xy2, yy2, color='brown', label='@Rest of League' ) axx[1].set_xlabel( 'CDF of Y' ) axx[1].legend() f.suptitle( 'Cumulative Density Function for Shot Location Rink Bias Adjustment' ) plt.show() if savefig: #f.set_tight_layout( True ) plt.savefig( 'Rink bias CDF chart %s.png' % team ) def rink_bias_adjust( self, x, y, team ): """ this method implements the actual location conversion from biased to "unbiased" shot location the way it works for rink bias adjustment is that for a given shot location in a specific rink, you find the cumulative probabilities for that x and y in that rink. Then you calculate the league equivalent x and y that have the same probabilities as the one measured in the specific rink The equivalency CDFs are calculated using only visiting teams, which ensures that both single rink and league wide rinks have as wide a sample of teams as possible but avoid any possible home team bias. All of which lets us assume that they are then unbiased enough to be representative (at least enough for standardization purposes) This is (my adaption of my understanding of) Shuckers' method for rink bias adjustment as described in Appendix A here: http://www.sloansportsconference.com/wp-content/uploads/2013/Total%20Hockey%20Rating%20(THoR)%20A%20comprehensive%20statistical%20rating%20of%20National%20Hockey%20League%20forwards%20and%20defensemen%20based%20upon%20all%20on-ice%20events.pdf for example, if a shot x coordinate is measured as xmeas in a rink xprob = this_x_cdf.Prob( xmeas ) # cum prob of seeing xmeas in this rink xadj = other_x_cdf.Value( xprob ) # value associated with same prob in rest of league analogous process for y The code for Cdf/Pmf creation and manipulation is taken directly from Allan Downey's code for "Think Bayes" """ xprob = self.teamxcdf[team].Prob( x ) newx = self.otherxcdf[team].Value( xprob ) yprob = self.teamycdf[team].Prob( y ) newy = self.otherycdf[team].Value( yprob ) return newx, newy # ## Create Function to Transform Raw NHL PBP Data # # NHL PBP Data from scraper: https://github.com/HarryShomer/Hockey-Scraper # In[3]: def transform_data(data): import warnings warnings.simplefilter("ignore") from sqlalchemy import create_engine pbp_df = data print("All events and columns: " + str(pbp_df.shape)) ## Remove shootouts pbp_df['season'] = pbp_df.apply( lambda x: str(pd.to_datetime(x.Date).year-1) + str(pd.to_datetime(x.Date).year) if pd.to_datetime(x.Date).month < 9 else str(pd.to_datetime(x.Date).year) + str(pd.to_datetime(x.Date).year + 1), axis=1 ) pbp_df['season2'] = pbp_df.apply( lambda x: x.season if x.Game_Id < 30000 else str(x.season) + "p", axis=1 ) pbp_df['Season_Type'] = pbp_df.apply( lambda x: 'RS' if x.Game_Id < 30000 else 'PO', axis=1 ) pbp_df['season_model'] = pbp_df.apply(lambda x: '2011_2012' if x.season in ['20102011','20112012'] else '2013_2014' if x.season in ['20122013','20132014'] else '2015_2016' if x.season in ['20142015','20152016'] else '2017_2018' if x.season in ['20162017','20172018'] else 0, axis = 1) pbp_df = pbp_df.drop_duplicates(['season','Game_Id','Period','Ev_Team','Seconds_Elapsed']) pbp_df = pbp_df.sort_values(['season','Game_Id','Period','Seconds_Elapsed'], ascending=True) # Remove SOs pbp_df = pbp_df.loc[((pbp_df.Period == 5) & (pbp_df.Season_Type == "RS")) != True,:] # Group Give/Take together pbp_df['Event'] = pbp_df['Event'].apply( lambda x: 'TURN' if x in ["GIVE","TAKE"] else x ) pbp_df['Type'] = pbp_df['Type'].apply( lambda x: 'DEFLECTED' if x in ["DEFLECTED","TIP-IN"] else \ 'WRIST SHOT' if x in ["WRIST SHOT","SNAP SHOT"] else x ) ## Check Lag Time doesn't Cross Periods pbp_df = pbp_df.sort_values(['season','Game_Id','Period','Seconds_Elapsed'], ascending=True) pbp_df['lagged_Event'] = pbp_df.groupby(['Game_Id','Period'])['Event'].shift(1) pbp_df['lagged_Ev_Zone'] = pbp_df.groupby(['Game_Id','Period'])['Ev_Zone'].shift(1) pbp_df['lagged_Seconds_Elapsed'] = pbp_df.groupby(['Game_Id','Period'])['Seconds_Elapsed'].shift(1) ############################################# ### Subset to just shots ############################################# pbp_df = pbp_df.loc[pbp_df.Event.isin(["SHOT","GOAL","MISS","BLOCK"]),:] print("All shots/blocks and columns: " + str(pbp_df.shape)) ## Binary pbp_df['Goal'] = pbp_df.apply( lambda x: 1 if x.Event == "GOAL" else 0, axis = 1 ) pbp_df['EmptyNet_SA'] = pbp_df.apply( lambda x: 1 if ((pd.isnull(x.Home_Goalie)) & (x.Ev_Team == x.Away_Team)) | \ ((pd.isnull(x.Away_Goalie)) & (x.Ev_Team == x.Home_Team)) else 0, axis = 1) pbp_df['is_Rebound'] = pbp_df.apply( lambda x: 1 if (x.lagged_Event in ["SHOT"]) & \ ((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 2) else 0, axis = 1 ) pbp_df['is_Bounce'] = pbp_df.apply( lambda x: 1 if (x.lagged_Event in ["BLOCK","MISS"]) & \ ((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 2) else 0, axis = 1 ) pbp_df['is_Rush'] = pbp_df.apply( lambda x: 1 if (x.Ev_Zone != x.lagged_Ev_Zone) & \ ((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 6) else 0, axis = 1 ) # Replace every occurrence of PHX with ARI pbp_df['Home_Team'] = pbp_df.apply( lambda x: x.Home_Team if x.Home_Team !='PHX' else 'ARI', axis=1 ) pbp_df['Away_Team'] = pbp_df.apply( lambda x: x.Away_Team if x.Away_Team !='PHX' else 'ARI', axis=1 ) pbp_df['Ev_Team'] = pbp_df.apply( lambda x: x.Ev_Team if x.Ev_Team !='PHX' else 'ARI', axis=1 ) # Replace every occurrence of ATL with WPG pbp_df['Home_Team'] = pbp_df.apply( lambda x: x.Home_Team if x.Home_Team !='ATL' else 'WPG', axis=1 ) pbp_df['Away_Team'] = pbp_df.apply( lambda x: x.Away_Team if x.Away_Team !='ATL' else 'WPG', axis=1 ) pbp_df['Ev_Team'] = pbp_df.apply( lambda x: x.Ev_Team if x.Ev_Team !='ATL' else 'WPG', axis=1 ) # add a 'Direction' column to indicate the primary direction for shots. The heuristic to determine # direction is the sign of the median of the X coordinate of shots in each period. This then lets us filter # out shots that originate from back in the defensive zone when the signs don't match pbp_df['Home_Shooter'] = pbp_df.apply( lambda x: 1 if x.Ev_Team == x.Home_Team else 0, axis = 1) game_period_locations = pbp_df.groupby( by=['season', 'Game_Id', 'Period','Home_Shooter'] )['xC','yC'] game_period_medians = game_period_locations.transform(np.median) pbp_df['Direction'] = np.sign( game_period_medians['xC'] ) # should actually write this to a CSV as up to here is the performance intensive part pbp_df['X_unadj'], pbp_df['Y_unadj'] = zip( *pbp_df.apply( lambda x: (x.xC, x.yC) if x.Direction > 0 else (-x.xC,-x.yC), axis = 1 ) ) pbp_df['LS_Shot'] = pbp_df.apply( lambda x: 1 if x.Y_unadj < 0 else 0, axis = 1) ## Logged Last Event Time pbp_df['LN_Last_Event_Time'] = pbp_df.apply( lambda x: 0 if (x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 0 \ else np.log(x.Seconds_Elapsed - x.lagged_Seconds_Elapsed + 0.001), axis = 1) # Last Event pbp_df['LastEV_Off_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event == 'FAC') else 0, axis = 1) pbp_df['LastEV_Def_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event == 'FAC') else 0, axis = 1) pbp_df['LastEV_Neu_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event == 'FAC') else 0, axis = 1) pbp_df['LastEV_Off_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1) pbp_df['LastEV_Def_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1) pbp_df['LastEV_Neu_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1) pbp_df['LastEV_Off_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event == 'TURN') else 0, axis = 1) pbp_df['LastEV_Def_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event == 'TURN') else 0, axis = 1) pbp_df['LastEV_Neu_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event == 'TURN') else 0, axis = 1) ## Adjust X, Y coordinates by Rink, using CDF of shot attempts only (remove blocks since they skew data) pbp_df = pbp_df.loc[pbp_df.Event.isin(["SHOT","GOAL","MISS"]),:] ## Call RinkAdjust class adjuster = RinkAdjust() ## New dataframe of adjusted shots for each home rink pbp_df_adj = pd.DataFrame() ## For each home rink for team in sorted(pbp_df.Home_Team.unique()): ## Split shots into team arena and all other rinks shot_data = pbp_df rink_shots = shot_data[ shot_data.Home_Team == team ] rest_of_league = shot_data[ shot_data.Home_Team != team ] ## Create teamxcdf and otherxcdf for rink adjustment adjuster.addTeam( team, rink_shots, rest_of_league ) ## Adjusted coordinates Xadj = [] Yadj = [] ## For each shot in rink adjust coordinates based on other rinks for row in rink_shots.itertuples(): newx, newy = adjuster.rink_bias_adjust( row.X_unadj, row.Y_unadj, row.Home_Team ) Xadj.append(newx) Yadj.append(newy) rink_shots['X'] = Xadj rink_shots['Y'] = Yadj pbp_df_adj = pbp_df_adj.append(rink_shots) print ("All shots columns, rink adjusted: " + str(pbp_df_adj.shape)) ## Apply only to season level data after x,y CDF adjustment pbp_df_adj['Shot_Distance_Unadj'] = pbp_df_adj.apply( lambda x: ((89 - x.X_unadj)**2 + (x.Y_unadj ** 2)) ** 0.5, axis = 1 ) pbp_df_adj['Shot_Distance'] = pbp_df_adj.apply( lambda x: ((89 - x.X)**2 + (x.Y ** 2)) ** 0.5, axis = 1 ) pbp_df_adj['Shot_Angle'] = pbp_df_adj.apply( lambda x: np.arctan(abs(89 - x.X) / abs(0 - x.Y)) * (180 / np.pi) if x.Y != 0 \ else 90, axis = 1 ) pbp_df_adj['Last_Shot_Distance'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['Shot_Distance'].shift(1) pbp_df_adj['Last_Shot_Angle'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['Shot_Angle'].shift(1) pbp_df_adj['Last_LS_Shot'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['LS_Shot'].shift(1) pbp_df_adj['Rebound_Distance_Change'] = pbp_df_adj.apply( lambda x: x.Last_Shot_Distance + x.Shot_Distance if x.is_Rebound == 1 else 0, axis = 1 ) pbp_df_adj['Rebound_Angle_Change'] = pbp_df_adj.apply( lambda x: 0 if x.is_Rebound == 0 \ else abs(x.Last_Shot_Angle - x.Shot_Angle) \ if x.is_Rebound == 1 & (x.Last_LS_Shot == x.LS_Shot) else \ (180 - x.Last_Shot_Angle - x.Shot_Angle), axis = 1 ) pbp_df_adj['Rebound_Distance_Traveled_byAngle'] = pbp_df_adj. \ apply( lambda x: x.Rebound_Distance_Change / x.Rebound_Angle_Change \ if x.Rebound_Angle_Change > 0 else 0, axis = 1) pbp_df_adj['LN_Rebound_Distance_Traveled_byAngle'] = pbp_df_adj. \ apply(lambda x: np.log(x.Rebound_Distance_Traveled_byAngle) \ if x.Rebound_Distance_Traveled_byAngle > 0 else 0, axis = 1) print ("All shots columns, final calcuations: " + str(pbp_df_adj.shape)) return pbp_df_adj # ### Read-in and Stack # In[4]: types = {'xC': np.float64, 'yC': np.float64, 'X': np.float64, 'X_unadj': np.float64, 'Y': np.float64, 'Y_unadj': np.float64, 'Game_Id': int} # In[30]: nhl_pbp20102011 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20102011.csv', dtype=types) nhl_pbp20112012 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20112012.csv', dtype=types) nhl_pbp20122013 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20122013.csv', dtype=types) nhl_pbp20132014 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20132014.csv', dtype=types) nhl_pbp20142015 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20142015.csv', dtype=types) nhl_pbp20152016 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20152016.csv', dtype=types) nhl_pbp20162017 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20162017.csv', dtype=types) nhl_pbp20172018 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20172018.csv', dtype=types) nhl_pbp = pd.concat([nhl_pbp20102011, nhl_pbp20112012, nhl_pbp20122013, nhl_pbp20132014, nhl_pbp20142015, nhl_pbp20152016, nhl_pbp20162017, nhl_pbp20172018]) unwanted = nhl_pbp.columns[nhl_pbp.columns.str.startswith('Unna')] nhl_pbp.drop(unwanted, axis=1, inplace=True) nhl_pbp.head() # ## Load Goalie/Skater Roster with Handedness # In[31]: player_lookup = pd.read_sql(con=engine, sql ="SELECT * FROM `nhl_all`.`hockey_roster_info` AS B") player_lookup = player_lookup.sort_values('gamesPlayed',ascending=False).groupby(['playerId']).first().reset_index(). \ loc[:, ['playerBirthDate', 'playerPositionCode', 'playerShootsCatches','playerId']] skater_lookup = player_lookup.loc[player_lookup.playerPositionCode != "G", :] skater_lookup.columns = ['shooterDOB','Player_Position','Shoots','p1_ID'] skater_lookup['p1_ID'] = skater_lookup['p1_ID'].astype(str) goalie_lookup = pd.read_sql(con=engine, sql = "SELECT DISTINCT playerId as SA_Goalie_Id, playerShootsCatches as Catches, playerBirthDate as goalieDOB FROM `nhl_all`.`hockey_goalies_roster` AS A") goalie_lookup['SA_Goalie_Id'] = goalie_lookup['SA_Goalie_Id'].astype(str) # ## Lookup Players, Generate More Features # In[33]: def lookups_data_clean(data): for col in ['Game_Id','Away_Goalie_Id','Home_Goalie_Id','p1_ID','p2_ID','p3_ID', 'awayPlayer1_id','awayPlayer2_id','awayPlayer3_id','awayPlayer4_id','awayPlayer5_id','awayPlayer6_id', 'homePlayer1_id','homePlayer2_id','homePlayer3_id','homePlayer4_id','homePlayer5_id','homePlayer6_id']: data[col] = data[col].fillna(0).astype(int).astype(str) data['SA_Goalie'] = data.apply( lambda x: x.Away_Goalie if x.Ev_Team == x.Home_Team else x.Home_Goalie, axis=1 ) data['SA_Goalie_Id'] = data.apply( lambda x: x.Away_Goalie_Id if x.Ev_Team == x.Home_Team else x.Home_Goalie_Id, axis=1 ) data['Away_State'] = data.apply( lambda x: x.Away_Players - 1 if x.Away_Goalie_Id in [x.awayPlayer6_id, x.awayPlayer5_id, x.awayPlayer4_id, x.awayPlayer3_id] else x.Away_Players, axis=1 ) data['Home_State'] = data.apply( lambda x: x.Home_Players - 1 if x.Home_Goalie_Id in [x.homePlayer6_id, x.homePlayer5_id, x.homePlayer4_id, x.homePlayer3_id] else x.Home_Players, axis=1 ) data['Away_State'] = data.apply( lambda x: x.Away_Players - 1 if x.Away_Goalie_Id in [x.awayPlayer6_id, x.awayPlayer5_id, x.awayPlayer4_id, x.awayPlayer3_id] else x.Away_Players, axis=1 ) data['Home_State'] = data.apply( lambda x: x.Home_Players - 1 if x.Home_Goalie_Id in [x.homePlayer6_id, x.homePlayer5_id, x.homePlayer4_id, x.homePlayer3_id] else x.Home_Players, axis=1 ) data['Results_inRebound'] = data['is_Rebound'].shift(periods=-1) data['Shooter_State'] = data.apply( lambda x: x.Away_State if x.Ev_Team != x.Home_Team else x.Home_State, axis=1 ) data['Goalie_State'] = data.apply( lambda x: x.Away_State if x.Ev_Team == x.Home_Team else x.Home_State, axis=1 ) data['Game_State'] = data.apply( lambda x: str(x.Away_State) + "v" + str(x.Home_State) if x.Ev_Team == x.Home_Team else \ str(x.Home_State) + "v" + str(x.Away_State) , axis=1 ) data['Game_State'] = data.apply( lambda x: "SH_SA" if x.Game_State in ["3v5","3v4","3v6","4v5","4v6","5v6"] else \ "PP_2p_SA" if x.Game_State in ["6v3","6v4","5v3"] else \ "5v5" if x.Game_State in ["5v5","6v6"] else x.Game_State, axis=1 ) data['State_Space'] = data['Goalie_State'] + data['Shooter_State'] data['Shooter_State_Advantage'] = data['Shooter_State'] - data['Goalie_State'] data = data.merge(skater_lookup, on=['p1_ID'], how = 'left') data = data.merge(goalie_lookup, on=['SA_Goalie_Id'], how = 'left') data['Shooter_Handedness'] = data.apply( lambda x: "L" if x.Shoots == "L" else \ "R" if x.Shoots == "R" else "U", axis=1 ) data['Handed_Class'] = data['Shoots'].str.cat(data['Catches'], sep='') data['Handed_Class2'] = data.apply( lambda x: "Same" if x.Handed_Class in ["LL","RR"] else \ "Opposite" if x.Handed_Class in ["LR","RL"] else "U", axis = 1) data['Player_Position2'] = data.apply( lambda x: "D" if x.Player_Position == "D" else "F", axis=1 ) return data # ## Cumulative Shooting Function # In[34]: def cumulative_shooting_talent(data): shooting_percentage = data.groupby(['Player_Position2'])['Goal'].mean() data['Cum_Goal'] = data.groupby(['p1_ID'])['Goal'].cumsum() data['Cum_Shots'] = data.groupby(['p1_ID']).cumcount() data['Cum_Goal'] = data.apply( lambda x: x.Cum_Goal - 1 if x.Event == "GOAL" else x.Cum_Goal, axis = 1) kr21_stabilizer_F = pd.to_numeric(375.0) kr21_stabilizer_D = pd.to_numeric(275.0) data['Regressed_Shooting_Indexed'] = data.apply( lambda x: ((x.Cum_Goal + (kr21_stabilizer_D * shooting_percentage[0])) /\ (x.Cum_Shots + kr21_stabilizer_D)) / shooting_percentage[0]\ if x.Player_Position2 == "D" else ((x.Cum_Goal + (kr21_stabilizer_F * shooting_percentage[1])) /\ (x.Cum_Shots + kr21_stabilizer_F)) / shooting_percentage[1], axis = 1) return data # ## Create Dummy and Polynomial Variables # # For each category variable, create dummies # For shot distance and angle 3rd degree polynomial # In[35]: def feature_generation(data, id_vars = ["season"], target_vars = ['Goal','Results_inRebound'], num_vars = ["EmptyNet_SA","is_Rebound","is_Rush","LN_Last_Event_Time","LastEV_Off_Faceoff", "LastEV_Def_Faceoff","LastEV_Neu_Faceoff","LastEV_Off_Shot","LastEV_Def_Shot","LastEV_Neu_Shot", "LastEV_Off_Give","LastEV_Def_Give","LastEV_Neu_Give","LN_Rebound_Distance_Traveled_byAngle", "Regressed_Shooting_Indexed"], cat_vars = ["Type","Shooter_State","Goalie_State","Handed_Class2","Player_Position2"], poly_vars = ["Shot_Distance","Shot_Angle"], model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time', 'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff', 'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot', 'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give', 'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed', 'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND', 'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State', 'Handed_Class2_Opposite', 'Player_Position2_F', 'Shot_Distance', 'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2', 'Shot_Angle^3']): from sklearn.preprocessing import PolynomialFeatures ## Dummy Variables model_data = data[num_vars].fillna(0) for i in cat_vars: var_dummies = pd.get_dummies(data.loc[:,[i]]) model_data = pd.concat([model_data, var_dummies], axis=1) ## Polynomial Variables for i in poly_vars: poly_data = data.loc[:,[i]] poly = PolynomialFeatures(degree=3,interaction_only=False).fit(poly_data) poly_names = poly.get_feature_names(poly_data.columns) poly_output = poly.transform(data.loc[:,[i]]) model_data = pd.DataFrame(pd.concat([ model_data, pd.DataFrame(poly_output, columns = poly_names ).iloc[:,1:]], axis=1)) #model_mat = model_data.loc[:, model_vars].as_matrix() model_data = pd.concat([data[id_vars],data[target_vars], model_data], axis=1) print(model_data.shape) return model_data # ## Data Pipeline # In[36]: shot_data_all = transform_data(nhl_pbp) # In[48]: ## Check shot distance adjustment shot_data_all.loc[shot_data_all.season == "20172018", :].groupby(['Home_Team'])[['Shot_Distance','Shot_Distance_Unadj']]\ .mean().sort_values(['Shot_Distance_Unadj']) # In[37]: shot_data_all = lookups_data_clean(shot_data_all) # In[38]: shot_data_all2 = cumulative_shooting_talent(shot_data_all) # In[40]: model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time', 'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff', 'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot', 'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give', 'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed', 'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND', 'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State', 'Handed_Class2_Opposite', 'Player_Position2_F', 'Shot_Distance', 'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2', 'Shot_Angle^3'] model_data = feature_generation(shot_data_all2, model_vars = model_vars) #(744586, 30) model_data.head() # ## Break Data into 2 Season Blocks # Modeling two seasons at a time allows model to adjust to changing goalie performance/shot recorder bias as modeling entire period results in fewer goals relative to expected in later seasons. Two seasons are used rather than 1 as to get 2017-18 some stability. # In[41]: model_data['season_model'] = model_data.apply(lambda x: '2011_2012' if x.season in ['20102011','20112012'] else '2013_2014' if x.season in ['20122013','20132014'] else '2015_2016' if x.season in ['20142015','20152016'] else '2017_2018' if x.season in ['20162017','20172018'] else 0, axis = 1) #(744586, 30) model_data.head() # In[49]: ## Check block counts model_data.groupby(['season_model'])['Goal'].count() # ## Fit and Score xG and xR Models # # Create function to score xG model based on model_vars, return ROC AUC, then score xR model, return ROC AUC. Save both models and print coefficients. # Both models use Logitstic Regression, 10-fold cross-validation # In[43]: def All_Model_Scoring(model_data, data, szn): print (szn) from sklearn.cross_validation import KFold from sklearn.linear_model import LogisticRegression from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegressionCV import pickle model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time', 'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff', 'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot', 'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give', 'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed', 'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND', 'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State', 'Handed_Class2_Opposite', 'Player_Position2_F', 'Shot_Distance', 'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2', 'Shot_Angle^3'] rebound_vars = ['xG_raw', 'EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time', 'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff', 'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot', 'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give', 'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed', 'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND', 'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State', 'Handed_Class2_Opposite', 'Player_Position2_F', 'Shot_Distance', 'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2', 'Shot_Angle^3'] ## Subset data to block, create model data, convert to matrix szn_data = data.loc[data.season_model == szn, :] szn_model_data = model_data.loc[model_data.season_model == szn, :].fillna(0) szn_model_mat = szn_model_data.loc[szn_model_data.season_model == szn, model_vars].as_matrix().astype(np.float) ### Train xG Model goal = szn_model_data.Goal print (str(szn) + ' seasons dimensions: ' + str(szn_model_mat.shape)) print (str(szn) + ' seasons shooting%: ' + str(sum(goal) / len(goal))) ## xG Model fold = KFold(len(goal), n_folds=10, shuffle=True, random_state=777) xG_model_CV = LogisticRegressionCV( Cs=list(np.power(10.0, np.arange(-10, 10))) ,penalty='l2' ,scoring='roc_auc' ,cv=fold ,random_state=777 ,max_iter=10000 ,fit_intercept=True ,solver='newton-cg' ,tol=10 ) ## Fit model xG_model_CV.fit(szn_model_mat, goal) ## Save Model filename = 'xG_Model_' + str(szn) + '_obj.sav' pickle.dump(xG_model_CV, open(filename, 'wb')) print (str(szn) + 'Max auc_roc:', xG_model_CV.scores_[1].max()) ## Score Model xG_raw = xG_model_CV.predict_proba(szn_model_mat)[:,1] print (str(szn) + ' seasons goals: ' + str(sum(goal)) + ', season xG: ' + str(sum(xG_raw))) ### Assemble data and train xRebound Model rebound = szn_model_data.Results_inRebound.fillna(0) print (str(szn) + ' goals scored: ' + str(sum(szn_data.Goal))) print (str(szn) + ' xG scored: ' + str(sum(xG_raw))) print (str(szn) + ' seasons dimensions: ' + str(szn_model_mat.shape)) print (str(szn) + ' seasons rebound%: ' + str(sum(rebound) / len(rebound))) fold = KFold(len(rebound), n_folds=10, shuffle=True, random_state=777) szn_model_mat = pd.concat([szn_model_data.reset_index(drop=True), pd.DataFrame(xG_raw, columns = ['xG_raw']).reset_index(drop=True)], axis=1).loc[:,rebound_vars].as_matrix() xR_model_CV = LogisticRegressionCV( Cs=list(np.power(10.0, np.arange(-10, 10))) ,penalty='l2' ,scoring='roc_auc' ,cv=fold ,random_state=777 ,max_iter=10000 ,fit_intercept=True ,solver='newton-cg' ,tol=10 ) xR_model_CV.fit(szn_model_mat, rebound) filename = 'xR_Model_' + str(szn) + '_obj.sav' pickle.dump(xR_model_CV, open(filename, 'wb')) print (str(szn) + ' Max auc_roc:', xR_model_CV.scores_[1].max()) xR_raw = xR_model_CV.predict_proba(szn_model_mat)[:,1] print (str(szn) + ' seasons rebounds: ' + str(sum(rebound)) + ', season xR: ' + str(sum(xR_raw))) coefs = pd.DataFrame(list(zip(np.array(rebound_vars),xR_model_CV.coef_.T)), columns = ['Variable','Coef']).sort_values(['Coef'], ascending=False) scored_data = pd.concat([ pd.DataFrame(xG_raw, columns = ['xG_raw']).reset_index(drop=True), pd.DataFrame(xR_raw, columns = ['xR']).reset_index(drop=True), szn_data.reset_index(drop=True) ], axis=1) scored_data.to_csv("scored_data" + str(szn) + ".csv", index=False) return coefs.T # In[44]: All_Model_Scoring(model_data, shot_data_all2, '2017_2018') # In[45]: All_Model_Scoring(model_data, shot_data_all2, '2015_2016') # In[46]: All_Model_Scoring(model_data, shot_data_all2, '2013_2014') # In[47]: All_Model_Scoring(model_data, shot_data_all2, '2011_2012') # ## Fin