In [1]:
import scrape_functions
from datetime import date, timedelta
import json_pbp
import html_pbp
import seaborn as sbs
import espn_pbp
import json_shifts
import html_shifts
import playing_roster
import json_schedule
import pandas as pd
import time
import numpy as np
import datetime
import warnings
import shared
import pickle
#pip install mysql-connector-python-rf
import mysql.connector
from mysql.connector import Error
from sqlalchemy import create_engine

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 999)
pd.set_option('display.max_rows', None)

Create Rink Adjust Object

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import thinkbayes2 as tb
import thinkbayes as tb0


class RinkAdjust(object):
    
    def __init__( self ):
        self.teamxcdf, self.teamycdf, self.otherxcdf, self.otherycdf  = {}, {}, {}, {}

        
    def addCDFs( self, team, this_x_cdf, this_y_cdf, other_x_cdf, other_y_cdf ):
        self.teamxcdf[team] = this_x_cdf        
        self.teamycdf[team] = this_y_cdf        
        self.otherxcdf[team] = other_x_cdf        
        self.otherycdf[team] = other_y_cdf


    def addTeam( self, team, this_team, rest_of_league ):
        this_x_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( this_team.X_unadj ) )
        this_y_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( this_team.Y_unadj ) )
        other_x_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( rest_of_league.X_unadj ) )
        other_y_cdf = tb.MakeCdfFromPmf( tb.MakePmfFromList( rest_of_league.Y_unadj ) )
        self.addCDFs( team, this_x_cdf, this_y_cdf, other_x_cdf, other_y_cdf )


    def PlotTeamCDFs( self, team, savefig=False ):
        this_x_cdf = self.teamxcdf[team]
        this_y_cdf = self.teamycdf[team]
        other_x_cdf = self.otherxcdf[team]        
        other_y_cdf = self.otherycdf[team]

        f, axx = plt.subplots( 1, 2, sharey='col' )
        f.set_size_inches( 14, 8 )
    
        xx1, yx1 = this_x_cdf.Render()
        xx2, yx2 = other_x_cdf.Render()

        axx[0].plot( xx1, yx1, color='blue', label='@%s' % team )
        axx[0].plot( xx2, yx2, color='brown', label='@Rest of League' )
        axx[0].set_xlabel( 'CDF of X' )
        axx[0].legend()
    
        xy1, yy1 = this_y_cdf.Render()
        xy2, yy2 = other_y_cdf.Render()
    
        axx[1].plot( xy1, yy1, color='blue', label='@%s' % team )
        axx[1].plot( xy2, yy2, color='brown', label='@Rest of League' )
        axx[1].set_xlabel( 'CDF of Y' )
        axx[1].legend()
    
        f.suptitle( 'Cumulative Density Function for Shot Location Rink Bias Adjustment' )
        
        plt.show()
        
        if savefig:
            #f.set_tight_layout( True )
            plt.savefig( 'Rink bias CDF chart %s.png' % team )


    def rink_bias_adjust( self, x, y, team ):
        """ this method implements the actual location conversion from biased to "unbiased" shot location
        
         the way it works for rink bias adjustment is that for a given shot location in a specific rink,
         you find the cumulative probabilities for that x and y in that rink. Then you calculate the league 
         equivalent x and y that have the same probabilities as the one measured in the specific rink
         
         The equivalency CDFs are calculated using only visiting teams, which ensures that both single rink and
         league wide rinks have as wide a sample of teams as possible but avoid any possible home team bias.
         All of which lets us assume that they are then unbiased enough to be representative (at least enough 
         for standardization purposes)
         
         This is (my adaption of my understanding of) Shuckers' method for rink bias adjustment as described in Appendix A here:
         http://www.sloansportsconference.com/wp-content/uploads/2013/Total%20Hockey%20Rating%20(THoR)%20A%20comprehensive%20statistical%20rating%20of%20National%20Hockey%20League%20forwards%20and%20defensemen%20based%20upon%20all%20on-ice%20events.pdf
    
         for example, if a shot x coordinate is measured as xmeas in a rink
         
             xprob = this_x_cdf.Prob( xmeas )  # cum prob of seeing xmeas in this rink
             xadj = other_x_cdf.Value( xprob ) # value associated with same prob in rest of league        
    
        analogous process for y
        
        The code for Cdf/Pmf creation and manipulation is taken directly from Allan Downey's code for "Think Bayes"
        """
         
        xprob = self.teamxcdf[team].Prob( x )
        newx = self.otherxcdf[team].Value( xprob )
        
        yprob = self.teamycdf[team].Prob( y )
        newy = self.otherycdf[team].Value( yprob )
        
        return newx, newy

Create Function to Transform Raw NHL PBP Data

NHL PBP Data from scraper: https://github.com/HarryShomer/Hockey-Scraper

In [3]:
def transform_data(data):

    import warnings
    warnings.simplefilter("ignore")
    
    from sqlalchemy import create_engine
    
    pbp_df = data

    print("All events and columns: " + str(pbp_df.shape))
        
    ## Remove shootouts
    pbp_df['season'] = pbp_df.apply( lambda x: str(pd.to_datetime(x.Date).year-1) + str(pd.to_datetime(x.Date).year) if pd.to_datetime(x.Date).month < 9 else str(pd.to_datetime(x.Date).year) + str(pd.to_datetime(x.Date).year + 1), axis=1 )
    
    

    pbp_df['season2'] = pbp_df.apply( lambda x: x.season if x.Game_Id < 30000 else str(x.season) + "p", axis=1 )

    pbp_df['Season_Type'] = pbp_df.apply( lambda x: 'RS' if x.Game_Id < 30000 else 'PO', axis=1 )

    pbp_df['season_model'] = pbp_df.apply(lambda x: '2011_2012' if x.season in ['20102011','20112012'] else
                                                       '2013_2014' if x.season in ['20122013','20132014'] else
                                                       '2015_2016' if x.season in ['20142015','20152016'] else
                                                       '2017_2018' if x.season in ['20162017','20172018'] else 0, axis = 1)

    pbp_df = pbp_df.drop_duplicates(['season','Game_Id','Period','Ev_Team','Seconds_Elapsed'])

    pbp_df = pbp_df.sort_values(['season','Game_Id','Period','Seconds_Elapsed'], ascending=True)

    # Remove SOs
    pbp_df = pbp_df.loc[((pbp_df.Period == 5) & (pbp_df.Season_Type == "RS")) != True,:]

    # Group Give/Take together
    pbp_df['Event'] = pbp_df['Event'].apply( lambda x: 'TURN' if x in ["GIVE","TAKE"] else x )

    pbp_df['Type'] = pbp_df['Type'].apply( lambda x: 'DEFLECTED' if x in ["DEFLECTED","TIP-IN"] else \
                                                      'WRIST SHOT' if x in ["WRIST SHOT","SNAP SHOT"] else x )

    ## Check Lag Time doesn't Cross Periods
    pbp_df = pbp_df.sort_values(['season','Game_Id','Period','Seconds_Elapsed'], ascending=True)

    pbp_df['lagged_Event'] = pbp_df.groupby(['Game_Id','Period'])['Event'].shift(1)
    pbp_df['lagged_Ev_Zone'] = pbp_df.groupby(['Game_Id','Period'])['Ev_Zone'].shift(1)
    pbp_df['lagged_Seconds_Elapsed'] = pbp_df.groupby(['Game_Id','Period'])['Seconds_Elapsed'].shift(1)
    
    #############################################
    ### Subset to just shots
    #############################################
    pbp_df = pbp_df.loc[pbp_df.Event.isin(["SHOT","GOAL","MISS","BLOCK"]),:]

    print("All shots/blocks and columns: " + str(pbp_df.shape))

    ## Binary
    pbp_df['Goal'] = pbp_df.apply( lambda x: 1 if x.Event == "GOAL" else 0, axis = 1 )
    
    pbp_df['EmptyNet_SA'] = pbp_df.apply( lambda x: 1 if ((pd.isnull(x.Home_Goalie)) & (x.Ev_Team == x.Away_Team)) | \
                                                         ((pd.isnull(x.Away_Goalie)) & (x.Ev_Team == x.Home_Team)) else 0, axis = 1)

    pbp_df['is_Rebound'] = pbp_df.apply( lambda x: 1 if (x.lagged_Event in ["SHOT"]) & \
                                        ((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 2) else 0, axis = 1 ) 
    
    pbp_df['is_Bounce'] = pbp_df.apply( lambda x: 1 if (x.lagged_Event in ["BLOCK","MISS"]) & \
                                        ((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 2) else 0, axis = 1 ) 

    pbp_df['is_Rush'] = pbp_df.apply( lambda x: 1 if (x.Ev_Zone != x.lagged_Ev_Zone) & \
                                        ((x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 6) else 0, axis = 1 ) 


    # Replace every occurrence of PHX with ARI
    pbp_df['Home_Team'] = pbp_df.apply( lambda x: x.Home_Team if x.Home_Team !='PHX' else 'ARI', axis=1 )
    pbp_df['Away_Team'] = pbp_df.apply( lambda x: x.Away_Team if x.Away_Team !='PHX' else 'ARI', axis=1 )
    pbp_df['Ev_Team'] = pbp_df.apply( lambda x: x.Ev_Team if x.Ev_Team !='PHX' else 'ARI', axis=1 )
    # Replace every occurrence of ATL with WPG
    pbp_df['Home_Team'] = pbp_df.apply( lambda x: x.Home_Team if x.Home_Team !='ATL' else 'WPG', axis=1 )
    pbp_df['Away_Team'] = pbp_df.apply( lambda x: x.Away_Team if x.Away_Team !='ATL' else 'WPG', axis=1 )
    pbp_df['Ev_Team'] = pbp_df.apply( lambda x: x.Ev_Team if x.Ev_Team !='ATL' else 'WPG', axis=1 )
    
    # add a 'Direction' column to indicate the primary direction for shots. The heuristic to determine
    # direction is the sign of the median of the X coordinate of shots in each period. This then lets us filter
    # out shots that originate from back in the defensive zone when the signs don't match
    pbp_df['Home_Shooter'] = pbp_df.apply( lambda x: 1 if x.Ev_Team == x.Home_Team else 0, axis = 1)

    game_period_locations = pbp_df.groupby( by=['season', 'Game_Id', 'Period','Home_Shooter'] )['xC','yC']
    
    game_period_medians = game_period_locations.transform(np.median)

    pbp_df['Direction'] = np.sign( game_period_medians['xC'] )

    # should actually write this to a CSV as up to here is the performance intensive part
    pbp_df['X_unadj'], pbp_df['Y_unadj'] = zip( *pbp_df.apply( lambda x: (x.xC, x.yC) if x.Direction > 0  else (-x.xC,-x.yC), axis = 1 ) )

    pbp_df['LS_Shot'] = pbp_df.apply( lambda x: 1 if x.Y_unadj < 0 else 0, axis = 1)

    ## Logged Last Event Time
    pbp_df['LN_Last_Event_Time'] = pbp_df.apply( lambda x: 0 if (x.Seconds_Elapsed - x.lagged_Seconds_Elapsed) <= 0 \
                                                        else np.log(x.Seconds_Elapsed - x.lagged_Seconds_Elapsed + 0.001), axis = 1)

    # Last Event
    pbp_df['LastEV_Off_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event == 'FAC') else 0, axis = 1)
    pbp_df['LastEV_Def_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event == 'FAC') else 0, axis = 1)
    pbp_df['LastEV_Neu_Faceoff'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event == 'FAC') else 0, axis = 1)
    pbp_df['LastEV_Off_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1)
    pbp_df['LastEV_Def_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1)
    pbp_df['LastEV_Neu_Shot'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event in ["SHOT","MISS","BLOCK"]) else 0, axis = 1)
    pbp_df['LastEV_Off_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Off') & (x.lagged_Event == 'TURN') else 0, axis = 1)
    pbp_df['LastEV_Def_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Def') & (x.lagged_Event == 'TURN') else 0, axis = 1)
    pbp_df['LastEV_Neu_Give'] = pbp_df.apply( lambda x: x.LN_Last_Event_Time if (x.Ev_Zone == 'Neu') & (x.lagged_Event == 'TURN') else 0, axis = 1)

    ## Adjust X, Y coordinates by Rink, using CDF of shot attempts only (remove blocks since they skew data)
    pbp_df = pbp_df.loc[pbp_df.Event.isin(["SHOT","GOAL","MISS"]),:]

    ## Call RinkAdjust class
    adjuster = RinkAdjust()

    ## New dataframe of adjusted shots for each home rink
    pbp_df_adj = pd.DataFrame()

    ## For each home rink
    for team in sorted(pbp_df.Home_Team.unique()):

        ## Split shots into team arena and all other rinks
        shot_data = pbp_df
        rink_shots = shot_data[ shot_data.Home_Team == team ]
        rest_of_league = shot_data[ shot_data.Home_Team != team ]

        ## Create teamxcdf and otherxcdf for rink adjustment
        adjuster.addTeam( team, rink_shots, rest_of_league )
        
        ## Adjusted coordinates
        Xadj = []
        Yadj = []

        ## For each shot in rink adjust coordinates based on other rinks
        for row in rink_shots.itertuples():
            newx, newy = adjuster.rink_bias_adjust( row.X_unadj, row.Y_unadj, row.Home_Team )

            Xadj.append(newx)
            Yadj.append(newy)

        rink_shots['X'] = Xadj
        rink_shots['Y'] = Yadj

        pbp_df_adj = pbp_df_adj.append(rink_shots)

    print ("All shots columns, rink adjusted: " + str(pbp_df_adj.shape))

    ## Apply only to season level data after x,y CDF adjustment
    pbp_df_adj['Shot_Distance_Unadj'] = pbp_df_adj.apply( lambda x: ((89 - x.X_unadj)**2 + (x.Y_unadj ** 2)) ** 0.5, axis = 1 )
    pbp_df_adj['Shot_Distance'] = pbp_df_adj.apply( lambda x: ((89 - x.X)**2 + (x.Y ** 2)) ** 0.5, axis = 1 )
    pbp_df_adj['Shot_Angle'] = pbp_df_adj.apply( lambda x: np.arctan(abs(89 - x.X) / abs(0 - x.Y)) * (180 / np.pi) if x.Y != 0 \
                                                    else 90, axis = 1 )                      

    pbp_df_adj['Last_Shot_Distance'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['Shot_Distance'].shift(1)
    pbp_df_adj['Last_Shot_Angle'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['Shot_Angle'].shift(1)
    pbp_df_adj['Last_LS_Shot'] = pbp_df_adj.groupby(['Game_Id','Period','Home_Shooter'])['LS_Shot'].shift(1)

    pbp_df_adj['Rebound_Distance_Change'] = pbp_df_adj.apply( lambda x: x.Last_Shot_Distance + x.Shot_Distance if x.is_Rebound == 1 else 0, axis = 1 )
    pbp_df_adj['Rebound_Angle_Change'] = pbp_df_adj.apply( lambda x: 0 if x.is_Rebound == 0 \
                                                        else abs(x.Last_Shot_Angle - x.Shot_Angle) \
                                                               if x.is_Rebound == 1 & (x.Last_LS_Shot == x.LS_Shot) else \
                                                          (180 - x.Last_Shot_Angle - x.Shot_Angle), axis = 1 )

    pbp_df_adj['Rebound_Distance_Traveled_byAngle'] = pbp_df_adj. \
                    apply( lambda x: x.Rebound_Distance_Change / x.Rebound_Angle_Change \
                              if x.Rebound_Angle_Change > 0 else 0, axis = 1)

    pbp_df_adj['LN_Rebound_Distance_Traveled_byAngle'] = pbp_df_adj. \
                    apply(lambda x: np.log(x.Rebound_Distance_Traveled_byAngle) \
                              if x.Rebound_Distance_Traveled_byAngle > 0 else 0, axis = 1)

    print ("All shots columns, final calcuations: " + str(pbp_df_adj.shape))
    
    return pbp_df_adj

Read-in and Stack

In [4]:
types = {'xC': np.float64,
'yC': np.float64,
'X': np.float64,
'X_unadj': np.float64,
'Y': np.float64,
'Y_unadj': np.float64,
'Game_Id': int}
In [30]:
nhl_pbp20102011 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20102011.csv', dtype=types)
nhl_pbp20112012 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20112012.csv', dtype=types)
nhl_pbp20122013 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20122013.csv', dtype=types)
nhl_pbp20132014 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20132014.csv', dtype=types)
nhl_pbp20142015 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20142015.csv', dtype=types)
nhl_pbp20152016 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20152016.csv', dtype=types)
nhl_pbp20162017 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20162017.csv', dtype=types)

nhl_pbp20172018 = pd.read_csv('/Users/colander1/Documents/CWA/HockeyScrape/nhl_pbp20172018.csv', dtype=types)

nhl_pbp = pd.concat([nhl_pbp20102011, nhl_pbp20112012, nhl_pbp20122013, nhl_pbp20132014,
                    nhl_pbp20142015, nhl_pbp20152016, nhl_pbp20162017, nhl_pbp20172018])

unwanted = nhl_pbp.columns[nhl_pbp.columns.str.startswith('Unna')]

nhl_pbp.drop(unwanted, axis=1, inplace=True)

nhl_pbp.head()
Out[30]:
Away_Coach Away_Goalie Away_Goalie_Id Away_Players Away_Score Away_Team Date Description Ev_Team Ev_Zone Event Game_Id Home_Coach Home_Goalie Home_Goalie_Id Home_Players Home_Score Home_Team Home_Zone Period Seconds_Elapsed Strength Time_Elapsed Type awayPlayer1 awayPlayer1_id awayPlayer2 awayPlayer2_id awayPlayer3 awayPlayer3_id awayPlayer4 awayPlayer4_id awayPlayer5 awayPlayer5_id awayPlayer6 awayPlayer6_id homePlayer1 homePlayer1_id homePlayer2 homePlayer2_id homePlayer3 homePlayer3_id homePlayer4 homePlayer4_id homePlayer5 homePlayer5_id homePlayer6 homePlayer6_id p1_ID p1_name p2_ID p2_name p3_ID p3_name xC yC
0 PAUL MAURICE CAM WARD 8470320.0 6 0 CAR 2010-10-07 Period Start- Local time: 7:10 EET NaN NaN PSTR 20003 TODD RICHARDS NIKLAS BACKSTROM 8473404.0 6 0 MIN NaN 1 0.0 5x5 0:00 NaN JEFF SKINNER 8475784.0 TUOMO RUUTU 8469462.0 JUSSI JOKINEN 8469638.0 JONI PITKANEN 8470137.0 JOE CORVO 8466215.0 CAM WARD 8470320.0 MIKKO KOIVU 8469459.0 ANTTI MIETTINEN 8468704.0 ANDREW BRUNETTE 8459596.0 GREG ZANON 8468636.0 CAM BARKER 8471216.0 NIKLAS BACKSTROM 8473404.0 NaN NaN NaN NaN NaN NaN NaN NaN
1 PAUL MAURICE CAM WARD 8470320.0 6 0 CAR 2010-10-07 MIN won Neu. Zone - CAR #36 JOKINEN vs MIN #9 KOIVU MIN Neu FAC 20003 TODD RICHARDS NIKLAS BACKSTROM 8473404.0 6 0 MIN Neu 1 0.0 5x5 0:00 NaN JEFF SKINNER 8475784.0 TUOMO RUUTU 8469462.0 JUSSI JOKINEN 8469638.0 JONI PITKANEN 8470137.0 JOE CORVO 8466215.0 CAM WARD 8470320.0 MIKKO KOIVU 8469459.0 ANTTI MIETTINEN 8468704.0 ANDREW BRUNETTE 8459596.0 GREG ZANON 8468636.0 CAM BARKER 8471216.0 NIKLAS BACKSTROM 8473404.0 8469459.0 MIKKO KOIVU 8469638.0 JUSSI JOKINEN NaN NaN 0.0 0.0
2 PAUL MAURICE CAM WARD 8470320.0 6 0 CAR 2010-10-07 OFFSIDE NaN NaN STOP 20003 TODD RICHARDS NIKLAS BACKSTROM 8473404.0 6 0 MIN NaN 1 8.0 5x5 0:08 NaN JEFF SKINNER 8475784.0 TUOMO RUUTU 8469462.0 JUSSI JOKINEN 8469638.0 JONI PITKANEN 8470137.0 JOE CORVO 8466215.0 CAM WARD 8470320.0 MIKKO KOIVU 8469459.0 ANTTI MIETTINEN 8468704.0 ANDREW BRUNETTE 8459596.0 GREG ZANON 8468636.0 CAM BARKER 8471216.0 NIKLAS BACKSTROM 8473404.0 NaN NaN NaN NaN NaN NaN NaN NaN
3 PAUL MAURICE CAM WARD 8470320.0 6 0 CAR 2010-10-07 MIN won Neu. Zone - CAR #36 JOKINEN vs MIN #9 KOIVU MIN Neu FAC 20003 TODD RICHARDS NIKLAS BACKSTROM 8473404.0 6 0 MIN Neu 1 8.0 5x5 0:08 NaN JEFF SKINNER 8475784.0 TUOMO RUUTU 8469462.0 JUSSI JOKINEN 8469638.0 JONI PITKANEN 8470137.0 JOE CORVO 8466215.0 CAM WARD 8470320.0 MIKKO KOIVU 8469459.0 ANTTI MIETTINEN 8468704.0 ANDREW BRUNETTE 8459596.0 GREG ZANON 8468636.0 CAM BARKER 8471216.0 NIKLAS BACKSTROM 8473404.0 8469459.0 MIKKO KOIVU 8469638.0 JUSSI JOKINEN NaN NaN 20.0 -22.0
4 PAUL MAURICE CAM WARD 8470320.0 6 0 CAR 2010-10-07 CAR ONGOAL - #12 STAAL, Snap, Off. Zone, 37 ft. CAR Off SHOT 20003 TODD RICHARDS NIKLAS BACKSTROM 8473404.0 6 0 MIN Def 1 65.0 5x5 1:05 SNAP SHOT ERIC STAAL 8470595.0 CHAD LAROSE 8469812.0 ERIK COLE 8467396.0 JONI PITKANEN 8470137.0 JOE CORVO 8466215.0 CAM WARD 8470320.0 MATT CULLEN 8464989.0 CAL CLUTTERBUCK 8473504.0 MARTIN HAVLAT 8467899.0 BRENT BURNS 8470613.0 NICK SCHULTZ 8468513.0 NIKLAS BACKSTROM 8473404.0 8470595.0 ERIC STAAL NaN NaN NaN NaN 56.0 -15.0

Load Goalie/Skater Roster with Handedness

In [31]:
player_lookup = pd.read_sql(con=engine, sql ="SELECT * FROM `nhl_all`.`hockey_roster_info` AS B")

player_lookup = player_lookup.sort_values('gamesPlayed',ascending=False).groupby(['playerId']).first().reset_index(). \
                    loc[:, ['playerBirthDate', 'playerPositionCode', 'playerShootsCatches','playerId']]
    
skater_lookup = player_lookup.loc[player_lookup.playerPositionCode != "G", :]
skater_lookup.columns = ['shooterDOB','Player_Position','Shoots','p1_ID']
skater_lookup['p1_ID'] = skater_lookup['p1_ID'].astype(str)


goalie_lookup = pd.read_sql(con=engine, sql = "SELECT DISTINCT playerId as SA_Goalie_Id, playerShootsCatches as Catches, playerBirthDate as goalieDOB FROM `nhl_all`.`hockey_goalies_roster` AS A") 
goalie_lookup['SA_Goalie_Id'] = goalie_lookup['SA_Goalie_Id'].astype(str)

Lookup Players, Generate More Features

In [33]:
def lookups_data_clean(data):
    
      
    for col in ['Game_Id','Away_Goalie_Id','Home_Goalie_Id','p1_ID','p2_ID','p3_ID',
            'awayPlayer1_id','awayPlayer2_id','awayPlayer3_id','awayPlayer4_id','awayPlayer5_id','awayPlayer6_id',
             'homePlayer1_id','homePlayer2_id','homePlayer3_id','homePlayer4_id','homePlayer5_id','homePlayer6_id']:
        data[col] = data[col].fillna(0).astype(int).astype(str)

    data['SA_Goalie'] = data.apply( lambda x: x.Away_Goalie if x.Ev_Team == x.Home_Team else x.Home_Goalie, axis=1 )
    data['SA_Goalie_Id'] = data.apply( lambda x: x.Away_Goalie_Id if x.Ev_Team == x.Home_Team else x.Home_Goalie_Id, axis=1 )
    
    data['Away_State'] = data.apply( lambda x: x.Away_Players - 1 if x.Away_Goalie_Id in [x.awayPlayer6_id, x.awayPlayer5_id, x.awayPlayer4_id, x.awayPlayer3_id] else x.Away_Players, axis=1 )
    data['Home_State'] = data.apply( lambda x: x.Home_Players - 1 if x.Home_Goalie_Id in [x.homePlayer6_id, x.homePlayer5_id, x.homePlayer4_id, x.homePlayer3_id] else x.Home_Players, axis=1 )
 
    data['Away_State'] = data.apply( lambda x: x.Away_Players - 1 if x.Away_Goalie_Id in [x.awayPlayer6_id, x.awayPlayer5_id, x.awayPlayer4_id, x.awayPlayer3_id] else x.Away_Players, axis=1 )
    data['Home_State'] = data.apply( lambda x: x.Home_Players - 1 if x.Home_Goalie_Id in [x.homePlayer6_id, x.homePlayer5_id, x.homePlayer4_id, x.homePlayer3_id] else x.Home_Players, axis=1 )
    
    data['Results_inRebound'] = data['is_Rebound'].shift(periods=-1)
                
    data['Shooter_State'] = data.apply( lambda x: x.Away_State if x.Ev_Team != x.Home_Team else x.Home_State, axis=1 )
    data['Goalie_State'] = data.apply( lambda x: x.Away_State if x.Ev_Team == x.Home_Team else x.Home_State, axis=1 )
   
    data['Game_State'] = data.apply( lambda x: str(x.Away_State) + "v" + str(x.Home_State) if x.Ev_Team == x.Home_Team else \
                                                 str(x.Home_State) + "v" + str(x.Away_State) , axis=1 )
    data['Game_State'] = data.apply( lambda x: "SH_SA" if x.Game_State in ["3v5","3v4","3v6","4v5","4v6","5v6"] else \
                                                 "PP_2p_SA" if x.Game_State in ["6v3","6v4","5v3"] else \
                                                  "5v5" if x.Game_State in ["5v5","6v6"] else x.Game_State, axis=1 )
    
    data['State_Space'] = data['Goalie_State'] + data['Shooter_State']
    data['Shooter_State_Advantage'] = data['Shooter_State'] - data['Goalie_State']
        
    data = data.merge(skater_lookup, on=['p1_ID'], how = 'left')
    data = data.merge(goalie_lookup, on=['SA_Goalie_Id'], how = 'left')
    

    data['Shooter_Handedness'] = data.apply( lambda x: "L" if x.Shoots == "L" else \
                                                 "R" if x.Shoots == "R" else "U", axis=1 )
    
    data['Handed_Class'] = data['Shoots'].str.cat(data['Catches'], sep='')
    
    data['Handed_Class2'] = data.apply( lambda x: "Same" if x.Handed_Class in ["LL","RR"] else \
                                                     "Opposite" if x.Handed_Class in ["LR","RL"] else "U", axis = 1)
    
    data['Player_Position2'] = data.apply( lambda x: "D" if x.Player_Position == "D" else "F", axis=1 )
 
    return data
    

Cumulative Shooting Function

In [34]:
def cumulative_shooting_talent(data):
    
    shooting_percentage = data.groupby(['Player_Position2'])['Goal'].mean()
  
    data['Cum_Goal'] = data.groupby(['p1_ID'])['Goal'].cumsum()
    data['Cum_Shots'] = data.groupby(['p1_ID']).cumcount()
    
    data['Cum_Goal'] = data.apply( lambda x: x.Cum_Goal - 1 if x.Event == "GOAL" else x.Cum_Goal, axis = 1)
   
    kr21_stabilizer_F = pd.to_numeric(375.0)
    kr21_stabilizer_D = pd.to_numeric(275.0)

    data['Regressed_Shooting_Indexed'] = data.apply( lambda x: ((x.Cum_Goal + (kr21_stabilizer_D * shooting_percentage[0])) /\
                                                    (x.Cum_Shots + kr21_stabilizer_D)) / shooting_percentage[0]\
                                        if x.Player_Position2 == "D" else ((x.Cum_Goal + (kr21_stabilizer_F * shooting_percentage[1])) /\
                                                    (x.Cum_Shots + kr21_stabilizer_F)) / shooting_percentage[1], axis = 1)
    
    return data
    

Create Dummy and Polynomial Variables

For each category variable, create dummies For shot distance and angle 3rd degree polynomial

In [35]:
def feature_generation(data, 
                id_vars = ["season"],
                target_vars = ['Goal','Results_inRebound'],
                num_vars = ["EmptyNet_SA","is_Rebound","is_Rush","LN_Last_Event_Time","LastEV_Off_Faceoff",
                        "LastEV_Def_Faceoff","LastEV_Neu_Faceoff","LastEV_Off_Shot","LastEV_Def_Shot","LastEV_Neu_Shot",
                        "LastEV_Off_Give","LastEV_Def_Give","LastEV_Neu_Give","LN_Rebound_Distance_Traveled_byAngle",
                        "Regressed_Shooting_Indexed"],
                cat_vars = ["Type","Shooter_State","Goalie_State","Handed_Class2","Player_Position2"], 
                poly_vars = ["Shot_Distance","Shot_Angle"],
                model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
       'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
       'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
       'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
       'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
       'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
       'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
       'Handed_Class2_Opposite',
       'Player_Position2_F', 'Shot_Distance',
       'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
       'Shot_Angle^3']):

    from sklearn.preprocessing import PolynomialFeatures
    ## Dummy Variables
    model_data = data[num_vars].fillna(0)

    for i in cat_vars:
        var_dummies = pd.get_dummies(data.loc[:,[i]])

        model_data = pd.concat([model_data, var_dummies], axis=1)

    ## Polynomial Variables
    for i in poly_vars:

        poly_data = data.loc[:,[i]]

        poly = PolynomialFeatures(degree=3,interaction_only=False).fit(poly_data)
        poly_names = poly.get_feature_names(poly_data.columns)


        poly_output = poly.transform(data.loc[:,[i]])

        model_data = pd.DataFrame(pd.concat([ model_data,                                
                                pd.DataFrame(poly_output,
                                                        columns = poly_names ).iloc[:,1:]], axis=1))

    #model_mat = model_data.loc[:, model_vars].as_matrix()
    model_data = pd.concat([data[id_vars],data[target_vars], model_data], axis=1)
                              
    
    print(model_data.shape)
    
    return model_data

Data Pipeline

In [36]:
shot_data_all = transform_data(nhl_pbp)
All events and columns: (2878182, 56)
All shots/blocks and columns: (1014120, 63)
All shots columns, rink adjusted: (753814, 85)
All shots columns, final calcuations: (753814, 95)
In [48]:
## Check shot distance adjustment
shot_data_all.loc[shot_data_all.season == "20172018", :].groupby(['Home_Team'])[['Shot_Distance','Shot_Distance_Unadj']]\
        .mean().sort_values(['Shot_Distance_Unadj'])
Out[48]:
Shot_Distance Shot_Distance_Unadj
Home_Team
NYR 37.967128 33.186499
DET 34.291843 34.282345
CHI 35.837164 34.322037
STL 36.458697 34.468851
N.J 34.590668 34.594601
ANA 36.826106 35.160489
PIT 35.505403 35.351800
NSH 35.671859 35.385298
L.A 34.941428 35.561865
MTL 34.586403 35.622235
WSH 34.732211 35.654820
DAL 36.015360 35.655394
CAR 36.121785 35.702802
TOR 32.979968 35.775225
CGY 36.037519 36.258758
T.B 34.318686 36.449372
VGK 34.737839 36.491532
NYI 37.686836 36.756656
COL 34.988536 36.862748
ARI 35.588372 36.873783
FLA 36.392375 36.958453
CBJ 38.090698 37.245836
EDM 38.198247 37.329866
VAN 36.457876 37.376298
BOS 33.265149 37.584670
S.J 37.499523 37.637475
WPG 39.252447 39.239246
BUF 37.702187 39.468667
MIN 37.545626 39.570156
PHI 37.190102 39.699116
OTT 38.729001 40.723540
In [37]:
shot_data_all = lookups_data_clean(shot_data_all)
In [38]:
shot_data_all2 = cumulative_shooting_talent(shot_data_all)
In [40]:
model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
       'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
       'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
       'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
       'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
       'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
       'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
       'Handed_Class2_Opposite',
       'Player_Position2_F', 'Shot_Distance',
       'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
       'Shot_Angle^3']

model_data = feature_generation(shot_data_all2, model_vars = model_vars)
#(744586, 30)

model_data.head()
(753814, 36)
Out[40]:
season Goal Results_inRebound EmptyNet_SA is_Rebound is_Rush LN_Last_Event_Time LastEV_Off_Faceoff LastEV_Def_Faceoff LastEV_Neu_Faceoff LastEV_Off_Shot LastEV_Def_Shot LastEV_Neu_Shot LastEV_Off_Give LastEV_Def_Give LastEV_Neu_Give LN_Rebound_Distance_Traveled_byAngle Regressed_Shooting_Indexed Type_BACKHAND Type_DEFLECTED Type_SLAP SHOT Type_WRAP-AROUND Type_WRIST SHOT Shooter_State Goalie_State Handed_Class2_Opposite Handed_Class2_Same Handed_Class2_U Player_Position2_D Player_Position2_F Shot_Distance Shot_Distance^2 Shot_Distance^3 Shot_Angle Shot_Angle^2 Shot_Angle^3
0 20102011 0 0.0 0 0 0 2.197336 2.197336 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 1.00000 0 0 1 0 0 5 5 1 0 0 0 1 40.311289 1625.0 65505.844205 66.614779 4437.528774 295604.998305
1 20102011 0 0.0 0 0 0 1.098946 0.000000 0.0 0.0 1.098946 0.0 0.0 0.000000 0.0 0.0 0.0 1.00000 0 0 1 0 0 5 5 0 1 0 1 0 58.137767 3380.0 196505.653863 63.434949 4023.992732 255261.773029
2 20102011 0 0.0 0 0 0 2.302685 2.302685 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 1.00000 0 0 0 0 1 5 5 0 1 0 0 1 31.016125 962.0 29837.512095 1.847610 3.413664 6.307120
3 20102011 0 0.0 0 0 0 3.637612 0.000000 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.99734 0 0 1 0 0 5 4 1 0 0 0 1 40.311289 1625.0 65505.844205 82.874984 6868.262915 569207.176806
4 20102011 0 0.0 0 0 0 1.386544 0.000000 0.0 0.0 0.000000 0.0 0.0 1.386544 0.0 0.0 0.0 1.00000 0 0 0 0 1 3 5 1 0 0 0 1 37.443290 1402.0 52495.493216 55.885527 3123.192134 174541.238511

Break Data into 2 Season Blocks

Modeling two seasons at a time allows model to adjust to changing goalie performance/shot recorder bias as modeling entire period results in fewer goals relative to expected in later seasons. Two seasons are used rather than 1 as to get 2017-18 some stability.

In [41]:
model_data['season_model'] = model_data.apply(lambda x: '2011_2012' if x.season in ['20102011','20112012'] else
                                                       '2013_2014' if x.season in ['20122013','20132014'] else
                                                       '2015_2016' if x.season in ['20142015','20152016'] else
                                                       '2017_2018' if x.season in ['20162017','20172018'] else 0, axis = 1)

#(744586, 30)
model_data.head()
Out[41]:
season Goal Results_inRebound EmptyNet_SA is_Rebound is_Rush LN_Last_Event_Time LastEV_Off_Faceoff LastEV_Def_Faceoff LastEV_Neu_Faceoff LastEV_Off_Shot LastEV_Def_Shot LastEV_Neu_Shot LastEV_Off_Give LastEV_Def_Give LastEV_Neu_Give LN_Rebound_Distance_Traveled_byAngle Regressed_Shooting_Indexed Type_BACKHAND Type_DEFLECTED Type_SLAP SHOT Type_WRAP-AROUND Type_WRIST SHOT Shooter_State Goalie_State Handed_Class2_Opposite Handed_Class2_Same Handed_Class2_U Player_Position2_D Player_Position2_F Shot_Distance Shot_Distance^2 Shot_Distance^3 Shot_Angle Shot_Angle^2 Shot_Angle^3 season_model
0 20102011 0 0.0 0 0 0 2.197336 2.197336 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 1.00000 0 0 1 0 0 5 5 1 0 0 0 1 40.311289 1625.0 65505.844205 66.614779 4437.528774 295604.998305 2011_2012
1 20102011 0 0.0 0 0 0 1.098946 0.000000 0.0 0.0 1.098946 0.0 0.0 0.000000 0.0 0.0 0.0 1.00000 0 0 1 0 0 5 5 0 1 0 1 0 58.137767 3380.0 196505.653863 63.434949 4023.992732 255261.773029 2011_2012
2 20102011 0 0.0 0 0 0 2.302685 2.302685 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 1.00000 0 0 0 0 1 5 5 0 1 0 0 1 31.016125 962.0 29837.512095 1.847610 3.413664 6.307120 2011_2012
3 20102011 0 0.0 0 0 0 3.637612 0.000000 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.99734 0 0 1 0 0 5 4 1 0 0 0 1 40.311289 1625.0 65505.844205 82.874984 6868.262915 569207.176806 2011_2012
4 20102011 0 0.0 0 0 0 1.386544 0.000000 0.0 0.0 0.000000 0.0 0.0 1.386544 0.0 0.0 0.0 1.00000 0 0 0 0 1 3 5 1 0 0 0 1 37.443290 1402.0 52495.493216 55.885527 3123.192134 174541.238511 2011_2012
In [49]:
## Check block counts
model_data.groupby(['season_model'])['Goal'].count()
Out[49]:
season_model
2011_2012    218979
2013_2014    175529
2015_2016    217610
2017_2018    141696
Name: Goal, dtype: int64

Fit and Score xG and xR Models

Create function to score xG model based on model_vars, return ROC AUC, then score xR model, return ROC AUC. Save both models and print coefficients. Both models use Logitstic Regression, 10-fold cross-validation

In [43]:
def All_Model_Scoring(model_data, data, szn):
    print (szn)
        
    from sklearn.cross_validation import KFold
    from sklearn.linear_model import LogisticRegression
    from sklearn.grid_search import GridSearchCV
    from sklearn.linear_model import LogisticRegressionCV
    import pickle

    model_vars = ['EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
       'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
       'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
       'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
       'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
       'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
       'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
       'Handed_Class2_Opposite',
       'Player_Position2_F', 'Shot_Distance',
       'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
       'Shot_Angle^3']

    rebound_vars = ['xG_raw', 'EmptyNet_SA', 'is_Rebound', 'is_Rush', 'LN_Last_Event_Time',
           'LastEV_Off_Faceoff', 'LastEV_Def_Faceoff', 'LastEV_Neu_Faceoff',
           'LastEV_Off_Shot', 'LastEV_Def_Shot', 'LastEV_Neu_Shot',
           'LastEV_Off_Give', 'LastEV_Def_Give', 'LastEV_Neu_Give',
           'LN_Rebound_Distance_Traveled_byAngle', 'Regressed_Shooting_Indexed',
           'Type_BACKHAND', 'Type_DEFLECTED', 'Type_SLAP SHOT', 'Type_WRAP-AROUND',
           'Type_WRIST SHOT', 'Shooter_State', 'Goalie_State',
           'Handed_Class2_Opposite',
           'Player_Position2_F', 'Shot_Distance',
           'Shot_Distance^2', 'Shot_Distance^3', 'Shot_Angle', 'Shot_Angle^2',
           'Shot_Angle^3']

    ## Subset data to block, create model data, convert to matrix
    szn_data = data.loc[data.season_model == szn, :]
    szn_model_data = model_data.loc[model_data.season_model == szn, :].fillna(0)
    szn_model_mat = szn_model_data.loc[szn_model_data.season_model == szn, model_vars].as_matrix().astype(np.float)

    ### Train xG Model
    goal = szn_model_data.Goal
    print (str(szn) + ' seasons dimensions: ' + str(szn_model_mat.shape))
    print (str(szn) + ' seasons shooting%: ' + str(sum(goal) / len(goal)))
    
    ## xG Model
    fold = KFold(len(goal), n_folds=10, shuffle=True, random_state=777)

    xG_model_CV = LogisticRegressionCV(
        Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='roc_auc'
        ,cv=fold
        ,random_state=777
        ,max_iter=10000
        ,fit_intercept=True
        ,solver='newton-cg'
        ,tol=10
    )    
    
    ## Fit model
    xG_model_CV.fit(szn_model_mat, goal)
    
    ## Save Model
    filename = 'xG_Model_' + str(szn) + '_obj.sav'
    pickle.dump(xG_model_CV, open(filename, 'wb'))
    
    print (str(szn) + 'Max auc_roc:', xG_model_CV.scores_[1].max())
    
    ## Score Model
    xG_raw = xG_model_CV.predict_proba(szn_model_mat)[:,1]

    print (str(szn) + ' seasons goals: ' + str(sum(goal)) + ', season xG: ' + str(sum(xG_raw)))

    ### Assemble data and train xRebound Model
    rebound = szn_model_data.Results_inRebound.fillna(0)
    print (str(szn) + ' goals scored: ' + str(sum(szn_data.Goal)))
    print (str(szn) + ' xG scored: ' + str(sum(xG_raw)))

    print (str(szn) + ' seasons dimensions: ' + str(szn_model_mat.shape))
    print (str(szn) + ' seasons rebound%: ' + str(sum(rebound) / len(rebound)))
    
    fold = KFold(len(rebound), n_folds=10, shuffle=True, random_state=777)

    szn_model_mat = pd.concat([szn_model_data.reset_index(drop=True), 
             pd.DataFrame(xG_raw, columns = ['xG_raw']).reset_index(drop=True)], axis=1).loc[:,rebound_vars].as_matrix()

    xR_model_CV = LogisticRegressionCV(
        Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='roc_auc'
        ,cv=fold
        ,random_state=777
        ,max_iter=10000
        ,fit_intercept=True
        ,solver='newton-cg'
        ,tol=10
    )    
    

    xR_model_CV.fit(szn_model_mat, rebound)
    
    filename = 'xR_Model_' + str(szn) + '_obj.sav'
    pickle.dump(xR_model_CV, open(filename, 'wb'))
    
    print (str(szn) + ' Max auc_roc:', xR_model_CV.scores_[1].max())
    
    xR_raw = xR_model_CV.predict_proba(szn_model_mat)[:,1]

    print (str(szn) + ' seasons rebounds: ' + str(sum(rebound)) + ', season xR: ' + str(sum(xR_raw)))

    coefs = pd.DataFrame(list(zip(np.array(rebound_vars),xR_model_CV.coef_.T)), 
                   columns = ['Variable','Coef']).sort_values(['Coef'], ascending=False)
    
    scored_data = pd.concat([
             pd.DataFrame(xG_raw, columns = ['xG_raw']).reset_index(drop=True),
             pd.DataFrame(xR_raw, columns = ['xR']).reset_index(drop=True),
                            szn_data.reset_index(drop=True)
                                ], axis=1)

    scored_data.to_csv("scored_data" + str(szn) + ".csv", index=False)
        
    return coefs.T
In [44]:
All_Model_Scoring(model_data, shot_data_all2, '2017_2018')
2017_2018
2017_2018 seasons dimensions: (141696, 30)
2017_2018 seasons shooting%: 0.0633539408311
2017_2018Max auc_roc: 0.775398676368
2017_2018 seasons goals: 8977, season xG: 8976.17408044
2017_2018 goals scored: 8977
2017_2018 xG scored: 8976.17408044
2017_2018 seasons dimensions: (141696, 30)
2017_2018 seasons rebound%: 0.0323932926829
2017_2018 Max auc_roc: 0.679599683931
2017_2018 seasons rebounds: 4590.0, season xR: 4591.12339643
Out[44]:
21 2 19 9 14 18 3 20 15 23 8 5 11 26 29 30 27 28 4 25 12 16 6 0 24 22 10 13 1 17 7
Variable Shooter_State is_Rebound Type_WRAP-AROUND LastEV_Def_Shot LN_Rebound_Distance_Traveled_byAngle Type_SLAP SHOT is_Rush Type_WRIST SHOT Regressed_Shooting_Indexed Handed_Class2_Opposite LastEV_Off_Shot LastEV_Off_Faceoff LastEV_Off_Give Shot_Distance^2 Shot_Angle^2 Shot_Angle^3 Shot_Distance^3 Shot_Angle LN_Last_Event_Time Shot_Distance LastEV_Def_Give Type_BACKHAND LastEV_Def_Faceoff xG_raw Player_Position2_F Goalie_State LastEV_Neu_Shot LastEV_Neu_Give EmptyNet_SA Type_DEFLECTED LastEV_Neu_Faceoff
Coef [0.443230762862] [0.242593069594] [0.236305225426] [0.196529997088] [0.0938102952462] [0.0880709042958] [0.0704929456784] [0.0687385433112] [0.0545326544149] [0.0411686094917] [0.0204143408809] [0.00932446948272] [0.00491453331725] [0.000910514894216] [0.000151931517311] [-4.32632277469e-07] [-3.29846116297e-06] [-0.00738119577147] [-0.0102381420226] [-0.0786000143722] [-0.122034340806] [-0.141906765043] [-0.144939916331] [-0.172781889777] [-0.215843836221] [-0.317256602872] [-0.377557564797] [-0.392747493842] [-0.512845561177] [-0.549577304022] [-0.716628355712]
In [45]:
All_Model_Scoring(model_data, shot_data_all2, '2015_2016')
2015_2016
2015_2016 seasons dimensions: (217610, 30)
2015_2016 seasons shooting%: 0.0625292955287
2015_2016Max auc_roc: 0.777002681465
2015_2016 seasons goals: 13607, season xG: 13605.6948619
2015_2016 goals scored: 13607
2015_2016 xG scored: 13605.6948619
2015_2016 seasons dimensions: (217610, 30)
2015_2016 seasons rebound%: 0.030462754469
2015_2016 Max auc_roc: 0.676234656964
2015_2016 seasons rebounds: 6629.0, season xR: 6629.73767749
Out[45]:
21 2 19 18 14 20 23 3 28 5 26 30 27 29 4 11 9 8 15 25 16 12 6 24 0 22 13 10 1 17 7
Variable Shooter_State is_Rebound Type_WRAP-AROUND Type_SLAP SHOT LN_Rebound_Distance_Traveled_byAngle Type_WRIST SHOT Handed_Class2_Opposite is_Rush Shot_Angle LastEV_Off_Faceoff Shot_Distance^2 Shot_Angle^3 Shot_Distance^3 Shot_Angle^2 LN_Last_Event_Time LastEV_Off_Give LastEV_Def_Shot LastEV_Off_Shot Regressed_Shooting_Indexed Shot_Distance Type_BACKHAND LastEV_Def_Give LastEV_Def_Faceoff Player_Position2_F xG_raw Goalie_State LastEV_Neu_Give LastEV_Neu_Shot EmptyNet_SA Type_DEFLECTED LastEV_Neu_Faceoff
Coef [0.434352784035] [0.217387469412] [0.214124051205] [0.171990320473] [0.13015520102] [0.115009004209] [0.0473993716049] [0.016660341811] [0.00603524793123] [0.00288536951206] [0.000794841008729] [4.33940532904e-07] [-2.53664345805e-06] [-4.37267199967e-05] [-0.00227667337587] [-0.00596937422759] [-0.0164444195784] [-0.0333510073401] [-0.0632778157823] [-0.0745497498176] [-0.0918397745913] [-0.137538495271] [-0.222625390936] [-0.229699315968] [-0.261292995408] [-0.355609845331] [-0.376252326754] [-0.405605383995] [-0.509204826468] [-0.535699799812] [-0.566667016696]
In [46]:
All_Model_Scoring(model_data, shot_data_all2, '2013_2014')
2013_2014
2013_2014 seasons dimensions: (175529, 30)
2013_2014 seasons shooting%: 0.0626904955876
2013_2014Max auc_roc: 0.770487378959
2013_2014 seasons goals: 11004, season xG: 11002.8547599
2013_2014 goals scored: 11004
2013_2014 xG scored: 11002.8547599
2013_2014 seasons dimensions: (175529, 30)
2013_2014 seasons rebound%: 0.0305818411772
2013_2014 Max auc_roc: 0.654241854268
2013_2014 seasons rebounds: 5368.0, season xR: 5369.87685727
Out[46]:
21 19 18 20 14 2 6 5 11 8 28 26 29 30 27 4 3 23 9 16 25 12 15 0 24 13 22 7 10 1 17
Variable Shooter_State Type_WRAP-AROUND Type_SLAP SHOT Type_WRIST SHOT LN_Rebound_Distance_Traveled_byAngle is_Rebound LastEV_Def_Faceoff LastEV_Off_Faceoff LastEV_Off_Give LastEV_Off_Shot Shot_Angle Shot_Distance^2 Shot_Angle^2 Shot_Angle^3 Shot_Distance^3 LN_Last_Event_Time is_Rush Handed_Class2_Opposite LastEV_Def_Shot Type_BACKHAND Shot_Distance LastEV_Def_Give Regressed_Shooting_Indexed xG_raw Player_Position2_F LastEV_Neu_Give Goalie_State LastEV_Neu_Faceoff LastEV_Neu_Shot EmptyNet_SA Type_DEFLECTED
Coef [0.421541250626] [0.195991297068] [0.180731390782] [0.14615833415] [0.101966515186] [0.0995412193419] [0.0874812352645] [0.0320194794607] [0.00778680101523] [0.00497227573604] [0.00173310331803] [0.000885809270975] [0.000119358110981] [-9.5847370416e-07] [-2.92774036083e-06] [-0.0104735349129] [-0.0149812990545] [-0.0217751440318] [-0.0265930399084] [-0.0665984909775] [-0.0780029052968] [-0.0879427972701] [-0.129339881986] [-0.199616920194] [-0.203785946043] [-0.276553511017] [-0.292844672162] [-0.310902202645] [-0.32141440785] [-0.395679564539] [-0.623100975278]
In [47]:
All_Model_Scoring(model_data, shot_data_all2, '2011_2012')
2011_2012
2011_2012 seasons dimensions: (218979, 30)
2011_2012 seasons shooting%: 0.0630745413944
2011_2012Max auc_roc: 0.781397035575
2011_2012 seasons goals: 13812, season xG: 13811.9586764
2011_2012 goals scored: 13812
2011_2012 xG scored: 13811.9586764
2011_2012 seasons dimensions: (218979, 30)
2011_2012 seasons rebound%: 0.0304732417264
2011_2012 Max auc_roc: 0.662957428347
2011_2012 seasons rebounds: 6673.0, season xR: 6674.41284226
Out[47]:
21 18 20 19 14 23 2 5 11 8 26 29 30 27 28 9 12 4 15 3 25 16 6 0 24 22 13 10 1 7 17
Variable Shooter_State Type_SLAP SHOT Type_WRIST SHOT Type_WRAP-AROUND LN_Rebound_Distance_Traveled_byAngle Handed_Class2_Opposite is_Rebound LastEV_Off_Faceoff LastEV_Off_Give LastEV_Off_Shot Shot_Distance^2 Shot_Angle^2 Shot_Angle^3 Shot_Distance^3 Shot_Angle LastEV_Def_Shot LastEV_Def_Give LN_Last_Event_Time Regressed_Shooting_Indexed is_Rush Shot_Distance Type_BACKHAND LastEV_Def_Faceoff xG_raw Player_Position2_F Goalie_State LastEV_Neu_Give LastEV_Neu_Shot EmptyNet_SA LastEV_Neu_Faceoff Type_DEFLECTED
Coef [0.383646161431] [0.256764086948] [0.142488000303] [0.113918135507] [0.0975769536292] [0.0559406316352] [0.0344296756424] [0.0182337373267] [0.0143922911474] [0.00225758886072] [0.00085221817346] [0.000243772347851] [-1.78506003457e-06] [-2.72353916895e-06] [-0.00345344999923] [-0.00861552976001] [-0.0195882791768] [-0.0211591668439] [-0.0297224028101] [-0.0498855393577] [-0.0789407644771] [-0.0912119624353] [-0.128691218921] [-0.204174082684] [-0.271887137976] [-0.290430989778] [-0.366768312254] [-0.389375254075] [-0.397079300866] [-0.510863084948] [-0.526980726973]

Fin