In [1]:
import pandas as pd
import numpy as np
import math
import sys
pd.set_option("display.max_columns",50)
In [2]:
# download data from here: https://s3.amazonaws.com/pbpstats/db_dumps/raw_shots.csv.zip and put it in working directory
all_shots = pd.read_csv('raw_shots.csv', dtype={'GameId': str})
In [3]:
all_shots = all_shots[~(all_shots.X.isna())]
In [4]:
# fill in missing putback column with False
all_shots.Putback.fillna(False, inplace=True)
In [5]:
def is_regular_season(row):
    if row['GameId'][2] == '2':
        return True
    return False

all_shots['IsRegularSeason'] = all_shots.apply(is_regular_season, axis=1)
In [6]:
def get_season(row):
    # 2017-18 season will be 17, 2016-17 will be 16...
    # 2017-18 season for gleague will be 217
    # 2018 season for wnba will be 118
    return int(row['GameId'][0] + row['GameId'][3:5])

all_shots['Season'] = all_shots.apply(get_season, axis=1)
In [7]:
# add play start type - start type for possessions with no oreb, missed shot type for shots after orebs
def get_play_start_type(row):
    if row['OrebShotType'] is np.nan:
        return row['StartType']
    elif row['OrebShotType'] in ['Arc3Blocked', 'Corner3Blocked']:
        # combine off blocked 3 orebs due to sample size issues
        return 'Off3BlockedOreb'
    elif row['OrebShotType'] in ['Corner3', 'Arc3']:
        # combine off 3 orebs because model does better when they are combined
        return 'Off3Oreb'
    else:
        return 'Off' + row['OrebShotType'] + 'Oreb'

all_shots['PlayStartType'] = all_shots.apply(get_play_start_type, axis=1)    
In [8]:
def get_time_since_play_started(row):
    if row['OrebShotType'] is np.nan:
        return row['StartTime'] - row['Time']
    else:
        # use time since oreb if off oreb
        return row['SecondsSinceOReb']

all_shots['SecondsSincePlayStarted'] = all_shots.apply(get_time_since_play_started, axis=1)    
In [9]:
def get_shot_distance(row):
    x_squared = row['X'] ** 2
    y_squared = row['Y'] ** 2
    shot_distance = math.sqrt(x_squared + y_squared) / 10  # unit for distance is off by factor of 10, divide by 10 to convert to feet
    return round(shot_distance, 1)

all_shots['ShotDistance'] = all_shots.apply(get_shot_distance, axis=1)    
In [10]:
def get_shot_angle_from_centre(row):
    # 0 is straightaway, 90 is from the corners
    angle = abs(math.degrees(math.atan2(row['X'], row['Y'])))
    if angle < 90:
        return round(angle, 1)
    else:
        return round(180 - angle, 1)

all_shots['ShotAngle'] = all_shots.apply(get_shot_angle_from_centre, axis=1)    
In [11]:
def make_all_ot_periods_5(row):
    # small sample size with Period > 5, make all OT periods 5
    if row['Period'] > 5:
        return 5
    else:
        return row['Period']

all_shots['Period'] = all_shots.apply(make_all_ot_periods_5, axis=1)

base_features = ['Season', 'Made', 'Period', 'StartScoreDifferential', 'ShotValue', 'Time', 'Putback', 'IsRegularSeason', 'PlayStartType', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle']
all_shots = pd.get_dummies(all_shots[base_features], prefix='is')
In [12]:
# group together start types to see what level of detail on start type does better

def combine_off_block_orebs(row):
    if row['is_Off3BlockedOreb'] + row['is_OffAtRimBlockedOreb'] + row['is_OffLongMidRangeBlockedOreb'] + row['is_OffShortMidRangeBlockedOreb'] == 1:
        return 1
    else:
        return 0

def combine_off_blocks(row):
    if row['is_OffArc3Block'] + row['is_OffCorner3Block'] + row['is_OffAtRimBlock'] + row['is_OffLongMidRangeBlock'] + row['is_OffShortMidRangeBlock'] == 1:
        return 1
    else:
        return 0

def combine_off_makes(row):
    if row['is_OffArc3Make'] + row['is_OffCorner3Make'] + row['is_OffAtRimMake'] + row['is_OffLongMidRangeMake'] + row['is_OffShortMidRangeMake'] == 1:
        return 1
    else:
        return 0


def combine_off_orebs(row):
    if row['is_Off3Oreb'] + row['is_OffAtRimOreb'] + row['is_OffLongMidRangeOreb'] + row['is_OffShortMidRangeOreb'] == 1:
        return 1
    else:
        return 0

def combine_off_missed_fg(row):
    if row['is_OffArc3Miss'] + row['is_OffCorner3Miss'] + row['is_OffAtRimMiss'] + row['is_OffLongMidRangeMiss'] + row['is_OffShortMidRangeMiss'] == 1:
        return 1
    else:
        return 0
In [13]:
all_shots['is_OffBlockedOreb'] = all_shots.apply(combine_off_block_orebs, axis=1)
all_shots['is_OffBlock'] = all_shots.apply(combine_off_blocks, axis=1)
all_shots['is_OffMadeFG'] = all_shots.apply(combine_off_makes, axis=1)
all_shots['is_OffOreb'] = all_shots.apply(combine_off_orebs, axis=1)
all_shots['is_OffMissedFG'] = all_shots.apply(combine_off_missed_fg, axis=1)
In [14]:
all_shots.head()
Out[14]:
Season Made Period StartScoreDifferential ShotValue Time Putback IsRegularSeason SecondsSincePlayStarted ShotDistance ShotAngle is_Off3BlockedOreb is_Off3Oreb is_OffArc3Block is_OffArc3Make is_OffArc3Miss is_OffAtRimBlock is_OffAtRimBlockedOreb is_OffAtRimMake is_OffAtRimMiss is_OffAtRimOreb is_OffCorner3Block is_OffCorner3Make is_OffCorner3Miss is_OffDeadball is_OffFTMake is_OffFTMiss is_OffFTOreb is_OffLiveBallTurnover is_OffLongMidRangeBlock is_OffLongMidRangeBlockedOreb is_OffLongMidRangeMake is_OffLongMidRangeMiss is_OffLongMidRangeOreb is_OffShortMidRangeBlock is_OffShortMidRangeBlockedOreb is_OffShortMidRangeMake is_OffShortMidRangeMiss is_OffShortMidRangeOreb is_OffTeamBlockedOreb is_OffTeamOreb is_OffTimeout is_OffBlockedOreb is_OffBlock is_OffMadeFG is_OffOreb is_OffMissedFG
0 7 False 4 19 2 299.0 False False 23.0 21.7 42.2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
1 7 True 4 -19 2 281.0 False False 16.0 0.0 0.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
2 7 True 4 18 2 231.0 False False 16.0 13.5 64.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 7 False 4 -20 3 195.0 False False 7.0 24.7 61.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 7 False 4 20 2 180.0 False False 14.0 18.8 74.9 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
In [15]:
all_shots.to_csv('data_for_model.csv', index=False)
In [16]:
print(pd.__version__)
0.23.3
In [17]:
print(np.__version__)
1.14.5
In [18]:
print(sys.version_info)
sys.version_info(major=3, minor=6, micro=1, releaselevel='final', serial=0)
In [ ]: