import pandas as pd
import numpy as np
import math
import sys
pd.set_option("display.max_columns",50)
# download data from here: https://s3.amazonaws.com/pbpstats/db_dumps/raw_shots.csv.zip and put it in working directory
all_shots = pd.read_csv('raw_shots.csv', dtype={'GameId': str})
all_shots = all_shots[~(all_shots.X.isna())]
# fill in missing putback column with False
all_shots.Putback.fillna(False, inplace=True)
def is_regular_season(row):
if row['GameId'][2] == '2':
return True
return False
all_shots['IsRegularSeason'] = all_shots.apply(is_regular_season, axis=1)
def get_season(row):
# 2017-18 season will be 17, 2016-17 will be 16...
# 2017-18 season for gleague will be 217
# 2018 season for wnba will be 118
return int(row['GameId'][0] + row['GameId'][3:5])
all_shots['Season'] = all_shots.apply(get_season, axis=1)
# add play start type - start type for possessions with no oreb, missed shot type for shots after orebs
def get_play_start_type(row):
if row['OrebShotType'] is np.nan:
return row['StartType']
elif row['OrebShotType'] in ['Arc3Blocked', 'Corner3Blocked']:
# combine off blocked 3 orebs due to sample size issues
return 'Off3BlockedOreb'
elif row['OrebShotType'] in ['Corner3', 'Arc3']:
# combine off 3 orebs because model does better when they are combined
return 'Off3Oreb'
else:
return 'Off' + row['OrebShotType'] + 'Oreb'
all_shots['PlayStartType'] = all_shots.apply(get_play_start_type, axis=1)
def get_time_since_play_started(row):
if row['OrebShotType'] is np.nan:
return row['StartTime'] - row['Time']
else:
# use time since oreb if off oreb
return row['SecondsSinceOReb']
all_shots['SecondsSincePlayStarted'] = all_shots.apply(get_time_since_play_started, axis=1)
def get_shot_distance(row):
x_squared = row['X'] ** 2
y_squared = row['Y'] ** 2
shot_distance = math.sqrt(x_squared + y_squared) / 10 # unit for distance is off by factor of 10, divide by 10 to convert to feet
return round(shot_distance, 1)
all_shots['ShotDistance'] = all_shots.apply(get_shot_distance, axis=1)
def get_shot_angle_from_centre(row):
# 0 is straightaway, 90 is from the corners
angle = abs(math.degrees(math.atan2(row['X'], row['Y'])))
if angle < 90:
return round(angle, 1)
else:
return round(180 - angle, 1)
all_shots['ShotAngle'] = all_shots.apply(get_shot_angle_from_centre, axis=1)
def make_all_ot_periods_5(row):
# small sample size with Period > 5, make all OT periods 5
if row['Period'] > 5:
return 5
else:
return row['Period']
all_shots['Period'] = all_shots.apply(make_all_ot_periods_5, axis=1)
base_features = ['Season', 'Made', 'Period', 'StartScoreDifferential', 'ShotValue', 'Time', 'Putback', 'IsRegularSeason', 'PlayStartType', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle']
all_shots = pd.get_dummies(all_shots[base_features], prefix='is')
# group together start types to see what level of detail on start type does better
def combine_off_block_orebs(row):
if row['is_Off3BlockedOreb'] + row['is_OffAtRimBlockedOreb'] + row['is_OffLongMidRangeBlockedOreb'] + row['is_OffShortMidRangeBlockedOreb'] == 1:
return 1
else:
return 0
def combine_off_blocks(row):
if row['is_OffArc3Block'] + row['is_OffCorner3Block'] + row['is_OffAtRimBlock'] + row['is_OffLongMidRangeBlock'] + row['is_OffShortMidRangeBlock'] == 1:
return 1
else:
return 0
def combine_off_makes(row):
if row['is_OffArc3Make'] + row['is_OffCorner3Make'] + row['is_OffAtRimMake'] + row['is_OffLongMidRangeMake'] + row['is_OffShortMidRangeMake'] == 1:
return 1
else:
return 0
def combine_off_orebs(row):
if row['is_Off3Oreb'] + row['is_OffAtRimOreb'] + row['is_OffLongMidRangeOreb'] + row['is_OffShortMidRangeOreb'] == 1:
return 1
else:
return 0
def combine_off_missed_fg(row):
if row['is_OffArc3Miss'] + row['is_OffCorner3Miss'] + row['is_OffAtRimMiss'] + row['is_OffLongMidRangeMiss'] + row['is_OffShortMidRangeMiss'] == 1:
return 1
else:
return 0
all_shots['is_OffBlockedOreb'] = all_shots.apply(combine_off_block_orebs, axis=1)
all_shots['is_OffBlock'] = all_shots.apply(combine_off_blocks, axis=1)
all_shots['is_OffMadeFG'] = all_shots.apply(combine_off_makes, axis=1)
all_shots['is_OffOreb'] = all_shots.apply(combine_off_orebs, axis=1)
all_shots['is_OffMissedFG'] = all_shots.apply(combine_off_missed_fg, axis=1)
all_shots.head()
Season | Made | Period | StartScoreDifferential | ShotValue | Time | Putback | IsRegularSeason | SecondsSincePlayStarted | ShotDistance | ShotAngle | is_Off3BlockedOreb | is_Off3Oreb | is_OffArc3Block | is_OffArc3Make | is_OffArc3Miss | is_OffAtRimBlock | is_OffAtRimBlockedOreb | is_OffAtRimMake | is_OffAtRimMiss | is_OffAtRimOreb | is_OffCorner3Block | is_OffCorner3Make | is_OffCorner3Miss | is_OffDeadball | is_OffFTMake | is_OffFTMiss | is_OffFTOreb | is_OffLiveBallTurnover | is_OffLongMidRangeBlock | is_OffLongMidRangeBlockedOreb | is_OffLongMidRangeMake | is_OffLongMidRangeMiss | is_OffLongMidRangeOreb | is_OffShortMidRangeBlock | is_OffShortMidRangeBlockedOreb | is_OffShortMidRangeMake | is_OffShortMidRangeMiss | is_OffShortMidRangeOreb | is_OffTeamBlockedOreb | is_OffTeamOreb | is_OffTimeout | is_OffBlockedOreb | is_OffBlock | is_OffMadeFG | is_OffOreb | is_OffMissedFG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | False | 4 | 19 | 2 | 299.0 | False | False | 23.0 | 21.7 | 42.2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 7 | True | 4 | -19 | 2 | 281.0 | False | False | 16.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 7 | True | 4 | 18 | 2 | 231.0 | False | False | 16.0 | 13.5 | 64.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 7 | False | 4 | -20 | 3 | 195.0 | False | False | 7.0 | 24.7 | 61.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 7 | False | 4 | 20 | 2 | 180.0 | False | False | 14.0 | 18.8 | 74.9 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
all_shots.to_csv('data_for_model.csv', index=False)
print(pd.__version__)
0.23.3
print(np.__version__)
1.14.5
print(sys.version_info)
sys.version_info(major=3, minor=6, micro=1, releaselevel='final', serial=0)