#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import math import sys pd.set_option("display.max_columns",50) # In[2]: # download data from here: https://s3.amazonaws.com/pbpstats/db_dumps/raw_shots.csv.zip and put it in working directory all_shots = pd.read_csv('raw_shots.csv', dtype={'GameId': str}) # In[3]: all_shots = all_shots[~(all_shots.X.isna())] # In[4]: # fill in missing putback column with False all_shots.Putback.fillna(False, inplace=True) # In[5]: def is_regular_season(row): if row['GameId'][2] == '2': return True return False all_shots['IsRegularSeason'] = all_shots.apply(is_regular_season, axis=1) # In[6]: def get_season(row): # 2017-18 season will be 17, 2016-17 will be 16... # 2017-18 season for gleague will be 217 # 2018 season for wnba will be 118 return int(row['GameId'][0] + row['GameId'][3:5]) all_shots['Season'] = all_shots.apply(get_season, axis=1) # In[7]: # add play start type - start type for possessions with no oreb, missed shot type for shots after orebs def get_play_start_type(row): if row['OrebShotType'] is np.nan: return row['StartType'] elif row['OrebShotType'] in ['Arc3Blocked', 'Corner3Blocked']: # combine off blocked 3 orebs due to sample size issues return 'Off3BlockedOreb' elif row['OrebShotType'] in ['Corner3', 'Arc3']: # combine off 3 orebs because model does better when they are combined return 'Off3Oreb' else: return 'Off' + row['OrebShotType'] + 'Oreb' all_shots['PlayStartType'] = all_shots.apply(get_play_start_type, axis=1) # In[8]: def get_time_since_play_started(row): if row['OrebShotType'] is np.nan: return row['StartTime'] - row['Time'] else: # use time since oreb if off oreb return row['SecondsSinceOReb'] all_shots['SecondsSincePlayStarted'] = all_shots.apply(get_time_since_play_started, axis=1) # In[9]: def get_shot_distance(row): x_squared = row['X'] ** 2 y_squared = row['Y'] ** 2 shot_distance = math.sqrt(x_squared + y_squared) / 10 # unit for distance is off by factor of 10, divide by 10 to convert to feet return round(shot_distance, 1) all_shots['ShotDistance'] = all_shots.apply(get_shot_distance, axis=1) # In[10]: def get_shot_angle_from_centre(row): # 0 is straightaway, 90 is from the corners angle = abs(math.degrees(math.atan2(row['X'], row['Y']))) if angle < 90: return round(angle, 1) else: return round(180 - angle, 1) all_shots['ShotAngle'] = all_shots.apply(get_shot_angle_from_centre, axis=1) # In[11]: def make_all_ot_periods_5(row): # small sample size with Period > 5, make all OT periods 5 if row['Period'] > 5: return 5 else: return row['Period'] all_shots['Period'] = all_shots.apply(make_all_ot_periods_5, axis=1) base_features = ['Season', 'Made', 'Period', 'StartScoreDifferential', 'ShotValue', 'Time', 'Putback', 'IsRegularSeason', 'PlayStartType', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle'] all_shots = pd.get_dummies(all_shots[base_features], prefix='is') # In[12]: # group together start types to see what level of detail on start type does better def combine_off_block_orebs(row): if row['is_Off3BlockedOreb'] + row['is_OffAtRimBlockedOreb'] + row['is_OffLongMidRangeBlockedOreb'] + row['is_OffShortMidRangeBlockedOreb'] == 1: return 1 else: return 0 def combine_off_blocks(row): if row['is_OffArc3Block'] + row['is_OffCorner3Block'] + row['is_OffAtRimBlock'] + row['is_OffLongMidRangeBlock'] + row['is_OffShortMidRangeBlock'] == 1: return 1 else: return 0 def combine_off_makes(row): if row['is_OffArc3Make'] + row['is_OffCorner3Make'] + row['is_OffAtRimMake'] + row['is_OffLongMidRangeMake'] + row['is_OffShortMidRangeMake'] == 1: return 1 else: return 0 def combine_off_orebs(row): if row['is_Off3Oreb'] + row['is_OffAtRimOreb'] + row['is_OffLongMidRangeOreb'] + row['is_OffShortMidRangeOreb'] == 1: return 1 else: return 0 def combine_off_missed_fg(row): if row['is_OffArc3Miss'] + row['is_OffCorner3Miss'] + row['is_OffAtRimMiss'] + row['is_OffLongMidRangeMiss'] + row['is_OffShortMidRangeMiss'] == 1: return 1 else: return 0 # In[13]: all_shots['is_OffBlockedOreb'] = all_shots.apply(combine_off_block_orebs, axis=1) all_shots['is_OffBlock'] = all_shots.apply(combine_off_blocks, axis=1) all_shots['is_OffMadeFG'] = all_shots.apply(combine_off_makes, axis=1) all_shots['is_OffOreb'] = all_shots.apply(combine_off_orebs, axis=1) all_shots['is_OffMissedFG'] = all_shots.apply(combine_off_missed_fg, axis=1) # In[14]: all_shots.head() # In[15]: all_shots.to_csv('data_for_model.csv', index=False) # In[16]: print(pd.__version__) # In[17]: print(np.__version__) # In[18]: print(sys.version_info) # In[ ]: