#!/usr/bin/env python # coding: utf-8 # In[37]: import pandas as pd np=pd.np from sdd_api.api import Api from credentials import * import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_columns=None api = Api(username=username, password=password, client_id=client_id, client_secret=client_secret) # In[38]: matchups=api.get_dataframe("matchups",season_start=2016) matchups.sort_values(by="game_datetime") # In[39]: matchups.sort_values(by="game_datetime").groupby("season").agg(len) # ### Lets Predict the Game Winner # We'll use data from 2011 season and on since we have line information for these games # In[41]: matchups=matchups[(matchups['game_type'].isin(['Regular','Playoffs']))] #drop any unscored/unplayed games matchups=matchups.dropna(subset=["home_pts","away_pts"]) matchups=matchups.sort_values(by=["game_datetime"]) matchups.tail(20) # In[42]: matchups.describe() # ### We will use 538's Elo Algorithm # - [538's Elo Introduction](https://fivethirtyeight.com/datalab/introducing-nfl-elo-ratings/) # - [More Elo Description](https://fivethirtyeight.com/datalab/nfl-elo-ratings-are-back/) # # ### The features for this model are: # - game location (home, away, neutral) # - team score # - opponent score # In[43]: matchups['is_neutral']=matchups['game_location'].apply(lambda x: True if x=="N" else False) # In[44]: from collections import defaultdict def silverK(MOV, elo_diff): K_0=20 multiplier=np.log(abs(MOV)+1)*(2.2/((elo_diff)*.001+2.2)) return K_0*multiplier,K_0*multiplier def silverS(home_score, away_score): S_home,S_away=0,0 if home_score>away_score: S_home=1 elif away_score>home_score: S_away=1 else: S_home,S_away=.5,.5 return S_home,S_away def silver_elo_update(home_score, away_score, home_rating, away_rating, isNeutral): HOME_AD=65. if not isNeutral: pass#home_rating+=HOME_AD E_home = elo_prediction(home_rating,away_rating) E_away=1-E_home elo_diff=home_rating-away_rating MOV=home_score-away_score S_home,S_away = silverS(home_score,away_score) if home_score>=away_score: elo_winner=home_rating elo_loser=away_rating else: elo_loser=home_rating elo_winner=away_rating K_home,K_away = silverK(MOV,elo_winner-elo_loser) return K_home*(S_home-E_home),K_away*(S_away-E_away) def elo_prediction(home_rating,away_rating): E_home = 1./(1 + 10 ** ((away_rating - home_rating) / (400.))) return E_home def score_prediction(home_rating,away_rating): return (home_rating-away_rating)/25. class HeadToHeadModel(object): def __init__(self, events, update_function, prediction_function=None): self.update_function=update_function self.events=events self.ratings=defaultdict(lambda: 1505) self.prediction_function = prediction_function self.predictions = [] self.curr_season=defaultdict(lambda: self.events[0][1]['season']) def compute_elo_ratings(self): for idx, event in self.events: new_year=event['season'] label_i=event['home_name'] label_j=event['away_name'] if self.curr_season[label_i]!=new_year: self.curr_season[label_i]=new_year self.ratings[label_i]=self.ratings[label_i]*.25+1505.*.75 elif self.curr_season[label_j]!=new_year: self.curr_season[label_j]=new_year self.ratings[label_j]=self.ratings[label_j]*.75+1505.*.25 #todo change below to just use event update=self.update_function(event['home_pts'],event['away_pts'], self.ratings[label_i], self.ratings[label_j], event['is_neutral']) self.ratings[label_i]+=update[0] self.ratings[label_j]+=update[1] def power_rankings(self): from operator import itemgetter #sort dictionary by value to get ascending list of teams power_rankings = sorted(self.ratings.items(), key=itemgetter(1), reverse=True) power = [] #Make the 0-th team 1st for i, x in enumerate(power_rankings): power.append((i + 1, x)) return power m=HeadToHeadModel(list(matchups.iterrows()), silver_elo_update, elo_prediction) m.compute_elo_ratings() m.power_rankings() # ### What Good are Rankings without Testing? # Elo is a robust algorithm for strength of schedule based ratings. But what good are rankings without testing them?!?. Let's test our rankings at picking the winners and against the spread. # In[45]: #Let's redo our elo model with the addition of elo ranking output class HeadToHeadModel(object): def __init__(self, events, update_function, prediction_function=None): self.update_function=update_function self.events=events self.ratings=defaultdict(lambda: 1505) self.prediction_function = prediction_function self.predictions = [] self.curr_season=defaultdict(lambda: self.events[0][1]['season']) self.elos=[] def compute_elo_ratings(self): for idx, event in self.events: new_year=event['season'] label_i=event['home_name'] label_j=event['away_name'] if self.curr_season[label_i]!=new_year: self.curr_season[label_i]=new_year self.ratings[label_i]=self.ratings[label_i]*.25+1505.*.75 elif self.curr_season[label_j]!=new_year: self.curr_season[label_j]=new_year self.ratings[label_j]=self.ratings[label_j]*.75+1505.*.25 #todo change below to just use event update=self.update_function(event['home_pts'],event['away_pts'], self.ratings[label_i], self.ratings[label_j], event['is_neutral']) self.elos.append({ "home_elo":self.ratings[label_i], "away_elo":self.ratings[label_j], "index": idx, }) self.ratings[label_i]+=update[0] self.ratings[label_j]+=update[1] def power_rankings(self): from operator import itemgetter #sort dictionary by value to get ascending list of teams power_rankings = sorted(self.ratings.items(), key=itemgetter(1), reverse=True) power = [] #Make the 0-th team 1st for i, x in enumerate(power_rankings): power.append((i + 1, x)) return power m=HeadToHeadModel(list(matchups.iterrows()), silver_elo_update, elo_prediction) m.compute_elo_ratings() m.power_rankings() # In[46]: elo=pd.DataFrame(m.elos).set_index("index").join(matchups) elo.head(2) # In[47]: elo.describe() # ### Prediction and Testing # Since we are seeding our rankings starting with the 2011 season it would be unfair to our model to expect to it be right in the beginning. For our purposes, we'll look at performance during the whole season and during weeks 4-15 to allow rating to settle and not lose when a team rests their players in weeks 16 and 17 # In[48]: def predict_home_margin(row): #how many points the home team is expected to win bye return score_prediction(row['home_elo']+65, row['away_elo']) elo['predicted_home_margin']=elo.apply(predict_home_margin,axis=1) # In[49]: elo.head(1) # In[50]: #1 is home winning elo['predicted_winner']=elo.apply(lambda row: 1 if row['predicted_home_margin']>=0 else 0, axis=1) # In[51]: elo['vegas_predicted_winner']=elo.apply(lambda row: 0 if row['home_line']>0 else 1, axis=1)#line assigns negative value elo['winner']=elo.apply(lambda row: 1 if row['home_pts']>row['away_pts'] else 0, axis=1)#does not account for ties # Since this is a binary decision problem, either win or lose our bet, we will use [confusion matrices](https://en.wikipedia.org/wiki/Confusion_matrix) to judge the performance of our model # In[52]: from sklearn.metrics import confusion_matrix def print_confusion_matrix(y_true, y_pred): conf_matrix=confusion_matrix(y_true, y_pred) success_rate=np.trace(conf_matrix)/np.sum(conf_matrix) print(success_rate) print(conf_matrix) test_period=elo[(elo['week_num'].between(4,15,inclusive=True))&(elo['game_type']=='Regular')&(elo['season'].between(2012,2016))] # In[53]: y_true='winner' y_pred='predicted_winner' print_confusion_matrix(test_period['winner'],test_period['predicted_winner']) # In[54]: y_true='winner' y_pred='vegas_predicted_winner' print_confusion_matrix(test_period['winner'],test_period[y_pred]) # ### Vegas is pretty good huh # We were good at predicting winners but vegas is slightly better... The highest we've gotten with a pure elo approach using more years of training is 67%. Let's look at our against the spread performance and see if we can find an edge. # In[55]: elo.head(1) # In[56]: elo['home_margin']=elo.apply(lambda x: x['home_pts']-x['away_pts'], axis=1) elo['home_bet']=elo.apply(lambda x: (x['predicted_home_margin']+x['home_line'])<0,axis=1) elo['home_covers']=elo.apply(lambda x: (x['home_margin']+x['home_line'])>0,axis=1) test_period=elo[(elo['week_num'].between(4,15,inclusive=True))&(elo['game_type']=='Regular')&(elo['season'].between(2012,2016))] y_true='home_covers' y_pred='home_bet' print_confusion_matrix(test_period[y_true],test_period[y_pred]) # That won't win for us. Under standard wagering strategies you need to be better than 52.4% # ### Performance each season during test period # In[57]: elo['predicted_winner_right']=elo.apply(lambda x: 1 if x['winner']==x['predicted_winner'] else 0, axis=1) elo['vegas_favored_wins']=elo.apply(lambda x: 1 if x['winner']==x['vegas_predicted_winner'] else 0, axis=1) elo['ats_right']=elo.apply(lambda x: 1 if x['home_covers']==x['home_bet'] else 0, axis=1) test_period=elo[(elo['week_num'].between(4,15,inclusive=True))&(elo['game_type']=='Regular')&(elo['season'].between(2012,2016))] # In[58]: test_period[['predicted_winner_right','vegas_favored_wins','ats_right','season']].groupby("season").agg([np.mean, np.var]) # ### Results # ATS is chancey but performance against vegas for winners is pretty close. # In[59]: # for all years in our sample we do about just as good as vegas but no better test_period[['predicted_winner_right','vegas_favored_wins','ats_right']].groupby(lambda x: 0).agg([np.mean, np.var]) # ### Next Steps # So now you've seen how to build a strength of schedule (SOS) model. You might think you can make it better, and you probably can, but it would be a better use of your team to create orthogonal features not based on SOS. Something like how a team plays using: # - offensive scheme: e.g. vertical "Air Coryell" offense # - base_defense: e.g. 3-4 # - running yards (and if they are a great running team) # # Check out our team_season_log table and our team_game_logs to get the data you need to build it. # In[60]: team_season_log = api.get_dataframe("team_season_log") team_season_log.head() # In[61]: team_game_logs = api.get_dataframe("team_game_logs") team_game_logs.head() # In[ ]: