import requests import re import csv import numpy as np from time import sleep def get_teams(): #Reads FieldHockeyCorner and grabs all the NCAA Division 1 team names and abrreviations url = 'http://www.fieldhockeycorner.com/scores.php?div=1' page = requests.get(url) teams = re.findall("tcode=(.*?)&div=1'>(.*?)<",page.text) return teams def get_schedule(team): #Grabs the schedule for a specific team #Includes games played and to be played sleep(.5) print 'Gettings schedule for', team[1] url = 'http://www.fieldhockeycorner.com/scores.php?action=schedule&tcode=%s&div=1' % team[0] page = requests.get(url) table = re.findall('(.*?)<\/td>(.*?)<\/td>(.*?)<\/td>(.*?)<\/td><\/tr>',page.text) table = [ read_table_row(row, team[1]) for row in table] return table def read_table_row(row, team): #Decodes a row of the table from Fieldhockey corner try: score = [item for item in row[2].split() if '-' in item][0] own_score = score.split('-')[0] other_score = score.split('-')[1] except: other_score = '.' own_score = '.' if 'vs. ' in row[1]: location = 'Neutral' elif '' in row[1]: location = 'Home' elif 'at' in row[1]: location = 'Away' else: location = 'Other' return {'team' : team, 'date' : row[0], 'opponent' : row[1].replace('vs. ','').replace('','').replace('','').replace('at ',''), 'notes' : row[3], 'location' : location, 'own_score': own_score, 'other_score' : other_score } #grab the teams and their schedules teams = get_teams() schedules = [ get_schedule(team) for team in teams] #Flatten the list so it a list of games games = [item for sublist in schedules for item in sublist] #Split into played and unplayed played_games = [game for game in games if game['own_score']!='.'] unplayed_games = [game for game in games if game['own_score']=='.'] #Because some names are listed in mulitple ways name_clean = {'Boston C.': 'Boston College', 'Boston U.': 'Boston University', 'Vcu' : 'Virginia Commonwealth', 'Appalachian St.' : 'Appalachian State', 'Uc Davis' : 'UC Davis'} def generate_skill(games, n=50): #Ranking algorithm. Average winning margin adjusted for opponents winning margin #N is the number of times to iterate through. Seems to converge after 10 or so loops skill = {} for x in range(0, n): team_skill_list = {} for game in games: #clean up the team names team = name_clean.get(game['team'].title(),game['team'].title()) opponent = name_clean.get(game['opponent'].title(),game['opponent'].title()) #Hard coded Home Field Advantage at .7, which was the figure from 2012 if game['location'] == 'Home': hfa = .7 else: hfa = 0 #figure out how unexpected the margin of victory was expected_margin = skill.get(team,0) - skill.get(opponent,0) + hfa observed_margin = int(game['own_score']) - int(game['other_score']) difference = observed_margin - expected_margin #Add the unexpected portion to a list by team try: team_skill_list[team].append(difference) except: team_skill_list[team] = [observed_margin - expected_margin] #New skill is old skill plus average of the new unexpected portion skill = {team: np.mean(team_skill_list[team]) + skill.get(team,0) for team in team_skill_list} #center the skills to prevent drift mean = np.mean([skill[team] for team in skill]) skill = {team: skill[team] - mean for team in skill} return skill skills = generate_skill(played_games) #take a look at the top teams for team in sorted(skills, key=skills.get, reverse=True)[:5]: print team,skills[team] def generate_total(games, n=20): #Same function as above, expcept for total points score rather than margin #Shoudl probably be combined with above total = {} for x in range(0, n): team_total_list = {} for game in games: team = name_clean.get(game['team'].title(),game['team'].title()) opponent = name_clean.get(game['opponent'].title(),game['opponent'].title()) expected_total = total.get(team,0) + total.get(opponent,0) + 4.7 observed_total = int(game['own_score']) + int(game['other_score']) difference = observed_total - expected_total try: team_total_list[team].append(difference) except: team_total_list[team] = [difference] #New total is old total plus new average total = {team: np.mean(team_total_list[team]) + total.get(team,0) for team in team_total_list} return total totals = generate_total(played_games) for team in sorted(totals, key=totals.get, reverse=True)[:10]: print team,totals[team] def make_prediction(games,skill,total): # for unplayed games, come up with a prediction based on location, # and who is playing predictions = {} for game in games: team = name_clean.get(game['team'].title(),game['team'].title()) opponent = name_clean.get(game['opponent'].title(),game['opponent'].title()) total_predict = 4.7 + total.get(team,0) + total.get(opponent,0) if game['location']=='Home': hfa = .7 modifier = "at" else: hfa = 0 modifier ="vs" #Quick hack to sort by date date = game['date'] day = int(date.split()[1]) if 'Sep. ' in date: day = day + 900 elif 'Oct. ' in date: day = day + 1000 elif 'Nov. ' in date: day = day + 1100 elif 'Dec. ' in date: day = day + 1200 #Only print out away games to avoid duplication if game['location']!='Away': expected_margin = skill.get(team,0) - skill.get(opponent,0) + hfa expected_margin = round(expected_margin * 2,0)/2 total_predict = round(total_predict*2,0)/2 #fix for cases where margin is greater than difference: if expected_margin > total_predict: expected_margin = total_predict # fix for -0 margin row = [date,'%s %s %s' % (opponent,modifier,team),"%.1f" % -expected_margin,"%.1f" % total_predict] try: predictions[day].append(row) except: predictions[day] = [row] return predictions preds = make_prediction(unplayed_games,skills,totals) for day in preds: print ' '.join(preds[day][0])