import requests
import re
import csv
import numpy as np
from time import sleep
def get_teams():
#Reads FieldHockeyCorner and grabs all the NCAA Division 1 team names and abrreviations
url = 'http://www.fieldhockeycorner.com/scores.php?div=1'
page = requests.get(url)
teams = re.findall("tcode=(.*?)&div=1'>(.*?)<",page.text)
return teams
def get_schedule(team):
#Grabs the schedule for a specific team
#Includes games played and to be played
sleep(.5)
print 'Gettings schedule for', team[1]
url = 'http://www.fieldhockeycorner.com/scores.php?action=schedule&tcode=%s&div=1' % team[0]
page = requests.get(url)
table = re.findall('
(.*?)<\/td> | (.*?)<\/td> | (.*?)<\/td> | (.*?)<\/td><\/tr>',page.text)
table = [ read_table_row(row, team[1]) for row in table]
return table
def read_table_row(row, team):
#Decodes a row of the table from Fieldhockey corner
try:
score = [item for item in row[2].split() if '-' in item][0]
own_score = score.split('-')[0]
other_score = score.split('-')[1]
except:
other_score = '.'
own_score = '.'
if 'vs. ' in row[1]:
location = 'Neutral'
elif '' in row[1]:
location = 'Home'
elif 'at' in row[1]:
location = 'Away'
else:
location = 'Other'
return {'team' : team,
'date' : row[0],
'opponent' : row[1].replace('vs. ','').replace('','').replace('','').replace('at ',''),
'notes' : row[3],
'location' : location,
'own_score': own_score,
'other_score' : other_score
}
#grab the teams and their schedules
teams = get_teams()
schedules = [ get_schedule(team) for team in teams]
#Flatten the list so it a list of games
games = [item for sublist in schedules for item in sublist]
#Split into played and unplayed
played_games = [game for game in games if game['own_score']!='.']
unplayed_games = [game for game in games if game['own_score']=='.']
#Because some names are listed in mulitple ways
name_clean = {'Boston C.': 'Boston College',
'Boston U.': 'Boston University',
'Vcu' : 'Virginia Commonwealth',
'Appalachian St.' : 'Appalachian State',
'Uc Davis' : 'UC Davis'}
def generate_skill(games, n=50):
#Ranking algorithm. Average winning margin adjusted for opponents winning margin
#N is the number of times to iterate through. Seems to converge after 10 or so loops
skill = {}
for x in range(0, n):
team_skill_list = {}
for game in games:
#clean up the team names
team = name_clean.get(game['team'].title(),game['team'].title())
opponent = name_clean.get(game['opponent'].title(),game['opponent'].title())
#Hard coded Home Field Advantage at .7, which was the figure from 2012
if game['location'] == 'Home':
hfa = .7
else:
hfa = 0
#figure out how unexpected the margin of victory was
expected_margin = skill.get(team,0) - skill.get(opponent,0) + hfa
observed_margin = int(game['own_score']) - int(game['other_score'])
difference = observed_margin - expected_margin
#Add the unexpected portion to a list by team
try:
team_skill_list[team].append(difference)
except:
team_skill_list[team] = [observed_margin - expected_margin]
#New skill is old skill plus average of the new unexpected portion
skill = {team: np.mean(team_skill_list[team]) + skill.get(team,0) for team in team_skill_list}
#center the skills to prevent drift
mean = np.mean([skill[team] for team in skill])
skill = {team: skill[team] - mean for team in skill}
return skill
skills = generate_skill(played_games)
#take a look at the top teams
for team in sorted(skills, key=skills.get, reverse=True)[:5]:
print team,skills[team]
def generate_total(games, n=20):
#Same function as above, expcept for total points score rather than margin
#Shoudl probably be combined with above
total = {}
for x in range(0, n):
team_total_list = {}
for game in games:
team = name_clean.get(game['team'].title(),game['team'].title())
opponent = name_clean.get(game['opponent'].title(),game['opponent'].title())
expected_total = total.get(team,0) + total.get(opponent,0) + 4.7
observed_total = int(game['own_score']) + int(game['other_score'])
difference = observed_total - expected_total
try:
team_total_list[team].append(difference)
except:
team_total_list[team] = [difference]
#New total is old total plus new average
total = {team: np.mean(team_total_list[team]) + total.get(team,0) for team in team_total_list}
return total
totals = generate_total(played_games)
for team in sorted(totals, key=totals.get, reverse=True)[:10]:
print team,totals[team]
def make_prediction(games,skill,total):
# for unplayed games, come up with a prediction based on location,
# and who is playing
predictions = {}
for game in games:
team = name_clean.get(game['team'].title(),game['team'].title())
opponent = name_clean.get(game['opponent'].title(),game['opponent'].title())
total_predict = 4.7 + total.get(team,0) + total.get(opponent,0)
if game['location']=='Home':
hfa = .7
modifier = "at"
else:
hfa = 0
modifier ="vs"
#Quick hack to sort by date
date = game['date']
day = int(date.split()[1])
if 'Sep. ' in date:
day = day + 900
elif 'Oct. ' in date:
day = day + 1000
elif 'Nov. ' in date:
day = day + 1100
elif 'Dec. ' in date:
day = day + 1200
#Only print out away games to avoid duplication
if game['location']!='Away':
expected_margin = skill.get(team,0) - skill.get(opponent,0) + hfa
expected_margin = round(expected_margin * 2,0)/2
total_predict = round(total_predict*2,0)/2
#fix for cases where margin is greater than difference:
if expected_margin > total_predict:
expected_margin = total_predict
# fix for -0 margin
row = [date,'%s %s %s' % (opponent,modifier,team),"%.1f" % -expected_margin,"%.1f" % total_predict]
try:
predictions[day].append(row)
except:
predictions[day] = [row]
return predictions
preds = make_prediction(unplayed_games,skills,totals)
for day in preds:
print ' '.join(preds[day][0])
|