Start out by first scraping the FP expert list, and then using that list to scrape each expert's rankings.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
TODAYS_DATE = datetime.date.today().strftime("%Y_%m_%d")
SCORING_TYPE = 'ppr'
if SCORING_TYPE == 'ppr':
fp_url = 'https://www.fantasypros.com/nfl/rankings/ppr-cheatsheets.php'
elif SCORING_TYPE == 'standard':
fp_url = 'https://www.fantasypros.com/nfl/rankings/consensus-cheatsheets.php'
r = requests.get(fp_url)
soup = BeautifulSoup(r.text,'lxml')
expert_table = soup.find('table',{'id':'experts'})
experts = []
rows = expert_table.findAll('tr')
for tr in rows:
columns = tr.findAll('td')
if len(columns) > 0:
expert = {}
expert['expert_id'] = columns[0].find('input').get('value')
expert['name'] = columns[1].text.strip()
expert['source'] = columns[2].text.strip()
expert['in_season_rank'] = columns[3].text.strip('#').strip()
expert['draft_rank'] = columns[4].text.strip('#').strip()
expert['date'] = columns[5].text.strip()
experts.append(expert)
print(len(experts))
print(experts[:5])
87 [{'expert_id': '3', 'source': 'ESPN', 'draft_rank': '108', 'in_season_rank': '104', 'name': 'Eric Karabell', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '5', 'source': 'ESPN', 'draft_rank': '', 'in_season_rank': '', 'name': 'Staff Composite', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '7', 'source': 'Yahoo! Sports', 'draft_rank': '69', 'in_season_rank': '65', 'name': 'Andy Behrens', 'date': '2017-08-25 10:51:138/25'}, {'expert_id': '9', 'source': 'Yahoo! Sports', 'draft_rank': '28', 'in_season_rank': '66', 'name': 'Scott Pianowski', 'date': '2017-08-25 02:21:318/24'}, {'expert_id': '15', 'source': 'ScoutFantasy', 'draft_rank': '76', 'in_season_rank': '8', 'name': 'Staff Rankings', 'date': '2017-08-23 07:47:338/23'}]
expert_df = pd.DataFrame(experts)
print(expert_df.head())
date draft_rank expert_id in_season_rank \ 0 2017-08-23 12:00:008/23 108 3 104 1 2017-08-23 12:00:008/23 5 2 2017-08-25 10:51:138/25 69 7 65 3 2017-08-25 02:21:318/24 28 9 66 4 2017-08-23 07:47:338/23 76 15 8 name source 0 Eric Karabell ESPN 1 Staff Composite ESPN 2 Andy Behrens Yahoo! Sports 3 Scott Pianowski Yahoo! Sports 4 Staff Rankings ScoutFantasy
Need to clean up that messy date field
expert_df['date'] = expert_df['date'].str.split(' ').str.get(0)
print(expert_df.head())
date draft_rank expert_id in_season_rank name \ 0 2017-08-23 108 3 104 Eric Karabell 1 2017-08-23 5 Staff Composite 2 2017-08-25 69 7 65 Andy Behrens 3 2017-08-25 28 9 66 Scott Pianowski 4 2017-08-23 76 15 8 Staff Rankings source 0 ESPN 1 ESPN 2 Yahoo! Sports 3 Yahoo! Sports 4 ScoutFantasy
expert_list_file = 'data/fp_experts_{}_{}.tsv'.format(SCORING_TYPE, TODAYS_DATE)
expert_df.to_csv(expert_list_file, sep='\t', index=False)
expert_ids = expert_df['expert_id'].tolist()
print(len(expert_ids))
87
Here's the function we'll use to scrape each expert's ranking.
def get_expert_rankings(expert_id,score_type):
return_rows = []
payload = {"source":'2',
"id":expert_id,
"year":"2017",
"position":"ALL",
"scoring":score_type, #PPR or STD
"week":"0",
"ajax":"true"}
widget_url = "https://partners.fantasypros.com/external/widget/nfl-staff-rankings.php"
r = requests.get(widget_url, params=payload)
soup = BeautifulSoup(r.content,'lxml')
ranking_table = soup.find('table')
if ranking_table:
rows = ranking_table.findAll('tr')
for tr in rows:
columns = tr.findAll('td')
if len(columns) > 0:
return_row = {}
return_row['rank'] = int(columns[0].text.strip())
return_row['player_name'] = columns[1].find('a').text
rest = columns[1].find('small').text
if "-" in rest:
rest_split = rest.split(',')
return_row['position'], return_row['team'] = rest_split[0].split(" - ")
else:
return_row['position'] = 'D/ST'
return_row['team'] = return_row['player_name']
return_row['expert_id'] = expert_id
return_rows.append(return_row)
return return_rows
test_ranks = get_expert_rankings('7',SCORING_TYPE)
print(test_ranks[:3])
[{'player_name': 'David Johnson', 'expert_id': '7', 'position': 'RB', 'rank': 1, 'team': 'ARI'}, {'player_name': "Le'Veon Bell", 'expert_id': '7', 'position': 'RB', 'rank': 2, 'team': 'PIT'}, {'player_name': 'Antonio Brown', 'expert_id': '7', 'position': 'WR', 'rank': 3, 'team': 'PIT'}]
from tqdm import tqdm_notebook
all_ranks = []
for expert_id in tqdm_notebook(expert_ids):
all_ranks += get_expert_rankings(expert_id, SCORING_TYPE)
rankings_df = pd.DataFrame(all_ranks)
print(rankings_df.head())
print(len(rankings_df))
expert_id player_name position rank team 0 3 David Johnson RB 1 ARI 1 3 Le'Veon Bell RB 2 PIT 2 3 Antonio Brown WR 3 PIT 3 3 Odell Beckham Jr. WR 4 NYG 4 3 Mike Evans WR 5 TB 21793
Across the 87 expert rankings, we've managed to put together a dataset of 21,793 player/expert/rank observations.
expert_rankings_file = 'data/fp_rankings_{}_{}.tsv'.format(SCORING_TYPE,TODAYS_DATE)
rankings_df.to_csv(expert_rankings_file, sep='\t', index=False)