Scraping all expert draft rankings¶

Start out by first scraping the FP expert list, and then using that list to scrape each expert's rankings.

In [1]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime

In [2]:

TODAYS_DATE = datetime.date.today().strftime("%Y_%m_%d")
SCORING_TYPE = 'ppr'

Scrape the FP expert list¶

In [4]:

if SCORING_TYPE == 'ppr':
    fp_url = 'https://www.fantasypros.com/nfl/rankings/ppr-cheatsheets.php'
elif SCORING_TYPE == 'standard':
    fp_url = 'https://www.fantasypros.com/nfl/rankings/consensus-cheatsheets.php'
r = requests.get(fp_url)

In [5]:

soup = BeautifulSoup(r.text,'lxml')
expert_table = soup.find('table',{'id':'experts'})

In [6]:

experts = []
rows = expert_table.findAll('tr')
for tr in rows:
    columns = tr.findAll('td')
    if len(columns) > 0:
        expert = {}
        expert['expert_id'] = columns[0].find('input').get('value')
        expert['name'] = columns[1].text.strip()
        expert['source'] = columns[2].text.strip()
        expert['in_season_rank'] = columns[3].text.strip('#').strip()
        expert['draft_rank'] = columns[4].text.strip('#').strip()
        expert['date'] = columns[5].text.strip()
        experts.append(expert)
print(len(experts))
print(experts[:5])

87
[{'expert_id': '3', 'source': 'ESPN', 'draft_rank': '108', 'in_season_rank': '104', 'name': 'Eric Karabell', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '5', 'source': 'ESPN', 'draft_rank': '', 'in_season_rank': '', 'name': 'Staff Composite', 'date': '2017-08-23 12:00:008/23'}, {'expert_id': '7', 'source': 'Yahoo! Sports', 'draft_rank': '69', 'in_season_rank': '65', 'name': 'Andy Behrens', 'date': '2017-08-25 10:51:138/25'}, {'expert_id': '9', 'source': 'Yahoo! Sports', 'draft_rank': '28', 'in_season_rank': '66', 'name': 'Scott Pianowski', 'date': '2017-08-25 02:21:318/24'}, {'expert_id': '15', 'source': 'ScoutFantasy', 'draft_rank': '76', 'in_season_rank': '8', 'name': 'Staff Rankings', 'date': '2017-08-23 07:47:338/23'}]

In [7]:

expert_df = pd.DataFrame(experts)
print(expert_df.head())

                      date draft_rank expert_id in_season_rank  \
0  2017-08-23 12:00:008/23        108         3            104   
1  2017-08-23 12:00:008/23                    5                  
2  2017-08-25 10:51:138/25         69         7             65   
3  2017-08-25 02:21:318/24         28         9             66   
4  2017-08-23 07:47:338/23         76        15              8   

              name         source  
0    Eric Karabell           ESPN  
1  Staff Composite           ESPN  
2     Andy Behrens  Yahoo! Sports  
3  Scott Pianowski  Yahoo! Sports  
4   Staff Rankings   ScoutFantasy

Need to clean up that messy date field

In [8]:

expert_df['date'] = expert_df['date'].str.split(' ').str.get(0)
print(expert_df.head())

         date draft_rank expert_id in_season_rank             name  \
0  2017-08-23        108         3            104    Eric Karabell   
1  2017-08-23                    5                 Staff Composite   
2  2017-08-25         69         7             65     Andy Behrens   
3  2017-08-25         28         9             66  Scott Pianowski   
4  2017-08-23         76        15              8   Staff Rankings   

          source  
0           ESPN  
1           ESPN  
2  Yahoo! Sports  
3  Yahoo! Sports  
4   ScoutFantasy

In [19]:

expert_list_file = 'data/fp_experts_{}_{}.tsv'.format(SCORING_TYPE, TODAYS_DATE)
expert_df.to_csv(expert_list_file, sep='\t', index=False)

Scrape each individual FP expert ranking¶

In [10]:

expert_ids = expert_df['expert_id'].tolist()
print(len(expert_ids))

Here's the function we'll use to scrape each expert's ranking.

In [11]:

def get_expert_rankings(expert_id,score_type):
    return_rows = []
    
    payload = {"source":'2',
               "id":expert_id,
              "year":"2017",
               "position":"ALL",
               "scoring":score_type, #PPR or STD
              "week":"0",
              "ajax":"true"}
    widget_url = "https://partners.fantasypros.com/external/widget/nfl-staff-rankings.php"
    r = requests.get(widget_url, params=payload)
    
    soup = BeautifulSoup(r.content,'lxml')
    ranking_table = soup.find('table')
    if ranking_table:
        rows = ranking_table.findAll('tr')
        for tr in rows:
            columns = tr.findAll('td')
            if len(columns) > 0:
                return_row = {}
                return_row['rank'] = int(columns[0].text.strip())
                return_row['player_name'] = columns[1].find('a').text
                rest = columns[1].find('small').text
                if "-" in rest:
                    rest_split = rest.split(',')
                    return_row['position'], return_row['team'] = rest_split[0].split(" - ")
                else:
                    return_row['position'] = 'D/ST'
                    return_row['team'] = return_row['player_name']
                return_row['expert_id'] = expert_id
                return_rows.append(return_row)
    return return_rows

In [13]:

test_ranks = get_expert_rankings('7',SCORING_TYPE)
print(test_ranks[:3])

[{'player_name': 'David Johnson', 'expert_id': '7', 'position': 'RB', 'rank': 1, 'team': 'ARI'}, {'player_name': "Le'Veon Bell", 'expert_id': '7', 'position': 'RB', 'rank': 2, 'team': 'PIT'}, {'player_name': 'Antonio Brown', 'expert_id': '7', 'position': 'WR', 'rank': 3, 'team': 'PIT'}]

In [14]:

from tqdm import tqdm_notebook

In [15]:

all_ranks = []
for expert_id in tqdm_notebook(expert_ids):
    all_ranks += get_expert_rankings(expert_id, SCORING_TYPE)

In [16]:

rankings_df = pd.DataFrame(all_ranks)
print(rankings_df.head())
print(len(rankings_df))

  expert_id        player_name position  rank team
0         3      David Johnson       RB     1  ARI
1         3       Le'Veon Bell       RB     2  PIT
2         3      Antonio Brown       WR     3  PIT
3         3  Odell Beckham Jr.       WR     4  NYG
4         3         Mike Evans       WR     5   TB
21793

Across the 87 expert rankings, we've managed to put together a dataset of 21,793 player/expert/rank observations.

In [18]:

expert_rankings_file = 'data/fp_rankings_{}_{}.tsv'.format(SCORING_TYPE,TODAYS_DATE)
rankings_df.to_csv(expert_rankings_file, sep='\t', index=False)

In [ ]: