#!/usr/bin/env python # coding: utf-8 # In[13]: url_base='http://www.wrc.com/service/sasCacheApi.php?route={stub}' # In[14]: itinerary_stub='rallies/{rallyId}/itinerary' startlists_stub='rallies/{rallyId}/entries' penalties_stub='rallies/{rallyId}/penalties' retirements_stub='rallies/{rallyId}/retirements' stagewinners_stub='rallies/{rallyId}/stagewinners' overall_stub='stages/{stageId}/results?rallyId={rallyId}' split_times_stub ='stages/{stageId}/splittimes?rallyId={rallyId}' stage_times_stage_stub='stages/{stageId}/stagetimes?rallyId={rallyId}' stage_times_overall_stub='stages/{stageId}/results?rallyId={rallyId}' championship_stub='championships/{championshipId}' championship_results_stub='championships/{championshipId}/results' # In[15]: meta={'rallyId':30} # ## Scrape page for rallyid # # The `rallyid` used in the API seems to differ from IDs in the data returned by the API, so we need to scrape the rally pages to get the actual ID. # In[16]: import requests import re from bs4 import BeautifulSoup # In[17]: results_main_url='http://www.wrc.com/en/wrc/results/wales/stage-times/page/416-238---.html#' # In[18]: html=requests.get(results_main_url) soup=BeautifulSoup(html.content, "html5lib") #BeautifulSoup has a routine - find_all() - that will find all the HTML tags of a particular sort #Links are represented in HTML pages in the form link text #Grab all the (anchor) tags... souplist=soup.findAll("li",{'class':'flag'}) # In[19]: items={} for s in souplist: href=s.find('a')['href'] if href: title=s.find('img')['title'] title = 'Monaco' if title == 'Monte Carlo' else title items[title]=href items # Note that the names don't properly match any single column in other tables. Need to match on either country or location in championship table - or change "Monte Carlo" to "Monaco". # In[20]: url='http://www.wrc.com/en/wrc/results/wales/stage-times/page/416-238---.html' html=requests.get(url) # In[21]: m = re.search("var rallyId = '(.+?)'", html.text) # In[22]: if m: print(m.group(1)) # In[23]: import pandas as pd import requests # In[24]: rallyids={} for item in items: html=requests.get(items[item]) m = re.search("var rallyId = '(.+?)'", html.text) if m: rallyids[item] = m.group(1) rallyids # ## Itinerary # In[25]: itinerary_json=requests.get( url_base.format(stub=itinerary_stub.format(**meta) ) ).json() #itinerary_json # If no data is returned, we get an empty list. # # Need to check that we do get a response, eg `if itinerary_json:` # In[61]: itinerary_json # In[26]: from pandas.io.json import json_normalize itinerary_event = json_normalize(itinerary_json).drop('itineraryLegs', axis=1) itinerary_event # In[27]: itinerary_legs = json_normalize(itinerary_json, record_path='itineraryLegs', meta='eventId').drop('itinerarySections', axis=1) #?don't need eventId? itinerary_legs # In[57]: itinerary_sections = json_normalize(itinerary_json, ['itineraryLegs', 'itinerarySections'], meta='eventId').drop(['stages','controls'],axis=1) #?don't need eventId? itinerary_sections # In[163]: json_normalize(itinerary_json, ['itineraryLegs', 'itinerarySections', 'stages'], meta=[['itineraryLegs','itinerarySections','itinerarySectionId']] )#.drop(['controls'],axis=1) # In[167]: itinerary_stages=json_normalize(itinerary_json['itineraryLegs'], ['itinerarySections','stages'], meta=['itineraryLegId',['itinerarySections','itinerarySectionId']]) itinerary_stages.head(10) # In[30]: meta['stages']=itinerary_stages['stageId'].tolist() # In[168]: itinerary_controls=json_normalize(itinerary_json['itineraryLegs'], ['itinerarySections','controls'] , meta=['itineraryLegId',['itinerarySections','itinerarySectionId']]) itinerary_controls['stageId'] = itinerary_controls['stageId'].fillna(-1).astype(int) itinerary_controls.head() # ## Startlists # In[32]: startlists_json=requests.get( url_base.format(stub=startlists_stub.format(**meta) ) ).json() #startlists_json # In[33]: startlists = json_normalize(startlists_json).drop('eventClasses', axis=1) startlists.head() # In[34]: startlists.columns # In[35]: startlist_classes = json_normalize(startlists_json,['eventClasses'], 'entryId' ) startlist_classes.head() # ## Penalties # In[36]: penalties_json=requests.get( url_base.format(stub=penalties_stub.format(**meta) ) ).json() #penalties_json # In[37]: penalties = json_normalize(penalties_json) penalties.head() # ## Retirements # In[38]: retirements_json=requests.get( url_base.format(stub=retirements_stub.format(**meta) ) ).json() #retirements_json # In[39]: retirements = json_normalize(retirements_json) retirements.head() # ## Stagewinners # In[40]: stagewinners_json=requests.get( url_base.format(stub=stagewinners_stub.format(**meta) ) ).json() #stagewinners_json # In[41]: stagewinners = json_normalize(stagewinners_json) stagewinners.head() # ## Stage Iterator # # TO DO: For following stage based tables, create a generic stage iterator function. # ## Overall # In[43]: meta2=meta stage_overall = pd.DataFrame() for stageId in meta['stages']: meta2['stageId']=stageId _stage_overall_json=requests.get( url_base.format(stub=overall_stub.format(**meta2) ) ).json() _stage_overall = json_normalize(_stage_overall_json) _stage_overall['stageId'] = stageId stage_overall = pd.concat([stage_overall, _stage_overall]) stage_overall.head() # ## Split Times # In[44]: split_times=pd.DataFrame() for stageId in meta['stages']: meta2['stageId']=stageId _stage_split_times_json=requests.get( url_base.format(stub=split_times_stub.format(**meta2) ) ).json() _stage_split_times = json_normalize(_stage_split_times_json) _stage_split_times['stageId'] = stageId split_times = pd.concat([split_times, _stage_split_times]) split_times.head() # In[195]: split_times[(split_times['entryId']==1487) & (split_times['stageId']==289)] # ## Stage Times - Stage # In[111]: stage_times_stage=pd.DataFrame() for stageId in meta['stages']: meta2['stageId']=stageId _stage_times_stage_json=requests.get( url_base.format(stub=stage_times_stage_stub.format(**meta2) ) ).json() _stage_times_stage = json_normalize(_stage_times_stage_json) _stage_times_stage['stageId'] = stageId stage_times_stage = pd.concat([stage_times_stage, _stage_times_stage]) stage_times_stage.head() # ## Stage Times - Overall # In[112]: stage_times_overall=pd.DataFrame() for stageId in meta['stages']: meta2['stageId']=stageId _stage_times_overall_json=requests.get( url_base.format(stub=stage_times_overall_stub.format(**meta2) ) ).json() _stage_times_overall = json_normalize(_stage_times_overall_json) _stage_times_overall['stageId'] = stageId stage_times_overall = pd.concat([stage_times_overall, _stage_times_overall]) stage_times_overall.head() # ## Championships # In[241]: #http://www.wrc.com/en/wrc/results/championship-standings/page/4176----.html championshipClasses = { 'WRC': { 'Driver': '6', 'Co-Driver': '7', 'Manufacturers': '8' }, 'WRC 2': { 'Driver': '10', 'Co-Driver': '11', 'Manufacturers': '9' }, 'WRC 3': { 'Driver': '13', 'Co-Driver': '14', 'Manufacturers': '12' }, 'JWRC': { 'Driver': '15', 'Co-Driver': '16' } } # We can extract the javascript that declares the championship codes from the HTML page and convert it to a Python `dict`. # In[46]: import json import re import requests url='http://www.wrc.com/en/wrc/results/championship-standings/page/4176----.html' html2=requests.get(url).text # In[47]: m = re.search("var championshipClasses = (.*?);", html2, re.DOTALL) mm=m.group(1).replace('\n','').replace("'",'"') d=json.loads(mm) #https://stackoverflow.com/a/35758583/454773 championshipClasses={k.replace(' ', ''): v for k, v in d.items()} championshipClasses # In[48]: drivers = 6 codrivers = 7 manufacturer = 8 championships={} champ_num = drivers # In[49]: meta2['championshipId']= champ_num championship_json=requests.get( url_base.format(stub=championship_stub.format(**meta2) ) ).json() #championship_json # In[50]: championship = json_normalize(championship_json).drop(['championshipEntries','championshipRounds'], axis=1) championship.head() # In[51]: championships={} championship_dict = championship.to_dict() championships[champ_num] = {c:championship_dict[c][0] for c in championship_dict} championships # In[52]: renamer={c.replace('Description',''):championships[champ_num][c] for c in championships[champ_num] if c.startswith('field')} renamer # In[54]: championship_entries = json_normalize(championship_json,['championshipEntries'] ) championship_entries = championship_entries.rename(columns=renamer) championship_entries = championship_entries[[c for c in championship_entries.columns if c!='']] championship_entries # In[161]: championship_rounds = json_normalize(championship_json,['championshipRounds'] ).drop('event', axis=1) championship_rounds # In[162]: _events_json = json_normalize(championship_json,['championshipRounds' ])['event'] championship_events = json_normalize(_events_json) championship_events.head() # ## Championship Results # In[166]: championship_results_json=requests.get( url_base.format(stub=championship_results_stub.format(**meta2) ) ).json() championship_results = json_normalize(championship_results_json) championship_results.head() # In[ ]: