#!/usr/bin/env python
# coding: utf-8
# In[13]:
url_base='http://www.wrc.com/service/sasCacheApi.php?route={stub}'
# In[14]:
itinerary_stub='rallies/{rallyId}/itinerary'
startlists_stub='rallies/{rallyId}/entries'
penalties_stub='rallies/{rallyId}/penalties'
retirements_stub='rallies/{rallyId}/retirements'
stagewinners_stub='rallies/{rallyId}/stagewinners'
overall_stub='stages/{stageId}/results?rallyId={rallyId}'
split_times_stub ='stages/{stageId}/splittimes?rallyId={rallyId}'
stage_times_stage_stub='stages/{stageId}/stagetimes?rallyId={rallyId}'
stage_times_overall_stub='stages/{stageId}/results?rallyId={rallyId}'
championship_stub='championships/{championshipId}'
championship_results_stub='championships/{championshipId}/results'
# In[15]:
meta={'rallyId':30}
# ## Scrape page for rallyid
#
# The `rallyid` used in the API seems to differ from IDs in the data returned by the API, so we need to scrape the rally pages to get the actual ID.
# In[16]:
import requests
import re
from bs4 import BeautifulSoup
# In[17]:
results_main_url='http://www.wrc.com/en/wrc/results/wales/stage-times/page/416-238---.html#'
# In[18]:
html=requests.get(results_main_url)
soup=BeautifulSoup(html.content, "html5lib")
#BeautifulSoup has a routine - find_all() - that will find all the HTML tags of a particular sort
#Links are represented in HTML pages in the form link text
#Grab all the (anchor) tags...
souplist=soup.findAll("li",{'class':'flag'})
# In[19]:
items={}
for s in souplist:
href=s.find('a')['href']
if href:
title=s.find('img')['title']
title = 'Monaco' if title == 'Monte Carlo' else title
items[title]=href
items
# Note that the names don't properly match any single column in other tables. Need to match on either country or location in championship table - or change "Monte Carlo" to "Monaco".
# In[20]:
url='http://www.wrc.com/en/wrc/results/wales/stage-times/page/416-238---.html'
html=requests.get(url)
# In[21]:
m = re.search("var rallyId = '(.+?)'", html.text)
# In[22]:
if m:
print(m.group(1))
# In[23]:
import pandas as pd
import requests
# In[24]:
rallyids={}
for item in items:
html=requests.get(items[item])
m = re.search("var rallyId = '(.+?)'", html.text)
if m:
rallyids[item] = m.group(1)
rallyids
# ## Itinerary
# In[25]:
itinerary_json=requests.get( url_base.format(stub=itinerary_stub.format(**meta) ) ).json()
#itinerary_json
# If no data is returned, we get an empty list.
#
# Need to check that we do get a response, eg `if itinerary_json:`
# In[61]:
itinerary_json
# In[26]:
from pandas.io.json import json_normalize
itinerary_event = json_normalize(itinerary_json).drop('itineraryLegs', axis=1)
itinerary_event
# In[27]:
itinerary_legs = json_normalize(itinerary_json, record_path='itineraryLegs',
meta='eventId').drop('itinerarySections', axis=1)
#?don't need eventId?
itinerary_legs
# In[57]:
itinerary_sections = json_normalize(itinerary_json, ['itineraryLegs', 'itinerarySections'],
meta='eventId').drop(['stages','controls'],axis=1)
#?don't need eventId?
itinerary_sections
# In[163]:
json_normalize(itinerary_json, ['itineraryLegs', 'itinerarySections', 'stages'],
meta=[['itineraryLegs','itinerarySections','itinerarySectionId']]
)#.drop(['controls'],axis=1)
# In[167]:
itinerary_stages=json_normalize(itinerary_json['itineraryLegs'],
['itinerarySections','stages'],
meta=['itineraryLegId',['itinerarySections','itinerarySectionId']])
itinerary_stages.head(10)
# In[30]:
meta['stages']=itinerary_stages['stageId'].tolist()
# In[168]:
itinerary_controls=json_normalize(itinerary_json['itineraryLegs'],
['itinerarySections','controls'] ,
meta=['itineraryLegId',['itinerarySections','itinerarySectionId']])
itinerary_controls['stageId'] = itinerary_controls['stageId'].fillna(-1).astype(int)
itinerary_controls.head()
# ## Startlists
# In[32]:
startlists_json=requests.get( url_base.format(stub=startlists_stub.format(**meta) ) ).json()
#startlists_json
# In[33]:
startlists = json_normalize(startlists_json).drop('eventClasses', axis=1)
startlists.head()
# In[34]:
startlists.columns
# In[35]:
startlist_classes = json_normalize(startlists_json,['eventClasses'], 'entryId' )
startlist_classes.head()
# ## Penalties
# In[36]:
penalties_json=requests.get( url_base.format(stub=penalties_stub.format(**meta) ) ).json()
#penalties_json
# In[37]:
penalties = json_normalize(penalties_json)
penalties.head()
# ## Retirements
# In[38]:
retirements_json=requests.get( url_base.format(stub=retirements_stub.format(**meta) ) ).json()
#retirements_json
# In[39]:
retirements = json_normalize(retirements_json)
retirements.head()
# ## Stagewinners
# In[40]:
stagewinners_json=requests.get( url_base.format(stub=stagewinners_stub.format(**meta) ) ).json()
#stagewinners_json
# In[41]:
stagewinners = json_normalize(stagewinners_json)
stagewinners.head()
# ## Stage Iterator
#
# TO DO: For following stage based tables, create a generic stage iterator function.
# ## Overall
# In[43]:
meta2=meta
stage_overall = pd.DataFrame()
for stageId in meta['stages']:
meta2['stageId']=stageId
_stage_overall_json=requests.get( url_base.format(stub=overall_stub.format(**meta2) ) ).json()
_stage_overall = json_normalize(_stage_overall_json)
_stage_overall['stageId'] = stageId
stage_overall = pd.concat([stage_overall, _stage_overall])
stage_overall.head()
# ## Split Times
# In[44]:
split_times=pd.DataFrame()
for stageId in meta['stages']:
meta2['stageId']=stageId
_stage_split_times_json=requests.get( url_base.format(stub=split_times_stub.format(**meta2) ) ).json()
_stage_split_times = json_normalize(_stage_split_times_json)
_stage_split_times['stageId'] = stageId
split_times = pd.concat([split_times, _stage_split_times])
split_times.head()
# In[195]:
split_times[(split_times['entryId']==1487) & (split_times['stageId']==289)]
# ## Stage Times - Stage
# In[111]:
stage_times_stage=pd.DataFrame()
for stageId in meta['stages']:
meta2['stageId']=stageId
_stage_times_stage_json=requests.get( url_base.format(stub=stage_times_stage_stub.format(**meta2) ) ).json()
_stage_times_stage = json_normalize(_stage_times_stage_json)
_stage_times_stage['stageId'] = stageId
stage_times_stage = pd.concat([stage_times_stage, _stage_times_stage])
stage_times_stage.head()
# ## Stage Times - Overall
# In[112]:
stage_times_overall=pd.DataFrame()
for stageId in meta['stages']:
meta2['stageId']=stageId
_stage_times_overall_json=requests.get( url_base.format(stub=stage_times_overall_stub.format(**meta2) ) ).json()
_stage_times_overall = json_normalize(_stage_times_overall_json)
_stage_times_overall['stageId'] = stageId
stage_times_overall = pd.concat([stage_times_overall, _stage_times_overall])
stage_times_overall.head()
# ## Championships
# In[241]:
#http://www.wrc.com/en/wrc/results/championship-standings/page/4176----.html
championshipClasses = { 'WRC': {
'Driver': '6',
'Co-Driver': '7',
'Manufacturers': '8'
},
'WRC 2': {
'Driver': '10',
'Co-Driver': '11',
'Manufacturers': '9'
},
'WRC 3': {
'Driver': '13',
'Co-Driver': '14',
'Manufacturers': '12'
},
'JWRC': {
'Driver': '15',
'Co-Driver': '16'
}
}
# We can extract the javascript that declares the championship codes from the HTML page and convert it to a Python `dict`.
# In[46]:
import json
import re
import requests
url='http://www.wrc.com/en/wrc/results/championship-standings/page/4176----.html'
html2=requests.get(url).text
# In[47]:
m = re.search("var championshipClasses = (.*?);", html2, re.DOTALL)
mm=m.group(1).replace('\n','').replace("'",'"')
d=json.loads(mm)
#https://stackoverflow.com/a/35758583/454773
championshipClasses={k.replace(' ', ''): v for k, v in d.items()}
championshipClasses
# In[48]:
drivers = 6
codrivers = 7
manufacturer = 8
championships={}
champ_num = drivers
# In[49]:
meta2['championshipId']= champ_num
championship_json=requests.get( url_base.format(stub=championship_stub.format(**meta2) ) ).json()
#championship_json
# In[50]:
championship = json_normalize(championship_json).drop(['championshipEntries','championshipRounds'], axis=1)
championship.head()
# In[51]:
championships={}
championship_dict = championship.to_dict()
championships[champ_num] = {c:championship_dict[c][0] for c in championship_dict}
championships
# In[52]:
renamer={c.replace('Description',''):championships[champ_num][c] for c in championships[champ_num] if c.startswith('field')}
renamer
# In[54]:
championship_entries = json_normalize(championship_json,['championshipEntries'] )
championship_entries = championship_entries.rename(columns=renamer)
championship_entries = championship_entries[[c for c in championship_entries.columns if c!='']]
championship_entries
# In[161]:
championship_rounds = json_normalize(championship_json,['championshipRounds'] ).drop('event', axis=1)
championship_rounds
# In[162]:
_events_json = json_normalize(championship_json,['championshipRounds' ])['event']
championship_events = json_normalize(_events_json)
championship_events.head()
# ## Championship Results
# In[166]:
championship_results_json=requests.get( url_base.format(stub=championship_results_stub.format(**meta2) ) ).json()
championship_results = json_normalize(championship_results_json)
championship_results.head()
# In[ ]: