#!/usr/bin/env python # coding: utf-8 # # A map of World Cup stadia using wikidata # # [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) is an amazing project # that aims to turn the unstructured text of Wikipedia into a database of facts # and figures that allows you to go beyond just presenting a page about something # to using data about it. # # I've been wanting to try out using it, and "SPARQL", the language used to query # it, so I decided to try and create a map of every stadium that has hosted a game # at the Fifa World Cup finals - a topical query as the 2018 World Cup in Russia # has just started. # # **UPDATE**: I've updated this now the 2022 world cup is currently taking place. # # ## Step 1. Querying the data # # I used [query.wikidata.org](https://query.wikidata.org/) to come up with a query # that got me the data I was looking for. Having never used SPARQL before it took # a bit of tweaking to get the query I needed - I found the interface helpful for # finding the right entities and the included examples for how to structure it. # Here's the query I came up with. I'll go through what each part does below. # In[1]: wc_sparql = """ SELECT ?FIFA_World_CupLabel ?location ?locationLabel ?coord ?countryLabel WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } ?FIFA_World_Cup wdt:P3450 wd:Q19317. ?FIFA_World_Cup wdt:P276 ?location. ?location wdt:P625 ?coord. ?location wdt:P17 ?country } ORDER BY ?FIFA_World_CupLabel """ # The first part sets up the fields we want to return - the name of the World Cup, the location ID (a stadium), the name of the stadium, the latitude and longitude and the name of the country # # ``` # SELECT ?FIFA_World_CupLabel ?location ?locationLabel ?coord ?countryLabel WHERE { # ``` # # This next part allows you to fetch labels for each of the items, which is more helpful than the URI that gets returned. # # ``` # SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # ``` # # Then we start off by adding a field called "FIFA_World_Cup" based on finding "sports season of league or competition" (`wdt:P3450`) with the labels "FIFA World Cup" (`wd:Q19317`) # # ``` # ?FIFA_World_Cup wdt:P3450 wd:Q19317. # ``` # # Then we look for the locations (`wdt:P276`) attached to each of these competitions: # # ``` # ?FIFA_World_Cup wdt:P276 ?location. # ``` # # And for each location we want the co-ordinates (`wdt:P625`) and country (`wdt:P17`). # # ``` # ?location wdt:P625 ?coord. # ?location wdt:P17 ?country # ``` # I then used a python library called [SPARQLWrapper](https://rdflib.github.io/sparqlwrapper/) to send the query to the WikiData sparql endpoint, and get JSON data back. # In[2]: from SPARQLWrapper import SPARQLWrapper, JSON sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setQuery(wc_sparql) sparql.setReturnFormat(JSON) results = sparql.query().convert() # Here's an example of what one of the results looks like - a stadium used in the very first World Cup in Uruguay. # In[3]: results['results']['bindings'][0] # ## Step 2: processing the results # # I then want to turn the results into nicely formatted data for plotting on a map. I'm looking for data that contains # one record for each stadium, even if it has hosted games at more than one World Cup (e.g Mexico in 1970 and 1986). # # The co-ordinates for each location come in WKT format, so I use a library called [Shapely](https://pypi.org/project/Shapely/) to extract the latitude and longitude. # In[4]: # for converting coordinates import shapely.wkt # Then I go through each of the results and add to a python dictionary. If the stadium is already in the dictionary I just add the extra World Cup year to the dictionary, rather than adding a new record. # In[5]: stadia = {} for result in results["results"]["bindings"]: stadium_id = result["location"]["value"] worldcup = result["FIFA_World_CupLabel"]["value"].replace(" FIFA World Cup","") if stadium_id in stadia: stadia[stadium_id]["worldcups"].append(worldcup) else: stadia[stadium_id] = { "lat_lng": shapely.wkt.loads(result["coord"]["value"]).coords[0], "worldcups": [worldcup], "stadium": result["locationLabel"]["value"], "country": result["countryLabel"]["value"], } # Here's what an entry in the processed data looks like. I've used the wikidata URI as an identifier for each stadium. # In[6]: stadia['http://www.wikidata.org/entity/Q498245'] # ## Step 3: Mapping the results # # I really like [folium](http://python-visualization.github.io/folium/index.html) for easily # producing Leaflet-based maps in python. I'm going to also use the MarkerCluster plugin # to cluster the markers to make it easier to view all the stadia on one map - with clusters # based on countries. # In[7]: import folium from folium.plugins import MarkerCluster import html # First I initialise the map and zoom out so you can see the whole world. # In[8]: m = folium.Map( location=[20,0], zoom_start=2, tiles='Stamen Toner', attr='''Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under CC BY SA. | Locations powered by Wikidata.''' ) # Then we go through the stadia and add each one to a cluster based on its country. I've also added a little popup which tells you the stadium's name and which World Cups it hosted games at. I also set a football icon for the pins. # In[9]: clusters = {} for stadium_id in stadia: s = stadia[stadium_id] if s["country"] not in clusters: clusters[s["country"]] = MarkerCluster().add_to(m) folium.Marker( [s["lat_lng"][1], s["lat_lng"][0]], popup='{}, {} - {}'.format( html.escape(s["stadium"]), html.escape(s["country"]), html.escape(", ".join(s["worldcups"])) ), icon=folium.Icon(icon='soccer-ball-o', prefix='fa') ).add_to(clusters[s["country"]]) # Finally we show the resulting map, which can be zoomed and panned to look at particular countries. # In[10]: m # In[11]: m.save("world-cup-stadia-map.html") # As an extra I wanted to convert the data into GeoJSON format so it's easy to use elsewhere. # # - [world_cup_stadia.geojson](https://dkane.net/data/world_cup_stadia.geojson) # In[21]: from geojson import Feature, Point, FeatureCollection import geojson # In[22]: wc_geojson = FeatureCollection( [Feature(geometry=Point(stadia[s]["lat_lng"]), properties=stadia[s]) for s in stadia] ) # In[23]: with open('world_cup_stadia.geojson', 'w') as a: geojson.dump(wc_geojson, a, indent=4) # ## Step 4: taking it further # # This was just a quick exercise to try and get data out of wikidata and then use it. There's a few things that could be done to take it further: # # - add filters to the map to filter by country, World Cup, etc. # - see if Wikidata has data on the matches that took place at each location and the teams that have played there, allowing you to filter by team or stage of the competition. # - visualise the data by adding in details like the maximum attendance # ## Acknowledgements # # - The data is from Wikidata and is used under the [CC0 Public Domain licence](https://creativecommons.org/about/cc0). # - [sparqlwrapper](https://rdflib.github.io/sparqlwrapper/) for querying the data. # - [Folium](http://python-visualization.github.io/folium/index.html) mapping using [Leaflet](https://leafletjs.com/) and [FontAwesome](http://fontawesome.com/icons?d=gallery&c=sports&m=free) icons. # - Map tiles by [Stamen Design](http://stamen.com), under [CC BY 3.0](http://creativecommons.org/licenses/by/3.0). Data by [OpenStreetMap](http://openstreetmap.org), under [CC BY SA](http://creativecommons.org/licenses/by-sa/3.0). # - [Shapely](https://pypi.org/project/Shapely/) and [geojson](https://pypi.org/project/geojson/) python libraries for manipulating the geodata # - [This blog post was written alongside the code in a Juypter notebook](https://nbviewer.jupyter.org/urls/dkane.net/data/world_cup_stadia.ipynb). # # Wikidata stamp