#Load in some libraries to handle the web page requests and the web page parsing... import requests #You may need to install BeautifulSoup #!pip3 install beautifulsoup4 from bs4 import BeautifulSoup #Note - I'm in Python3 from urllib.parse import parse_qs #The scraper will be limited to just the first results page... def searchUNdata(q): ''' Run a search on the UN data website and scrape the results ''' params={'q':q} url='http://data.un.org/Search.aspx' response = requests.get(url,params=params) soup=BeautifulSoup(response.content) results={} #Get the list of results searchresults=soup.findAll('div',{'class':'Result'}) #For each result, parse out the name of the dataset, the datamart ID and the data filter ID for result in searchresults: h2=result.find('h2') #We can find everything we need in the tag... a=h2.find('a') p=parse_qs(a.attrs['href']) results[a.text]=(p['d'][0],p['f'][0]) return results #A couple of helper functions to let us display the results results=searchUNdata('carbon dioxide') def printResults(results): ''' Nicely print the search results ''' for result in results.keys(): print(result) def unDataSearch(q): ''' Simple function to take a search phrase, run the search on the UN data site, and print and return the results. ''' results=searchUNdata(q) printResults(results) return results printResults(results) #q='carbon dioxide' #unDataSearch(q) #Just in case - a helper routine for working with the search results data def search(d, substr): ''' Partial string match search within dict key names ''' #via http://stackoverflow.com/a/10796050/454773 result = [] for key in d: if substr.lower() in key.lower(): result.append((key, d[key])) return result search(results, 'per capita') #Note - I'm in Python3 from io import BytesIO import zipfile import pandas as pd def getUNdata(undataSearchResults,dataset): ''' Download a named dataset from the UN Data website and load it into a pandas dataframe ''' datamartID,seriesRowID=undataSearchResults[dataset] url='http://data.un.org/Handlers/DownloadHandler.ashx?DataFilter='+seriesRowID+'&DataMartId='+datamartID+'&Format=csv' r = requests.get(url) s=BytesIO(r.content) z = zipfile.ZipFile(s) #Show the files in the zip file #z.namelist() #Let's assume we just get one file per zip... #Drop any all blank columns df=pd.read_csv( BytesIO( z.read( z.namelist()[0] ) )).dropna(axis=1,how='all') return df results=unDataSearch('carbon dioxide') dd=getUNdata(results,'Carbon dioxide emissions (CO2), metric tons of CO2 per capita (UNFCCC)') #Preview the last few rows dd[-5:] #One thing to note is that footnotes may appear at the bottom of a dataframe #We can spot the all empty row and drop rows from that #We can also drop the footnote related columns def dropFootnotes(df): return df[:pd.isnull(dd).all(1).nonzero()[0][0]].drop(['Value Footnotes','Value Footnotes.1'], 1) dropFootnotes(dd)[-5:] #Create a function that automatically drops the footnotes and any empty rows def getUNdata2(undataSearchResults, dataset, footnotes=False): df=getUNdata(undataSearchResults, dataset) if footnotes: return df return dropFootnotes(df) getUNdata2(results,'Carbon dioxide emissions (CO2), metric tons of CO2 per capita (UNFCCC)')[-5:] getUNdata2(results,'Carbon dioxide emissions (CO2), metric tons of CO2 per capita (UNFCCC)',footnotes=True)[-5:]