#If you do not have BeautifulSoup installed already, you will need to install it #!pip3 install beautifulsoup4 #Routines for scraping and downloading data from http://esa.un.org/unpd/wpp/unpp/panel_indicators.htm import requests from bs4 import BeautifulSoup #I'm in a Python3 environment from io import StringIO import pandas as pd def getFormSelectVals(soup,name): ''' Parse out options from an HTML select list ''' tmp={} items=soup.find('select',{'name':name}) for o in items.findAll('option'): #print(o.attrs['value'],o.text.strip()) tmp[o.text.strip()]=o.attrs['value'] return tmp def getUNPopDivIndicators(): ''' Grab the UN Population Division indicators page and parse out form values ''' url='http://esa.un.org/unpd/wpp/unpp/panel_indicators.htm' page = requests.get(url) soup=BeautifulSoup(page.content) unpop={} unpop['vars']=getFormSelectVals(soup,'Variable') unpop['countries']=getFormSelectVals(soup,'Location') unpop['variants']=getFormSelectVals(soup,'Varient') return unpop def getUNPopDivData(dfi,country,indicator,variant='Medium variant',startyear=1950,endyear=2010): ''' Download selected data from the UN Population Division indicators page and return it as a pandas dataframe ''' if not isinstance(country,list): country= [country] if not isinstance(indicator,list): indicator= [indicator] #Limited to five options - use the first 5 #For each item, look up the name from the scraped form elements and find what code number to use. locations= [dfi['countries'][x] for x in country[:5]] variables= [dfi['vars'][x] for x in indicator[:5]] varient=dfi['variants'][variant] data=[('Panel',2),('Varient',varient),('StartYear',startyear),('EndYear',endyear),('DoWhat','Download as .CSV File')] #If we request multiple locations or variables, the form repeats the 'Location' or 'Variable' attribute name #This means we can't use a Python dict to represent the arguments - instead we create a list of tuples for l in locations: data.append(('Location',l)) for v in variables: data.append(('Variable',v)) url='http://esa.un.org/unpd/wpp/unpp/p2k0data_script.asp' response = requests.post(url,data=data) #The data is returned as a string - we use StringIO to make it look like a file stream so pd.read_csv() can read it #The last two lines of the returned data are an empty line and a metadata line - so let's ignore them. #skipfooter currently only works with the python engine - so let's declare that to prevent a warning df=pd.read_csv(StringIO(response.text), skipfooter=2, engine='python' ) return df #Helper routines for working with the data def search(d, substr): ''' Partial string match search within dict key names ''' #via http://stackoverflow.com/a/10796050/454773 result = [] for key in d: if substr.lower() in key.lower(): result.append((key, d[key])) return result #Get indicators dfi=getUNPopDivIndicators() #Preview indicators dfi['vars'] #Example search within indicators search(dfi['vars'],'pop') #Show variants of predictions dfi['variants'] #Example search within country names search(dfi['countries'],'united') #Let's run a test query ukpop=getUNPopDivData(dfi,['United Kingdom','Bangladesh','India','France','Germany','Italy'], ['Population 80+','Population sex ratio'],startyear=2000,endyear=2000) #Show unique countries - only data for the first five should have been requested ukpop['Country'].unique() #Preview the dataframe ukpop #Processors for pyramid chart def processWorldPopData(dfi,fname,countries=[],start=1950,end=2010): df=getUNPopDivData(dfi,countries, ['Population by five-year age group and sex'], startyear=start, endyear=end) dfp=df[['Year','Age','Sex','Value']] dfp=dfp[dfp['Sex']!='Total'] dfp['Age']=dfp['Age'].apply(lambda x: x.replace('+','-').split('-')[0]) dfp['Sex']=dfp['Sex'].map( {'Male':1,'Female':2} ) dfp.columns=['year','age','sex','people'] dfp.to_csv(fname,index=False) processWorldPopData(dfi,'delme',['Bahrain']) !head delme