#If you do not have BeautifulSoup installed already, you will need to install it
#!pip3 install beautifulsoup4

#Routines for scraping and downloading data from http://esa.un.org/unpd/wpp/unpp/panel_indicators.htm

import requests
from bs4 import BeautifulSoup

#I'm in a Python3 environment
from io import StringIO
import pandas as pd

def getFormSelectVals(soup,name):
    ''' Parse out options from an HTML select list '''
    
    tmp={}
    items=soup.find('select',{'name':name})
    for o in items.findAll('option'):
        #print(o.attrs['value'],o.text.strip())
        tmp[o.text.strip()]=o.attrs['value']
    return tmp


def getUNPopDivIndicators():
    ''' Grab the UN Population Division indicators page and parse out form values '''
    
    url='http://esa.un.org/unpd/wpp/unpp/panel_indicators.htm'
    page = requests.get(url)
    
    soup=BeautifulSoup(page.content)
    
    unpop={}
    unpop['vars']=getFormSelectVals(soup,'Variable')
    unpop['countries']=getFormSelectVals(soup,'Location')
    unpop['variants']=getFormSelectVals(soup,'Varient')
      
    return unpop


def getUNPopDivData(dfi,country,indicator,variant='Medium variant',startyear=1950,endyear=2010):
    ''' Download selected data from the UN Population Division indicators page and return it as a pandas dataframe '''
    
    if not isinstance(country,list): country= [country] 
    if not isinstance(indicator,list): indicator= [indicator]
    
    #Limited to five options - use the first 5
    #For each item, look up the name from the scraped form elements and find what code number to use.
    locations= [dfi['countries'][x] for x in country[:5]]
    variables= [dfi['vars'][x] for x in indicator[:5]]
    
    varient=dfi['variants'][variant]
    
    data=[('Panel',2),('Varient',varient),('StartYear',startyear),('EndYear',endyear),('DoWhat','Download as .CSV File')]
    
    #If we request multiple locations or variables, the form repeats the 'Location' or 'Variable' attribute name
    #This means we can't use a Python dict to represent the arguments - instead we create a list of tuples
    for l in locations: data.append(('Location',l))
    for v in variables: data.append(('Variable',v))
    
    url='http://esa.un.org/unpd/wpp/unpp/p2k0data_script.asp'

    response = requests.post(url,data=data)
    
    #The data is returned as a string - we use StringIO to make it look like a file stream so pd.read_csv() can read it
    #The last two lines of the returned data are an empty line and a metadata line - so let's ignore them.
    #skipfooter currently only works with the python engine - so let's declare that to prevent a warning
    df=pd.read_csv(StringIO(response.text), skipfooter=2, engine='python' )
    
    return df

#Helper routines for working with the data
def search(d, substr):
    ''' Partial string match search within dict key names '''
    #via http://stackoverflow.com/a/10796050/454773
    
    result = []
    for key in d:
        if substr.lower() in key.lower():
            result.append((key, d[key])) 

    return result

#Get indicators
dfi=getUNPopDivIndicators()

#Preview indicators
dfi['vars']

#Example search within indicators
search(dfi['vars'],'pop')

#Show variants of predictions
dfi['variants']

#Example search within country names
search(dfi['countries'],'united')

#Let's run a test query
ukpop=getUNPopDivData(dfi,['United Kingdom','Bangladesh','India','France','Germany','Italy'],
                      ['Population 80+','Population sex ratio'],startyear=2000,endyear=2000)

#Show unique countries - only data for the first five should have been requested
ukpop['Country'].unique()

#Preview the dataframe
ukpop

#Processors for pyramid chart
def processWorldPopData(dfi,fname,countries=[],start=1950,end=2010):
    df=getUNPopDivData(dfi,countries,
                       ['Population by five-year age group and sex'],
                       startyear=start,
                       endyear=end)
    dfp=df[['Year','Age','Sex','Value']]
    dfp=dfp[dfp['Sex']!='Total']
    dfp['Age']=dfp['Age'].apply(lambda x: x.replace('+','-').split('-')[0])
    dfp['Sex']=dfp['Sex'].map( {'Male':1,'Female':2} )
    dfp.columns=['year','age','sex','people']
    dfp.to_csv(fname,index=False)

processWorldPopData(dfi,'delme',['Bahrain'])

!head delme