# http://lxml.de/lxmlhtml.html
import requests
from lxml.html import fromstring, parse
from itertools import islice

# http://stackoverflow.com/a/1779324/7782
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

url = "https://en.wikipedia.org/w/index.php?title=List_of_countries_by_population_(United_Nations)&oldid=590438477"
page = requests.get(url).content.decode("UTF-8")

doc = fromstring(page)

def parse_rank(col):
    try:
        rank = int(col.text)
        return rank
    except:
        return None

def parse_name(col):
    try:
        # find all the anchors and if href points is the form "/wiki"
        name = "; ".join([a.text for a in col.findall(".//a") if a.attrib["href"].startswith("/wiki/")])
        return name
    except:
        return None

def parse_pop(col):
    return locale.atoi(col.text)


def country_by_pop():
    
    for row in islice(doc.xpath("""//*[@id="mw-content-text"]/table[1]/tr"""),2, None):
        cols = row.findall(".//td")
        yield (parse_rank(cols[0]), parse_name(cols[1]), parse_pop(cols[2]))
    
for (i, row) in enumerate(islice(country_by_pop(), None)):
    print i, 
    for col in row:
        if type(col) == 'unicode':
            print col.encode("UTF-8"), 
        else:
            print col, 
    print

import json
s = json.dumps([row for row in country_by_pop()], ensure_ascii=True)

type(s)

print s

# https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json

# read population in
import json
import requests

pop_json_url = "https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json"
pop_list= requests.get(pop_json_url).json()
pop_list

world_pop = sum([r[2] for r in pop_list])
world_pop

# http://stackoverflow.com/a/15889203/7782
def cumsum(lis):
    total = 0
    for x in lis:
        total += x
        yield total


cum_pop = list(cumsum((r[2] for r in pop_list)))
cum_pop

import bisect
import random

# http://docs.python.org/2/library/bisect.html
bisect.bisect_left(cum_pop,world_pop/2)

float(cum_pop[5])/world_pop

len(cum_pop)

pop_list[0][1]

from itertools import repeat
from collections import Counter

def random_country_weighted_by_pop():
    while True:
        yield pop_list[bisect.bisect_left(cum_pop,random.randint(1,world_pop))][1]
        
Counter(islice(random_country_weighted_by_pop(),5))

import requests
import locale
import json

locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

cia_url = "https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt"
content = requests.get(cia_url).content

cia_pop_list = [(int(x[0]), x[1], locale.atoi(x[2])) for x in [r.split("\t") for r in content.strip().split("\r")]]
cia_pop_list

print json.dumps(cia_pop_list)

# https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json
# https://gist.github.com/rdhyee/8530164
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt


import json
import requests

cia_json_url = "https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json"
cia_list= requests.get(cia_json_url).json()
cia_list

cia_world_pop = sum([r[2] for r in cia_list if r[1] != 'European Union'])
cia_world_pop

cia_world_pop, world_pop, cia_world_pop/float(world_pop)

# set of entities for Wikipedia
wk_entities = set([r[1] for r in pop_list])
wk_entities

cia_entities = set([r[1] for r in cia_list])

len(wk_entities), len(cia_entities)

# http://docs.python.org/2/library/stdtypes.html#set
# intersection
len(wk_entities & cia_entities)

# symmetric diff
wk_entities ^ cia_entities

wk_entities - cia_entities

cia_entities - wk_entities

len(wk_entities or cia_entities)