# http://lxml.de/lxmlhtml.html
import requests
from lxml.html import fromstring, parse
from itertools import islice
# http://stackoverflow.com/a/1779324/7782
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
url = "https://en.wikipedia.org/w/index.php?title=List_of_countries_by_population_(United_Nations)&oldid=590438477"
page = requests.get(url).content.decode("UTF-8")
doc = fromstring(page)
def parse_rank(col):
try:
rank = int(col.text)
return rank
except:
return None
def parse_name(col):
try:
# find all the anchors and if href points is the form "/wiki"
name = "; ".join([a.text for a in col.findall(".//a") if a.attrib["href"].startswith("/wiki/")])
return name
except:
return None
def parse_pop(col):
return locale.atoi(col.text)
def country_by_pop():
for row in islice(doc.xpath("""//*[@id="mw-content-text"]/table[1]/tr"""),2, None):
cols = row.findall(".//td")
yield (parse_rank(cols[0]), parse_name(cols[1]), parse_pop(cols[2]))
for (i, row) in enumerate(islice(country_by_pop(), None)):
print i,
for col in row:
if type(col) == 'unicode':
print col.encode("UTF-8"),
else:
print col,
print
import json
s = json.dumps([row for row in country_by_pop()], ensure_ascii=True)
type(s)
print s
# https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json
# read population in
import json
import requests
pop_json_url = "https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json"
pop_list= requests.get(pop_json_url).json()
pop_list
world_pop = sum([r[2] for r in pop_list])
world_pop
# http://stackoverflow.com/a/15889203/7782
def cumsum(lis):
total = 0
for x in lis:
total += x
yield total
cum_pop = list(cumsum((r[2] for r in pop_list)))
cum_pop
import bisect
import random
# http://docs.python.org/2/library/bisect.html
bisect.bisect_left(cum_pop,world_pop/2)
float(cum_pop[5])/world_pop
len(cum_pop)
pop_list[0][1]
from itertools import repeat
from collections import Counter
def random_country_weighted_by_pop():
while True:
yield pop_list[bisect.bisect_left(cum_pop,random.randint(1,world_pop))][1]
Counter(islice(random_country_weighted_by_pop(),5))
import requests
import locale
import json
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
cia_url = "https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt"
content = requests.get(cia_url).content
cia_pop_list = [(int(x[0]), x[1], locale.atoi(x[2])) for x in [r.split("\t") for r in content.strip().split("\r")]]
cia_pop_list
print json.dumps(cia_pop_list)
# https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json
# https://gist.github.com/rdhyee/8530164
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
import json
import requests
cia_json_url = "https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json"
cia_list= requests.get(cia_json_url).json()
cia_list
cia_world_pop = sum([r[2] for r in cia_list if r[1] != 'European Union'])
cia_world_pop
cia_world_pop, world_pop, cia_world_pop/float(world_pop)
# set of entities for Wikipedia
wk_entities = set([r[1] for r in pop_list])
wk_entities
cia_entities = set([r[1] for r in cia_list])
len(wk_entities), len(cia_entities)
# http://docs.python.org/2/library/stdtypes.html#set
# intersection
len(wk_entities & cia_entities)
# symmetric diff
wk_entities ^ cia_entities
wk_entities - cia_entities
cia_entities - wk_entities
len(wk_entities or cia_entities)