# http://lxml.de/lxmlhtml.html import requests from lxml.html import fromstring, parse from itertools import islice # http://stackoverflow.com/a/1779324/7782 import locale locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) url = "https://en.wikipedia.org/w/index.php?title=List_of_countries_by_population_(United_Nations)&oldid=590438477" page = requests.get(url).content.decode("UTF-8") doc = fromstring(page) def parse_rank(col): try: rank = int(col.text) return rank except: return None def parse_name(col): try: # find all the anchors and if href points is the form "/wiki" name = "; ".join([a.text for a in col.findall(".//a") if a.attrib["href"].startswith("/wiki/")]) return name except: return None def parse_pop(col): return locale.atoi(col.text) def country_by_pop(): for row in islice(doc.xpath("""//*[@id="mw-content-text"]/table[1]/tr"""),2, None): cols = row.findall(".//td") yield (parse_rank(cols[0]), parse_name(cols[1]), parse_pop(cols[2])) for (i, row) in enumerate(islice(country_by_pop(), None)): print i, for col in row: if type(col) == 'unicode': print col.encode("UTF-8"), else: print col, print import json s = json.dumps([row for row in country_by_pop()], ensure_ascii=True) type(s) print s # https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json # read population in import json import requests pop_json_url = "https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json" pop_list= requests.get(pop_json_url).json() pop_list world_pop = sum([r[2] for r in pop_list]) world_pop # http://stackoverflow.com/a/15889203/7782 def cumsum(lis): total = 0 for x in lis: total += x yield total cum_pop = list(cumsum((r[2] for r in pop_list))) cum_pop import bisect import random # http://docs.python.org/2/library/bisect.html bisect.bisect_left(cum_pop,world_pop/2) float(cum_pop[5])/world_pop len(cum_pop) pop_list[0][1] from itertools import repeat from collections import Counter def random_country_weighted_by_pop(): while True: yield pop_list[bisect.bisect_left(cum_pop,random.randint(1,world_pop))][1] Counter(islice(random_country_weighted_by_pop(),5)) import requests import locale import json locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) cia_url = "https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt" content = requests.get(cia_url).content cia_pop_list = [(int(x[0]), x[1], locale.atoi(x[2])) for x in [r.split("\t") for r in content.strip().split("\r")]] cia_pop_list print json.dumps(cia_pop_list) # https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json # https://gist.github.com/rdhyee/8530164 # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html # https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt import json import requests cia_json_url = "https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json" cia_list= requests.get(cia_json_url).json() cia_list cia_world_pop = sum([r[2] for r in cia_list if r[1] != 'European Union']) cia_world_pop cia_world_pop, world_pop, cia_world_pop/float(world_pop) # set of entities for Wikipedia wk_entities = set([r[1] for r in pop_list]) wk_entities cia_entities = set([r[1] for r in cia_list]) len(wk_entities), len(cia_entities) # http://docs.python.org/2/library/stdtypes.html#set # intersection len(wk_entities & cia_entities) # symmetric diff wk_entities ^ cia_entities wk_entities - cia_entities cia_entities - wk_entities len(wk_entities or cia_entities)