import nltk
import urllib2
import re

opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
infile = opener.open('http://en.wikipedia.org/w/index.php?title=George_Washington&printable=yes')
page = infile.read().decode("utf-8")
page[:400]

page = nltk.util.clean_html(page)
page = re.sub(r'\s*\n+','\n',page)
print page[:400]

sents = nltk.sent_tokenize(page)
sents = [nltk.word_tokenize(s) for s in sents]
print sents[17]

print nltk.pos_tag(sents[17])

chunked = nltk.ne_chunk(nltk.pos_tag(sents[17]))
for node in chunked:
    print node

"""
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	      Eddy Bonte, President Obama
LOCATION	    Murray River, Mount Everest
DATE	        June, 2008-06-29
TIME	        two fifty a m, 1:30 p.m.
MONEY	       175 million Canadian Dollars, GBP 10.40
PERCENT	     twenty pct, 18.75 %
FACILITY	    Washington Monument, Stonehenge
GPE	         South East Asia, Midlothian
""";None

def nextract(tokens,types=["GPE","PERSON"]):
    chunked = nltk.ne_chunk(nltk.pos_tag(tokens))
    return [c for c in chunked if hasattr(c,"node") and c.node in types]
nes = nextract(sents[17])
nes

nes[0].leaves()

def nextract_text(tokens,types=["GPE","PERSON"]):
    nodes = nextract(tokens,types)
    return [" ".join(c[0] for c in chunk.leaves()) for chunk in nodes]

nextract_text(sents[17])

nes = [nextract_text(s,["PERSON"]) for s in sents]

from collections import Counter
Counter([x for l in nes for x in l]).most_common(10)

from itertools import *
pairs = Counter([tuple(sorted(list(p))) for s in nes for p in combinations(s,2)])
[p for p in pairs.most_common(20) if p[0][0]!=p[0][1]]