import nltk import urllib2 import re opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open('http://en.wikipedia.org/w/index.php?title=George_Washington&printable=yes') page = infile.read().decode("utf-8") page[:400] page = nltk.util.clean_html(page) page = re.sub(r'\s*\n+','\n',page) print page[:400] sents = nltk.sent_tokenize(page) sents = [nltk.word_tokenize(s) for s in sents] print sents[17] print nltk.pos_tag(sents[17]) chunked = nltk.ne_chunk(nltk.pos_tag(sents[17])) for node in chunked: print node """ ORGANIZATION Georgia-Pacific Corp., WHO PERSON Eddy Bonte, President Obama LOCATION Murray River, Mount Everest DATE June, 2008-06-29 TIME two fifty a m, 1:30 p.m. MONEY 175 million Canadian Dollars, GBP 10.40 PERCENT twenty pct, 18.75 % FACILITY Washington Monument, Stonehenge GPE South East Asia, Midlothian """;None def nextract(tokens,types=["GPE","PERSON"]): chunked = nltk.ne_chunk(nltk.pos_tag(tokens)) return [c for c in chunked if hasattr(c,"node") and c.node in types] nes = nextract(sents[17]) nes nes[0].leaves() def nextract_text(tokens,types=["GPE","PERSON"]): nodes = nextract(tokens,types) return [" ".join(c[0] for c in chunk.leaves()) for chunk in nodes] nextract_text(sents[17]) nes = [nextract_text(s,["PERSON"]) for s in sents] from collections import Counter Counter([x for l in nes for x in l]).most_common(10) from itertools import * pairs = Counter([tuple(sorted(list(p))) for s in nes for p in combinations(s,2)]) [p for p in pairs.most_common(20) if p[0][0]!=p[0][1]]