#!/usr/bin/env python # coding: utf-8 # # hyperlinks network # In[1]: get_ipython().run_line_magic('run', '"libraries.ipynb"') from IPython.display import display, HTML from bs4 import BeautifulSoup # ## loading the original namespace # # In order to control our namespace, we are importing [a list of wikipedia pages](https://github.com/WeKeyPedia/notebooks/blob/master/geometry/data/pagenames.txt) that we extracted from the [List of geometry topics](http://en.wikipedia.org/wiki/list_of_geometry_topics). # In[2]: pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines() pages = map(lambda x: x.strip(), pages) # ## finding wikipedia links in the page content # In[3]: hyperlinks_graph = nx.DiGraph() def get_content(page): with codecs.open("data/pages/%s.json" % (page), "r", "utf-8-sig") as f: j = json.load(f) content = j["query"]["pages"][j["query"]["pages"].keys()[0]] content = content["revisions"][0]["*"] return content def find_hyperlinks(page): hyperlinks = [] content = get_content(page) content = BeautifulSoup(content, 'html.parser') hyperlinks = content.find_all('a') return hyperlinks for p in pages: occurences_link = {} occurences_named_entity = {} hyperlinks = find_hyperlinks(p) content = u"" + get_content(p) # keep only title of the link hyperlinks = map(lambda x: x.get("title"), hyperlinks) # some hyperlinks have no title hyperlinks = [ x for x in hyperlinks if x != None ] # sorted hyperlinks by title length in order to get a more precise n-graming # otherwise terms like "triangle" are over-evaluated because of other terms # like "equilateral triangle" hyperlinks = sorted(hyperlinks, key=lambda k: -len(k)) gruyere = content for k in hyperlinks: # count occurences links occurences_link.setdefault(k, 0) occurences_link[k] += 1 # count occurences terms occurences_named_entity.setdefault(k, 0) occurences_named_entity[k] = unicode(gruyere).count(unicode(k)) gruyere = gruyere.replace(k, "") # print "coverage: %s/%s (%s%%)" % (len(gruyere), len(content), float(len(gruyere))*100/float(len(content))) # reduce to a list of unique items hyperlinks = list(set(hyperlinks)) # keep only linked pages that are inside the initial domain intradomain_pages = set(hyperlinks) & set(pages) extradomain_pages = set(hyperlinks) - set(pages) # print len( hyperlinks ) # print len( intradomain_pages ) # print len( extradomain_pages ) for target in intradomain_pages: edge_info = { "link occurence": occurences_link[target], "term occurence": occurences_named_entity[target] } hyperlinks_graph.add_edge(p, target, attr_dict=edge_info) print "nodes: %s" % len(hyperlinks_graph.nodes()) print "edges: %s" % len(hyperlinks_graph.edges()) # ## storing the result graph # In[4]: nx.write_gexf(hyperlinks_graph, "data/hyperlinks.gexf") # ## community detection (louvain) # In[5]: import community partitions = community.best_partition(hyperlinks_graph.to_undirected()) # In[6]: def print_groups(communities): html = "
group %s | " % (c) html += ", ".join(map(lambda x: u"{0}".format(x), ps)) html += " |