#!/usr/bin/env python # coding: utf-8 # This notebook is the data munging part of the visualization of the interconnectedness of my top 30\* most edited articles on Wikipedia (I go by [Resident Mario](https://en.wikipedia.org/wiki/User:Resident_Mario) on the encyclopedia), as reported by IBM Watson's [Concept Insight](http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/concept-insights.html) API service. The data is scraped from the [Supercount Wikimedia Lab tool](https://tools.wmflabs.org/supercount/) with `requests` and `beautifulsoup`, interwoven using `watsongraph`, and visualized using `d3.js`. # # The techniques here could eventually be easily applied to any editor! A [widget](https://github.com/jdfreder/ipython-d3networkx/blob/master/examples/demo%20simple.ipynb) for visualizing any editor's top articles is forthcoming once `watsongraph` makes it to the `0.3.0` release. # # \* The cutoff is due to a [technical limitation](https://github.com/ResidentMario/watsongraph/issues/8). # In[1]: from watsongraph.conceptmodel import ConceptModel from watsongraph.node import conceptualize import json import requests import bs4 def get_top_thirty_articles(username): """ Performs a raw call to the Supercount edit counter, and parses the response to get at a list of links on that page. Output looks like this: [
  • Hawaii hotspot — 603
  • ,
  • Mauna Kea — 543
  • , ...] """ u = username.replace(" ", "+") # Currently limited to 30 articles because of batch limitation. # cf. https://github.com/ResidentMario/watsongraph/issues/8 url = "https://tools.wmflabs.org/supercount/index.php?user={}&project=en.wikipedia.org&toplimit=30".format(u) r = requests.get(url) # Surprisingly, requests guesses the wrong encoding and tries 'ISO-8859-1' by default. # This blows up e.g. "Lōʻihi_Seamount", which becomes a garbled mess. # Easy to fix by swapping the encoding but finding this programmatic misstep took me some time. # cf. http://docs.python-requests.org/en/latest/user/quickstart/#response-content r.encoding = 'utf-8' raw_data = r.text raw_links = list(bs4.BeautifulSoup(raw_data, "html.parser").find_all('li')) return raw_links def parse_articles(list_of_links): """ After running get_top_fifty_articles() we get a list of links. It looks like this: [
  • Hawaii hotspot — 603
  • ,
  • Mauna Kea — 543
  • , ...] This method takes this info and returns a list of dicts of article names and edits that looks like this: [{'article': 'Types of volcanic eruptions', 'edits': '236'}, ...] """ ret = [] for link in list_of_links: if "/:" in str(link): text = link.get_text() # At this point we are parsing e.g. 'Hawaii hotspot — 603' article = text[:text.index("—") - 1] edits = text[text.index("—") + 2:] ret.append({'article': article, 'edits': int(edits)}) return ret def clean_and_model(data_dict): """ After running get_top_fifty_articles() and parse_articles() we are left with the information we need, encoded in the following format: [{'article': 'Types of volcanic eruptions', 'edits': '236'}, ...] Now we build the model for the whole thing. """ # At this step we ought to clean input using conceptualize() to back-trace, but unicode breaks: # conceptualize() does not support unicode. # cf. https://github.com/ResidentMario/watsongraph/issues/11 # So this one time I cleaned the data by hand (fixed one problem point). # Once I rewrite the access methods and push this library out to version 0.3.0 this should be fixed. # cf. contributions = ConceptModel([dat['article'] for dat in data_dict]) contributions.explode_edges(prune=True) for entry in data_dict: contributions.set_property(entry['article'], "edits", entry['edits']) # print(contributions.get_node(entry['article']).properties['edits']) return contributions def save_model(model): """ The final step: saves the model! """ with open('contributions.json', 'w') as file: file.write(json.dumps(model.to_json(), indent=4)) # In[2]: dataset = parse_articles(get_top_thirty_articles("Resident Mario")) # manually heal a problem point. # cf. dataset[23]['article'] = 'Ferdinandea' # In[3]: contributions = clean_and_model(dataset) # In[6]: save_model(contributions)