#!/usr/bin/env python
# coding: utf-8
# This notebook is the data munging part of the visualization of the interconnectedness of my top 30\* most edited articles on Wikipedia (I go by [Resident Mario](https://en.wikipedia.org/wiki/User:Resident_Mario) on the encyclopedia), as reported by IBM Watson's [Concept Insight](http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/concept-insights.html) API service. The data is scraped from the [Supercount Wikimedia Lab tool](https://tools.wmflabs.org/supercount/) with `requests` and `beautifulsoup`, interwoven using `watsongraph`, and visualized using `d3.js`.
#
# The techniques here could eventually be easily applied to any editor! A [widget](https://github.com/jdfreder/ipython-d3networkx/blob/master/examples/demo%20simple.ipynb) for visualizing any editor's top articles is forthcoming once `watsongraph` makes it to the `0.3.0` release.
#
# \* The cutoff is due to a [technical limitation](https://github.com/ResidentMario/watsongraph/issues/8).
# In[1]:
from watsongraph.conceptmodel import ConceptModel
from watsongraph.node import conceptualize
import json
import requests
import bs4
def get_top_thirty_articles(username):
"""
Performs a raw call to the Supercount edit counter, and parses the response to get at a list of links
on that page.
Output looks like this:
[
Hawaii hotspot — 603,
Mauna Kea — 543,
...]
"""
u = username.replace(" ", "+")
# Currently limited to 30 articles because of batch limitation.
# cf. https://github.com/ResidentMario/watsongraph/issues/8
url = "https://tools.wmflabs.org/supercount/index.php?user={}&project=en.wikipedia.org&toplimit=30".format(u)
r = requests.get(url)
# Surprisingly, requests guesses the wrong encoding and tries 'ISO-8859-1' by default.
# This blows up e.g. "Lōʻihi_Seamount", which becomes a garbled mess.
# Easy to fix by swapping the encoding but finding this programmatic misstep took me some time.
# cf. http://docs.python-requests.org/en/latest/user/quickstart/#response-content
r.encoding = 'utf-8'
raw_data = r.text
raw_links = list(bs4.BeautifulSoup(raw_data, "html.parser").find_all('li'))
return raw_links
def parse_articles(list_of_links):
"""
After running get_top_fifty_articles() we get a list of links.
It looks like this:
[Hawaii hotspot — 603,
Mauna Kea — 543,
...]
This method takes this info and returns a list of dicts of article names and edits that looks like this:
[{'article': 'Types of volcanic eruptions', 'edits': '236'}, ...]
"""
ret = []
for link in list_of_links:
if "/:" in str(link):
text = link.get_text()
# At this point we are parsing e.g. 'Hawaii hotspot — 603'
article = text[:text.index("—") - 1]
edits = text[text.index("—") + 2:]
ret.append({'article': article, 'edits': int(edits)})
return ret
def clean_and_model(data_dict):
"""
After running get_top_fifty_articles() and parse_articles() we are left with the information we need,
encoded in the following format:
[{'article': 'Types of volcanic eruptions', 'edits': '236'}, ...]
Now we build the model for the whole thing.
"""
# At this step we ought to clean input using conceptualize() to back-trace, but unicode breaks:
# conceptualize() does not support unicode.
# cf. https://github.com/ResidentMario/watsongraph/issues/11
# So this one time I cleaned the data by hand (fixed one problem point).
# Once I rewrite the access methods and push this library out to version 0.3.0 this should be fixed.
# cf.
contributions = ConceptModel([dat['article'] for dat in data_dict])
contributions.explode_edges(prune=True)
for entry in data_dict:
contributions.set_property(entry['article'], "edits", entry['edits'])
# print(contributions.get_node(entry['article']).properties['edits'])
return contributions
def save_model(model):
"""
The final step: saves the model!
"""
with open('contributions.json', 'w') as file:
file.write(json.dumps(model.to_json(), indent=4))
# In[2]:
dataset = parse_articles(get_top_thirty_articles("Resident Mario"))
# manually heal a problem point.
# cf.
dataset[23]['article'] = 'Ferdinandea'
# In[3]:
contributions = clean_and_model(dataset)
# In[6]:
save_model(contributions)