#!/usr/bin/env python # coding: utf-8 # # Signpost Article Linked Views # # Sometimes *Signpost* articles get linked to by mainstream media (usually tech media). This companion notebook studies what happens when this happens. # In[1]: from pageviews import PageviewsClient import urllib import mwapi import arrow import datetime import json def viewcounts(article_name, start=None, end=None): """ Fetches the viewcounts. """ article_name = article_name.replace(' ', '_') parsed_article_name = urllib.parse.quote(article_name).replace('/', '%2F') p = PageviewsClient().article_views("en.wikipedia", [parsed_article_name], access="all-access", # access="users", granularity="daily", start=start, end=end) counts = {key: p[key][article_name] for key in p.keys()} return [p[key][article_name] for key in sorted(p.keys())] # TODO: Fix be pre-padding with 0s if output is below 15 length. # In[2]: from bokeh.plotting import figure, output_notebook, show from bokeh.models.tools import HoverTool import pandas as pd # In[3]: example_viewcounts = viewcounts("Wikipedia:Wikipedia Signpost/2015-12-09/Op-ed", start="2015121000", end ="2015122900") output_notebook(hide_banner=True) p = figure(plot_width=900, plot_height=400, title='"Wikidata: Knowledge from different points of view"', title_text_font_size="18px", x_axis_type="datetime" ) # add a line renderer p.line( [(arrow.get("2015-12-10-00") + datetime.timedelta(days=n)).datetime for n in range(0, 20)], example_viewcounts, line_width=2, ) hover = HoverTool() hover.tooltips = [ # ("index", "$index"), # ("(x,y)", "($x, $y)"), # ("radius", "@radius"), # ("fill color", "$color[hex, swatch]:fill_color"), # ("foo", "@foo"), # ("bar", "@bar"), ] show(p) # This is the viewership curve for a typical *Signpost* article, the fairly popular op-ed "[Wikidata: Knowledge from different points of view](https://en.wikipedia.org/wiki/Wikipedia:Wikipedia_Signpost/2015-12-09/Op-ed)", from pre-publication to publication to post-publication. It's a pattern that's representative of what most *Signpost* viewership curves look like. # In[4]: example_viewcounts = viewcounts("Wikipedia:Wikipedia Signpost/2016-01-13/News and notes", start="2016011500", end ="2016013000") output_notebook(hide_banner=True) p = figure(plot_width=900, plot_height=400, title='"Community objections to new Board trustee"', title_text_font_size="18px", x_axis_type="datetime" ) # add a line renderer p.line( [(arrow.get("2014-06-28-00") + datetime.timedelta(days=n)).datetime for n in range(0, 16)], example_viewcounts, line_width=2, ) hover = HoverTool() hover.tooltips = [ # ("index", "$index"), # ("(x,y)", "($x, $y)"), # ("radius", "@radius"), # ("fill color", "$color[hex, swatch]:fill_color"), # ("foo", "@foo"), # ("bar", "@bar"), ] show(p) # ^ What happened to "[Community objects to board trustee](https://en.wikipedia.org/wiki/Wikipedia:Wikipedia_Signpost/2016-01-13/News_and_notes)" here? Actually this graph is a tale of two curves: the original peak is the Wikipedia community (e.g. the *Signpost's* usual readers) picking up on the story immediately after publication; the much large peak is the mainstream media picking up on the news. What was already an extremely popular article by *Signpost* standards absolutely exploded when the story was picked up and recirculated by several outlets linking back to the *Signpost*: [ZDNet](http://www.zdnet.fr/actualites/un-membre-du-ca-de-la-fondation-wikimedia-mis-en-cause-pour-son-passe-chez-google-39831606.htm), [The Register](http://www.theregister.co.uk/2016/01/27/trust_me_pleads_wikipedia_former_google_man/), [The Register](http://www.theregister.co.uk/2016/01/28/wmf_geshuri_steps_down/?mt=1454029117421), [Fortune](http://fortune.com/2016/01/26/wikipedia-board-geshuri/), and [Ars Technica](http://arstechnica.com/tech-policy/2016/01/editors-demand-ouster-of-wikimedia-board-member-involved-in-no-poach-deal/). # # Let's look at a few more like this: the ones that we're aware of, at least. It's helpful for our purposes to compare how we were linked and from where to how much we got out of it. # # Since the pageview API data doesn't go back very far we also first need to define a similar API caller against the older, venerable [stats.grok.se](http://stats.grok.se/) API. # # The list of articles used for these queries is taken partly from the data from [this Quarry](http://quarry.wmflabs.org/query/7131) and partly from our own memory of our "greatest hits". # In[5]: import requests def grok_viewcounts(article_name, start, end): """ Retrieves article views in between two dates from the stats.grok.se API. Two months max. Note that the start= and end= query format is not exactly the same! """ article_name = article_name.replace(" ", "_") start = arrow.get(start) end = arrow.get(end) query_string = 'http://stats.grok.se/json/en/{0}/{1}'.format(start.format("YYYYMM"), article_name) dat = json.loads(requests.get(query_string).text)['daily_views'] first_month_views = [dat[key] for key in sorted(dat.keys())] first_month_views = first_month_views[start.day - 1:] # Below: a weird correction that seems to be necessary. first_month_views = first_month_views[:len(first_month_views) - 1] # print("First:", first_month_views) if start.month == end.month: return first_month_views else: query_string = 'http://stats.grok.se/json/en/{0}/{1}'.format(end.format("YYYYMM"), article_name) dat = json.loads(requests.get(query_string).text)['daily_views'] # print(dat) second_month_views = [dat[key] for key in sorted(dat.keys())] # print(sorted(dat.keys())) # print(second_month_views) second_month_views = second_month_views[:end.day - 1] # print(first_month_views) return first_month_views + second_month_views # ## Further examples # In[6]: example_viewcounts = grok_viewcounts("Wikipedia:Wikipedia_Signpost/2014-06-25/News_and_notes", start="2014-06-28-00", end ="2014-07-28-00") output_notebook(hide_banner=True) p = figure(plot_width=900, plot_height=400, title='"US National Archives enshrines Wikipedia in Open Government Plan, plans to upload all holdings to Commons"', title_text_font_size="18px", x_axis_type="datetime" ) # add a line renderer p.line( [(arrow.get("2014-06-28-00") + datetime.timedelta(days=n)).datetime for n in range(0, 30)], example_viewcounts, line_width=2, ) hover = HoverTool() hover.tooltips = [ # ("index", "$index"), # ("(x,y)", "($x, $y)"), # ("radius", "@radius"), # ("fill color", "$color[hex, swatch]:fill_color"), # ("foo", "@foo"), # ("bar", "@bar"), ] show(p) # ^ [This story](https://en.wikipedia.org/wiki/Wikipedia:Wikipedia_Signpost/2014-06-25/News_and_notes) was both [slashdotted](http://yro.slashdot.org/story/14/06/29/2216240/us-national-archives-will-upload-all-its-holdings-to-wikipedia) and posted to [TechCrunch](http://techcrunch.com/2014/06/30/us-national-archives-to-upload-all-holdings-to-wikimedia-commons/). This happened immediately post-publication, so there's only one bump. The cumulative impact was probably >5000 viewers. # In[7]: example_viewcounts = grok_viewcounts("Wikipedia:Wikipedia_Signpost/2014-07-16/Special_report", start="2014-07-16-00", end ="2014-08-06-00") output_notebook(hide_banner=True) p = figure(plot_width=900, plot_height=400, title='"$10 million lawsuit against Wikipedia editors withdrawn, but plaintiff intends to refile"', title_text_font_size="18px", x_axis_type="datetime" ) # add a line renderer p.line( [(arrow.get("2014-07-16-00") + datetime.timedelta(days=n)).datetime for n in range(0, 20)], example_viewcounts, line_width=2, ) hover.tooltips = [ # ("index", "$index"), # ("(x,y)", "($x, $y)"), # ("radius", "@radius"), # ("fill color", "$color[hex, swatch]:fill_color"), # ("foo", "@foo"), # ("bar", "@bar"), ] show(p) # ^ Another front-and-center [slashdotting](http://news.slashdot.org/story/14/07/18/2223202/10-million-lawsuit-against-wikipedia-editors-stragetically-withdrawn). # In[8]: example_viewcounts = grok_viewcounts("Wikipedia:Wikipedia Signpost/2014-08-13/News and notes", start="2014-08-16-00", end ="2014-09-16-00") output_notebook(hide_banner=True) p = figure(plot_width=900, plot_height=400, title='"Media Viewer controversy spreads to German Wikipedia"', title_text_font_size="18px", x_axis_type="datetime" ) # add a line renderer p.line( [(arrow.get("2014-08-16-00") + datetime.timedelta(days=n)).datetime for n in range(0, 30)], example_viewcounts, line_width=2, ) hover.tooltips = [ # ("index", "$index"), # ("(x,y)", "($x, $y)"), # ("radius", "@radius"), # ("fill color", "$color[hex, swatch]:fill_color"), # ("foo", "@foo"), # ("bar", "@bar"), ] show(p) # ^ This got a [link](http://www.theregister.co.uk/2014/08/18/class_war_wikipedias_workers_revolt_after_bourgeois_papershufflers_suspend_democracy/) from *The Register* (contributing probably something on the order of 700 or so viewers) on August 18th, and then a [change.org petition](https://www.change.org/p/lila-tretikov-remove-new-superprotect-status-and-permit-wikipedia-communities-to-enact-current-software-decisions-uninhibited) on August 22nd (probably a similar amount). I am unsure about where the small bump on August 31st came from, however. # In[9]: example_viewcounts = grok_viewcounts("Wikipedia:Wikipedia Signpost/2013-02-04/Special report", start="2013-02-04-00", end ="2013-03-24-00") output_notebook(hide_banner=True) p = figure(plot_width=900, plot_height=400, title='"Examining the popularity of Wikipedia articles: catalysts, trends, and applications"', title_text_font_size="18px", x_axis_type="datetime" ) # add a line renderer p.line( [(arrow.get("2013-02-04-00") + datetime.timedelta(days=n)).datetime for n in range(0, 50)], example_viewcounts, line_width=2, ) hover.tooltips = [ # ("index", "$index"), # ("(x,y)", "($x, $y)"), # ("radius", "@radius"), # ("fill color", "$color[hex, swatch]:fill_color"), # ("foo", "@foo"), # ("bar", "@bar"), ] show(p) # ^ This story got an incredible amount of traction: this is the most pageviews of any story in the *Signpost* I've yet examined (we don't have complete data, unfortunately). It is almost certainly the most viewed *Signpost* article of all time. # # The first bump on February 8th comes from [Tested.com](http://www.tested.com/tech/web/453360-wikipedia-signpost-report-peers-pop-culture-trends-drive-big-traffic/) and [Atlantic](http://www.theatlantic.com/technology/archive/2013/02/if-you-want-your-wikipedia-page-to-get-a-ton-of-traffic-die-while-performing-at-the-super-bowl-half-time-show/272919/) links. # # The second bump comes from [Gizmodo](http://gizmodo.com/5983295/the-biggest-wikipedia-traffic-spikes-since-2010-prove-were-all-morbid) and [TheVerge](http://www.theverge.com/2013/2/11/3975570/wikipedia-traffic-trends-super-bowl-show-celebrity-deaths) links. # # We have no idea about that third one, we're investigating.