Analysis of Wikipedia's Coverage of 2014 News Events¶

By Brian Keegan, Ph.D. -- December 19, 2014

In [1024]:

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, requests, re, itertools, urllib2, urlparse
import wikipedia_scraping as ws
import seaborn as sns
import networkx as nx

from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup, element
from collections import Counter
from IPython.display import Image
from operator import itemgetter
from scipy import stats
from matplotlib.lines import Line2D

_start = pd.datetime(2001,1,1)
_end = pd.datetime(2015,1,1)
_filedir = u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'

We begin by defining the article titles for the news events we want to collect data on. We'll also decode these strings into valid unicode strings to account for any wacky characters.

What were top stories in 2014?¶

In [4]:

# http://yearinreview.fb.com/
facebook = ['World Cup','Ebola Outbreak','Elections in Brazil','Robin Williams','Ice Bucket Challenge','Conflict in Gaza','Malaysia Airlines disasters','Super Bowl','Ferguson','Sochi']

# http://www.google.com/trends/topcharts?hl=en#date=2014
google = ['Robin Williams','World Cup','Ebola','Malaysia Airlines','Flappy Bird','ALS Ice Bucket Challenge','ISIS','Ferguson','Frozen','Ukraine']

# https://2014.twitter.com/moments
twitter = ['Philip Seymour Hoffman','State of the Union','Carnaval','Malaysia Airlines','Bring Back Our Girls','India Election','Spanish Abdication','Maya Angelou','Ferguson','Robin Williams','Ice Bucket Challenge','Scottish referendum','Ebola','He for She','Hong Kong protests','Mars Orbiter','Malala Yousafzi','US elections','Berlin Wall','Philae']

# Editorial judgment, https://en.wikipedia.org/wiki/2014
wikipedia1 = ['2014 Winter Olympics','Ebola virus epidemic in West Africa','2014 Crimean crisis','Malaysia Airlines Flight 370','Chibok schoolgirl kidnapping','Sinking of the MV Sewol','Islamic State in Iraq and the Levant','2014 FIFA World Cup','Felipe VI','2014 Israel–Gaza conflict','Malaysia Airlines Flight 17','Rosetta spacecraft','Cuba-United States relations']

# Number of contributors, http://stats.wikimedia.org/EN/TablesWikipediaEN.htm#zeitgeist
# Excluding repeats like "Deaths in 2014"
wikipedia2 = ['2013–14 North American cold wave',]

Top articles across languages¶

This is un-redeemably hacky HTML parsing. I apologize. But this generates your top 25 list, which is published on this gist.

In [394]:

def top_articles(lang):
    # Read the HTML from the web and convert to soup
    # Broken URLS here: soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/{0}/TablesWikipedia{0}.htm'.format(lang.upper())).read()) 
    soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/EN/TablesWikipedia{0}.htm'.format(lang.upper())).read())

    # Look for all the paragraphs with 2014
    _p = soup.findAll('b',text=re.compile('2014'))

    # Select only those paragraph parents that have exactly 152 fields, corresponding to the top-25 lists
    _p2014 = [t.parent for t in _p if len(t.parent) == 152]

    # Get the text out of the children tags as a list of lists
    parsed = [[t.text for t in list(p.children) if type(t) != element.NavigableString] for p in _p2014]

    # Convert to a dictionary keyed by month abbreviation with values as the list of text fields
    parsed = {month[0].split(u'\xa0')[0]:month[1:] for month in parsed}

    # Do some crazy dictionary and list comprehensions with zips to convert the values in the list
    parsed = {k:[{'rank':int(a),'editors':int(b),'article':c} for a,b,c in zip(v[0::3],v[1::3],v[2::3])] for k,v in parsed.items()}

    # Convert each month into a DataFrame with month information in the index
    # and then concat all the dfs together, sorting on those with the most editors
    ranked = pd.concat([pd.DataFrame(parsed[i],index=[i]*len(parsed[i])) for i in parsed.keys()]).sort('editors',ascending=False).reset_index()

    # rename the reset index to something meaningful
    ranked.rename(columns={'index':'month'},inplace=True)

    # Group the articles by name, compute aggregate statistics
    # Rank on the total number editors and months in the top 25
    top_articles = ranked.groupby('article').agg({'month':len,'editors':np.sum,'rank':np.min})
    top_articles['editor-month'] = top_articles['month'] * top_articles['editors']
    top_articles.sort(['editor-month'],ascending=False,inplace=True)
    return top_articles

In [395]:

country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
                 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
                 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
                 'uk':'Ukranian'}

top_articles_by_country = {}
for country in country_codes.keys():
    try:
        top_articles_by_country[country] = top_articles(country)
    except urllib2.HTTPError:
        print "The '{0}' language does not have a stats page ".format(country)
        pass

for _country,_df in top_articles_by_country.items():
    _df.to_csv('/Data/{0}.csv'.format(_country),encoding='utf8')

In [110]:

def langlink_translater(source_lang,target_lang,article_titles):
    chunks = ws.chunk_maker(article_titles,40)
    translation_dict = dict()
    
    for chunk in chunks:
        result = ws.wikipedia_query({'action':'query',
                                     'prop': 'langlinks',
                                     'lllang': source_lang,
                                     'titles': '|'.join(chunk),
                                     'lllimit': '500'},target_lang)
        if result and 'pages' in result.keys():
            translation_dict.update({_d['title'] : _d['langlinks'][0]['*'] for _d in result['pages'].values() if 'langlinks' in _d.keys()})
            
    return translation_dict

# This step takes a few minutes
translater_dict = {source_lang:{target_lang:langlink_translater(source_lang,target_lang,df.index) for target_lang,df in top_articles_by_country.items()} for source_lang in top_articles_by_country.keys()}

# Save the file
with open('translater_dict.json','wb') as f:
    json.dump(translater_dict,f)

Load up data crawled above.

In [400]:

_filedir

Out[400]:

u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'

In [401]:

country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
                 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
                 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
                 'uk':'Ukranian'}

top_articles_by_country = dict()
for country in country_codes.keys():
    top_articles_by_country[country] = pd.read_csv(_filedir + '/Data/{0}.csv'.format(country),encoding='utf8',index_col=0)

with open('translater_dict.json','rb') as f:
    translater_dict = json.load(f)

In [438]:

lang_link_exists_dict = dict()
top_articles_df = pd.DataFrame()

for source_lang,target_dictionary in translater_dict.iteritems():
    langlink_exists_df = pd.DataFrame()
    for target_lang,d in target_dictionary.iteritems():
        top_articles_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index)
        langlink_exists_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index).isin(translater_dict[source_lang][target_lang].keys())
        if source_lang == target_lang:
            langlink_exists_df[target_lang] = [1]*len(langlink_exists_df[target_lang])
        langlink_exists_df = langlink_exists_df.reindex_axis(sorted(langlink_exists_df.columns), axis=1)
        lang_link_exists_dict[source_lang] = langlink_exists_df

In [462]:

_df = top_articles_df.ix[:2].T
_df.index = [country_codes[i] for i in _df.index]
_df.columns = range(1,4)
_df.sort()

Out[462]:

	1	2	3
Arabic	كريستيانو رونالدو	ريال مدريد	السعودية
Chinese	世間情	太陽花學運	马来西亚航空370号班机空难
Czech	Válka na východní Ukrajině	Euromajdan	Minecraft
Dutch	Lijst van personen overleden in 2014	Malaysia Airlines-vlucht 17	Eurovisiesongfestival 2014
English	Deaths in 2014	Malaysia Airlines Flight 370	Islamic State of Iraq and the Levant
Farsi	دولت اسلامی عراق و شام	ایل ملکشاهی	مهران مدیری
French	État islamique (organisation)	Manuel Valls	Dieudonné
German	Krise in der Ukraine 2014	Alternative für Deutschland	Fußball-Weltmeisterschaft 2014
Indonesian	JKT48	NET.	Joko Widodo
Italian	Juventus Football Club	Campionato mondiale di calcio 2014	Serie A 2013-2014
Japanese	仮面ライダー鎧武/ガイム	烈車戦隊トッキュウジャー	ハピネスチャージプリキュア!
Korean	대한민국	일베저장소	세월호 침몰 사고
Polish	Robert Lewandowski	2014	Euromajdan
Portugese	Em Família (telenovela)	Copa do Mundo FIFA de 2014	Campeonato Brasileiro de Futebol de 2014 - Sér...
Russian	Список умерших в 2014 году	Вооружённый конфликт на востоке Украины (2014)	Донецкая Народная Республика
Spanish	Copa Mundial de Fútbol de 2014	Podemos (partido político)	Copa Sudamericana 2014
Swedish	Sverigedemokraterna	Avlidna 2014	Feministiskt initiativ
Turkish	Türkiye	Recep Tayyip Erdoğan	Mustafa Kemal Atatürk
Ukranian	Війна на сході України	Небесна сотня	Ленінопад

In [ ]:

In [469]:

_df.ix['id'].sum(axis=1)

Out[469]:

55.0

In [465]:

_lang = 'en'
f, ax = plt.subplots(figsize=(10,5))
_df = lang_link_exists_dict[_lang].ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]

_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=1)

ax.set_frame_on(False)
ax.set_xticks(np.arange(0.5,_x+.5,10),minor=False)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.invert_yaxis()
ax.set_xticklabels(_df.columns[::10],minor=False,fontsize=12)
ax.set_yticklabels([country_codes[x] for x in _df.index],minor=False,fontsize=12)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_xlabel('Article rank',fontsize=15)
#f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)

f.tight_layout();
f.savefig('en_lang_link_exists.png',dpi=150)

In [474]:

sum_lang_link = pd.DataFrame(np.zeros(lang_link_exists_dict['en'].shape),columns=lang_link_exists_dict['en'].columns)
for lang,_df in lang_link_exists_dict.iteritems():
    sum_lang_link = sum_lang_link + _df.values.astype(float)
#frac_sum_lang_link = sum_lang_link.apply(lambda x:x/19)
sum_lang_link.columns = [country_codes[i] for i in sum_lang_link.columns]

f, ax = plt.subplots(figsize=(10,5))
_df = sum_lang_link.ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]
_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=19)

ax.set_frame_on(False)
ax.set_xticks(np.arange(0,_x,10),minor=False)
ax.set_xticklabels(np.arange(0,_x,10),fontsize=12)
ax.set_xlabel('Article Rank',fontsize=15)
ax.set_title('Number of Languages with Article on Topic',fontsize=20)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.set_yticklabels(_df.index,minor=False)
ax.invert_yaxis()

#f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([.875, 0.15, 0.025, .75])
f.colorbar(_ax, cax=cbar_ax)

f.tight_layout()
f.savefig('sum_lang_link.png',dpi=200)

In [476]:

_s = sum_lang_link.ix[:100].apply(np.average,axis=1)
ax = plt.scatter(_s.index,_s.values,s=50,cmap='rainbow')
ax.axes.set_title('Coverage for Top 100 Stories',fontsize=20)
ax.axes.set_xlabel('Article Rank',fontsize=16)
ax.axes.set_ylabel('Number of Languages Covered',fontsize=16)
ax.axes.set_xlim((-1,101))
plt.tight_layout()
plt.savefig('top100_coverage.png',dpi=200)

But all of this analysis about article coverage is inaccurate because we've made the assumption that the interlanguage links between articles are reliable. That is, if a "Eurovision 2014" article exists in all 19 languages, all 18 languages link to the other 18 languages' version of the article. The analysis below will show this assumption is flawed.

Article language graph¶

Make a network of the interlanguage links to reveal the clusters and missing links between languages.

In [479]:

article_language_graph = nx.DiGraph()
article_language_mapper = dict() # This will be helpful later

for source_lang,d in translater_dict.iteritems():
    for target_lang,mapping in d.iteritems():
        for target_lang_article,source_lang_article in mapping.iteritems():
            article_language_graph.add_edge(target_lang_article,source_lang_article)
            article_language_graph.add_node(source_lang_article,lang=source_lang)
            article_language_graph.add_node(target_lang_article,lang=target_lang)
            
            # Populate the article_language_mapper
            if source_lang_article in article_language_mapper.keys():
                article_language_mapper[source_lang_article].append(source_lang)
            else:
                article_language_mapper[source_lang_article] = [source_lang]
            if target_lang_article in article_language_mapper.keys():
                article_language_mapper[target_lang_article].append(target_lang)
            else:
                article_language_mapper[target_lang_article] = [target_lang]
            
nx.write_gexf(article_language_graph,'article_language_graph.gexf')
article_language_mapper = {k:list(set(v)) for k,v in article_language_mapper.iteritems()}

with open('article_language_mapper.json','wb') as f:
    json.dump(article_language_mapper,f)

In [14]:

Image('article_language_links.png')

Out[14]:

There are at least two kinds of problems in these topical subgraphs. The first problem is that these subgraphs are missing many links within topics; articles about the same are linked in some languages, but not others. This is marked in the figure below by the blue observations falling much below the red Ideal line.

The second problem is that different topics are sometimes linked together. Because there are 19 languages we're looking at, there should be a maximum of 19 articles in a cluster. However, there are three clusters that have more than 19 articles in them. This is an artifact of imprecise topical linking. For example, some languages link to the article on Ebolavirus while others link to Ebola virus disease. Entities like Taiwan and China refer to complex and overlapping concepts like political entities (PRC vs. ROC), geographies ("Formosa" vs. Mainland), and cultures. Of course, this disambiguation problem is almost certainly likely to be present in the topical subgraphs below size 19, but I'm simply going to ignore it.

In [480]:

topic_subgraphs = list(nx.components.connected_component_subgraphs(article_language_graph.to_undirected()))
subgraph_properties = [{'edges':_subgraph.number_of_edges(),'nodes':_subgraph.number_of_nodes(),'density':nx.density(_subgraph)} for _subgraph in topic_subgraphs]

# Uncomment to see what's in these subgraphs
#for _subgraph in topic_subgraphs:
#    if _subgraph.number_of_nodes() > 19:
#        print _subgraph.nodes()

subgraph_df = pd.DataFrame(subgraph_properties)
subgraph_df = subgraph_df[subgraph_df['nodes'] > 2]

f,ax = plt.subplots(1,1)
_ax = subgraph_df.plot(x='nodes',y='edges',kind='scatter',label='Observed Topic',ax=ax)

ax.plot([i*(i-1) for i in range(20)],label='Ideal Topic',lw=3,c='r',alpha=.5)
ax.axvline(x=19.5,ls='--',lw=3,c='g',alpha=.5,label='Max Topics')
ax.set_xlim((0,40))
ax.set_ylim((-1,400))
ax.legend(fontsize=12)
ax.set_xlabel('Number of Nodes in Topic',fontsize=18)
ax.set_ylabel('Number of Edges in Topic',fontsize=18)
ax.set_title('Diagnosing Problems in Topic Subgraphs',fontsize=24)

# Based on the results from commented part above, I'm applying three labels to the three outliers
_outliers = zip(['China','Taiwan','Ebola'],subgraph_df[subgraph_df['nodes'] > 20][['nodes','edges']].values)
for label,(x,y) in _outliers:
    ax.annotate(label,xy=(x, y),fontsize=12,
                xytext=(x+2, y+75),
                arrowprops=dict(arrowstyle="fancy", #linestyle="dashed",
                                color="0.5",shrinkB=8,connectionstyle="arc3,rad=0.3"))
plt.tight_layout();

We need to create these missing links. We'll use a helpful answer from StackOverflow involving itertools to generate and edgelist of every permutation of nodes in the subgraph. We can use the resulting edgelist to create a complete subgraph where every node in the subgraph is linked together.

In [481]:

def complete_subgraph_maker(node_list):
    return itertools.permutations(node_list,2)

complete_topic_graph = nx.DiGraph()
for _subgraph in topic_subgraphs:
    if _subgraph.number_of_nodes() < 20:
        _edgelist = complete_subgraph_maker(_subgraph.nodes())
        complete_topic_graph.add_edges_from(_edgelist)

# Add the language labels back in as node attributes so we can hopefully translate back
for node in complete_topic_graph.nodes():
    complete_topic_graph.add_node(node,lang=article_language_mapper[node])

Now I'm going to impose a very Anglo-centric constraint by creating a topic_graph that contains only articles that are linked in the English Wikipedia. This unfortunately has the effect of removing a few of the topical clusters, but still leaves us with over 1800 articles in English to explore in subsequent steps.

In [482]:

complete_topic_subgraphs = list(nx.components.connected_component_subgraphs(complete_topic_graph.to_undirected()))
english_label_subgraphs = [_subgraph for _subgraph in complete_topic_subgraphs for node,data in _subgraph.nodes_iter(data=True) if 'en' in data['lang']]
print "Out of the initial {0} topical clusters, there are {1} subgraphs in the complete approach. {2} of these have an English label".format(len(topic_subgraphs), len(complete_topic_subgraphs), len(english_label_subgraphs))

english_topic_graph = nx.DiGraph()
topic_translation_dict = dict()

for _subgraph in english_label_subgraphs:
    english_topic_graph.add_edges_from(_subgraph.edges(data=True))
    english_topic_graph.add_nodes_from(_subgraph.nodes(data=True)) #1
    _english_nodes = [_node for _node,_data in _subgraph.nodes_iter(data=True) if 'en' in _data['lang']]
    if len(_english_nodes) == 1:
        topic_translation_dict.update({_node:_english_nodes[0] for _node in _subgraph.nodes_iter()})
    else:
        # I really hope this is never the case, but just to be sure
        print _english_nodes

# Graphs with lists for attributes cant be serialized into GEXF
# Comment out #1 to make #2 work, or leave #1 uncommented and #2 commented
#nx.write_gexf(english_topic_graph,'english_topic_graph.gexf') #2

Out of the initial 2124 topical clusters, there are 1968 subgraphs in the complete approach. 1885 of these have an English label

Now having done all this work to create topical sub-graphs containing English labels, we can go back to the original lists of top stories and meaningfully compare what the top stories (in English) were across languages.

In [486]:

translated_articles_by_country = pd.DataFrame()

for country in country_codes.keys():
    translated_articles_by_country[country] = pd.Series([topic_translation_dict.get(article,np.nan) for article in top_articles_by_country[country].index])

translated_articles_by_country.columns = [country_codes[i] for i in translated_articles_by_country.columns]
translated_articles_by_country.index = range(1,len(translated_articles_by_country)+1)
translated_articles_by_country.sort(axis=1).head(3).T

Out[486]:

	1	2	3
Arabic	Cristiano Ronaldo	Real Madrid C.F.	Saudi Arabia
Chinese	NaN	Sunflower Student Movement	Malaysia Airlines Flight 370
Czech	War in Donbass	Euromaidan	Minecraft
Dutch	Deaths in 2014	Malaysia Airlines Flight 17	Eurovision Song Contest 2014
English	Deaths in 2014	Malaysia Airlines Flight 370	Islamic State of Iraq and the Levant
Farsi	Islamic State of Iraq and the Levant	NaN	Mehran Modiri
French	Islamic State of Iraq and the Levant	Manuel Valls	Dieudonné M'bala M'bala
German	War in Donbass	Alternative for Germany	2014 FIFA World Cup
Indonesian	NaN	NaN	Joko Widodo
Italian	Juventus F.C.	2014 FIFA World Cup	2013–14 Serie A
Japanese	Kamen Rider Gaim	Ressha Sentai ToQger	HappinessCharge PreCure!
Korean	South Korea	Ilbe Storehouse	Sinking of the MV Sewol
Polish	Robert Lewandowski	2014	Euromaidan
Portugese	Em Família (telenovela)	2014 FIFA World Cup	2014 Campeonato Brasileiro Série A
Russian	Deaths in 2014	War in Donbass	Donetsk People's Republic
Spanish	2014 FIFA World Cup	Podemos (Spanish political party)	2014 Copa Sudamericana
Swedish	Sweden Democrats	Deaths in 2014	Feminist Initiative (Sweden)
Turkish	Turkey	Recep Tayyip Erdoğan	Mustafa Kemal Atatürk
Ukranian	War in Donbass	List of people killed during Euromaidan	NaN

These are articles that went missing from all that "clean up." Hooray. But at least we have a nice translated version of everything.

In [20]:

pd.Series(top_articles_by_country['en'].index).ix[list(np.array(translated_articles_by_country['English'][translated_articles_by_country['English'].isnull()].index) - 1)]

Out[20]:

12                        Ebola virus disease
25                      Big Brother 16 (U.S.)
32                                 Nash Grier
41     Ebola virus cases in the United States
75               Republic of Crimea (country)
141                                  WWE 2K15
143                           Siege of Kobanê
148                            Gopinath Munde
Name: article, dtype: object

Comparing the extent to which every cell has an valid article that can be translated into English, it looks like the cleanup (bottom) hurt more than it helped compared the original (top).

In [21]:

f,(ax1,ax2) = plt.subplots(2,1,sharex=True,figsize=(10,5))

# Plot on ax1
_df1 = lang_link_exists_dict['en'].T.astype(float)
_df1 = _df1.ix[_df1.sum(axis=1).sort(inplace=False,ascending=False).index]
_df1.index = [country_codes[i] for i in _df1.index]
_y1,_x1 = _df1.shape
_ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=1)
ax1.set_frame_on(False)
#ax1.set_xticks(np.arange(0,_x1,10),minor=False)
ax1.set_yticks(np.arange(_y1)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_yticklabels(_df1.index,minor=False,fontsize=8)
ax1.set_title('Original',fontsize=18)

# Plot on ax2
_df2 = translated_articles_by_country.T.notnull().astype(float)
_df2 = _df2.ix[_df1.index] # Use the _df1 index
_y2,_x2 = _df2.shape
_ax2 = ax2.pcolor(_df2.values.astype(float),cmap='rainbow',vmin=0,vmax=1)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(0,_x2,10),minor=False)
ax2.set_yticks(np.arange(_y2)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_yticklabels(_df2.index,minor=False,fontsize=8)
ax2.tick_params(axis='x',direction='in',pad=-4)
ax2.set_title('Cleaned',fontsize=18)

f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)
f.suptitle('Comparing results, English',fontsize=24)
#f.subplots_adjust(top=0.5)
f.tight_layout(rect=[0,0,1,.9])

In [489]:

top_stories_across_languages = pd.Series(Counter([_val for _array in translated_articles_by_country.values for _val in _array]))
top_stories_across_languages = top_stories_across_languages.ix[1:]
top_stories_across_languages_top5 = top_stories_across_languages[top_stories_across_languages >= 5].sort(inplace=False,ascending=True)

f,ax = plt.subplots(1,1,figsize=(8,10))
_ax = top_stories_across_languages_top5.plot(kind='barh',ax=ax)
ax.axes.set_title('Articles With Widest Coverage\n',fontsize=24)
ax.axes.set_xlabel('Number of Languages',fontsize=18)
f.tight_layout()
f.savefig('widest_coverage.png',dpi=200)

In [491]:

combined_top_articles_df = pd.concat(top_articles_by_country.values(),keys=top_articles_by_country.keys(),axis=0).reset_index()
combined_top_articles_df.rename(columns={'level_0':'lang'},inplace=True)
combined_top_articles_df['article'] = combined_top_articles_df['article'].apply(lambda x:topic_translation_dict.get(x,np.nan))

In [492]:

combined_top_articles_agg_article = combined_top_articles_df.groupby('article').agg({'editors':np.sum,'month':np.average,'lang':len})
combined_top_articles_agg_article['editors per month'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['month']
combined_top_articles_agg_article['editors per lang'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['lang']
combined_top_articles_agg_article['editors-lang-month'] = combined_top_articles_agg_article['editors']*combined_top_articles_agg_article['lang']*combined_top_articles_agg_article['month']
combined_top_articles_agg_article.sort('editors-lang-month',ascending=True,inplace=True)

f,ax = plt.subplots(1,1,figsize=(8,10))
_ax = combined_top_articles_agg_article['editors-lang-month'].ix[-50:].plot(kind='barh',ax=ax)
ax.axes.set_xscale('log')
ax.axes.set_xlabel('Editor-language-month score',fontsize=18)
ax.axes.set_ylabel('')
ax.axes.set_title('Articles with Highest Activity\n',fontsize=24)
f.tight_layout();
f.savefig('highest_activity_ranking.png',dpi=200)

In [498]:

_melted = pd.melt(translated_articles_by_country.reset_index(),id_vars=['index'])
_pivoted = pd.pivot_table(data=_melted,index='value',columns='variable',values='index')

top_by_language_pivoted = _pivoted.ix[top_stories_across_languages_top5.index].fillna(0)
top_by_combined_pivoted = _pivoted.ix[combined_top_articles_agg_article.index[-50:]].fillna(0)

language_cosine = dict()
combined_cosine = dict()
for _lang1 in country_codes.values():
    language_cosine[_lang1] = dict()
    combined_cosine[_lang1] = dict()
    for _lang2 in country_codes.values():
        if _lang1 != _lang2:
            language_cosine[_lang1][_lang2] = cosine_similarity(top_by_language_pivoted[_lang1],top_by_language_pivoted[_lang2])[0][0]
            combined_cosine[_lang1][_lang2] = cosine_similarity(top_by_combined_pivoted[_lang1],top_by_combined_pivoted[_lang2])[0][0]

In [774]:

f,(ax1,ax2) = plt.subplots(1,2,figsize=(9,5),sharey=True)

_df1 = pd.DataFrame(language_cosine)
_order1 = _df1.mean(axis=1).sort(inplace=False,ascending=True).index
_df1 = _df1[_order1].ix[_order1]
_y1,_x1 = _df1.shape
_ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=.75)
ax1.set_title('Language coverage',fontsize=18)
ax1.set_frame_on(False)
ax1.set_xticks(np.arange(_y1)+.5,minor=False)
ax1.set_yticks(np.arange(_y1)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90)
ax1.set_yticklabels(_df1.index,minor=False,fontsize=10)

_df2 = pd.DataFrame(combined_cosine)
_order2 = _df2.mean(axis=1).sort(inplace=False,ascending=True).index
_df2 = _df2[_order1].ix[_order1] # Order the same way as _df1
_y2,_x2 = _df2.shape
_ax2 = ax2.pcolor(_df2,cmap='rainbow',vmin=0,vmax=.75)
ax2.set_title('Editor-language-month score',fontsize=18)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(_y1)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90)

f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([1, 0.15, 0.05, .7])
cb = f.colorbar(_ax2, cax=cbar_ax, label='Cosine similarity')
cb.ax.yaxis.label.set_fontsize(15)

#f.suptitle('Cosine similarity of rankings across languages',fontsize=24)
#f.subplots_adjust(top=0.5)
f.tight_layout()#rect=[0,0,1,.9])
f.savefig('cosine_similarity.png',dpi=200)

In [501]:

_df = pd.DataFrame(data=np.triu(_df2),index=_df2.index,columns=_df2.columns).replace({0:np.nan})
#del _df['id']
_df.reset_index(inplace=True)
coverage = pd.melt(_df,id_vars=['index']).dropna(subset=['value'])
coverage.columns = ['Language 1','Language 2','Cosine Similarity']
#coverage['Language 1'] = coverage['Language 1'].apply(lambda x:country_codes.get(x))
#coverage['Language 2'] = coverage['Language 2'].apply(lambda x:country_codes.get(x))
_highest = coverage.sort('Cosine Similarity',inplace=False,ascending=False).reset_index(drop=True).ix[:9]
_lowest = coverage[coverage['Language 1'] != 'Indonesian'].sort('Cosine Similarity',inplace=False,ascending=True).reset_index(drop=True).ix[:9]
pd.concat([_highest,_lowest],axis=1,keys=['Highest similarities','Lowest similarities'])

Out[501]:

	Highest similarities			Lowest similarities
	Language 1	Language 2	Cosine Similarity	Language 1	Language 2	Cosine Similarity
0	Farsi	Dutch	0.747684	Ukranian	Swedish	0.094924
1	French	English	0.639927	Japanese	Farsi	0.115249
2	Czech	Polish	0.609603	Japanese	English	0.136084
3	Polish	English	0.593232	Chinese	Czech	0.160647
4	German	English	0.592578	Ukranian	Dutch	0.169325
5	Farsi	Russian	0.587311	Turkish	Arabic	0.174961
6	Portugese	Farsi	0.583782	Japanese	Spanish	0.175202
7	Chinese	Spanish	0.580538	Turkish	Ukranian	0.177230
8	Farsi	Italian	0.570912	Ukranian	Farsi	0.181022
9	French	Polish	0.569966	Turkish	German	0.186484

Define top articles¶

In [540]:

top_news_articles = [u'2014 FIFA World Cup', u'Malaysia Airlines Flight 370', u'Malaysia Airlines Flight 17',
                     u'2014 Winter Olympics', u'2014 Crimean crisis', u'Felipe VI of Spain', 
                     u'Islamic State of Iraq and the Levant',u'Ebola virus epidemic in West Africa',u'Eurovision Song Contest 2014',
                     u'Ice Bucket Challenge', u'2014 Israel\u2013Gaza conflict', u'Minecraft',
                     u'Scottish independence referendum, 2014',u'2014 Hong Kong protests', u'United States elections, 2014',
                     u'Soma mine disaster', u'Indian general election, 2014', u'Gamergate controversy',
                     u'2014 Ferguson unrest',u'Rosetta spacecraft', u'Cuba\u2013United States relations',
                     u'Chibok schoolgirl kidnapping', u'Sinking of the MV Sewol']

Get page content and revision histories¶

Define an empty revision_dict to hold the data, loop through the list of articles and use the get_page_content for each of these articles between 2001-01-01 and 2014-10-17 in English, and save this data into revision_dict. We'll also write this revision data to disk as a pickle file.

In [541]:

revision_dict = dict()

for article in top_news_articles:
    print article
    revision_dict[article] = ws.get_page_revisions(article,_start,_end,'en')
    revision_dict[article].to_csv(_filedir + u'Data/{0}.csv'.format(article),encoding='utf8')

2014 FIFA World Cup
Malaysia Airlines Flight 370
Malaysia Airlines Flight 17
2014 Winter Olympics
2014 Crimean crisis
Felipe VI of Spain
Islamic State of Iraq and the Levant
Ebola virus epidemic in West Africa
Eurovision Song Contest 2014
Ice Bucket Challenge
2014 Israel–Gaza conflict
Minecraft
Scottish independence referendum, 2014
2014 Hong Kong protests
United States elections, 2014
Soma mine disaster
Indian general election, 2014
Gamergate controversy
2014 Ferguson unrest
Rosetta spacecraft
Cuba–United States relations
Chibok schoolgirl kidnapping
Sinking of the MV Sewol

Compute cumulative Gini coefficients for each revision of an article. This captures how centralized the distribution of revisions per editor has become over time.

In [562]:

# http://planspace.org/2013/06/21/how-to-calculate-gini-coefficient-from-raw-data-in-python/
def gini(list_of_values):
    if len(list_of_values) > 1:
        sorted_list = sorted(list_of_values)
        height, area = 0, 0
        for value in sorted_list:
            height += value
            area += height - value / 2.
        fair_area = height * len(list_of_values) / 2
        gini_value = (fair_area - area) / fair_area
    else:
        gini_value = np.nan
    return gini_value

for _df in revision_dict.values():
    _df['gini'] = [gini(Counter(_df.ix[:i,'user']).values()) for i in iter(_df.index)]

Do some more data cleanup and write out two large CSV files corresponding to all the data (revisions.csv) and revisions during 2014 (revisions_2014.csv).

In [961]:

rev_df = pd.concat(revision_dict.values(),keys=revision_dict.keys(),axis=0)
rev_df.reset_index(inplace=True,level=0)
rev_df.rename(columns={'level_0':'title'},inplace=True)
rev_df.reset_index(inplace=True,drop=True)
rev_df['anon'] = rev_df['anon'].notnull()
rev_df['userhidden'] = rev_df['userhidden'].notnull()
rev_df['commenthidden'] = rev_df['commenthidden'].notnull()
rev_df.to_csv('revisions.csv',encoding='utf8')

revs2014_df = rev_df[rev_df['timestamp'] >= pd.datetime(2014,1,1,0,0,0)]
revs2014_df.reset_index(drop=True,inplace=True)
revs2014_df.to_csv('revisions_2014.csv',encoding='utf8')

Load page content and revision histories from disk¶

Loop through all the revisions for each article in revisions_dict and convert them to a pandas DataFrame. concat this list of article revision history DataFrames together into revs and look at everything that's happened already!

In [571]:

rev_df = pd.read_csv('revisions.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp'])
revs2014_df = pd.read_csv('revisions_2014.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp'])
revs2014_df.tail()

Out[571]:

	title	anon	comment	commenthidden	date	diff	gini	latency	parentid	revid	revision	size	timestamp	unique_users	user	userhidden	userid
7168	2014 FIFA World Cup	True	NaN	False	2014-12-01	9	0.606035	5137	636094531	636105483	7168	112203	2014-12-01 01:24:41	2307	Moka Mo	False	22227837
7169	2014 FIFA World Cup	True	/* Qualification */	False	2014-12-06	-1	0.606039	461669	636105483	636867858	7169	112202	2014-12-06 09:39:10	2307	Edgars2007	False	8973808
7170	2014 FIFA World Cup	True	/* External links */	False	2014-12-09	70	0.606093	300989	636867858	637376139	7170	112272	2014-12-09 21:15:39	2307	Chanheigeorge	False	376297
7171	2014 FIFA World Cup	True	/* Group stage */ Rearrange	False	2014-12-12	24437	0.606147	232125	637376139	637769495	7171	136709	2014-12-12 13:44:24	2307	Soerfm	False	15268407
7172	2014 FIFA World Cup	True	/* Match summary */ link	False	2014-12-12	29	0.606202	1330	637769495	637771163	7172	136738	2014-12-12 14:06:34	2307	Soerfm	False	15268407

In [572]:

_agg_function = {'revision':np.max,'unique_users':np.max}
revs2014_agg_article = revs2014_df.groupby('title').agg(_agg_function)

f,ax = plt.subplots(1,1,figsize=(8,10))
revs2014_agg_article.sort('revision',inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_ylabel('')
ax.set_xlabel('Count',fontsize=12)
f.tight_layout()
f.savefig('en_19_activity.png',dpi=200)

Activity over time¶

Compute daily activity numbers for each article by grouping first on article title, then on the date. We use an aggregate function to return the number of unique users, revisions, size of change, and latency between edits for each day. Perform some other data manipulation and cleanup and store the results for 2014-01-01 onward in activity_2014. This dataframe will be used as the dat source for subsequent timeseries visualizations of content production.

In [573]:

daily_activity = revs2014_df.groupby(['title','date']).aggregate({'unique_users':max,
                                                           'revid':len,
                                                           'diff':np.sum,
                                                           'latency':np.mean,
                                                           'size':np.mean,
                                                           'gini':np.mean})
daily_activity = daily_activity.unstack(level=0)
daily_activity.index = pd.to_datetime(daily_activity.index)

daily_activity['unique_users'] = daily_activity['unique_users'].fillna(method='ffill').fillna(0)
daily_activity['revid'] = daily_activity['revid'].fillna(method='ffill').fillna(0)
daily_activity['gini'] = daily_activity['gini'].fillna(method='ffill').fillna(0)
#daily_activity['link_count'] = daily_activity['link_count'].fillna(method='ffill').fillna(0)
daily_activity['size'] = daily_activity['size'].fillna(method='ffill').fillna(0)
daily_activity['diff'] = daily_activity['diff'].fillna(0)
daily_activity['latency'] = daily_activity['latency'].fillna(0)

#daily_activity = daily_activity.fillna(method='ffill').fillna(0)
activity_2014 = daily_activity.ix['2014-1-1':]
activity_2014.tail()

Out[573]:

	latency										...	size
title	2014 Crimean crisis	2014 FIFA World Cup	2014 Ferguson unrest	2014 Hong Kong protests	2014 Israel–Gaza conflict	2014 Winter Olympics	Chibok schoolgirl kidnapping	Cuba–United States relations	Ebola virus epidemic in West Africa	Eurovision Song Contest 2014	...	Indian general election, 2014	Islamic State of Iraq and the Levant	Malaysia Airlines Flight 17	Malaysia Airlines Flight 370	Minecraft	Rosetta spacecraft	Scottish independence referendum, 2014	Sinking of the MV Sewol	Soma mine disaster	United States elections, 2014
date
2014-12-19	30306.4	0	44294	8175.800000	0.000000	0	0	7911.545455	2413.060606	0.0	...	189399	230636.621622	140466.0	220862.500000	111471.75	79146.000000	231326	150991	29070	29644.666667
2014-12-20	0.0	0	0	16080.666667	0.000000	1211054	0	14762.800000	7225.071429	107732.5	...	189399	231294.473684	140850.8	220783.666667	111471.75	79146.000000	231326	150991	29070	29644.666667
2014-12-21	0.0	0	55853	0.000000	126267.500000	0	0	8865.300000	6827.363636	0.0	...	189399	232124.173913	140850.8	224719.333333	111471.75	79146.000000	231303	150992	29070	29644.666667
2014-12-22	65912.5	0	0	13750.076923	0.000000	0	0	0.000000	9229.800000	0.0	...	189399	232567.500000	142643.0	224719.333333	111471.75	80400.714286	231303	150992	29070	29644.666667
2014-12-23	87836.0	0	0	0.000000	21207.666667	0	0	36031.800000	16523.200000	0.0	...	189959	237956.000000	143447.0	225217.666667	111471.75	81914.000000	231303	150980	29070	29644.666667

5 rows × 138 columns

Number of users¶

This first thing to plot is the number of unique users over time. Some articles like the World Cup or Winter Olympics had articles before the event actually occured while other articles are about un-anticipated events like the Crimean crisis or Israel-Gaza conflict. To make these more comparable, the number of unique users across articles is normalized to 0 on January 1. The result is we see which articles add the most new users over the course of the year.

MA 370 has the most unique users, with 2095 new unique users in 2014, followed by the rapidly rising "Ebola virus epidemic" article with 1110 new unique users and the long-simmering "2014 Crimean crisis".

In [634]:

normalized_unique_users = activity_2014['unique_users'] - activity_2014.ix['2014-1-1','unique_users']
ax = normalized_unique_users.plot(colormap='spectral')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('New unique users since Jan. 1',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[634]:

<matplotlib.legend.Legend at 0x6ff732e8>

In [635]:

users_rank_s = normalized_unique_users.ix['2014-12-21'].order(ascending=False)
users_rank_s

Out[635]:

title
2014 FIFA World Cup                       2307
Malaysia Airlines Flight 370              2145
Ebola virus epidemic in West Africa       1374
Minecraft                                 1353
Islamic State of Iraq and the Levant      1127
2014 Crimean crisis                        933
2014 Israel–Gaza conflict                  857
Malaysia Airlines Flight 17                796
Felipe VI of Spain                         750
Cuba–United States relations               573
Scottish independence referendum, 2014     550
Ice Bucket Challenge                       539
Indian general election, 2014              482
Sinking of the MV Sewol                    420
2014 Hong Kong protests                    417
2014 Ferguson unrest                       380
2014 Winter Olympics                       337
Eurovision Song Contest 2014               277
Gamergate controversy                      233
Rosetta spacecraft                         218
Chibok schoolgirl kidnapping               202
Soma mine disaster                         159
United States elections, 2014              156
Name: 2014-12-21 00:00:00, dtype: float64

Number of revisions¶

Plot the number of revisions made per day. There are major peaks for acute events like the Crimean crisis and MA370 disappearance in March, the death of Robin Williams in August, and the Scottish vote in September.

Looking at the cumulative number of revisions made during the year-to-date, MA370 is still the big story with over 10k changes made, followed by the Israel-Gaza conflict, Ebola, and ISIL articles.

In [1117]:

f,ax = plt.subplots(1,1,figsize=(10,6))
_ax = activity_2014['revid'].plot(colormap='spectral',lw=3,ax=ax)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5],ncol=1)
ax.set_title('Revisions over time',fontsize=18)
f.tight_layout()
f.savefig('revisions.png',dpi=200,bbox_inches='tight')

In [636]:

revisions_rank_s = (activity_2014['revid'].cumsum().ix['2014-12-21'] - activity_2014['revid'].cumsum().ix['2014-1-1']).order(ascending=False)
revisions_rank_s

Out[636]:

title
Malaysia Airlines Flight 370              10336
Ebola virus epidemic in West Africa        7818
Islamic State of Iraq and the Levant       7545
2014 Israel–Gaza conflict                  6550
Malaysia Airlines Flight 17                5202
2014 Crimean crisis                        4271
2014 Hong Kong protests                    3852
Gamergate controversy                      3304
2014 FIFA World Cup                        3107
Indian general election, 2014              2923
Scottish independence referendum, 2014     2374
Eurovision Song Contest 2014               1963
2014 Winter Olympics                       1793
Sinking of the MV Sewol                    1692
Ice Bucket Challenge                       1647
2014 Ferguson unrest                       1483
Felipe VI of Spain                         1231
Rosetta spacecraft                         1152
Minecraft                                   900
Chibok schoolgirl kidnapping                865
United States elections, 2014               840
Soma mine disaster                          634
Cuba–United States relations                579
dtype: float64

Gini coefficient¶

Information about the number of contributing editors and the number of revisions obscures how this work is distributed. Using the Gini coefficient to measure the dispersion in the data, we can capture how inequality of contributions to articles has changed over time. For example, the work might be evenly distributed with all editors making a equal contributions (Gini = 0) or the work might be highly concentrated with one editor making almost all of the contributions (Gini = 1).

The plot shows a general trend towards increasingly concentrated editing activity across many articles. The articles about the Hong Kong protests and Israel-Gaza conflict show very high levels of centralized editing activity while the articles about the Olympics and Robin Williams show more evenly-distributed activity.

In [637]:

ax = activity_2014['gini'].plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Gini coefficient',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[637]:

<matplotlib.legend.Legend at 0x8d5998d0>

In [638]:

gini_rank_s = activity_2014['gini'].ix['2014-12-21'].order(ascending=False)
gini_rank_s

Out[638]:

title
2014 Hong Kong protests                   0.833029
Gamergate controversy                     0.819048
Islamic State of Iraq and the Levant      0.800438
2014 Israel–Gaza conflict                 0.786471
Ebola virus epidemic in West Africa       0.767065
Malaysia Airlines Flight 370              0.723217
Indian general election, 2014             0.715372
Malaysia Airlines Flight 17               0.713623
Eurovision Song Contest 2014              0.696886
2014 Crimean crisis                       0.689587
Scottish independence referendum, 2014    0.653276
Sinking of the MV Sewol                   0.629018
2014 Ferguson unrest                      0.621660
2014 FIFA World Cup                       0.606175
Minecraft                                 0.594117
2014 Winter Olympics                      0.575925
Ice Bucket Challenge                      0.552910
Soma mine disaster                        0.552441
Rosetta spacecraft                        0.496439
Chibok schoolgirl kidnapping              0.488796
Felipe VI of Spain                        0.475323
Cuba–United States relations              0.438778
United States elections, 2014             0.437851
Name: 2014-12-21 00:00:00, dtype: float64

Size of article¶

Plotting the size of the article's markup (in kilobytes), there is some variability in the size of articles over time. Some sharp valleys in articles like MA370 suggest large sections being removed and quickly restored while other articles like Hearbleed and Robin Williams have more sudden growth followed by stabilization. MA370 and the Scottish vote make up the two largest articles in the corpus currently.

In [639]:

ax = (activity_2014['size']/1000.).plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Article size (kB)',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[639]:

<matplotlib.legend.Legend at 0x8de15cf8>

In [640]:

size_rank_s = (activity_2014['size']/1000.).ix['2014-12-21'].order(ascending=False)
size_rank_s

Out[640]:

title
2014 Israel–Gaza conflict                 247.040500
Islamic State of Iraq and the Levant      232.124174
Scottish independence referendum, 2014    231.303000
Malaysia Airlines Flight 370              224.719333
Ebola virus epidemic in West Africa       213.271364
2014 Crimean crisis                       204.954000
Indian general election, 2014             189.399000
2014 Hong Kong protests                   179.214333
2014 Ferguson unrest                      170.762000
Eurovision Song Contest 2014              163.456000
Sinking of the MV Sewol                   150.992000
Malaysia Airlines Flight 17               140.850800
2014 FIFA World Cup                       136.723500
Minecraft                                 111.471750
Gamergate controversy                     106.945526
2014 Winter Olympics                       96.313000
Rosetta spacecraft                         79.146000
Cuba–United States relations               58.256900
Ice Bucket Challenge                       50.676000
Chibok schoolgirl kidnapping               45.199000
Felipe VI of Spain                         34.538000
United States elections, 2014              29.644667
Soma mine disaster                         29.070000
Name: 2014-12-21 00:00:00, dtype: float64

One chart and table to rule them all¶

In [632]:

f,(ax1,ax2,ax3,ax4) = plt.subplots(4,1,figsize=(10,10),sharex=True)

_ax1 = activity_2014['unique_users'].ix['1-7-2014':].diff().plot(colormap='spectral',lw=2,ax=ax1,legend=None)
ax1.set_xlabel('')
ax1.set_ylabel('Users')
ax1.set_title('New users',fontsize=15)

_ax2 = activity_2014['revid'].plot(colormap='spectral',lw=2,ax=ax2,legend=None)
ax2.set_xlabel('')
ax2.set_ylabel('Revisions')
ax2.set_title('Revisions made',fontsize=15)

_ax3 = activity_2014['gini'].diff().plot(colormap='spectral',lw=2,ax=ax3,legend=None)
ax3.set_xlabel('')
ax3.set_ylabel('Gini delta')
ax3.set_title('Change in centralization',fontsize=15)

_ax4 = (activity_2014['diff']/1000.).diff().plot(colormap='spectral',lw=2,ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('Kilobytes (kB) delta')
ax4.set_title('Change in article size',fontsize=15)
ax4.set_ylim((-100,100))
#ax4.set_yscale('symlog')

_colors = dict(zip(sorted(revs2014_df['title'].unique()),sns.color_palette('spectral', len(revs2014_df['title'].unique()))))

handles, labels = _ax4.get_legend_handles_labels()
ax4.legend_.remove()
new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(revs2014_df['title'].unique())]
f.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=15)
f.tight_layout()
f.savefig('article_changes.png',dpi=200,bbox_inches='tight')

In [660]:

_table = pd.concat([users_rank_s.round(2),revisions_rank_s.round(2),gini_rank_s.round(2),size_rank_s.round(2)],
                   axis=1,keys=['Users','Revisions','Gini','Length'])
_table[['Revisions','Users','Gini','Length']].sort(['Revisions','Users','Gini','Length'],ascending=False)

Out[660]:

	Revisions	Users	Gini	Length
Malaysia Airlines Flight 370	10336	2145	0.72	224.72
Ebola virus epidemic in West Africa	7818	1374	0.77	213.27
Islamic State of Iraq and the Levant	7545	1127	0.80	232.12
2014 Israel–Gaza conflict	6550	857	0.79	247.04
Malaysia Airlines Flight 17	5202	796	0.71	140.85
2014 Crimean crisis	4271	933	0.69	204.95
2014 Hong Kong protests	3852	417	0.83	179.21
Gamergate controversy	3304	233	0.82	106.95
2014 FIFA World Cup	3107	2307	0.61	136.72
Indian general election, 2014	2923	482	0.72	189.40
Scottish independence referendum, 2014	2374	550	0.65	231.30
Eurovision Song Contest 2014	1963	277	0.70	163.46
2014 Winter Olympics	1793	337	0.58	96.31
Sinking of the MV Sewol	1692	420	0.63	150.99
Ice Bucket Challenge	1647	539	0.55	50.68
2014 Ferguson unrest	1483	380	0.62	170.76
Felipe VI of Spain	1231	750	0.48	34.54
Rosetta spacecraft	1152	218	0.50	79.15
Minecraft	900	1353	0.59	111.47
Chibok schoolgirl kidnapping	865	202	0.49	45.20
United States elections, 2014	840	156	0.44	29.64
Soma mine disaster	634	159	0.55	29.07
Cuba–United States relations	579	573	0.44	58.26

Number of links¶

Wikipedia articles link to other Wikipedia articles to provide further details. The number of links in an article thus provides a coarse measure of how many other topics the article is related to. The ISIL, Crimean, Scottish, and Olympics articles have over 400 unique links while other articles like Heartbleed and the Hong Kong protests have lewss than 200.

In [13]:

ax = activity_2014['link_count'].plot(colormap='gist_rainbow')
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Links in article',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[13]:

<matplotlib.legend.Legend at 0x1b065828>

In [14]:

activity_2014['link_count'].ix['2014-10-16'].order(ascending=False)

Out[14]:

title
Islamic State of Iraq and the Levant      476.000000
2014 Crimean crisis                       448.000000
Scottish independence referendum, 2014    426.500000
2014 Winter Olympics                      410.000000
2014 FIFA World Cup                       382.000000
Robin Williams                            288.000000
Malaysia Airlines Flight 370              279.000000
2014 Israel–Gaza conflict                 238.000000
Ebola virus epidemic in West Africa       192.819149
Heartbleed                                151.000000
2014 Hong Kong protests                   149.630769
Name: 2014-10-16 00:00:00, dtype: float64

Links per byte¶

The amount of content (number of bytes) and number of links on an article can be combined to form a ratio of links per byte (LPB). Articles with a high LPB suggest the article tends to send people away to other articles for information. Articles with a low LPB suggest there are few other articles that contain relevant information than the current article. More imprecisely, articles with high LPB should be lower quality articles with less substantive content and articles with lower LPB should be higher quality articles with more thorough discussion.

There's a tendency for breaking news articles to get fewer LPB over time, suggesting an increase in quality. The MA370 and Ebola articles have the lowest LPB as they have detailed discussions of the events, context, and people while the articles for the World Cup and Olympics have the highest LPB as they mostly link to other sub-pages about teams and events.

In [15]:

links_per_byte = (activity_2014['link_count']/activity_2014['size'])
ax = links_per_byte.plot(colormap='gist_rainbow')
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Links per byte',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[15]:

<matplotlib.legend.Legend at 0x1921c710>

In [16]:

links_per_byte.ix['2014-10-16'].order(ascending=True)

Out[16]:

title
Malaysia Airlines Flight 370              0.001008
Ebola virus epidemic in West Africa       0.001084
2014 Israel–Gaza conflict                 0.001165
2014 Hong Kong protests                   0.001251
Scottish independence referendum, 2014    0.001871
Heartbleed                                0.002020
2014 Crimean crisis                       0.002238
Islamic State of Iraq and the Levant      0.002445
Robin Williams                            0.003192
2014 FIFA World Cup                       0.003409
2014 Winter Olympics                      0.003612
Name: 2014-10-16 00:00:00, dtype: float64

Edit latency¶

Next measure how long a revision to an article "lives" before another revision is made. Breaking news articles where lots of editors are trying to make changes in response to new information may mean that an edit only lives for a few seconds or minutes before being changes. As the article stabilizes, the latency between edits should increase reflecting that changes become less frequent. Articles for ISIL, Robin Williams, World Cup, and the Scottish referendum were written in advance of the events themselves, but still had average latencies of only a few hours at the start of the year. New articles about breaking news events start with very short latencies between edits and lengthening over time.

In [520]:

#ax = daily_activity.ix['2014-1-1':,'latency'].plot(colormap='gist_rainbow')
ax = pd.rolling_mean(daily_activity.ix['2013-11-1':,'latency'],28).ix['2014-1-1':].plot(colormap='spectral')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Edit latency (seconds)',fontsize=15)
ax.set_yscale('symlog')
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[520]:

<matplotlib.legend.Legend at 0x5cbd17f0>

In [521]:

daily_activity.ix['2014-1-1':,'latency'].mean().order(ascending=True)

Out[521]:

title
2014 Hong Kong protests                    1118.243382
Gamergate controversy                      5386.759761
2014 Ferguson unrest                       7044.298083
Malaysia Airlines Flight 17                8742.343510
Ebola virus epidemic in West Africa       14984.461360
Ice Bucket Challenge                      15739.002636
Malaysia Airlines Flight 370              19438.206178
Islamic State of Iraq and the Levant      21140.547770
Scottish independence referendum, 2014    35588.366615
2014 Crimean crisis                       36745.956090
2014 FIFA World Cup                       37520.299262
Indian general election, 2014             39796.101925
Rosetta (spacecraft)                      41674.952889
2014 Winter Olympics                      44095.782572
Soma mine disaster                        44172.268543
Felipe VI of Spain                        51472.925482
Malala Yousafzai                          51515.912970
Minecraft                                 52667.175757
Conchita Wurst                            53032.325408
Cuba–United States relations              62546.472660
dtype: float64

Get pageviews¶

In addition the the information production statistics analyzed above, the number of article pageviews captures a measure of information consumption. The make_pageview_df function takes a list of article titles and returns a DataFrame indexed by day, columns corresponding to articles, and values the number of pageviews for that article on that day.

If you've already done the step above, you can read in the CSV file pageviews.csv.

In [663]:

pv_df = pd.read_csv('pageviews_Dec.csv',encoding='utf8',index_col=0,parse_dates=[0])
del pv_df['Heartbleed']
pv_2014 = pv_df.ix['1-1-2014':]
pv_df.tail()

Out[663]:

	2014 Crimean crisis	2014 FIFA World Cup	2014 Ferguson unrest	2014 Hong Kong protests	2014 Israel–Gaza conflict	2014 Winter Olympics	Chibok schoolgirl kidnapping	Cuba–United States relations	Ebola virus epidemic in West Africa	Eurovision Song Contest 2014	...	Indian general election, 2014	Islamic State of Iraq and the Levant	Malaysia Airlines Flight 17	Malaysia Airlines Flight 370	Minecraft	Rosetta spacecraft	Scottish independence referendum, 2014	Sinking of the MV Sewol	Soma mine disaster	United States elections, 2014
2014-12-17	2663	5779	4979	2294	1573	2964	736	13112	9673	747	...	1691	26288	3131	9257	7495	2506	1557	714	124	940
2014-12-18	2390	5552	19184	2038	1489	2237	830	28682	10765	682	...	1852	29494	2850	7919	8742	1835	1257	601	115	911
2014-12-19	2044	4953	2418	1463	1248	1946	683	13209	7873	766	...	1690	20771	2502	6784	21895	1645	978	611	102	826
2014-12-20	1666	5039	1944	1005	981	1517	381	7491	6175	932	...	1537	15989	1959	4823	12914	1086	827	485	66	482
2014-12-21	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 23 columns

Pageviews exhibit a strong weekly cycle across articles: less content is consumed on weekends than weekdays. But there are "bursts" in pageview attention to articles in the immediate aftermath of an event. The death of Robin Williams had the highest peak of pageview activity, but the World Cup, and Heartbleed also showed high levels of attention. Much of this attention falls exponentially back to a few thousand pageviews about 2 months after the precipitating event.

In [731]:

f,ax = plt.subplots(1,1,figsize=(10,5))
_ax = pv_df.ix['2014-1-1':].plot(colormap='spectral',lw=3,ax=ax)
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Pageviews',fontsize=15)


ax.legend(loc='center right',bbox_to_anchor=[1.35,.5],fontsize=9,ncol=1)
f.tight_layout()
f.savefig('pageviews.png',dpi=200,bbox_inches='tight')

In [733]:

pv_df.ix['2014-1-1':].sum().sort(ascending=False,inplace=False)

Out[733]:

2014 FIFA World Cup                       15114251
Islamic State of Iraq and the Levant       7958521
Malaysia Airlines Flight 370               7104838
Ice Bucket Challenge                       5393702
2014 Winter Olympics                       4994186
Minecraft                                  3536207
Indian general election, 2014              3308296
Scottish independence referendum, 2014     2453512
Ebola virus epidemic in West Africa        2229304
Malaysia Airlines Flight 17                2173782
Eurovision Song Contest 2014               2045654
2014 Crimean crisis                        1955729
Gamergate controversy                      1255098
2014 Israel–Gaza conflict                   920625
Rosetta spacecraft                          865312
2014 Ferguson unrest                        787856
United States elections, 2014               672488
2014 Hong Kong protests                     496953
Sinking of the MV Sewol                     436051
Felipe VI of Spain                          408235
Cuba–United States relations                190825
Soma mine disaster                          139668
Chibok schoolgirl kidnapping                108700
dtype: float64

In [374]:

ax = (pv_df.ix['2014-1-1':]/pv_df.max(axis=0)).plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Pageviews',fontsize=15)
#ax.set_ylim((10**2,10**7))
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[374]:

<matplotlib.legend.Legend at 0x426d3eb8>

In [735]:

f,ax = plt.subplots(1,1,figsize=(8,8))
pv_df.sum(axis=0).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Cumulative pageviews',fontsize=18)
ax.set_xlabel('Total page views',fontsize=15)
ax.set_xscale('symlog')
f.tight_layout()
f.savefig('cumulative_pageviews.png',dpi=200)

Now use the idxmax function (basedo on this StackOverflow question) to identify the index corresponding to the dates for each article's pageview peak.

In [869]:

pv_melted = pd.melt(pv_df.reset_index(),id_vars=['index'])
pv_gb_page = pv_melted.groupby('variable')
_idx = pv_melted.groupby('variable')['value'].agg(lambda col: col.idxmax())
pv_max = pv_melted.ix[_idx]
pv_max.columns = ['date','article','pageviews']
pv_max = pv_max.set_index('article')
pv_max

Out[869]:

	date	pageviews
article
2014 Crimean crisis	2014-03-03	114744
2014 FIFA World Cup	2014-06-12	494388
2014 Ferguson unrest	2014-11-25	133152
2014 Hong Kong protests	2014-10-02	41168
2014 Israel–Gaza conflict	2014-07-31	35057
2014 Winter Olympics	2014-02-07	290268
Chibok schoolgirl kidnapping	2014-09-03	1680
Cuba–United States relations	2014-12-18	28682
Ebola virus epidemic in West Africa	2014-10-15	70894
Eurovision Song Contest 2014	2014-05-11	240290
Felipe VI of Spain	2014-06-21	35789
Gamergate controversy	2014-10-23	89858
Ice Bucket Challenge	2014-08-21	559582
Indian general election, 2014	2014-05-16	287240
Islamic State of Iraq and the Levant	2014-09-03	297580
Malaysia Airlines Flight 17	2014-07-18	425012
Malaysia Airlines Flight 370	2014-03-10	288597
Minecraft	2014-09-16	46521
Rosetta spacecraft	2014-11-12	119264
Scottish independence referendum, 2014	2014-09-19	289786
Sinking of the MV Sewol	2014-04-27	17393
Soma mine disaster	2014-05-14	32152
United States elections, 2014	2014-11-05	63299

In [ ]:

pv_gb_page.groups.keys()

In [736]:

f,ax = plt.subplots(1,1,figsize=(8,8))
pv_df.max().sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Most Pageviews in a Day',fontsize=18)
ax.set_xlabel('Pageviews',fontsize=15)
ax.set_xscale('symlog')
f.tight_layout()
f.savefig('max_pageviews.png',dpi=200)

In [766]:

f,ax = plt.subplots(1,1,figsize=(10,5))
_cmap = 'spectral'
_topics = ['Malaysia Airlines Flight 17','2014 FIFA World Cup','Islamic State of Iraq and the Levant','Minecraft']
_data = pv_df[_topics].ix['1-1-2014':]
#_ax = _data.plot(lw=3,ax=ax,cmap=_cmap)
#ax.set_yscale('symlog')
#ax.set_ylim((1e2,1e7))

ax.legend(fontsize=12,loc='upper left')

_colors = dict(zip(pv_df.columns,sns.color_palette(_cmap, len(pv_df.columns))))
for d in _topics:
    ax.plot(pv_df.ix['1-1-2014':,d].index,pv_df.ix['1-1-2014':,d].values,c=_colors[d],lw=2,label=d)
    #ax.fill_between(_data[d].index, _data[d].values, _data['Minecraft'].values, facecolor=_colors[d], alpha=0.33)
ax.set_xticklabels(['Jan 2014','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax.legend(loc='center right',bbox_to_anchor=[1.4,.5],fontsize=12)
f.tight_layout()
f.savefig('pageviews_shapes.png',dpi=200,bbox_inches='tight')

In [ ]:

In [767]:

f,ax = plt.subplots(1,1,figsize=(8,8))
(pv_df.max()/pv_df.sum()).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Fraction of Total Pageviews in Peak',fontsize=18)
ax.set_xlabel('Fraction of Pageviews',fontsize=15)
#ax.set_xscale('log')
f.tight_layout()
f.savefig('peak_fraction.png',dpi=200)

The ratio of information producation and consumption reveals whether a peer-produced encyclopedia like Wikipedia is generating enough content relative to the demand for information. More specifically, measuring the ratio of pageviews (information demand) to revisions (information production) reveals during what times and on what topics these are matched or off-balance. Call this the "information conduced" ratio.

The recent "Heartbleed 2.0" bug produced a major spike in attention without a corresponding spike of similar magnitude in editing behavior, creating the highest conduced ratio, but other major media events like the Olympics and World Cup also have high ratios. This suggests that Wikipedians were not generating changes to these articles to reflect the demand for information. In contrast, other breaking news topics like MA370, the Ebola outbreak, and others were more evenly matched between information production and consumption.

In [526]:

pv_2014.columns

Out[526]:

Index([u'2014 Crimean crisis', u'2014 FIFA World Cup', u'2014 Ferguson unrest', u'2014 Hong Kong protests', u'2014 Israel–Gaza conflict', u'2014 Winter Olympics', u'Chibok schoolgirl kidnapping', u'Cuba–United States relations', u'Ebola virus epidemic in West Africa', u'Eurovision Song Contest 2014', u'Felipe VI of Spain', u'Gamergate controversy', u'Ice Bucket Challenge', u'Indian general election, 2014', u'Islamic State of Iraq and the Levant', u'Malaysia Airlines Flight 17', u'Malaysia Airlines Flight 370', u'Minecraft', u'Rosetta spacecraft', u'Scottish independence referendum, 2014', u'Sinking of the MV Sewol', u'Soma mine disaster', u'United States elections, 2014'], dtype='object')

In [522]:

information_conduced_df = pv_2014/(activity_2014['revid']+1)
ax = pd.rolling_mean(information_conduced_df,7).ix['2014-1-1':].plot(colormap='spectral')
ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Consumption/Production ratio',fontsize=15)
#ax.set_ylim((0,20000))
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

Out[522]:

<matplotlib.legend.Legend at 0x5d9feda0>

In [213]:

_s = pd.melt(information_conduced_df.reset_index(),id_vars='index').replace({np.inf:np.nan,-np.inf:np.nan}).dropna()
_s.columns = ['date','article','ratio']
_top = _s.sort('ratio',inplace=False,ascending=False).reset_index(drop=True).ix[:10]
# Exclude uninteresting edge cases. Sorry Your Excellence, you're a boring fellow.
_bottom = _s[(_s['ratio'] > 0) & (_s['article'] != 'Felipe VI of Spain')]
_bottom = _bottom.sort('ratio',inplace=False,ascending=True).reset_index(drop=True).ix[:10]
pd.concat([_top,_bottom],keys=['Greater consumption per production','Lesser consumption per production'],axis=1)

Out[213]:

	Greater consumption per production			Lesser consumption per production
	date	article	ratio	date	article	ratio
0	2014-10-23	Gamergate controversy	44929.000000	2014-08-02	Ice Bucket Challenge	1.000000
1	2014-10-24	Gamergate controversy	32957.000000	2014-05-13	Soma mine disaster	1.840909
2	2014-02-14	2014 Winter Olympics	31753.000000	2014-08-12	Ice Bucket Challenge	2.000000
3	2014-08-31	Ice Bucket Challenge	30047.333333	2014-08-11	Ice Bucket Challenge	2.000000
4	2014-08-27	Ice Bucket Challenge	29725.666667	2014-08-10	Ice Bucket Challenge	2.000000
5	2014-10-15	Gamergate controversy	28503.500000	2014-01-05	Scottish independence referendum, 2014	17.333333
6	2014-06-11	2014 FIFA World Cup	25593.000000	2014-09-21	Gamergate controversy	20.725490
7	2014-02-15	2014 Winter Olympics	24110.600000	2014-08-13	Ice Bucket Challenge	22.888889
8	2014-09-03	Ice Bucket Challenge	23886.000000	2014-09-06	Soma mine disaster	23.500000
9	2014-10-16	Gamergate controversy	23419.500000	2014-09-28	2014 Hong Kong protests	24.964286
10	2014-05-15	2014 FIFA World Cup	22044.000000	2014-08-26	Soma mine disaster	26.000000

In [1118]:

_p = pd.melt(activity_2014['revid'].reset_index(),id_vars='date')
_c = pd.melt(pv_2014.reset_index(),id_vars='index')
_j = pd.merge(_p,_c,left_on=['date','title'],right_on=['index','variable'],copy=False)
_j = _j[['date','title','value_x','value_y']]
_j.columns = ['date','article','production','consumption']
_j_gb = _j.groupby('article')

In [1119]:

f,ax = plt.subplots(1,1,figsize=(10,10))

_colors = dict(zip(sorted(_j_gb.groups.keys()),sns.color_palette('spectral', len(_j_gb.groups.keys()))))

for article in sorted(_j_gb.groups.keys()):
    _data = _j_gb.get_group(article)[['production','consumption']]
    _data['production_z'] = (_data['production'] - _data['production'].mean())/_data['production'].std()
    _data['consumption_z'] = (_data['consumption'] - _data['consumption'].mean())/_data['consumption'].std()
    _data['ratio'] = np.abs(_data['consumption_z'] + _data['production_z'])
    #_data = _data[(_data['production'] > 0) & (_data['consumption'] > 0)]
    sns.regplot(_data['production_z'],_data['consumption_z'],
                ci=None,color=_colors[article],label=article,ax=ax,lowess=True,
                scatter_kws={'s':50*np.abs(_data['ratio']),'alpha':.33},line_kws={'lw':5})
    #ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=50,lw=0,alpha=.5,label=article)
    #ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=250*np.log(_data['ratio']),lw=0,alpha=.5,label=article)

ax.set_xlabel('Revisions (Z-score)',fontsize=15)
ax.set_ylabel('Pageviews (Z-score)',fontsize=15)
ax.set_yscale('symlog')
ax.set_ylim((-1,20))
ax.set_xscale('symlog')
ax.set_xlim((-1,20))
ax.plot((-1,20),(-1,20),'--',lw=3,c='k')

handles, labels = ax.get_legend_handles_labels()
new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(_j_gb.groups.keys())]
ax.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=12)
f.savefig('pv_vs_revision.png',dpi=200,bbox_inches='tight')

In [355]:

handles[0]

Out[355]:

<matplotlib.collections.PathCollection at 0x3c7b6160>

Co-authorship network evolution¶

Define a dictionary aggregator to use for generating attribute data for edges, articles, and editors from the revision history. This will be used throughout several of the following steps.

Create a new DataFrame by grouping the revisions together by article title and user username and apply the aggregator function to this groupby object to generate relevant statistics and attributes. Other data cleanup to simplify the data structure and convert timestamps from Timestamp objects into better-behaved floats.

In [777]:

agg_function = {'revid':{'weight':len},
              'timestamp':{'ts_min':np.min,'ts_max':np.max},
              'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max,'total_changes':np.sum},
              'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
              'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max},
              #'link_count':{'link_count_min':np.min,'link_count_median':np.median,'link_count_max':np.max}
              }

revs_gb_edge = revs.groupby(['title','user'])
revs_edgelist = revs_gb_edge.agg(agg_function)
revs_edgelist.columns = revs_edgelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_edgelist['ts_min'] = (revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_edgelist['ts_max'] = (revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')

revs_edgelist.head()

Out[777]:

		ts_min	ts_max	revision_min	revision_max	revision_median	weight	latency_min	latency_median	latency_max	diff_median	diff_max	diff_min	total_changes
title	user
2014 Crimean crisis	1.178.88.30	4808.280891	4808.280891	634	634	634.0	1	748	748	748	1	1	1	1
	10.4.0.34	4811.231632	4811.231632	1342	1342	1342.0	1	2267	2267	2267	48	48	48	48
	101.98.175.68	4870.264688	4870.265058	3644	3645	3644.5	2	32	6020	12008	72	143	1	144
	106.68.144.182	4995.320382	4995.320382	3896	3896	3896.0	1	384782	384782	384782	1	1	1	1
	107.15.237.75	4808.931308	4808.931308	852	852	852.0	1	33	33	33	-40	-40	-40	-40

Check to see if there are any articles and editors with the same names, cause this will definitely break stuff.

In [778]:

revs_edgelist.ix[[i for i in revs_edgelist.index if i[0] == i[1]]]

Out[778]:

		ts_min	ts_max	revision_min	revision_max	revision_median	weight	latency_min	latency_median	latency_max	diff_median	diff_max	diff_min	total_changes
title	user

In [779]:

revs_gb_page = revs.groupby('title')
revs_pagenodelist = revs_gb_page.agg(agg_function)

revs_pagenodelist.columns = revs_pagenodelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_pagenodelist['ts_min'] = (revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_pagenodelist['ts_max'] = (revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_pagenodelist['article'] = [1]*len(revs_pagenodelist)
revs_pagenodelist.head()

Out[779]:

	ts_min	ts_max	revision_min	revision_max	revision_median	weight	latency_min	latency_median	latency_max	diff_median	diff_max	diff_min	total_changes	article
title
2014 Crimean crisis	4804.710139	5104.708588	0	4046	2023.0	4047	2	306	791108	10	201155	-201155	204422	1
2014 FIFA World Cup	1230.827986	5093.587894	0	7172	3586.0	7173	1	963	10345392	1	111902	-111902	136141	1
2014 Ferguson unrest	4978.230613	5102.265486	0	1348	674.0	1349	5	609	473810	5	168645	-168645	81098	1
2014 Hong Kong protests	5018.674051	5103.346389	0	3861	1930.5	3862	4	339	165680	4	122763	-122763	172817	1
2014 Israel–Gaza conflict	4936.250602	5104.259549	0	6497	3248.5	6498	6	353	306131	4	188426	-188426	245650	1

In [781]:

revs_gb_user = revs.groupby('user')
revs_usernodelist = revs_gb_user.agg(agg_function)

revs_usernodelist.columns = revs_usernodelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_usernodelist['ts_min'] = (revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_usernodelist['ts_max'] = (revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')

revs_usernodelist['article'] = [0]*len(revs_usernodelist)
revs_usernodelist.head()

Out[781]:

	ts_min	ts_max	revision_min	revision_max	revision_median	weight	latency_min	latency_median	latency_max	diff_median	diff_max	diff_min	total_changes	article
user
!dea4u	3571.221806	3571.221806	1902	1902	1902.0	1	12	12.0	12	-18.0	-18	-18	-18	0
$oliton	4898.752361	4898.752361	4993	4993	4993.0	1	6009	6009.0	6009	0.0	0	0	0	0
(	764.846713	795.962593	0	2	1.0	3	98	1344206.0	2688314	113.5	215	12	227	0
-Jafar277-	4996.878519	5062.872720	4405	6542	5473.5	2	704	3770.5	6837	22.5	40	5	45	0
-MARSHMELLOWxPUPPIES-	4863.816030	4863.816713	53	54	53.5	2	59	11165.0	22271	4.0	4	4	8	0

Make a network¶

Create an empty NetworkX DiGraph object coauthorship_g that will be filled with the edges, nodes, and attribute data for both generated from the edge and nodelists above. Loop through these edge and nodelists, adding attributes and the nodes/edges to the coauthorship_g object. The dictionary comprehension with k:float(v) is to convert the numpy.float64 to more primitive float types because NetworkX graph serializers aren't compatible with the float64. Print out the size of the graph and an example of one edge to verify the process worked.

In [785]:

coauthorship_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(revs_edgelist.index.values):
    edge_attributes = {k:float(v) for k,v in dict(revs_edgelist.ix[(article,editor)]).items()}
    if article != editor:
        coauthorship_g.add_edge(article,editor,edge_attributes)

# Add the user nodes and attributes
for node in iter(revs_usernodelist.index):
    node_attributes = {k:float(v) for k,v in dict(revs_usernodelist.ix[node]).items()}
    coauthorship_g.add_node(node,node_attributes,type='user')

# Add the page nodes and attributes
for node in iter(revs_pagenodelist.index):
    node_attributes = {k:float(v) for k,v in dict(revs_pagenodelist.ix[node]).items()}
    coauthorship_g.add_node(node,node_attributes,type='page')
    
print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g.number_of_nodes(),coauthorship_g.number_of_edges())

coauthorship_g.edges(data=True)[:1]

There are 15782 nodes and 18964 edges in the network.

Out[785]:

[(u'Sinking of the MV Sewol',
  u'24.177.247.129',
  {'diff_max': 23.0,
   'diff_median': 23.0,
   'diff_min': 23.0,
   'latency_max': 1513.0,
   'latency_median': 1513.0,
   'latency_min': 1513.0,
   'revision_max': 771.0,
   'revision_median': 771.0,
   'revision_min': 771.0,
   'total_changes': 23.0,
   'ts_max': 4864.7124652777775,
   'ts_min': 4864.7124652777775,
   'weight': 1.0})]

Write the graph object to disk.

In [787]:

nx.write_gexf(coauthorship_g,'coauthorship_g.gexf')

Subset the data to only include users who make more than a single contribution, create a new graph object coauthorship_g_gt1 and remove the edges and nodes for users making only a single contribution.

In [788]:

edges_wt1 = [(i,j) for (i,j,k) in coauthorship_g.edges_iter(data=True) if k['weight'] == 1]

coauthorship_g_gt1 = coauthorship_g.copy()

coauthorship_g_gt1.remove_edges_from(edges_wt1)
isolates = nx.isolates(coauthorship_g_gt1)
coauthorship_g_gt1.remove_nodes_from(isolates)

nx.write_gexf(coauthorship_g_gt1,'coauthorship_g_gt1.gexf')

print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g_gt1.number_of_nodes(),coauthorship_g_gt1.number_of_edges()) 

There are 6171 nodes and 7420 edges in the network.

Visualize the sparsified network in Gephi using ForceAtlas 2 with the "Prevent Overlap", Scaling = 500, and "Stonger gravity" options. Nodes are colored by ts_min to reflect the timestamp of the first edit made and mapped to a rainbow spectrum where bluer colors are older nodes (earlier dates) and redder colors are younger nodes (more recent dates). Nodes have also been sided by degree (in + out).

In [886]:

Image('coauthorship_g_gt1.png')

Out[886]:

Look at the edge weights for users who edited pages most intensively.

In [791]:

revs_edgelist.sort('weight',inplace=False,ascending=False)['weight'].reset_index().head(10)

Out[791]:

	title	user	weight
0	Islamic State of Iraq and the Levant	P-123	2385
1	Ebola virus epidemic in West Africa	BrianGroen	1374
2	2014 Hong Kong protests	Signedzzz	893
3	Malaysia Airlines Flight 370	Ohconfucius	773
4	Ebola virus epidemic in West Africa	Gandydancer	709
5	2014 Hong Kong protests	Ohconfucius	668
6	Gamergate controversy	Ryulong	565
7	Indian general election, 2014	Lihaas	505
8	Gamergate controversy	NorthBySouthBaranof	496
9	Scottish independence referendum, 2014	Jmorrison230582	473

In [795]:

revs_edgelist[(revs_edgelist['weight'] > 10)]['total_changes'].abs().sort(inplace=False,ascending=False).head(10)

Out[795]:

title                                 user         
Minecraft                             ClueBot NG       994919
Malaysia Airlines Flight 370          ClueBot NG       816064
2014 Crimean crisis                   ClueBot NG       666962
Minecraft                             Kevin12345671    482144
Islamic State of Iraq and the Levant  Teaksmitty       386686
Ice Bucket Challenge                  Mr. Granger      298161
Indian general election, 2014         Lihaas           182078
Malaysia Airlines Flight 370          Ohconfucius      141311
2014 Winter Olympics                  ClueBot NG       129650
Islamic State of Iraq and the Levant  ClueBot NG       124130
Name: total_changes, dtype: float64

Computing outdegree centrality identifies the editors who contributed to the most other articles. The k:int(v*_n) dictionary comprehension de-normalizes the values back to counts for the actual number of articles edited.

Two editors, "ClueBot NG" and "AnomieBOT" contributed to all 11 articles, but these aren't human editors, but automated scripts. Users "Tpbradbury" and "Lihass" contributed to 9 of the articles we analyzed.

In [796]:

_n = len(coauthorship_g_gt1) - 1
idc = {k:int(v*_n) for k,v in nx.in_degree_centrality(coauthorship_g_gt1).iteritems()}
odc = {k:int(v*_n) for k,v in nx.out_degree_centrality(coauthorship_g_gt1).iteritems()}
pd.Series(idc).sort(inplace=False,ascending=False).ix[:20]

Out[796]:

AnomieBOT                 23
ClueBot NG                22
Yobot                     19
BG19bot                   15
Tpbradbury                12
Ohconfucius               11
Lihaas                    11
Nickst                    10
Another Believer          10
Rothorpe                  10
Mogism                    10
BattyBot                   9
Brandmeister               9
Jprg1966                   9
Cydebot                    9
Soffredo                   9
Jonesey95                  9
Illegitimate Barrister     7
SmackBot                   7
Factsearch                 7
dtype: int64

Computing clustering tells us the extent to which editors of a particular article edited other articles. The ISIL and 2014 Crimean crisis articles had more of their editors contributing to other articles in the set while the contributors to Robin Williams and Heartbleed tended not to contribute to other articles.

In [797]:

bp_g_gt1 = coauthorship_g_gt1.to_undirected()
pages = list(revs_pagenodelist.index)
users = list(set(coauthorship_g_gt1.nodes()) - set(pages))
clustering = nx.bipartite.clustering(bp_g_gt1,pages)

pd.Series(clustering).sort(inplace=False,ascending=False)

Out[797]:

Malaysia Airlines Flight 17               0.030860
Islamic State of Iraq and the Levant      0.029015
2014 Crimean crisis                       0.027932
2014 Israel–Gaza conflict                 0.025939
2014 Ferguson unrest                      0.024146
Ebola virus epidemic in West Africa       0.023239
Malaysia Airlines Flight 370              0.022589
Scottish independence referendum, 2014    0.022386
2014 Winter Olympics                      0.021959
2014 Hong Kong protests                   0.021083
Sinking of the MV Sewol                   0.020832
Indian general election, 2014             0.020414
Rosetta spacecraft                        0.018343
2014 FIFA World Cup                       0.017378
Chibok schoolgirl kidnapping              0.016841
Eurovision Song Contest 2014              0.014686
Felipe VI of Spain                        0.014023
Soma mine disaster                        0.013941
Minecraft                                 0.013861
United States elections, 2014             0.012435
Ice Bucket Challenge                      0.012004
Cuba–United States relations              0.011649
Gamergate controversy                     0.009740
dtype: float64

72-hour window around peak¶

Now let's look at the coauthorship patterns in the 72 hour window surrounding the article's peak pageview activity. Specifically, for each article's revision history look at theh revisions 24 hours before the peak and 48 hours after the peak.

In [896]:

revs2014_gb_article = revs2014_df.groupby('title')
aftermath_df_list = list()

for _article in pv_max.index:
    _df = revs2014_gb_article.get_group(_article)
    _before = pv_max.ix[_article,'date'] - np.timedelta64(1,'D')
    _after =  pv_max.ix[_article,'date'] + np.timedelta64(2,'D')
    _aftermath = _df[(_df['timestamp'] > _before) & (_df['timestamp'] < _after)]
    aftermath_df_list.append(_aftermath)

aftermath_revs = pd.concat(aftermath_df_list)

In [897]:

aftermath_revs_gb_edge = aftermath_revs.groupby(['title','user'])
aftermath_revs_edgelist = aftermath_revs_gb_edge.agg(agg_function)
aftermath_revs_edgelist.columns = aftermath_revs_edgelist.columns.droplevel(0)
aftermath_revs_edgelist['ts_min'] = (aftermath_revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_edgelist['ts_max'] = (aftermath_revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')

aftermath_revs_gb_page = aftermath_revs.groupby('title')
aftermath_revs_pagenodelist = aftermath_revs_gb_page.agg(agg_function)
aftermath_revs_pagenodelist.columns = aftermath_revs_pagenodelist.columns.droplevel(0)
aftermath_revs_pagenodelist['ts_min'] = (aftermath_revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_pagenodelist['ts_max'] = (aftermath_revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_pagenodelist['article'] = [1]*len(aftermath_revs_pagenodelist)

aftermath_revs_gb_user = aftermath_revs.groupby('user')
aftermath_revs_usernodelist = aftermath_revs_gb_user.agg(agg_function)
aftermath_revs_usernodelist.columns = aftermath_revs_usernodelist.columns.droplevel(0)
aftermath_revs_usernodelist['ts_min'] = (aftermath_revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_usernodelist['ts_max'] = (aftermath_revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_usernodelist['article'] = [0]*len(aftermath_revs_usernodelist)

aftermath_coauthorship_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(aftermath_revs_edgelist.index.values):
    edge_attributes = {k:float(v) for k,v in dict(aftermath_revs_edgelist.ix[(article,editor)]).items()}
    if article != editor:
        aftermath_coauthorship_g.add_edge(article,editor,edge_attributes)

# Add the user nodes and attributes
for node in iter(aftermath_revs_usernodelist.index):
    node_attributes = {k:float(v) for k,v in dict(aftermath_revs_usernodelist.ix[node]).items()}
    aftermath_coauthorship_g.add_node(node,node_attributes,type='user')

# Add the page nodes and attributes
for node in iter(aftermath_revs_pagenodelist.index):
    node_attributes = {k:float(v) for k,v in dict(aftermath_revs_pagenodelist.ix[node]).items()}
    aftermath_coauthorship_g.add_node(node,node_attributes,type='page')
    
print "There are {0} nodes and {1} edges in the network.".format(aftermath_coauthorship_g.number_of_nodes(),aftermath_coauthorship_g.number_of_edges())
nx.write_gexf(aftermath_coauthorship_g,'aftermath_coauthorship_g.gexf')

There are 2158 nodes and 2508 edges in the network.

Apparently no one edited the "Gamergate controversy" article on the peak date --- it was likely protected for the duration.

In [912]:

_df = revs2014_gb_article.get_group('Gamergate controversy')
_df[(_df['date'] < pd.datetime(2014,10,25)) & (_df['date'] > pd.datetime(2014,10,22))]

Out[912]:

	title	anon	comment	commenthidden	date	diff	gini	latency	parentid	revid	revision	size	timestamp	unique_users	user	userhidden	userid

As a result, it's not added to the graph.

In [905]:

_pages1 = revs2014_gb_article.groups.keys()
_pages2 = [_n for _n,_d in aftermath_coauthorship_g.nodes_iter(data=True) if 'page' in _d.values()]
_pages3 = list(aftermath_revs['title'].unique())
set(_pages1) - set(_pages3)

Out[905]:

{u'Gamergate controversy'}

In [887]:

Image('aftermath_coauthorship_g.png')

Out[887]:

Article overlaps¶

How dooes the editing behavior of users overlaps across the different articles?

In [937]:

_article_overlaps = dict()
_aftermath_overlaps = dict()

for _article1 in pages:
    _article_overlaps[_article1] = dict()
    _aftermath_overlaps[_article1] = dict()
    for _article2 in pages:
        if _article1 != _article2:
            try:
                _article_overlaps[_article1][_article2] = len(set(coauthorship_g.neighbors(_article1)) & set(coauthorship_g.neighbors(_article2)))/float(len(set(coauthorship_g.neighbors(_article1))))
            except nx.NetworkXError:
                _article_overlaps[_article1][_article2] = np.nan
            try: # Some articles have no editing activity in the aftermath window
                _aftermath_overlaps[_article1][_article2] = len(set(aftermath_coauthorship_g.neighbors(_article1)) & set(aftermath_coauthorship_g.neighbors(_article2)))/float(len(set(aftermath_coauthorship_g.neighbors(_article1))))
            except nx.NetworkXError:
                _aftermath_overlaps[_article1][_article2] = np.nan
        
_article_overlaps_df = pd.DataFrame(_article_overlaps)
_order = _article_overlaps_df.mean(axis=1).sort(inplace=False,ascending=True).index
_article_overlaps_df = _article_overlaps_df[_order].ix[_order]
_x1,_y1 = _article_overlaps_df.shape

_aftermath_overlaps_df = pd.DataFrame(_aftermath_overlaps)
_aftermath_overlaps_df = _aftermath_overlaps_df[_order].ix[_order]
_x2,_y2 = _aftermath_overlaps_df.shape

In [938]:

f,(ax1,ax2) = plt.subplots(1,2,figsize=(12,8),sharey=True)
_ax1 = ax1.pcolor(_article_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25)
ax1.set_frame_on(False)
ax1.set_xticks(np.arange(0.5,_x+.5),minor=False)
ax1.set_yticks(np.arange(_y)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90)
ax1.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12)
ax1.tick_params(axis='x',direction='in',pad=3)
ax1.set_title('Complete coauthorship',fontsize=15)

_ax2 = ax2.pcolor(_aftermath_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(0.5,_x+.5),minor=False)
#ax2.set_yticks(np.arange(_y)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90)
#ax2.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12)
ax2.tick_params(axis='x',direction='in',pad=3)
ax2.set_title('Aftermath coauthorship',fontsize=15)

#ax.set_xlabel('Article rank',fontsize=15)
f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([1, 0.25, 0.05, 0.7])
f.colorbar(_ax1, cax=cbar_ax,label='Editor overlap')
f.tight_layout()
f.savefig('editor_overlap.png',dpi=200,bbox_inches='tight')

Changes in content¶

Create a DataFrame max_daily_revid that is indexed by dates in 2014 and has columns for each article. Crucially, the values in each cell correspond to the maximum (final) revision of that article on that date. Use the complete rev_df DataFrame to populate revids from before Jan 1, 2014 forward and then use the fillna's "ffill" method to populate revids of previous dates forward to dates when no revision happened.

In [ ]:

624150417[']

In [1056]:

_idx = rev_df.groupby(['title','date']).agg({'revid':lambda x:x.idxmax()})
max_daily_revid = rev_df[['title','date','revid']].ix[_idx['revid']]
max_daily_revid = pd.pivot_table(data=max_daily_revid,columns='title',index='date',values='revid')
max_daily_revid.fillna(method='ffill',inplace=True)
max_daily_revid = max_daily_revid.ix[pd.date_range(start='1-1-2014',end='12-22-2014')]

# It turns out a bunch of revisions were deleted on the ISIS article
max_daily_revid.ix[pd.to_datetime('2014-09-03').date(),'Islamic State of Iraq and the Levant'] = np.nan #624150417

Now we have daily revisions for each article that should give us a way to retrieve article versions at a daily level of resolution to track changes in content over time. In particular, we can use the MediaWiki API to parse out the external links for each version of an article as a way of looking at how citation practices have changed over time. There are nearly 9,000 revisions in total that need to be parsed across all the articles this year --- and likely some cleanup needed in the event that the chosen revisions happen to be outliers or vandalism.

In [1009]:

len([_revid for _article in max_daily_revid.columns for _revid in max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()])

Out[1009]:

Into the parsed_revid_data dictionary we're going to place the payloads from the MediaWiki parsing results. Specifically, we'll be asking for the revision IDs, categories, language links, external links, internal wikilinks, templates, and images. Having each of these properties at the revision level will let us track the "evolution" of the content on these articles over time.

Because this requires making over 3,000 API calls, it will take a while and we'll only want to run it once. The cell block below has been converted to Raw rather than Code to prevent inadvertent execution.

In [1052]:

#parsed_revid_data = dict()

for _article in max_daily_revid.columns[14:15]:
    print _article
    parsed_revid_data[_article] = dict() 
    _unique_revids = max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()
    for _revid in _unique_revids:
        try:
            parsed_revid_data[_article][_revid] = ws.wikipedia_query({'action':'parse',
                                                                  'oldid': _revid,
                                                                  'redirects': True,
                                                                  'prop': 'revid|langlinks|categories|externallinks|iwlinks|templates|images'},'en')
            
        except:
            print "Revision {0} has an error".format(str(_revid))
            parsed_revid_data[_article][_revid] = np.nan
            pass

simlified_parsed_revid_data = {_rev:_payload for _article,_revs in parsed_revid_data.items() for _rev,_payload in _revs.items()}        
        
with open('parsed_revid_data.json','wb') as f:
    json.dump(parsed_revid_data,f)
    
with open('simlified_parsed_revid_data.json','wb') as f:
    json.dump(simlified_parsed_revid_data,f)

Islamic State of Iraq and the Levant
Revision 624083851.0 has an error

In [1053]:

parsed_revid_data['Islamic State of Iraq and the Levant'][624083851]

Out[1053]:

nan

Load the saved data here!¶

In [ ]:

with open('parsed_revid_data.json','rb') as f:
    parsed_revid_data = json.load(f)

External link analysis¶

Look at the most-referenced domain names.

In [1081]:

_final_revs = dict(max_daily_revid.ix[pd.to_datetime('2014-12-21').date()])

urls = list()
for _article, _rev in _final_revs.items():
    if 'externallinks' in parsed_revid_data[_article][_rev].keys():
        for _url in parsed_revid_data[_article][_rev]['externallinks']:
            urls.append(urlparse.urlparse(_url)[1])
            
_s = pd.Series(Counter(urls)).sort(ascending=False,inplace=False)
_s.head(10)

Out[1081]:

www.bbc.co.uk             218
www.theguardian.com       175
www.nytimes.com           135
www.bbc.com               126
www.reuters.com           106
www.telegraph.co.uk        99
web.archive.org            94
www.washingtonpost.com     92
www.fifa.com               72
www.scmp.com               65
dtype: int64

Define a list of Western news sources in westerners and create a function western_link_fraction that will compute the fraction of domain names in a given revid that come from this list.

In [1104]:

westerners = ['bbc','guardian','ft','telegraph','independent',
              'nytimes','reuters','washingtonpost','cnn','wsj','abc','nbc','cbs','yahoo','bloomberg']

def western_link_fraction(_revid,_revdict):
    try:
        _urls = _revdict[_revid]['externallinks']
        _domains = [urlparse.urlparse(_url)[1] for _url in _urls]
        _western = [any(_w in _d for _w in westerners) for _d in _domains]
        if len(_western) > 0:
            return float(sum(_western))/len(_western)
        else:
            return 0
    except KeyError:
        return np.nan

In [1106]:

western_link_df = pd.DataFrame(index=pd.date_range('1-1-2014','12-21-2014'))

for article in max_daily_revid.columns:
    western_link_df[article] = max_daily_revid[article].apply(lambda x:western_link_fraction(x,simlified_parsed_revid_data))

Out[1106]:

	2014 Crimean crisis	2014 FIFA World Cup	2014 Ferguson unrest	2014 Hong Kong protests	2014 Israel–Gaza conflict	2014 Winter Olympics	Chibok schoolgirl kidnapping	Cuba–United States relations	Ebola virus epidemic in West Africa	Eurovision Song Contest 2014	...	Indian general election, 2014	Islamic State of Iraq and the Levant	Malaysia Airlines Flight 17	Malaysia Airlines Flight 370	Minecraft	Rosetta spacecraft	Scottish independence referendum, 2014	Sinking of the MV Sewol	Soma mine disaster	United States elections, 2014
2014-12-17	0.318735	0.219653	0.34413	0.362573	0.196277	0.156425	0.304348	0.252747	0.275064	0.043062	...	0.074176	0.327586	0.351064	0.349206	0.096070	0.103448	0.522422	0.318339	0.328571	0.408163
2014-12-18	0.318735	0.219653	0.34413	0.363372	0.195616	0.156425	0.304348	0.227723	0.275253	0.043062	...	0.074176	0.327586	0.351064	0.351438	0.095652	0.103448	0.522422	0.318339	0.328571	0.408163
2014-12-19	0.315663	0.219653	0.34413	0.359195	0.195616	0.156425	0.304348	0.235294	0.276382	0.043062	...	0.074176	0.325431	0.351064	0.351438	0.094828	0.103448	0.522422	0.318339	0.328571	0.408163
2014-12-20	0.315663	0.219653	0.34413	0.359195	0.195616	0.156425	0.304348	0.235294	0.276543	0.043062	...	0.074176	0.324034	0.348592	0.351438	0.094828	0.103448	0.522422	0.318339	0.328571	0.408163
2014-12-21	0.315663	0.219653	0.34413	0.359195	0.195616	0.156425	0.304348	0.235294	0.276543	0.043062	...	0.074176	0.324034	0.348592	0.351438	0.094828	0.103448	0.522422	0.318339	0.328571	0.408163

5 rows × 23 columns

In [1115]:

f,ax = plt.subplots(1,1,figsize=(10,6))

_ax = western_link_df.plot(colormap='spectral',ax=ax)
_ax.legend(loc='center left',bbox_to_anchor=[1,.5])

f.tight_layout()
f.savefig('western_links.png',dpi=200,bbox_inches='tight')

Get user attributes¶

In [74]:

def chunk_maker(a_list,size):
    chunk_num = len(a_list)/size
    chunks = list()
    for c in range(chunk_num + 1):
        start = c * (size + 1)
        end = (c + 1) * (size + 1)
        elements = list(itertools.islice(a_list,start,end))
        if len(elements) > 0:
            chunks.append(elements)
    return chunks

# http://stackoverflow.com/a/319291/1574687
def valid_ip(address):
    try:
        parts = address.split(".")
        if len(parts) != 4:
            return False
        for item in parts:
            if not 0 <= int(item) <= 255 and len(item) > 3:
                return False
        return True
    except ValueError:
        return False

The code block below is used to get the userproperties and save them to user_properties.json. First filter out the users corresponding to IP addresses, as they won't have any valid information. Then use the chunk_maker to make a list of lists containing 50 elements in each list. For each chunk, convert the list of 50 usernames into a giant string with names joined by pipes (u'|') and pass this "list" of usernames to the get_user_properties function. Add the each element of the returned list of names back to the user_properties empty list. Then save it to disk.

If you've completed the above step or have userproperties.json already in your directory, you can load this up and proceed from this step.

In [211]:

with open('user_properties.json','rb') as f:
    user_properties2 = json.load(f)
    
user_props_df = pd.DataFrame(user_properties2).set_index('name')
user_props_df = user_props_df[user_props_df['userid'].notnull()]
user_props_df['registration'] = pd.to_datetime(user_props_df['registration'],format='%Y-%m-%dT%H:%M:%SZ')
user_props_df['blockedtimestamp'] = pd.to_datetime(user_props_df['blockedtimestamp'],format='%Y-%m-%dT%H:%M:%SZ')
user_props_df['account_age'] = (pd.datetime.today().date() - user_props_df['registration'])/np.timedelta64(1,'D')
user_props_df['blocked'] = user_props_df['blockexpiry'].notnull()
user_props_df['blocked_account_age'] = (user_props_df['blockedtimestamp'] - user_props_df['registration'])/np.timedelta64(1,'D')
user_props_df['editcount'] = user_props_df['editcount'].map(float)
user_props_df['permissions'] = user_props_df['groups'].apply(len) - 2
user_props_df.drop(['invalid','blockedbyid','blockid','userid','blockedby','blockexpiry','blockreason'],inplace=True,axis=1)
user_props_df.head()

Out[211]:

	blockedtimestamp	editcount	gender	groups	registration	account_age	blocked	blocked_account_age	permissions
name
-sche	NaT	2593	unknown	[reviewer, *, user, autoconfirmed]	2010-12-14 22:02:32	1406.081574	False	NaN	2
0Aliuk	NaT	47	unknown	[*, user, autoconfirmed]	2014-02-18 09:54:43	244.587002	False	NaN	1
1007D	2011-07-09 22:51:54	1734	male	[*, user, autoconfirmed]	2010-10-06 00:24:04	1475.983287	True	276.935995	1
18abruce	NaT	5101	unknown	[*, user, autoconfirmed]	2010-09-07 13:09:43	1504.451586	False	NaN	1
1980fast	NaT	885	unknown	[*, user, autoconfirmed]	2010-04-13 04:32:12	1651.810972	False	NaN	1

Gender analysis¶

There are very few registered users in the corpus who identify as women (43) versus men (802). However, the vast majority of registered editors do not identify their gender at all (2006).

In [206]:

gender_count = user_props_df.groupby('gender').agg({'editcount':len})

print gender_count

sns.barplot(gender_count.index,gender_count.values,palette='muted')
plt.yscale('log')
plt.ylabel('Number of users',fontsize=15)
plt.xlabel('')

         editcount
gender            
female          43
male           802
unknown       2006

Out[206]:

<matplotlib.text.Text at 0x2d1e6278>

Differences in editor contributions¶

The editcount variable is across all edits to the English Wikipedia, not just the articles in the current 2014 news corpus. Plotting the distributions across genders, it appears women make more edits than men.

In [196]:

sns.boxplot(user_props_df['editcount'],groupby=user_props_df['gender'],color='muted')
plt.yscale('log')
plt.ylabel('Total revisions',fontsize=15)
plt.xlabel('')

Out[196]:

<matplotlib.text.Text at 0x2c19ec88>

Running a Mann-Whiney rank test to test the significance of the difference between men and women, it is not significant (one-tailed $p = .132$) In other words, we cannot reject the hypothesis that the differences in the median values between men and women is due to random chance.

In [161]:

female_editcounts = user_props_df[user_props_df['gender'] == 'female']['editcount'].values
male_editcounts = user_props_df[user_props_df['gender'] == 'male']['editcount'].values
stats.mannwhitneyu(female_editcounts,male_editcounts)

Out[161]:

(15505.0, 0.13257152370225006)

Differences in editor age¶

In [197]:

sns.boxplot(user_props_df['account_age'],groupby=user_props_df['gender'],color='muted')
plt.yscale('log')
plt.ylabel('Account age',fontsize=15)
plt.xlabel('')

Out[197]:

<matplotlib.text.Text at 0x2c5562e8>

Running a Mann-Whiney rank test to test the significance of the difference between men and women, it is not significant (two-tailed $p = .156$). In other words, we cannot reject the hypothesis that the differences in the median values between men and women is due to random chance.

In [160]:

female_account_ages = user_props_df[user_props_df['gender'] == 'female']['account_age'].values
male_account_ages = user_props_df[user_props_df['gender'] == 'male']['account_age'].values
stats.mannwhitneyu(female_account_ages,male_account_ages)

Out[160]:

(15668.0, 0.15630039119059119)

Blocks by gender¶

In the corpus of women who edited any of these 11 articles, women get blocked ~4.7% of the time and men get blocked 2.5% of the time, but there aren't enough observations to make any claims about these differences being significant.

In [209]:

blocked = user_props_df[user_props_df['blocked']]
print blocked.groupby('gender').agg({'editcount':len})
print '\n'
print blocked.groupby('gender').agg({'editcount':len})/gender_count

         editcount
gender            
female           2
male            20
unknown        137


         editcount
gender            
female    0.046512
male      0.024938
unknown   0.068295

Combine and extract node information¶

Subset the usernodelist and edgelists to only include the nodes in the gt1 graph.

In [219]:

bp_g_gt1_usernodelist.head()

Out[219]:

	latency_min	latency_median	latency_max	ts_min	ts_max	weight	diff_median	diff_max	diff_min	link_count_median	...	blockedtimestamp	editcount	gender	groups	registration	account_age	blocked	blocked_account_age	permissions	degree
username
-sche	31	341	81251	4937.999317	5017.180903	358	-1.0	2851	-3266	229	...	NaT	2593	unknown	[reviewer, *, user, autoconfirmed]	2010-12-14 22:02:32	1406.081574	False	NaN	2	2
0Aliuk	111	536	31981	4787.828588	4818.636806	4	12.5	35	-1	366	...	NaT	47	unknown	[*, user, autoconfirmed]	2014-02-18 09:54:43	244.587002	False	NaN	1	1
1.123.194.170	41	226	411	4801.286829	4801.287292	2	174.5	322	27	127	...	NaT	NaN	NaN	NaN	NaT	NaN	NaN	NaN	NaN	1
1.36.102.163	71	906	1741	5007.774919	5007.775752	2	26.5	51	2	132	...	NaT	NaN	NaN	NaN	NaT	NaN	NaN	NaN	NaN	1
1.36.209.129	91	336	19731	5008.152789	5021.642234	16	41.5	150	-576	133	...	NaT	NaN	NaN	NaN	NaT	NaN	NaN	NaN	NaN	1

5 rows × 26 columns

In [ ]:

In [240]:

bp_g_gt1_usernodelist = revs_usernodelist[revs_usernodelist.index.isin(users)]
bp_g_gt1_usernodelist = bp_g_gt1_usernodelist.join(user_props_df,how='left')
bp_g_gt1_usernodelist['degree'] = pd.Series({k:v for k,v in idc.iteritems() if k in bp_g_gt1_usernodelist.index})

bp_g_gt1_edgelist = revs_edgelist[revs_edgelist.index.isin(coauthorship_g_gt1.edges())]
bp_g_gt1_edgelist['article_degree'] = pd.Series({i:odc[i[0]] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editor_degree'] = pd.Series({i:idc[i[1]] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['article_age'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'ts_min'] - revs_pagenodelist.ix[i[0],'ts_min'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editor_age'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'account_age'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['gender'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'gender'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['permissions'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'permissions'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editcount'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'editcount'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['persistence'] = bp_g_gt1_edgelist['revision_max'] - bp_g_gt1_edgelist['revision_min']
bp_g_gt1_edgelist['revision_min_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_min']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['revision_max_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_max']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)})

Nodes' position and activity¶

Compare the number of revisions made per article to the number of articles edited. There's an upward trend suggesting that editors who contribute to more articles also edit articles more intensively.

In [30]:

ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Revisions made',fontsize=15)

Out[30]:

<matplotlib.text.Text at 0x1bb6ce48>

In [224]:

ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Revisions made',fontsize=15)

Out[224]:

<matplotlib.text.Text at 0x3416ada0>

Edit intensity can also be measured using the median latency of an editors' contributions. Editors making changes in rapid succession have lower latency and editors who take a lot of time between edits have higher latency. There's no apparent relationship between lataency and number of articles edited.

In [221]:

ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Latency (s)',fontsize=15)

Out[221]:

<matplotlib.text.Text at 0x2da69518>

In [222]:

ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Latency (s)',fontsize=15)

Out[222]:

<matplotlib.text.Text at 0x33c5eb38>

Edges position and activity¶

In [229]:

bp_g_gt1_edgelist.head()

Out[229]:

		latency_min	latency_median	latency_max	ts_min	ts_max	weight	diff_median	diff_max	diff_min	link_count_median	...	article_degree	editor_degree	article_age	editor_age	editor_gender	permissions	editcount	persistence	revision_min_frac	revision_max_frac
title	username
2014 Crimean crisis	101.98.175.68	31	6021	12011	4855.264688	4855.265058	2	72	143	1	410.0	...	342	1	65.554549	NaN	NaN	NaN	NaN	1	0.922298	0.922551
	109.148.57.245	81	101	131	4795.964560	4795.970093	3	11	27	7	300.0	...	342	1	6.254421	NaN	NaN	NaN	NaN	4	0.331309	0.332321
	109.255.139.0	31	81	3021	4797.811412	4813.016528	3	-1	17	-2	346.0	...	342	1	8.101273	NaN	NaN	NaN	NaN	1233	0.434827	0.746900
	109.64.20.198	771	811	1241	4795.843796	4817.730625	3	29	112	18	353.0	...	342	1	6.133657	NaN	NaN	NaN	NaN	1827	0.323209	0.785624
	109.78.144.177	731	4306	7881	4815.682222	4816.079375	2	0	0	0	415.5	...	342	1	25.972083	NaN	NaN	NaN	NaN	25	0.768413	0.774741

5 rows × 25 columns

In [33]:

plt.scatter(bp_g_gt1_edgelist['weight'],bp_g_gt1_edgelist['persistence'],alpha=.5)
plt.plot((0,10**4),(0,10**4),color='k',linestyle='--',linewidth=2)
plt.xscale('symlog')
plt.yscale('symlog')
plt.xlim((0,10**4))
plt.ylim((0,10**4))
plt.xlabel('Revisions made',fontsize=15)
plt.ylabel('Persistence (days)',fontsize=15)

Out[33]:

<matplotlib.text.Text at 0x29cec9b0>

In [34]:

plt.scatter(bp_g_gt1_edgelist['weight']*np.random.uniform(.9,1.1,size=len(bp_g_gt1_edgelist)),bp_g_gt1_edgelist['revision_min_frac'],alpha=.5)
plt.xscale('symlog')
plt.xlim((2,10**4))
plt.ylim((0,1))
plt.xlabel('Revisions made',fontsize=15)
plt.ylabel('Latency',fontsize=15)

Out[34]:

<matplotlib.text.Text at 0x2986df98>

In [245]:

ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)

Out[245]:

<matplotlib.text.Text at 0x370f8550>

In [256]:

ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['permissions'],color='gist_rainbow')
ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)

Out[256]:

<matplotlib.text.Text at 0x32477470>

In [234]:

ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Editor age (days)',fontsize=15)

Out[234]:

<matplotlib.text.Text at 0x3465ce10>

In [238]:

ax = sns.boxplot(bp_g_gt1_edgelist['article_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Time since first edit (days)',fontsize=15)

Out[238]:

<matplotlib.text.Text at 0x36036da0>

In [250]:

ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

Out[250]:

<matplotlib.text.Text at 0x381c1048>

In [252]:

ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel')
#ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('Editor age',fontsize=15)

Out[252]:

<matplotlib.text.Text at 0x3889ce10>

First edit to article¶

In [236]:

ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
#ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

Out[236]:

<matplotlib.text.Text at 0x3515cfd0>

In [241]:

ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
#ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

Out[241]:

<matplotlib.text.Text at 0x344f22e8>

In [249]:

ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel')
#ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

Out[249]:

<matplotlib.text.Text at 0x379b0cc0>

In [ ]: