%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, requests, re, itertools, urllib2, urlparse
import wikipedia_scraping as ws
import seaborn as sns
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup, element
from collections import Counter
from IPython.display import Image
from operator import itemgetter
from scipy import stats
from matplotlib.lines import Line2D
_start = pd.datetime(2001,1,1)
_end = pd.datetime(2015,1,1)
_filedir = u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'
We begin by defining the article titles for the news events we want to collect data on. We'll also decode
these strings into valid unicode strings to account for any wacky characters.
# http://yearinreview.fb.com/
facebook = ['World Cup','Ebola Outbreak','Elections in Brazil','Robin Williams','Ice Bucket Challenge','Conflict in Gaza','Malaysia Airlines disasters','Super Bowl','Ferguson','Sochi']
# http://www.google.com/trends/topcharts?hl=en#date=2014
google = ['Robin Williams','World Cup','Ebola','Malaysia Airlines','Flappy Bird','ALS Ice Bucket Challenge','ISIS','Ferguson','Frozen','Ukraine']
# https://2014.twitter.com/moments
twitter = ['Philip Seymour Hoffman','State of the Union','Carnaval','Malaysia Airlines','Bring Back Our Girls','India Election','Spanish Abdication','Maya Angelou','Ferguson','Robin Williams','Ice Bucket Challenge','Scottish referendum','Ebola','He for She','Hong Kong protests','Mars Orbiter','Malala Yousafzi','US elections','Berlin Wall','Philae']
# Editorial judgment, https://en.wikipedia.org/wiki/2014
wikipedia1 = ['2014 Winter Olympics','Ebola virus epidemic in West Africa','2014 Crimean crisis','Malaysia Airlines Flight 370','Chibok schoolgirl kidnapping','Sinking of the MV Sewol','Islamic State in Iraq and the Levant','2014 FIFA World Cup','Felipe VI','2014 Israel–Gaza conflict','Malaysia Airlines Flight 17','Rosetta spacecraft','Cuba-United States relations']
# Number of contributors, http://stats.wikimedia.org/EN/TablesWikipediaEN.htm#zeitgeist
# Excluding repeats like "Deaths in 2014"
wikipedia2 = ['2013–14 North American cold wave',]
This is un-redeemably hacky HTML parsing. I apologize. But this generates your top 25 list, which is published on this gist.
def top_articles(lang):
# Read the HTML from the web and convert to soup
# Broken URLS here: soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/{0}/TablesWikipedia{0}.htm'.format(lang.upper())).read())
soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/EN/TablesWikipedia{0}.htm'.format(lang.upper())).read())
# Look for all the paragraphs with 2014
_p = soup.findAll('b',text=re.compile('2014'))
# Select only those paragraph parents that have exactly 152 fields, corresponding to the top-25 lists
_p2014 = [t.parent for t in _p if len(t.parent) == 152]
# Get the text out of the children tags as a list of lists
parsed = [[t.text for t in list(p.children) if type(t) != element.NavigableString] for p in _p2014]
# Convert to a dictionary keyed by month abbreviation with values as the list of text fields
parsed = {month[0].split(u'\xa0')[0]:month[1:] for month in parsed}
# Do some crazy dictionary and list comprehensions with zips to convert the values in the list
parsed = {k:[{'rank':int(a),'editors':int(b),'article':c} for a,b,c in zip(v[0::3],v[1::3],v[2::3])] for k,v in parsed.items()}
# Convert each month into a DataFrame with month information in the index
# and then concat all the dfs together, sorting on those with the most editors
ranked = pd.concat([pd.DataFrame(parsed[i],index=[i]*len(parsed[i])) for i in parsed.keys()]).sort('editors',ascending=False).reset_index()
# rename the reset index to something meaningful
ranked.rename(columns={'index':'month'},inplace=True)
# Group the articles by name, compute aggregate statistics
# Rank on the total number editors and months in the top 25
top_articles = ranked.groupby('article').agg({'month':len,'editors':np.sum,'rank':np.min})
top_articles['editor-month'] = top_articles['month'] * top_articles['editors']
top_articles.sort(['editor-month'],ascending=False,inplace=True)
return top_articles
country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
'uk':'Ukranian'}
top_articles_by_country = {}
for country in country_codes.keys():
try:
top_articles_by_country[country] = top_articles(country)
except urllib2.HTTPError:
print "The '{0}' language does not have a stats page ".format(country)
pass
for _country,_df in top_articles_by_country.items():
_df.to_csv('/Data/{0}.csv'.format(_country),encoding='utf8')
def langlink_translater(source_lang,target_lang,article_titles):
chunks = ws.chunk_maker(article_titles,40)
translation_dict = dict()
for chunk in chunks:
result = ws.wikipedia_query({'action':'query',
'prop': 'langlinks',
'lllang': source_lang,
'titles': '|'.join(chunk),
'lllimit': '500'},target_lang)
if result and 'pages' in result.keys():
translation_dict.update({_d['title'] : _d['langlinks'][0]['*'] for _d in result['pages'].values() if 'langlinks' in _d.keys()})
return translation_dict
# This step takes a few minutes
translater_dict = {source_lang:{target_lang:langlink_translater(source_lang,target_lang,df.index) for target_lang,df in top_articles_by_country.items()} for source_lang in top_articles_by_country.keys()}
# Save the file
with open('translater_dict.json','wb') as f:
json.dump(translater_dict,f)
Load up data crawled above.
_filedir
u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'
country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
'uk':'Ukranian'}
top_articles_by_country = dict()
for country in country_codes.keys():
top_articles_by_country[country] = pd.read_csv(_filedir + '/Data/{0}.csv'.format(country),encoding='utf8',index_col=0)
with open('translater_dict.json','rb') as f:
translater_dict = json.load(f)
lang_link_exists_dict = dict()
top_articles_df = pd.DataFrame()
for source_lang,target_dictionary in translater_dict.iteritems():
langlink_exists_df = pd.DataFrame()
for target_lang,d in target_dictionary.iteritems():
top_articles_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index)
langlink_exists_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index).isin(translater_dict[source_lang][target_lang].keys())
if source_lang == target_lang:
langlink_exists_df[target_lang] = [1]*len(langlink_exists_df[target_lang])
langlink_exists_df = langlink_exists_df.reindex_axis(sorted(langlink_exists_df.columns), axis=1)
lang_link_exists_dict[source_lang] = langlink_exists_df
_df = top_articles_df.ix[:2].T
_df.index = [country_codes[i] for i in _df.index]
_df.columns = range(1,4)
_df.sort()
1 | 2 | 3 | |
---|---|---|---|
Arabic | كريستيانو رونالدو | ريال مدريد | السعودية |
Chinese | 世間情 | 太陽花學運 | 马来西亚航空370号班机空难 |
Czech | Válka na východní Ukrajině | Euromajdan | Minecraft |
Dutch | Lijst van personen overleden in 2014 | Malaysia Airlines-vlucht 17 | Eurovisiesongfestival 2014 |
English | Deaths in 2014 | Malaysia Airlines Flight 370 | Islamic State of Iraq and the Levant |
Farsi | دولت اسلامی عراق و شام | ایل ملکشاهی | مهران مدیری |
French | État islamique (organisation) | Manuel Valls | Dieudonné |
German | Krise in der Ukraine 2014 | Alternative für Deutschland | Fußball-Weltmeisterschaft 2014 |
Indonesian | JKT48 | NET. | Joko Widodo |
Italian | Juventus Football Club | Campionato mondiale di calcio 2014 | Serie A 2013-2014 |
Japanese | 仮面ライダー鎧武/ガイム | 烈車戦隊トッキュウジャー | ハピネスチャージプリキュア! |
Korean | 대한민국 | 일베저장소 | 세월호 침몰 사고 |
Polish | Robert Lewandowski | 2014 | Euromajdan |
Portugese | Em Família (telenovela) | Copa do Mundo FIFA de 2014 | Campeonato Brasileiro de Futebol de 2014 - Sér... |
Russian | Список умерших в 2014 году | Вооружённый конфликт на востоке Украины (2014) | Донецкая Народная Республика |
Spanish | Copa Mundial de Fútbol de 2014 | Podemos (partido político) | Copa Sudamericana 2014 |
Swedish | Sverigedemokraterna | Avlidna 2014 | Feministiskt initiativ |
Turkish | Türkiye | Recep Tayyip Erdoğan | Mustafa Kemal Atatürk |
Ukranian | Війна на сході України | Небесна сотня | Ленінопад |
_df.ix['id'].sum(axis=1)
55.0
_lang = 'en'
f, ax = plt.subplots(figsize=(10,5))
_df = lang_link_exists_dict[_lang].ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]
_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=1)
ax.set_frame_on(False)
ax.set_xticks(np.arange(0.5,_x+.5,10),minor=False)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.invert_yaxis()
ax.set_xticklabels(_df.columns[::10],minor=False,fontsize=12)
ax.set_yticklabels([country_codes[x] for x in _df.index],minor=False,fontsize=12)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_xlabel('Article rank',fontsize=15)
#f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)
f.tight_layout();
f.savefig('en_lang_link_exists.png',dpi=150)
sum_lang_link = pd.DataFrame(np.zeros(lang_link_exists_dict['en'].shape),columns=lang_link_exists_dict['en'].columns)
for lang,_df in lang_link_exists_dict.iteritems():
sum_lang_link = sum_lang_link + _df.values.astype(float)
#frac_sum_lang_link = sum_lang_link.apply(lambda x:x/19)
sum_lang_link.columns = [country_codes[i] for i in sum_lang_link.columns]
f, ax = plt.subplots(figsize=(10,5))
_df = sum_lang_link.ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]
_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=19)
ax.set_frame_on(False)
ax.set_xticks(np.arange(0,_x,10),minor=False)
ax.set_xticklabels(np.arange(0,_x,10),fontsize=12)
ax.set_xlabel('Article Rank',fontsize=15)
ax.set_title('Number of Languages with Article on Topic',fontsize=20)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.set_yticklabels(_df.index,minor=False)
ax.invert_yaxis()
#f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([.875, 0.15, 0.025, .75])
f.colorbar(_ax, cax=cbar_ax)
f.tight_layout()
f.savefig('sum_lang_link.png',dpi=200)
_s = sum_lang_link.ix[:100].apply(np.average,axis=1)
ax = plt.scatter(_s.index,_s.values,s=50,cmap='rainbow')
ax.axes.set_title('Coverage for Top 100 Stories',fontsize=20)
ax.axes.set_xlabel('Article Rank',fontsize=16)
ax.axes.set_ylabel('Number of Languages Covered',fontsize=16)
ax.axes.set_xlim((-1,101))
plt.tight_layout()
plt.savefig('top100_coverage.png',dpi=200)
But all of this analysis about article coverage is inaccurate because we've made the assumption that the interlanguage links between articles are reliable. That is, if a "Eurovision 2014" article exists in all 19 languages, all 18 languages link to the other 18 languages' version of the article. The analysis below will show this assumption is flawed.
Make a network of the interlanguage links to reveal the clusters and missing links between languages.
article_language_graph = nx.DiGraph()
article_language_mapper = dict() # This will be helpful later
for source_lang,d in translater_dict.iteritems():
for target_lang,mapping in d.iteritems():
for target_lang_article,source_lang_article in mapping.iteritems():
article_language_graph.add_edge(target_lang_article,source_lang_article)
article_language_graph.add_node(source_lang_article,lang=source_lang)
article_language_graph.add_node(target_lang_article,lang=target_lang)
# Populate the article_language_mapper
if source_lang_article in article_language_mapper.keys():
article_language_mapper[source_lang_article].append(source_lang)
else:
article_language_mapper[source_lang_article] = [source_lang]
if target_lang_article in article_language_mapper.keys():
article_language_mapper[target_lang_article].append(target_lang)
else:
article_language_mapper[target_lang_article] = [target_lang]
nx.write_gexf(article_language_graph,'article_language_graph.gexf')
article_language_mapper = {k:list(set(v)) for k,v in article_language_mapper.iteritems()}
with open('article_language_mapper.json','wb') as f:
json.dump(article_language_mapper,f)
Image('article_language_links.png')
There are at least two kinds of problems in these topical subgraphs. The first problem is that these subgraphs are missing many links within topics; articles about the same are linked in some languages, but not others. This is marked in the figure below by the blue observations falling much below the red Ideal
line.
The second problem is that different topics are sometimes linked together. Because there are 19 languages we're looking at, there should be a maximum of 19 articles in a cluster. However, there are three clusters that have more than 19 articles in them. This is an artifact of imprecise topical linking. For example, some languages link to the article on Ebolavirus while others link to Ebola virus disease. Entities like Taiwan and China refer to complex and overlapping concepts like political entities (PRC vs. ROC), geographies ("Formosa" vs. Mainland), and cultures. Of course, this disambiguation problem is almost certainly likely to be present in the topical subgraphs below size 19, but I'm simply going to ignore it.
topic_subgraphs = list(nx.components.connected_component_subgraphs(article_language_graph.to_undirected()))
subgraph_properties = [{'edges':_subgraph.number_of_edges(),'nodes':_subgraph.number_of_nodes(),'density':nx.density(_subgraph)} for _subgraph in topic_subgraphs]
# Uncomment to see what's in these subgraphs
#for _subgraph in topic_subgraphs:
# if _subgraph.number_of_nodes() > 19:
# print _subgraph.nodes()
subgraph_df = pd.DataFrame(subgraph_properties)
subgraph_df = subgraph_df[subgraph_df['nodes'] > 2]
f,ax = plt.subplots(1,1)
_ax = subgraph_df.plot(x='nodes',y='edges',kind='scatter',label='Observed Topic',ax=ax)
ax.plot([i*(i-1) for i in range(20)],label='Ideal Topic',lw=3,c='r',alpha=.5)
ax.axvline(x=19.5,ls='--',lw=3,c='g',alpha=.5,label='Max Topics')
ax.set_xlim((0,40))
ax.set_ylim((-1,400))
ax.legend(fontsize=12)
ax.set_xlabel('Number of Nodes in Topic',fontsize=18)
ax.set_ylabel('Number of Edges in Topic',fontsize=18)
ax.set_title('Diagnosing Problems in Topic Subgraphs',fontsize=24)
# Based on the results from commented part above, I'm applying three labels to the three outliers
_outliers = zip(['China','Taiwan','Ebola'],subgraph_df[subgraph_df['nodes'] > 20][['nodes','edges']].values)
for label,(x,y) in _outliers:
ax.annotate(label,xy=(x, y),fontsize=12,
xytext=(x+2, y+75),
arrowprops=dict(arrowstyle="fancy", #linestyle="dashed",
color="0.5",shrinkB=8,connectionstyle="arc3,rad=0.3"))
plt.tight_layout();
We need to create these missing links. We'll use a helpful answer from StackOverflow involving itertools
to generate and edgelist of every permutation of nodes in the subgraph. We can use the resulting edgelist to create a complete subgraph where every node in the subgraph is linked together.
def complete_subgraph_maker(node_list):
return itertools.permutations(node_list,2)
complete_topic_graph = nx.DiGraph()
for _subgraph in topic_subgraphs:
if _subgraph.number_of_nodes() < 20:
_edgelist = complete_subgraph_maker(_subgraph.nodes())
complete_topic_graph.add_edges_from(_edgelist)
# Add the language labels back in as node attributes so we can hopefully translate back
for node in complete_topic_graph.nodes():
complete_topic_graph.add_node(node,lang=article_language_mapper[node])
Now I'm going to impose a very Anglo-centric constraint by creating a topic_graph
that contains only articles that are linked in the English Wikipedia. This unfortunately has the effect of removing a few of the topical clusters, but still leaves us with over 1800 articles in English to explore in subsequent steps.
complete_topic_subgraphs = list(nx.components.connected_component_subgraphs(complete_topic_graph.to_undirected()))
english_label_subgraphs = [_subgraph for _subgraph in complete_topic_subgraphs for node,data in _subgraph.nodes_iter(data=True) if 'en' in data['lang']]
print "Out of the initial {0} topical clusters, there are {1} subgraphs in the complete approach. {2} of these have an English label".format(len(topic_subgraphs), len(complete_topic_subgraphs), len(english_label_subgraphs))
english_topic_graph = nx.DiGraph()
topic_translation_dict = dict()
for _subgraph in english_label_subgraphs:
english_topic_graph.add_edges_from(_subgraph.edges(data=True))
english_topic_graph.add_nodes_from(_subgraph.nodes(data=True)) #1
_english_nodes = [_node for _node,_data in _subgraph.nodes_iter(data=True) if 'en' in _data['lang']]
if len(_english_nodes) == 1:
topic_translation_dict.update({_node:_english_nodes[0] for _node in _subgraph.nodes_iter()})
else:
# I really hope this is never the case, but just to be sure
print _english_nodes
# Graphs with lists for attributes cant be serialized into GEXF
# Comment out #1 to make #2 work, or leave #1 uncommented and #2 commented
#nx.write_gexf(english_topic_graph,'english_topic_graph.gexf') #2
Out of the initial 2124 topical clusters, there are 1968 subgraphs in the complete approach. 1885 of these have an English label
Now having done all this work to create topical sub-graphs containing English labels, we can go back to the original lists of top stories and meaningfully compare what the top stories (in English) were across languages.
translated_articles_by_country = pd.DataFrame()
for country in country_codes.keys():
translated_articles_by_country[country] = pd.Series([topic_translation_dict.get(article,np.nan) for article in top_articles_by_country[country].index])
translated_articles_by_country.columns = [country_codes[i] for i in translated_articles_by_country.columns]
translated_articles_by_country.index = range(1,len(translated_articles_by_country)+1)
translated_articles_by_country.sort(axis=1).head(3).T
1 | 2 | 3 | |
---|---|---|---|
Arabic | Cristiano Ronaldo | Real Madrid C.F. | Saudi Arabia |
Chinese | NaN | Sunflower Student Movement | Malaysia Airlines Flight 370 |
Czech | War in Donbass | Euromaidan | Minecraft |
Dutch | Deaths in 2014 | Malaysia Airlines Flight 17 | Eurovision Song Contest 2014 |
English | Deaths in 2014 | Malaysia Airlines Flight 370 | Islamic State of Iraq and the Levant |
Farsi | Islamic State of Iraq and the Levant | NaN | Mehran Modiri |
French | Islamic State of Iraq and the Levant | Manuel Valls | Dieudonné M'bala M'bala |
German | War in Donbass | Alternative for Germany | 2014 FIFA World Cup |
Indonesian | NaN | NaN | Joko Widodo |
Italian | Juventus F.C. | 2014 FIFA World Cup | 2013–14 Serie A |
Japanese | Kamen Rider Gaim | Ressha Sentai ToQger | HappinessCharge PreCure! |
Korean | South Korea | Ilbe Storehouse | Sinking of the MV Sewol |
Polish | Robert Lewandowski | 2014 | Euromaidan |
Portugese | Em Família (telenovela) | 2014 FIFA World Cup | 2014 Campeonato Brasileiro Série A |
Russian | Deaths in 2014 | War in Donbass | Donetsk People's Republic |
Spanish | 2014 FIFA World Cup | Podemos (Spanish political party) | 2014 Copa Sudamericana |
Swedish | Sweden Democrats | Deaths in 2014 | Feminist Initiative (Sweden) |
Turkish | Turkey | Recep Tayyip Erdoğan | Mustafa Kemal Atatürk |
Ukranian | War in Donbass | List of people killed during Euromaidan | NaN |
These are articles that went missing from all that "clean up." Hooray. But at least we have a nice translated version of everything.
pd.Series(top_articles_by_country['en'].index).ix[list(np.array(translated_articles_by_country['English'][translated_articles_by_country['English'].isnull()].index) - 1)]
12 Ebola virus disease 25 Big Brother 16 (U.S.) 32 Nash Grier 41 Ebola virus cases in the United States 75 Republic of Crimea (country) 141 WWE 2K15 143 Siege of Kobanê 148 Gopinath Munde Name: article, dtype: object
Comparing the extent to which every cell has an valid article that can be translated into English, it looks like the cleanup (bottom) hurt more than it helped compared the original (top).
f,(ax1,ax2) = plt.subplots(2,1,sharex=True,figsize=(10,5))
# Plot on ax1
_df1 = lang_link_exists_dict['en'].T.astype(float)
_df1 = _df1.ix[_df1.sum(axis=1).sort(inplace=False,ascending=False).index]
_df1.index = [country_codes[i] for i in _df1.index]
_y1,_x1 = _df1.shape
_ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=1)
ax1.set_frame_on(False)
#ax1.set_xticks(np.arange(0,_x1,10),minor=False)
ax1.set_yticks(np.arange(_y1)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_yticklabels(_df1.index,minor=False,fontsize=8)
ax1.set_title('Original',fontsize=18)
# Plot on ax2
_df2 = translated_articles_by_country.T.notnull().astype(float)
_df2 = _df2.ix[_df1.index] # Use the _df1 index
_y2,_x2 = _df2.shape
_ax2 = ax2.pcolor(_df2.values.astype(float),cmap='rainbow',vmin=0,vmax=1)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(0,_x2,10),minor=False)
ax2.set_yticks(np.arange(_y2)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_yticklabels(_df2.index,minor=False,fontsize=8)
ax2.tick_params(axis='x',direction='in',pad=-4)
ax2.set_title('Cleaned',fontsize=18)
f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)
f.suptitle('Comparing results, English',fontsize=24)
#f.subplots_adjust(top=0.5)
f.tight_layout(rect=[0,0,1,.9])
top_stories_across_languages = pd.Series(Counter([_val for _array in translated_articles_by_country.values for _val in _array]))
top_stories_across_languages = top_stories_across_languages.ix[1:]
top_stories_across_languages_top5 = top_stories_across_languages[top_stories_across_languages >= 5].sort(inplace=False,ascending=True)
f,ax = plt.subplots(1,1,figsize=(8,10))
_ax = top_stories_across_languages_top5.plot(kind='barh',ax=ax)
ax.axes.set_title('Articles With Widest Coverage\n',fontsize=24)
ax.axes.set_xlabel('Number of Languages',fontsize=18)
f.tight_layout()
f.savefig('widest_coverage.png',dpi=200)
combined_top_articles_df = pd.concat(top_articles_by_country.values(),keys=top_articles_by_country.keys(),axis=0).reset_index()
combined_top_articles_df.rename(columns={'level_0':'lang'},inplace=True)
combined_top_articles_df['article'] = combined_top_articles_df['article'].apply(lambda x:topic_translation_dict.get(x,np.nan))
combined_top_articles_agg_article = combined_top_articles_df.groupby('article').agg({'editors':np.sum,'month':np.average,'lang':len})
combined_top_articles_agg_article['editors per month'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['month']
combined_top_articles_agg_article['editors per lang'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['lang']
combined_top_articles_agg_article['editors-lang-month'] = combined_top_articles_agg_article['editors']*combined_top_articles_agg_article['lang']*combined_top_articles_agg_article['month']
combined_top_articles_agg_article.sort('editors-lang-month',ascending=True,inplace=True)
f,ax = plt.subplots(1,1,figsize=(8,10))
_ax = combined_top_articles_agg_article['editors-lang-month'].ix[-50:].plot(kind='barh',ax=ax)
ax.axes.set_xscale('log')
ax.axes.set_xlabel('Editor-language-month score',fontsize=18)
ax.axes.set_ylabel('')
ax.axes.set_title('Articles with Highest Activity\n',fontsize=24)
f.tight_layout();
f.savefig('highest_activity_ranking.png',dpi=200)
_melted = pd.melt(translated_articles_by_country.reset_index(),id_vars=['index'])
_pivoted = pd.pivot_table(data=_melted,index='value',columns='variable',values='index')
top_by_language_pivoted = _pivoted.ix[top_stories_across_languages_top5.index].fillna(0)
top_by_combined_pivoted = _pivoted.ix[combined_top_articles_agg_article.index[-50:]].fillna(0)
language_cosine = dict()
combined_cosine = dict()
for _lang1 in country_codes.values():
language_cosine[_lang1] = dict()
combined_cosine[_lang1] = dict()
for _lang2 in country_codes.values():
if _lang1 != _lang2:
language_cosine[_lang1][_lang2] = cosine_similarity(top_by_language_pivoted[_lang1],top_by_language_pivoted[_lang2])[0][0]
combined_cosine[_lang1][_lang2] = cosine_similarity(top_by_combined_pivoted[_lang1],top_by_combined_pivoted[_lang2])[0][0]
f,(ax1,ax2) = plt.subplots(1,2,figsize=(9,5),sharey=True)
_df1 = pd.DataFrame(language_cosine)
_order1 = _df1.mean(axis=1).sort(inplace=False,ascending=True).index
_df1 = _df1[_order1].ix[_order1]
_y1,_x1 = _df1.shape
_ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=.75)
ax1.set_title('Language coverage',fontsize=18)
ax1.set_frame_on(False)
ax1.set_xticks(np.arange(_y1)+.5,minor=False)
ax1.set_yticks(np.arange(_y1)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90)
ax1.set_yticklabels(_df1.index,minor=False,fontsize=10)
_df2 = pd.DataFrame(combined_cosine)
_order2 = _df2.mean(axis=1).sort(inplace=False,ascending=True).index
_df2 = _df2[_order1].ix[_order1] # Order the same way as _df1
_y2,_x2 = _df2.shape
_ax2 = ax2.pcolor(_df2,cmap='rainbow',vmin=0,vmax=.75)
ax2.set_title('Editor-language-month score',fontsize=18)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(_y1)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90)
f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([1, 0.15, 0.05, .7])
cb = f.colorbar(_ax2, cax=cbar_ax, label='Cosine similarity')
cb.ax.yaxis.label.set_fontsize(15)
#f.suptitle('Cosine similarity of rankings across languages',fontsize=24)
#f.subplots_adjust(top=0.5)
f.tight_layout()#rect=[0,0,1,.9])
f.savefig('cosine_similarity.png',dpi=200)
_df = pd.DataFrame(data=np.triu(_df2),index=_df2.index,columns=_df2.columns).replace({0:np.nan})
#del _df['id']
_df.reset_index(inplace=True)
coverage = pd.melt(_df,id_vars=['index']).dropna(subset=['value'])
coverage.columns = ['Language 1','Language 2','Cosine Similarity']
#coverage['Language 1'] = coverage['Language 1'].apply(lambda x:country_codes.get(x))
#coverage['Language 2'] = coverage['Language 2'].apply(lambda x:country_codes.get(x))
_highest = coverage.sort('Cosine Similarity',inplace=False,ascending=False).reset_index(drop=True).ix[:9]
_lowest = coverage[coverage['Language 1'] != 'Indonesian'].sort('Cosine Similarity',inplace=False,ascending=True).reset_index(drop=True).ix[:9]
pd.concat([_highest,_lowest],axis=1,keys=['Highest similarities','Lowest similarities'])
Highest similarities | Lowest similarities | |||||
---|---|---|---|---|---|---|
Language 1 | Language 2 | Cosine Similarity | Language 1 | Language 2 | Cosine Similarity | |
0 | Farsi | Dutch | 0.747684 | Ukranian | Swedish | 0.094924 |
1 | French | English | 0.639927 | Japanese | Farsi | 0.115249 |
2 | Czech | Polish | 0.609603 | Japanese | English | 0.136084 |
3 | Polish | English | 0.593232 | Chinese | Czech | 0.160647 |
4 | German | English | 0.592578 | Ukranian | Dutch | 0.169325 |
5 | Farsi | Russian | 0.587311 | Turkish | Arabic | 0.174961 |
6 | Portugese | Farsi | 0.583782 | Japanese | Spanish | 0.175202 |
7 | Chinese | Spanish | 0.580538 | Turkish | Ukranian | 0.177230 |
8 | Farsi | Italian | 0.570912 | Ukranian | Farsi | 0.181022 |
9 | French | Polish | 0.569966 | Turkish | German | 0.186484 |
top_news_articles = [u'2014 FIFA World Cup', u'Malaysia Airlines Flight 370', u'Malaysia Airlines Flight 17',
u'2014 Winter Olympics', u'2014 Crimean crisis', u'Felipe VI of Spain',
u'Islamic State of Iraq and the Levant',u'Ebola virus epidemic in West Africa',u'Eurovision Song Contest 2014',
u'Ice Bucket Challenge', u'2014 Israel\u2013Gaza conflict', u'Minecraft',
u'Scottish independence referendum, 2014',u'2014 Hong Kong protests', u'United States elections, 2014',
u'Soma mine disaster', u'Indian general election, 2014', u'Gamergate controversy',
u'2014 Ferguson unrest',u'Rosetta spacecraft', u'Cuba\u2013United States relations',
u'Chibok schoolgirl kidnapping', u'Sinking of the MV Sewol']
Define an empty revision_dict
to hold the data, loop through the list of articles and use the get_page_content
for each of these articles between 2001-01-01 and 2014-10-17 in English, and save this data into revision_dict
. We'll also write this revision data to disk as a pickle file.
revision_dict = dict()
for article in top_news_articles:
print article
revision_dict[article] = ws.get_page_revisions(article,_start,_end,'en')
revision_dict[article].to_csv(_filedir + u'Data/{0}.csv'.format(article),encoding='utf8')
2014 FIFA World Cup Malaysia Airlines Flight 370 Malaysia Airlines Flight 17 2014 Winter Olympics 2014 Crimean crisis Felipe VI of Spain Islamic State of Iraq and the Levant Ebola virus epidemic in West Africa Eurovision Song Contest 2014 Ice Bucket Challenge 2014 Israel–Gaza conflict Minecraft Scottish independence referendum, 2014 2014 Hong Kong protests United States elections, 2014 Soma mine disaster Indian general election, 2014 Gamergate controversy 2014 Ferguson unrest Rosetta spacecraft Cuba–United States relations Chibok schoolgirl kidnapping Sinking of the MV Sewol
Compute cumulative Gini coefficients for each revision of an article. This captures how centralized the distribution of revisions per editor has become over time.
# http://planspace.org/2013/06/21/how-to-calculate-gini-coefficient-from-raw-data-in-python/
def gini(list_of_values):
if len(list_of_values) > 1:
sorted_list = sorted(list_of_values)
height, area = 0, 0
for value in sorted_list:
height += value
area += height - value / 2.
fair_area = height * len(list_of_values) / 2
gini_value = (fair_area - area) / fair_area
else:
gini_value = np.nan
return gini_value
for _df in revision_dict.values():
_df['gini'] = [gini(Counter(_df.ix[:i,'user']).values()) for i in iter(_df.index)]
Do some more data cleanup and write out two large CSV files corresponding to all the data (revisions.csv
) and revisions during 2014 (revisions_2014.csv
).
rev_df = pd.concat(revision_dict.values(),keys=revision_dict.keys(),axis=0)
rev_df.reset_index(inplace=True,level=0)
rev_df.rename(columns={'level_0':'title'},inplace=True)
rev_df.reset_index(inplace=True,drop=True)
rev_df['anon'] = rev_df['anon'].notnull()
rev_df['userhidden'] = rev_df['userhidden'].notnull()
rev_df['commenthidden'] = rev_df['commenthidden'].notnull()
rev_df.to_csv('revisions.csv',encoding='utf8')
revs2014_df = rev_df[rev_df['timestamp'] >= pd.datetime(2014,1,1,0,0,0)]
revs2014_df.reset_index(drop=True,inplace=True)
revs2014_df.to_csv('revisions_2014.csv',encoding='utf8')
Loop through all the revisions for each article in revisions_dict
and convert them to a pandas DataFrame. concat
this list of article revision history DataFrames together into revs
and look at everything that's happened already!
rev_df = pd.read_csv('revisions.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp'])
revs2014_df = pd.read_csv('revisions_2014.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp'])
revs2014_df.tail()
title | anon | comment | commenthidden | date | diff | gini | latency | parentid | revid | revision | size | timestamp | unique_users | user | userhidden | userid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7168 | 2014 FIFA World Cup | True | NaN | False | 2014-12-01 | 9 | 0.606035 | 5137 | 636094531 | 636105483 | 7168 | 112203 | 2014-12-01 01:24:41 | 2307 | Moka Mo | False | 22227837 |
7169 | 2014 FIFA World Cup | True | /* Qualification */ | False | 2014-12-06 | -1 | 0.606039 | 461669 | 636105483 | 636867858 | 7169 | 112202 | 2014-12-06 09:39:10 | 2307 | Edgars2007 | False | 8973808 |
7170 | 2014 FIFA World Cup | True | /* External links */ | False | 2014-12-09 | 70 | 0.606093 | 300989 | 636867858 | 637376139 | 7170 | 112272 | 2014-12-09 21:15:39 | 2307 | Chanheigeorge | False | 376297 |
7171 | 2014 FIFA World Cup | True | /* Group stage */ Rearrange | False | 2014-12-12 | 24437 | 0.606147 | 232125 | 637376139 | 637769495 | 7171 | 136709 | 2014-12-12 13:44:24 | 2307 | Soerfm | False | 15268407 |
7172 | 2014 FIFA World Cup | True | /* Match summary */ link | False | 2014-12-12 | 29 | 0.606202 | 1330 | 637769495 | 637771163 | 7172 | 136738 | 2014-12-12 14:06:34 | 2307 | Soerfm | False | 15268407 |
_agg_function = {'revision':np.max,'unique_users':np.max}
revs2014_agg_article = revs2014_df.groupby('title').agg(_agg_function)
f,ax = plt.subplots(1,1,figsize=(8,10))
revs2014_agg_article.sort('revision',inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_ylabel('')
ax.set_xlabel('Count',fontsize=12)
f.tight_layout()
f.savefig('en_19_activity.png',dpi=200)
Compute daily activity numbers for each article by grouping first on article title, then on the date. We use an aggregate
function to return the number of unique users, revisions, size of change, and latency between edits for each day. Perform some other data manipulation and cleanup and store the results for 2014-01-01 onward in activity_2014
. This dataframe will be used as the dat source for subsequent timeseries visualizations of content production.
daily_activity = revs2014_df.groupby(['title','date']).aggregate({'unique_users':max,
'revid':len,
'diff':np.sum,
'latency':np.mean,
'size':np.mean,
'gini':np.mean})
daily_activity = daily_activity.unstack(level=0)
daily_activity.index = pd.to_datetime(daily_activity.index)
daily_activity['unique_users'] = daily_activity['unique_users'].fillna(method='ffill').fillna(0)
daily_activity['revid'] = daily_activity['revid'].fillna(method='ffill').fillna(0)
daily_activity['gini'] = daily_activity['gini'].fillna(method='ffill').fillna(0)
#daily_activity['link_count'] = daily_activity['link_count'].fillna(method='ffill').fillna(0)
daily_activity['size'] = daily_activity['size'].fillna(method='ffill').fillna(0)
daily_activity['diff'] = daily_activity['diff'].fillna(0)
daily_activity['latency'] = daily_activity['latency'].fillna(0)
#daily_activity = daily_activity.fillna(method='ffill').fillna(0)
activity_2014 = daily_activity.ix['2014-1-1':]
activity_2014.tail()
latency | ... | size | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | 2014 Crimean crisis | 2014 FIFA World Cup | 2014 Ferguson unrest | 2014 Hong Kong protests | 2014 Israel–Gaza conflict | 2014 Winter Olympics | Chibok schoolgirl kidnapping | Cuba–United States relations | Ebola virus epidemic in West Africa | Eurovision Song Contest 2014 | ... | Indian general election, 2014 | Islamic State of Iraq and the Levant | Malaysia Airlines Flight 17 | Malaysia Airlines Flight 370 | Minecraft | Rosetta spacecraft | Scottish independence referendum, 2014 | Sinking of the MV Sewol | Soma mine disaster | United States elections, 2014 |
date | |||||||||||||||||||||
2014-12-19 | 30306.4 | 0 | 44294 | 8175.800000 | 0.000000 | 0 | 0 | 7911.545455 | 2413.060606 | 0.0 | ... | 189399 | 230636.621622 | 140466.0 | 220862.500000 | 111471.75 | 79146.000000 | 231326 | 150991 | 29070 | 29644.666667 |
2014-12-20 | 0.0 | 0 | 0 | 16080.666667 | 0.000000 | 1211054 | 0 | 14762.800000 | 7225.071429 | 107732.5 | ... | 189399 | 231294.473684 | 140850.8 | 220783.666667 | 111471.75 | 79146.000000 | 231326 | 150991 | 29070 | 29644.666667 |
2014-12-21 | 0.0 | 0 | 55853 | 0.000000 | 126267.500000 | 0 | 0 | 8865.300000 | 6827.363636 | 0.0 | ... | 189399 | 232124.173913 | 140850.8 | 224719.333333 | 111471.75 | 79146.000000 | 231303 | 150992 | 29070 | 29644.666667 |
2014-12-22 | 65912.5 | 0 | 0 | 13750.076923 | 0.000000 | 0 | 0 | 0.000000 | 9229.800000 | 0.0 | ... | 189399 | 232567.500000 | 142643.0 | 224719.333333 | 111471.75 | 80400.714286 | 231303 | 150992 | 29070 | 29644.666667 |
2014-12-23 | 87836.0 | 0 | 0 | 0.000000 | 21207.666667 | 0 | 0 | 36031.800000 | 16523.200000 | 0.0 | ... | 189959 | 237956.000000 | 143447.0 | 225217.666667 | 111471.75 | 81914.000000 | 231303 | 150980 | 29070 | 29644.666667 |
5 rows × 138 columns
This first thing to plot is the number of unique users over time. Some articles like the World Cup or Winter Olympics had articles before the event actually occured while other articles are about un-anticipated events like the Crimean crisis or Israel-Gaza conflict. To make these more comparable, the number of unique users across articles is normalized to 0 on January 1. The result is we see which articles add the most new users over the course of the year.
MA 370 has the most unique users, with 2095 new unique users in 2014, followed by the rapidly rising "Ebola virus epidemic" article with 1110 new unique users and the long-simmering "2014 Crimean crisis".
normalized_unique_users = activity_2014['unique_users'] - activity_2014.ix['2014-1-1','unique_users']
ax = normalized_unique_users.plot(colormap='spectral')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('New unique users since Jan. 1',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x6ff732e8>
users_rank_s = normalized_unique_users.ix['2014-12-21'].order(ascending=False)
users_rank_s
title 2014 FIFA World Cup 2307 Malaysia Airlines Flight 370 2145 Ebola virus epidemic in West Africa 1374 Minecraft 1353 Islamic State of Iraq and the Levant 1127 2014 Crimean crisis 933 2014 Israel–Gaza conflict 857 Malaysia Airlines Flight 17 796 Felipe VI of Spain 750 Cuba–United States relations 573 Scottish independence referendum, 2014 550 Ice Bucket Challenge 539 Indian general election, 2014 482 Sinking of the MV Sewol 420 2014 Hong Kong protests 417 2014 Ferguson unrest 380 2014 Winter Olympics 337 Eurovision Song Contest 2014 277 Gamergate controversy 233 Rosetta spacecraft 218 Chibok schoolgirl kidnapping 202 Soma mine disaster 159 United States elections, 2014 156 Name: 2014-12-21 00:00:00, dtype: float64
Plot the number of revisions made per day. There are major peaks for acute events like the Crimean crisis and MA370 disappearance in March, the death of Robin Williams in August, and the Scottish vote in September.
Looking at the cumulative number of revisions made during the year-to-date, MA370 is still the big story with over 10k changes made, followed by the Israel-Gaza conflict, Ebola, and ISIL articles.
f,ax = plt.subplots(1,1,figsize=(10,6))
_ax = activity_2014['revid'].plot(colormap='spectral',lw=3,ax=ax)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5],ncol=1)
ax.set_title('Revisions over time',fontsize=18)
f.tight_layout()
f.savefig('revisions.png',dpi=200,bbox_inches='tight')
revisions_rank_s = (activity_2014['revid'].cumsum().ix['2014-12-21'] - activity_2014['revid'].cumsum().ix['2014-1-1']).order(ascending=False)
revisions_rank_s
title Malaysia Airlines Flight 370 10336 Ebola virus epidemic in West Africa 7818 Islamic State of Iraq and the Levant 7545 2014 Israel–Gaza conflict 6550 Malaysia Airlines Flight 17 5202 2014 Crimean crisis 4271 2014 Hong Kong protests 3852 Gamergate controversy 3304 2014 FIFA World Cup 3107 Indian general election, 2014 2923 Scottish independence referendum, 2014 2374 Eurovision Song Contest 2014 1963 2014 Winter Olympics 1793 Sinking of the MV Sewol 1692 Ice Bucket Challenge 1647 2014 Ferguson unrest 1483 Felipe VI of Spain 1231 Rosetta spacecraft 1152 Minecraft 900 Chibok schoolgirl kidnapping 865 United States elections, 2014 840 Soma mine disaster 634 Cuba–United States relations 579 dtype: float64
Information about the number of contributing editors and the number of revisions obscures how this work is distributed. Using the Gini coefficient to measure the dispersion in the data, we can capture how inequality of contributions to articles has changed over time. For example, the work might be evenly distributed with all editors making a equal contributions (Gini = 0) or the work might be highly concentrated with one editor making almost all of the contributions (Gini = 1).
The plot shows a general trend towards increasingly concentrated editing activity across many articles. The articles about the Hong Kong protests and Israel-Gaza conflict show very high levels of centralized editing activity while the articles about the Olympics and Robin Williams show more evenly-distributed activity.
ax = activity_2014['gini'].plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Gini coefficient',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x8d5998d0>
gini_rank_s = activity_2014['gini'].ix['2014-12-21'].order(ascending=False)
gini_rank_s
title 2014 Hong Kong protests 0.833029 Gamergate controversy 0.819048 Islamic State of Iraq and the Levant 0.800438 2014 Israel–Gaza conflict 0.786471 Ebola virus epidemic in West Africa 0.767065 Malaysia Airlines Flight 370 0.723217 Indian general election, 2014 0.715372 Malaysia Airlines Flight 17 0.713623 Eurovision Song Contest 2014 0.696886 2014 Crimean crisis 0.689587 Scottish independence referendum, 2014 0.653276 Sinking of the MV Sewol 0.629018 2014 Ferguson unrest 0.621660 2014 FIFA World Cup 0.606175 Minecraft 0.594117 2014 Winter Olympics 0.575925 Ice Bucket Challenge 0.552910 Soma mine disaster 0.552441 Rosetta spacecraft 0.496439 Chibok schoolgirl kidnapping 0.488796 Felipe VI of Spain 0.475323 Cuba–United States relations 0.438778 United States elections, 2014 0.437851 Name: 2014-12-21 00:00:00, dtype: float64
Plotting the size of the article's markup (in kilobytes), there is some variability in the size of articles over time. Some sharp valleys in articles like MA370 suggest large sections being removed and quickly restored while other articles like Hearbleed and Robin Williams have more sudden growth followed by stabilization. MA370 and the Scottish vote make up the two largest articles in the corpus currently.
ax = (activity_2014['size']/1000.).plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Article size (kB)',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x8de15cf8>
size_rank_s = (activity_2014['size']/1000.).ix['2014-12-21'].order(ascending=False)
size_rank_s
title 2014 Israel–Gaza conflict 247.040500 Islamic State of Iraq and the Levant 232.124174 Scottish independence referendum, 2014 231.303000 Malaysia Airlines Flight 370 224.719333 Ebola virus epidemic in West Africa 213.271364 2014 Crimean crisis 204.954000 Indian general election, 2014 189.399000 2014 Hong Kong protests 179.214333 2014 Ferguson unrest 170.762000 Eurovision Song Contest 2014 163.456000 Sinking of the MV Sewol 150.992000 Malaysia Airlines Flight 17 140.850800 2014 FIFA World Cup 136.723500 Minecraft 111.471750 Gamergate controversy 106.945526 2014 Winter Olympics 96.313000 Rosetta spacecraft 79.146000 Cuba–United States relations 58.256900 Ice Bucket Challenge 50.676000 Chibok schoolgirl kidnapping 45.199000 Felipe VI of Spain 34.538000 United States elections, 2014 29.644667 Soma mine disaster 29.070000 Name: 2014-12-21 00:00:00, dtype: float64
f,(ax1,ax2,ax3,ax4) = plt.subplots(4,1,figsize=(10,10),sharex=True)
_ax1 = activity_2014['unique_users'].ix['1-7-2014':].diff().plot(colormap='spectral',lw=2,ax=ax1,legend=None)
ax1.set_xlabel('')
ax1.set_ylabel('Users')
ax1.set_title('New users',fontsize=15)
_ax2 = activity_2014['revid'].plot(colormap='spectral',lw=2,ax=ax2,legend=None)
ax2.set_xlabel('')
ax2.set_ylabel('Revisions')
ax2.set_title('Revisions made',fontsize=15)
_ax3 = activity_2014['gini'].diff().plot(colormap='spectral',lw=2,ax=ax3,legend=None)
ax3.set_xlabel('')
ax3.set_ylabel('Gini delta')
ax3.set_title('Change in centralization',fontsize=15)
_ax4 = (activity_2014['diff']/1000.).diff().plot(colormap='spectral',lw=2,ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('Kilobytes (kB) delta')
ax4.set_title('Change in article size',fontsize=15)
ax4.set_ylim((-100,100))
#ax4.set_yscale('symlog')
_colors = dict(zip(sorted(revs2014_df['title'].unique()),sns.color_palette('spectral', len(revs2014_df['title'].unique()))))
handles, labels = _ax4.get_legend_handles_labels()
ax4.legend_.remove()
new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(revs2014_df['title'].unique())]
f.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=15)
f.tight_layout()
f.savefig('article_changes.png',dpi=200,bbox_inches='tight')
_table = pd.concat([users_rank_s.round(2),revisions_rank_s.round(2),gini_rank_s.round(2),size_rank_s.round(2)],
axis=1,keys=['Users','Revisions','Gini','Length'])
_table[['Revisions','Users','Gini','Length']].sort(['Revisions','Users','Gini','Length'],ascending=False)
Revisions | Users | Gini | Length | |
---|---|---|---|---|
Malaysia Airlines Flight 370 | 10336 | 2145 | 0.72 | 224.72 |
Ebola virus epidemic in West Africa | 7818 | 1374 | 0.77 | 213.27 |
Islamic State of Iraq and the Levant | 7545 | 1127 | 0.80 | 232.12 |
2014 Israel–Gaza conflict | 6550 | 857 | 0.79 | 247.04 |
Malaysia Airlines Flight 17 | 5202 | 796 | 0.71 | 140.85 |
2014 Crimean crisis | 4271 | 933 | 0.69 | 204.95 |
2014 Hong Kong protests | 3852 | 417 | 0.83 | 179.21 |
Gamergate controversy | 3304 | 233 | 0.82 | 106.95 |
2014 FIFA World Cup | 3107 | 2307 | 0.61 | 136.72 |
Indian general election, 2014 | 2923 | 482 | 0.72 | 189.40 |
Scottish independence referendum, 2014 | 2374 | 550 | 0.65 | 231.30 |
Eurovision Song Contest 2014 | 1963 | 277 | 0.70 | 163.46 |
2014 Winter Olympics | 1793 | 337 | 0.58 | 96.31 |
Sinking of the MV Sewol | 1692 | 420 | 0.63 | 150.99 |
Ice Bucket Challenge | 1647 | 539 | 0.55 | 50.68 |
2014 Ferguson unrest | 1483 | 380 | 0.62 | 170.76 |
Felipe VI of Spain | 1231 | 750 | 0.48 | 34.54 |
Rosetta spacecraft | 1152 | 218 | 0.50 | 79.15 |
Minecraft | 900 | 1353 | 0.59 | 111.47 |
Chibok schoolgirl kidnapping | 865 | 202 | 0.49 | 45.20 |
United States elections, 2014 | 840 | 156 | 0.44 | 29.64 |
Soma mine disaster | 634 | 159 | 0.55 | 29.07 |
Cuba–United States relations | 579 | 573 | 0.44 | 58.26 |
Wikipedia articles link to other Wikipedia articles to provide further details. The number of links in an article thus provides a coarse measure of how many other topics the article is related to. The ISIL, Crimean, Scottish, and Olympics articles have over 400 unique links while other articles like Heartbleed and the Hong Kong protests have lewss than 200.
ax = activity_2014['link_count'].plot(colormap='gist_rainbow')
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Links in article',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x1b065828>
activity_2014['link_count'].ix['2014-10-16'].order(ascending=False)
title Islamic State of Iraq and the Levant 476.000000 2014 Crimean crisis 448.000000 Scottish independence referendum, 2014 426.500000 2014 Winter Olympics 410.000000 2014 FIFA World Cup 382.000000 Robin Williams 288.000000 Malaysia Airlines Flight 370 279.000000 2014 Israel–Gaza conflict 238.000000 Ebola virus epidemic in West Africa 192.819149 Heartbleed 151.000000 2014 Hong Kong protests 149.630769 Name: 2014-10-16 00:00:00, dtype: float64
The amount of content (number of bytes) and number of links on an article can be combined to form a ratio of links per byte (LPB). Articles with a high LPB suggest the article tends to send people away to other articles for information. Articles with a low LPB suggest there are few other articles that contain relevant information than the current article. More imprecisely, articles with high LPB should be lower quality articles with less substantive content and articles with lower LPB should be higher quality articles with more thorough discussion.
There's a tendency for breaking news articles to get fewer LPB over time, suggesting an increase in quality. The MA370 and Ebola articles have the lowest LPB as they have detailed discussions of the events, context, and people while the articles for the World Cup and Olympics have the highest LPB as they mostly link to other sub-pages about teams and events.
links_per_byte = (activity_2014['link_count']/activity_2014['size'])
ax = links_per_byte.plot(colormap='gist_rainbow')
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Links per byte',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x1921c710>
links_per_byte.ix['2014-10-16'].order(ascending=True)
title Malaysia Airlines Flight 370 0.001008 Ebola virus epidemic in West Africa 0.001084 2014 Israel–Gaza conflict 0.001165 2014 Hong Kong protests 0.001251 Scottish independence referendum, 2014 0.001871 Heartbleed 0.002020 2014 Crimean crisis 0.002238 Islamic State of Iraq and the Levant 0.002445 Robin Williams 0.003192 2014 FIFA World Cup 0.003409 2014 Winter Olympics 0.003612 Name: 2014-10-16 00:00:00, dtype: float64
Next measure how long a revision to an article "lives" before another revision is made. Breaking news articles where lots of editors are trying to make changes in response to new information may mean that an edit only lives for a few seconds or minutes before being changes. As the article stabilizes, the latency between edits should increase reflecting that changes become less frequent. Articles for ISIL, Robin Williams, World Cup, and the Scottish referendum were written in advance of the events themselves, but still had average latencies of only a few hours at the start of the year. New articles about breaking news events start with very short latencies between edits and lengthening over time.
#ax = daily_activity.ix['2014-1-1':,'latency'].plot(colormap='gist_rainbow')
ax = pd.rolling_mean(daily_activity.ix['2013-11-1':,'latency'],28).ix['2014-1-1':].plot(colormap='spectral')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Edit latency (seconds)',fontsize=15)
ax.set_yscale('symlog')
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x5cbd17f0>
daily_activity.ix['2014-1-1':,'latency'].mean().order(ascending=True)
title 2014 Hong Kong protests 1118.243382 Gamergate controversy 5386.759761 2014 Ferguson unrest 7044.298083 Malaysia Airlines Flight 17 8742.343510 Ebola virus epidemic in West Africa 14984.461360 Ice Bucket Challenge 15739.002636 Malaysia Airlines Flight 370 19438.206178 Islamic State of Iraq and the Levant 21140.547770 Scottish independence referendum, 2014 35588.366615 2014 Crimean crisis 36745.956090 2014 FIFA World Cup 37520.299262 Indian general election, 2014 39796.101925 Rosetta (spacecraft) 41674.952889 2014 Winter Olympics 44095.782572 Soma mine disaster 44172.268543 Felipe VI of Spain 51472.925482 Malala Yousafzai 51515.912970 Minecraft 52667.175757 Conchita Wurst 53032.325408 Cuba–United States relations 62546.472660 dtype: float64
In addition the the information production statistics analyzed above, the number of article pageviews captures a measure of information consumption. The make_pageview_df
function takes a list of article titles and returns a DataFrame indexed by day, columns corresponding to articles, and values the number of pageviews for that article on that day.
If you've already done the step above, you can read in the CSV file pageviews.csv
.
pv_df = pd.read_csv('pageviews_Dec.csv',encoding='utf8',index_col=0,parse_dates=[0])
del pv_df['Heartbleed']
pv_2014 = pv_df.ix['1-1-2014':]
pv_df.tail()
2014 Crimean crisis | 2014 FIFA World Cup | 2014 Ferguson unrest | 2014 Hong Kong protests | 2014 Israel–Gaza conflict | 2014 Winter Olympics | Chibok schoolgirl kidnapping | Cuba–United States relations | Ebola virus epidemic in West Africa | Eurovision Song Contest 2014 | ... | Indian general election, 2014 | Islamic State of Iraq and the Levant | Malaysia Airlines Flight 17 | Malaysia Airlines Flight 370 | Minecraft | Rosetta spacecraft | Scottish independence referendum, 2014 | Sinking of the MV Sewol | Soma mine disaster | United States elections, 2014 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2014-12-17 | 2663 | 5779 | 4979 | 2294 | 1573 | 2964 | 736 | 13112 | 9673 | 747 | ... | 1691 | 26288 | 3131 | 9257 | 7495 | 2506 | 1557 | 714 | 124 | 940 |
2014-12-18 | 2390 | 5552 | 19184 | 2038 | 1489 | 2237 | 830 | 28682 | 10765 | 682 | ... | 1852 | 29494 | 2850 | 7919 | 8742 | 1835 | 1257 | 601 | 115 | 911 |
2014-12-19 | 2044 | 4953 | 2418 | 1463 | 1248 | 1946 | 683 | 13209 | 7873 | 766 | ... | 1690 | 20771 | 2502 | 6784 | 21895 | 1645 | 978 | 611 | 102 | 826 |
2014-12-20 | 1666 | 5039 | 1944 | 1005 | 981 | 1517 | 381 | 7491 | 6175 | 932 | ... | 1537 | 15989 | 1959 | 4823 | 12914 | 1086 | 827 | 485 | 66 | 482 |
2014-12-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 23 columns
Pageviews exhibit a strong weekly cycle across articles: less content is consumed on weekends than weekdays. But there are "bursts" in pageview attention to articles in the immediate aftermath of an event. The death of Robin Williams had the highest peak of pageview activity, but the World Cup, and Heartbleed also showed high levels of attention. Much of this attention falls exponentially back to a few thousand pageviews about 2 months after the precipitating event.
f,ax = plt.subplots(1,1,figsize=(10,5))
_ax = pv_df.ix['2014-1-1':].plot(colormap='spectral',lw=3,ax=ax)
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Pageviews',fontsize=15)
ax.legend(loc='center right',bbox_to_anchor=[1.35,.5],fontsize=9,ncol=1)
f.tight_layout()
f.savefig('pageviews.png',dpi=200,bbox_inches='tight')
pv_df.ix['2014-1-1':].sum().sort(ascending=False,inplace=False)
2014 FIFA World Cup 15114251 Islamic State of Iraq and the Levant 7958521 Malaysia Airlines Flight 370 7104838 Ice Bucket Challenge 5393702 2014 Winter Olympics 4994186 Minecraft 3536207 Indian general election, 2014 3308296 Scottish independence referendum, 2014 2453512 Ebola virus epidemic in West Africa 2229304 Malaysia Airlines Flight 17 2173782 Eurovision Song Contest 2014 2045654 2014 Crimean crisis 1955729 Gamergate controversy 1255098 2014 Israel–Gaza conflict 920625 Rosetta spacecraft 865312 2014 Ferguson unrest 787856 United States elections, 2014 672488 2014 Hong Kong protests 496953 Sinking of the MV Sewol 436051 Felipe VI of Spain 408235 Cuba–United States relations 190825 Soma mine disaster 139668 Chibok schoolgirl kidnapping 108700 dtype: float64
ax = (pv_df.ix['2014-1-1':]/pv_df.max(axis=0)).plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Pageviews',fontsize=15)
#ax.set_ylim((10**2,10**7))
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x426d3eb8>
f,ax = plt.subplots(1,1,figsize=(8,8))
pv_df.sum(axis=0).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Cumulative pageviews',fontsize=18)
ax.set_xlabel('Total page views',fontsize=15)
ax.set_xscale('symlog')
f.tight_layout()
f.savefig('cumulative_pageviews.png',dpi=200)
Now use the idxmax
function (basedo on this StackOverflow question) to identify the index corresponding to the dates for each article's pageview peak.
pv_melted = pd.melt(pv_df.reset_index(),id_vars=['index'])
pv_gb_page = pv_melted.groupby('variable')
_idx = pv_melted.groupby('variable')['value'].agg(lambda col: col.idxmax())
pv_max = pv_melted.ix[_idx]
pv_max.columns = ['date','article','pageviews']
pv_max = pv_max.set_index('article')
pv_max
date | pageviews | |
---|---|---|
article | ||
2014 Crimean crisis | 2014-03-03 | 114744 |
2014 FIFA World Cup | 2014-06-12 | 494388 |
2014 Ferguson unrest | 2014-11-25 | 133152 |
2014 Hong Kong protests | 2014-10-02 | 41168 |
2014 Israel–Gaza conflict | 2014-07-31 | 35057 |
2014 Winter Olympics | 2014-02-07 | 290268 |
Chibok schoolgirl kidnapping | 2014-09-03 | 1680 |
Cuba–United States relations | 2014-12-18 | 28682 |
Ebola virus epidemic in West Africa | 2014-10-15 | 70894 |
Eurovision Song Contest 2014 | 2014-05-11 | 240290 |
Felipe VI of Spain | 2014-06-21 | 35789 |
Gamergate controversy | 2014-10-23 | 89858 |
Ice Bucket Challenge | 2014-08-21 | 559582 |
Indian general election, 2014 | 2014-05-16 | 287240 |
Islamic State of Iraq and the Levant | 2014-09-03 | 297580 |
Malaysia Airlines Flight 17 | 2014-07-18 | 425012 |
Malaysia Airlines Flight 370 | 2014-03-10 | 288597 |
Minecraft | 2014-09-16 | 46521 |
Rosetta spacecraft | 2014-11-12 | 119264 |
Scottish independence referendum, 2014 | 2014-09-19 | 289786 |
Sinking of the MV Sewol | 2014-04-27 | 17393 |
Soma mine disaster | 2014-05-14 | 32152 |
United States elections, 2014 | 2014-11-05 | 63299 |
pv_gb_page.groups.keys()
f,ax = plt.subplots(1,1,figsize=(8,8))
pv_df.max().sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Most Pageviews in a Day',fontsize=18)
ax.set_xlabel('Pageviews',fontsize=15)
ax.set_xscale('symlog')
f.tight_layout()
f.savefig('max_pageviews.png',dpi=200)
f,ax = plt.subplots(1,1,figsize=(10,5))
_cmap = 'spectral'
_topics = ['Malaysia Airlines Flight 17','2014 FIFA World Cup','Islamic State of Iraq and the Levant','Minecraft']
_data = pv_df[_topics].ix['1-1-2014':]
#_ax = _data.plot(lw=3,ax=ax,cmap=_cmap)
#ax.set_yscale('symlog')
#ax.set_ylim((1e2,1e7))
ax.legend(fontsize=12,loc='upper left')
_colors = dict(zip(pv_df.columns,sns.color_palette(_cmap, len(pv_df.columns))))
for d in _topics:
ax.plot(pv_df.ix['1-1-2014':,d].index,pv_df.ix['1-1-2014':,d].values,c=_colors[d],lw=2,label=d)
#ax.fill_between(_data[d].index, _data[d].values, _data['Minecraft'].values, facecolor=_colors[d], alpha=0.33)
ax.set_xticklabels(['Jan 2014','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax.legend(loc='center right',bbox_to_anchor=[1.4,.5],fontsize=12)
f.tight_layout()
f.savefig('pageviews_shapes.png',dpi=200,bbox_inches='tight')
f,ax = plt.subplots(1,1,figsize=(8,8))
(pv_df.max()/pv_df.sum()).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Fraction of Total Pageviews in Peak',fontsize=18)
ax.set_xlabel('Fraction of Pageviews',fontsize=15)
#ax.set_xscale('log')
f.tight_layout()
f.savefig('peak_fraction.png',dpi=200)
The ratio of information producation and consumption reveals whether a peer-produced encyclopedia like Wikipedia is generating enough content relative to the demand for information. More specifically, measuring the ratio of pageviews (information demand) to revisions (information production) reveals during what times and on what topics these are matched or off-balance. Call this the "information conduced" ratio.
The recent "Heartbleed 2.0" bug produced a major spike in attention without a corresponding spike of similar magnitude in editing behavior, creating the highest conduced ratio, but other major media events like the Olympics and World Cup also have high ratios. This suggests that Wikipedians were not generating changes to these articles to reflect the demand for information. In contrast, other breaking news topics like MA370, the Ebola outbreak, and others were more evenly matched between information production and consumption.
pv_2014.columns
Index([u'2014 Crimean crisis', u'2014 FIFA World Cup', u'2014 Ferguson unrest', u'2014 Hong Kong protests', u'2014 Israel–Gaza conflict', u'2014 Winter Olympics', u'Chibok schoolgirl kidnapping', u'Cuba–United States relations', u'Ebola virus epidemic in West Africa', u'Eurovision Song Contest 2014', u'Felipe VI of Spain', u'Gamergate controversy', u'Ice Bucket Challenge', u'Indian general election, 2014', u'Islamic State of Iraq and the Levant', u'Malaysia Airlines Flight 17', u'Malaysia Airlines Flight 370', u'Minecraft', u'Rosetta spacecraft', u'Scottish independence referendum, 2014', u'Sinking of the MV Sewol', u'Soma mine disaster', u'United States elections, 2014'], dtype='object')
information_conduced_df = pv_2014/(activity_2014['revid']+1)
ax = pd.rolling_mean(information_conduced_df,7).ix['2014-1-1':].plot(colormap='spectral')
ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Consumption/Production ratio',fontsize=15)
#ax.set_ylim((0,20000))
ax.legend(loc='center left',bbox_to_anchor=[1,.5])
<matplotlib.legend.Legend at 0x5d9feda0>
_s = pd.melt(information_conduced_df.reset_index(),id_vars='index').replace({np.inf:np.nan,-np.inf:np.nan}).dropna()
_s.columns = ['date','article','ratio']
_top = _s.sort('ratio',inplace=False,ascending=False).reset_index(drop=True).ix[:10]
# Exclude uninteresting edge cases. Sorry Your Excellence, you're a boring fellow.
_bottom = _s[(_s['ratio'] > 0) & (_s['article'] != 'Felipe VI of Spain')]
_bottom = _bottom.sort('ratio',inplace=False,ascending=True).reset_index(drop=True).ix[:10]
pd.concat([_top,_bottom],keys=['Greater consumption per production','Lesser consumption per production'],axis=1)
Greater consumption per production | Lesser consumption per production | |||||
---|---|---|---|---|---|---|
date | article | ratio | date | article | ratio | |
0 | 2014-10-23 | Gamergate controversy | 44929.000000 | 2014-08-02 | Ice Bucket Challenge | 1.000000 |
1 | 2014-10-24 | Gamergate controversy | 32957.000000 | 2014-05-13 | Soma mine disaster | 1.840909 |
2 | 2014-02-14 | 2014 Winter Olympics | 31753.000000 | 2014-08-12 | Ice Bucket Challenge | 2.000000 |
3 | 2014-08-31 | Ice Bucket Challenge | 30047.333333 | 2014-08-11 | Ice Bucket Challenge | 2.000000 |
4 | 2014-08-27 | Ice Bucket Challenge | 29725.666667 | 2014-08-10 | Ice Bucket Challenge | 2.000000 |
5 | 2014-10-15 | Gamergate controversy | 28503.500000 | 2014-01-05 | Scottish independence referendum, 2014 | 17.333333 |
6 | 2014-06-11 | 2014 FIFA World Cup | 25593.000000 | 2014-09-21 | Gamergate controversy | 20.725490 |
7 | 2014-02-15 | 2014 Winter Olympics | 24110.600000 | 2014-08-13 | Ice Bucket Challenge | 22.888889 |
8 | 2014-09-03 | Ice Bucket Challenge | 23886.000000 | 2014-09-06 | Soma mine disaster | 23.500000 |
9 | 2014-10-16 | Gamergate controversy | 23419.500000 | 2014-09-28 | 2014 Hong Kong protests | 24.964286 |
10 | 2014-05-15 | 2014 FIFA World Cup | 22044.000000 | 2014-08-26 | Soma mine disaster | 26.000000 |
_p = pd.melt(activity_2014['revid'].reset_index(),id_vars='date')
_c = pd.melt(pv_2014.reset_index(),id_vars='index')
_j = pd.merge(_p,_c,left_on=['date','title'],right_on=['index','variable'],copy=False)
_j = _j[['date','title','value_x','value_y']]
_j.columns = ['date','article','production','consumption']
_j_gb = _j.groupby('article')
f,ax = plt.subplots(1,1,figsize=(10,10))
_colors = dict(zip(sorted(_j_gb.groups.keys()),sns.color_palette('spectral', len(_j_gb.groups.keys()))))
for article in sorted(_j_gb.groups.keys()):
_data = _j_gb.get_group(article)[['production','consumption']]
_data['production_z'] = (_data['production'] - _data['production'].mean())/_data['production'].std()
_data['consumption_z'] = (_data['consumption'] - _data['consumption'].mean())/_data['consumption'].std()
_data['ratio'] = np.abs(_data['consumption_z'] + _data['production_z'])
#_data = _data[(_data['production'] > 0) & (_data['consumption'] > 0)]
sns.regplot(_data['production_z'],_data['consumption_z'],
ci=None,color=_colors[article],label=article,ax=ax,lowess=True,
scatter_kws={'s':50*np.abs(_data['ratio']),'alpha':.33},line_kws={'lw':5})
#ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=50,lw=0,alpha=.5,label=article)
#ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=250*np.log(_data['ratio']),lw=0,alpha=.5,label=article)
ax.set_xlabel('Revisions (Z-score)',fontsize=15)
ax.set_ylabel('Pageviews (Z-score)',fontsize=15)
ax.set_yscale('symlog')
ax.set_ylim((-1,20))
ax.set_xscale('symlog')
ax.set_xlim((-1,20))
ax.plot((-1,20),(-1,20),'--',lw=3,c='k')
handles, labels = ax.get_legend_handles_labels()
new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(_j_gb.groups.keys())]
ax.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=12)
f.savefig('pv_vs_revision.png',dpi=200,bbox_inches='tight')
handles[0]
<matplotlib.collections.PathCollection at 0x3c7b6160>
Define a dictionary aggregator
to use for generating attribute data for edges, articles, and editors from the revision history. This will be used throughout several of the following steps.
Create a new DataFrame by grouping the revisions together by article title
and user username
and apply the aggregator
function to this groupby object to generate relevant statistics and attributes. Other data cleanup to simplify the data structure and convert timestamps from Timestamp
objects into better-behaved floats.
agg_function = {'revid':{'weight':len},
'timestamp':{'ts_min':np.min,'ts_max':np.max},
'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max,'total_changes':np.sum},
'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max},
#'link_count':{'link_count_min':np.min,'link_count_median':np.median,'link_count_max':np.max}
}
revs_gb_edge = revs.groupby(['title','user'])
revs_edgelist = revs_gb_edge.agg(agg_function)
revs_edgelist.columns = revs_edgelist.columns.droplevel(0)
# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_edgelist['ts_min'] = (revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_edgelist['ts_max'] = (revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_edgelist.head()
ts_min | ts_max | revision_min | revision_max | revision_median | weight | latency_min | latency_median | latency_max | diff_median | diff_max | diff_min | total_changes | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | user | |||||||||||||
2014 Crimean crisis | 1.178.88.30 | 4808.280891 | 4808.280891 | 634 | 634 | 634.0 | 1 | 748 | 748 | 748 | 1 | 1 | 1 | 1 |
10.4.0.34 | 4811.231632 | 4811.231632 | 1342 | 1342 | 1342.0 | 1 | 2267 | 2267 | 2267 | 48 | 48 | 48 | 48 | |
101.98.175.68 | 4870.264688 | 4870.265058 | 3644 | 3645 | 3644.5 | 2 | 32 | 6020 | 12008 | 72 | 143 | 1 | 144 | |
106.68.144.182 | 4995.320382 | 4995.320382 | 3896 | 3896 | 3896.0 | 1 | 384782 | 384782 | 384782 | 1 | 1 | 1 | 1 | |
107.15.237.75 | 4808.931308 | 4808.931308 | 852 | 852 | 852.0 | 1 | 33 | 33 | 33 | -40 | -40 | -40 | -40 |
Check to see if there are any articles and editors with the same names, cause this will definitely break stuff.
revs_edgelist.ix[[i for i in revs_edgelist.index if i[0] == i[1]]]
ts_min | ts_max | revision_min | revision_max | revision_median | weight | latency_min | latency_median | latency_max | diff_median | diff_max | diff_min | total_changes | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | user |
revs_gb_page = revs.groupby('title')
revs_pagenodelist = revs_gb_page.agg(agg_function)
revs_pagenodelist.columns = revs_pagenodelist.columns.droplevel(0)
# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_pagenodelist['ts_min'] = (revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_pagenodelist['ts_max'] = (revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_pagenodelist['article'] = [1]*len(revs_pagenodelist)
revs_pagenodelist.head()
ts_min | ts_max | revision_min | revision_max | revision_median | weight | latency_min | latency_median | latency_max | diff_median | diff_max | diff_min | total_changes | article | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | ||||||||||||||
2014 Crimean crisis | 4804.710139 | 5104.708588 | 0 | 4046 | 2023.0 | 4047 | 2 | 306 | 791108 | 10 | 201155 | -201155 | 204422 | 1 |
2014 FIFA World Cup | 1230.827986 | 5093.587894 | 0 | 7172 | 3586.0 | 7173 | 1 | 963 | 10345392 | 1 | 111902 | -111902 | 136141 | 1 |
2014 Ferguson unrest | 4978.230613 | 5102.265486 | 0 | 1348 | 674.0 | 1349 | 5 | 609 | 473810 | 5 | 168645 | -168645 | 81098 | 1 |
2014 Hong Kong protests | 5018.674051 | 5103.346389 | 0 | 3861 | 1930.5 | 3862 | 4 | 339 | 165680 | 4 | 122763 | -122763 | 172817 | 1 |
2014 Israel–Gaza conflict | 4936.250602 | 5104.259549 | 0 | 6497 | 3248.5 | 6498 | 6 | 353 | 306131 | 4 | 188426 | -188426 | 245650 | 1 |
revs_gb_user = revs.groupby('user')
revs_usernodelist = revs_gb_user.agg(agg_function)
revs_usernodelist.columns = revs_usernodelist.columns.droplevel(0)
# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_usernodelist['ts_min'] = (revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_usernodelist['ts_max'] = (revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_usernodelist['article'] = [0]*len(revs_usernodelist)
revs_usernodelist.head()
ts_min | ts_max | revision_min | revision_max | revision_median | weight | latency_min | latency_median | latency_max | diff_median | diff_max | diff_min | total_changes | article | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user | ||||||||||||||
!dea4u | 3571.221806 | 3571.221806 | 1902 | 1902 | 1902.0 | 1 | 12 | 12.0 | 12 | -18.0 | -18 | -18 | -18 | 0 |
$oliton | 4898.752361 | 4898.752361 | 4993 | 4993 | 4993.0 | 1 | 6009 | 6009.0 | 6009 | 0.0 | 0 | 0 | 0 | 0 |
( | 764.846713 | 795.962593 | 0 | 2 | 1.0 | 3 | 98 | 1344206.0 | 2688314 | 113.5 | 215 | 12 | 227 | 0 |
-Jafar277- | 4996.878519 | 5062.872720 | 4405 | 6542 | 5473.5 | 2 | 704 | 3770.5 | 6837 | 22.5 | 40 | 5 | 45 | 0 |
-MARSHMELLOWxPUPPIES- | 4863.816030 | 4863.816713 | 53 | 54 | 53.5 | 2 | 59 | 11165.0 | 22271 | 4.0 | 4 | 4 | 8 | 0 |
Create an empty NetworkX DiGraph object coauthorship_g
that will be filled with the edges, nodes, and attribute data for both generated from the edge and nodelists above. Loop through these edge and nodelists, adding attributes and the nodes/edges to the coauthorship_g
object. The dictionary comprehension with k:float(v)
is to convert the numpy.float64
to more primitive float
types because NetworkX graph serializers aren't compatible with the float64. Print out the size of the graph and an example of one edge to verify the process worked.
coauthorship_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(revs_edgelist.index.values):
edge_attributes = {k:float(v) for k,v in dict(revs_edgelist.ix[(article,editor)]).items()}
if article != editor:
coauthorship_g.add_edge(article,editor,edge_attributes)
# Add the user nodes and attributes
for node in iter(revs_usernodelist.index):
node_attributes = {k:float(v) for k,v in dict(revs_usernodelist.ix[node]).items()}
coauthorship_g.add_node(node,node_attributes,type='user')
# Add the page nodes and attributes
for node in iter(revs_pagenodelist.index):
node_attributes = {k:float(v) for k,v in dict(revs_pagenodelist.ix[node]).items()}
coauthorship_g.add_node(node,node_attributes,type='page')
print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g.number_of_nodes(),coauthorship_g.number_of_edges())
coauthorship_g.edges(data=True)[:1]
There are 15782 nodes and 18964 edges in the network.
[(u'Sinking of the MV Sewol', u'24.177.247.129', {'diff_max': 23.0, 'diff_median': 23.0, 'diff_min': 23.0, 'latency_max': 1513.0, 'latency_median': 1513.0, 'latency_min': 1513.0, 'revision_max': 771.0, 'revision_median': 771.0, 'revision_min': 771.0, 'total_changes': 23.0, 'ts_max': 4864.7124652777775, 'ts_min': 4864.7124652777775, 'weight': 1.0})]
Write the graph object to disk.
nx.write_gexf(coauthorship_g,'coauthorship_g.gexf')
Subset the data to only include users who make more than a single contribution, create a new graph object coauthorship_g_gt1
and remove the edges and nodes for users making only a single contribution.
edges_wt1 = [(i,j) for (i,j,k) in coauthorship_g.edges_iter(data=True) if k['weight'] == 1]
coauthorship_g_gt1 = coauthorship_g.copy()
coauthorship_g_gt1.remove_edges_from(edges_wt1)
isolates = nx.isolates(coauthorship_g_gt1)
coauthorship_g_gt1.remove_nodes_from(isolates)
nx.write_gexf(coauthorship_g_gt1,'coauthorship_g_gt1.gexf')
print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g_gt1.number_of_nodes(),coauthorship_g_gt1.number_of_edges())
There are 6171 nodes and 7420 edges in the network.
Visualize the sparsified network in Gephi using ForceAtlas 2 with the "Prevent Overlap", Scaling = 500, and "Stonger gravity" options. Nodes are colored by ts_min
to reflect the timestamp of the first edit made and mapped to a rainbow spectrum where bluer colors are older nodes (earlier dates) and redder colors are younger nodes (more recent dates). Nodes have also been sided by degree (in + out).
Image('coauthorship_g_gt1.png')
Look at the edge weights for users who edited pages most intensively.
revs_edgelist.sort('weight',inplace=False,ascending=False)['weight'].reset_index().head(10)
title | user | weight | |
---|---|---|---|
0 | Islamic State of Iraq and the Levant | P-123 | 2385 |
1 | Ebola virus epidemic in West Africa | BrianGroen | 1374 |
2 | 2014 Hong Kong protests | Signedzzz | 893 |
3 | Malaysia Airlines Flight 370 | Ohconfucius | 773 |
4 | Ebola virus epidemic in West Africa | Gandydancer | 709 |
5 | 2014 Hong Kong protests | Ohconfucius | 668 |
6 | Gamergate controversy | Ryulong | 565 |
7 | Indian general election, 2014 | Lihaas | 505 |
8 | Gamergate controversy | NorthBySouthBaranof | 496 |
9 | Scottish independence referendum, 2014 | Jmorrison230582 | 473 |
revs_edgelist[(revs_edgelist['weight'] > 10)]['total_changes'].abs().sort(inplace=False,ascending=False).head(10)
title user Minecraft ClueBot NG 994919 Malaysia Airlines Flight 370 ClueBot NG 816064 2014 Crimean crisis ClueBot NG 666962 Minecraft Kevin12345671 482144 Islamic State of Iraq and the Levant Teaksmitty 386686 Ice Bucket Challenge Mr. Granger 298161 Indian general election, 2014 Lihaas 182078 Malaysia Airlines Flight 370 Ohconfucius 141311 2014 Winter Olympics ClueBot NG 129650 Islamic State of Iraq and the Levant ClueBot NG 124130 Name: total_changes, dtype: float64
Computing outdegree centrality identifies the editors who contributed to the most other articles. The k:int(v*_n)
dictionary comprehension de-normalizes the values back to counts for the actual number of articles edited.
Two editors, "ClueBot NG" and "AnomieBOT" contributed to all 11 articles, but these aren't human editors, but automated scripts. Users "Tpbradbury" and "Lihass" contributed to 9 of the articles we analyzed.
_n = len(coauthorship_g_gt1) - 1
idc = {k:int(v*_n) for k,v in nx.in_degree_centrality(coauthorship_g_gt1).iteritems()}
odc = {k:int(v*_n) for k,v in nx.out_degree_centrality(coauthorship_g_gt1).iteritems()}
pd.Series(idc).sort(inplace=False,ascending=False).ix[:20]
AnomieBOT 23 ClueBot NG 22 Yobot 19 BG19bot 15 Tpbradbury 12 Ohconfucius 11 Lihaas 11 Nickst 10 Another Believer 10 Rothorpe 10 Mogism 10 BattyBot 9 Brandmeister 9 Jprg1966 9 Cydebot 9 Soffredo 9 Jonesey95 9 Illegitimate Barrister 7 SmackBot 7 Factsearch 7 dtype: int64
Computing clustering tells us the extent to which editors of a particular article edited other articles. The ISIL and 2014 Crimean crisis articles had more of their editors contributing to other articles in the set while the contributors to Robin Williams and Heartbleed tended not to contribute to other articles.
bp_g_gt1 = coauthorship_g_gt1.to_undirected()
pages = list(revs_pagenodelist.index)
users = list(set(coauthorship_g_gt1.nodes()) - set(pages))
clustering = nx.bipartite.clustering(bp_g_gt1,pages)
pd.Series(clustering).sort(inplace=False,ascending=False)
Malaysia Airlines Flight 17 0.030860 Islamic State of Iraq and the Levant 0.029015 2014 Crimean crisis 0.027932 2014 Israel–Gaza conflict 0.025939 2014 Ferguson unrest 0.024146 Ebola virus epidemic in West Africa 0.023239 Malaysia Airlines Flight 370 0.022589 Scottish independence referendum, 2014 0.022386 2014 Winter Olympics 0.021959 2014 Hong Kong protests 0.021083 Sinking of the MV Sewol 0.020832 Indian general election, 2014 0.020414 Rosetta spacecraft 0.018343 2014 FIFA World Cup 0.017378 Chibok schoolgirl kidnapping 0.016841 Eurovision Song Contest 2014 0.014686 Felipe VI of Spain 0.014023 Soma mine disaster 0.013941 Minecraft 0.013861 United States elections, 2014 0.012435 Ice Bucket Challenge 0.012004 Cuba–United States relations 0.011649 Gamergate controversy 0.009740 dtype: float64
Now let's look at the coauthorship patterns in the 72 hour window surrounding the article's peak pageview activity. Specifically, for each article's revision history look at theh revisions 24 hours before the peak and 48 hours after the peak.
revs2014_gb_article = revs2014_df.groupby('title')
aftermath_df_list = list()
for _article in pv_max.index:
_df = revs2014_gb_article.get_group(_article)
_before = pv_max.ix[_article,'date'] - np.timedelta64(1,'D')
_after = pv_max.ix[_article,'date'] + np.timedelta64(2,'D')
_aftermath = _df[(_df['timestamp'] > _before) & (_df['timestamp'] < _after)]
aftermath_df_list.append(_aftermath)
aftermath_revs = pd.concat(aftermath_df_list)
aftermath_revs_gb_edge = aftermath_revs.groupby(['title','user'])
aftermath_revs_edgelist = aftermath_revs_gb_edge.agg(agg_function)
aftermath_revs_edgelist.columns = aftermath_revs_edgelist.columns.droplevel(0)
aftermath_revs_edgelist['ts_min'] = (aftermath_revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_edgelist['ts_max'] = (aftermath_revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_gb_page = aftermath_revs.groupby('title')
aftermath_revs_pagenodelist = aftermath_revs_gb_page.agg(agg_function)
aftermath_revs_pagenodelist.columns = aftermath_revs_pagenodelist.columns.droplevel(0)
aftermath_revs_pagenodelist['ts_min'] = (aftermath_revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_pagenodelist['ts_max'] = (aftermath_revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_pagenodelist['article'] = [1]*len(aftermath_revs_pagenodelist)
aftermath_revs_gb_user = aftermath_revs.groupby('user')
aftermath_revs_usernodelist = aftermath_revs_gb_user.agg(agg_function)
aftermath_revs_usernodelist.columns = aftermath_revs_usernodelist.columns.droplevel(0)
aftermath_revs_usernodelist['ts_min'] = (aftermath_revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_usernodelist['ts_max'] = (aftermath_revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_usernodelist['article'] = [0]*len(aftermath_revs_usernodelist)
aftermath_coauthorship_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(aftermath_revs_edgelist.index.values):
edge_attributes = {k:float(v) for k,v in dict(aftermath_revs_edgelist.ix[(article,editor)]).items()}
if article != editor:
aftermath_coauthorship_g.add_edge(article,editor,edge_attributes)
# Add the user nodes and attributes
for node in iter(aftermath_revs_usernodelist.index):
node_attributes = {k:float(v) for k,v in dict(aftermath_revs_usernodelist.ix[node]).items()}
aftermath_coauthorship_g.add_node(node,node_attributes,type='user')
# Add the page nodes and attributes
for node in iter(aftermath_revs_pagenodelist.index):
node_attributes = {k:float(v) for k,v in dict(aftermath_revs_pagenodelist.ix[node]).items()}
aftermath_coauthorship_g.add_node(node,node_attributes,type='page')
print "There are {0} nodes and {1} edges in the network.".format(aftermath_coauthorship_g.number_of_nodes(),aftermath_coauthorship_g.number_of_edges())
nx.write_gexf(aftermath_coauthorship_g,'aftermath_coauthorship_g.gexf')
There are 2158 nodes and 2508 edges in the network.
Apparently no one edited the "Gamergate controversy" article on the peak date --- it was likely protected for the duration.
_df = revs2014_gb_article.get_group('Gamergate controversy')
_df[(_df['date'] < pd.datetime(2014,10,25)) & (_df['date'] > pd.datetime(2014,10,22))]
title | anon | comment | commenthidden | date | diff | gini | latency | parentid | revid | revision | size | timestamp | unique_users | user | userhidden | userid |
---|
As a result, it's not added to the graph.
_pages1 = revs2014_gb_article.groups.keys()
_pages2 = [_n for _n,_d in aftermath_coauthorship_g.nodes_iter(data=True) if 'page' in _d.values()]
_pages3 = list(aftermath_revs['title'].unique())
set(_pages1) - set(_pages3)
{u'Gamergate controversy'}
Image('aftermath_coauthorship_g.png')
How dooes the editing behavior of users overlaps across the different articles?
_article_overlaps = dict()
_aftermath_overlaps = dict()
for _article1 in pages:
_article_overlaps[_article1] = dict()
_aftermath_overlaps[_article1] = dict()
for _article2 in pages:
if _article1 != _article2:
try:
_article_overlaps[_article1][_article2] = len(set(coauthorship_g.neighbors(_article1)) & set(coauthorship_g.neighbors(_article2)))/float(len(set(coauthorship_g.neighbors(_article1))))
except nx.NetworkXError:
_article_overlaps[_article1][_article2] = np.nan
try: # Some articles have no editing activity in the aftermath window
_aftermath_overlaps[_article1][_article2] = len(set(aftermath_coauthorship_g.neighbors(_article1)) & set(aftermath_coauthorship_g.neighbors(_article2)))/float(len(set(aftermath_coauthorship_g.neighbors(_article1))))
except nx.NetworkXError:
_aftermath_overlaps[_article1][_article2] = np.nan
_article_overlaps_df = pd.DataFrame(_article_overlaps)
_order = _article_overlaps_df.mean(axis=1).sort(inplace=False,ascending=True).index
_article_overlaps_df = _article_overlaps_df[_order].ix[_order]
_x1,_y1 = _article_overlaps_df.shape
_aftermath_overlaps_df = pd.DataFrame(_aftermath_overlaps)
_aftermath_overlaps_df = _aftermath_overlaps_df[_order].ix[_order]
_x2,_y2 = _aftermath_overlaps_df.shape
f,(ax1,ax2) = plt.subplots(1,2,figsize=(12,8),sharey=True)
_ax1 = ax1.pcolor(_article_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25)
ax1.set_frame_on(False)
ax1.set_xticks(np.arange(0.5,_x+.5),minor=False)
ax1.set_yticks(np.arange(_y)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90)
ax1.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12)
ax1.tick_params(axis='x',direction='in',pad=3)
ax1.set_title('Complete coauthorship',fontsize=15)
_ax2 = ax2.pcolor(_aftermath_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(0.5,_x+.5),minor=False)
#ax2.set_yticks(np.arange(_y)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90)
#ax2.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12)
ax2.tick_params(axis='x',direction='in',pad=3)
ax2.set_title('Aftermath coauthorship',fontsize=15)
#ax.set_xlabel('Article rank',fontsize=15)
f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([1, 0.25, 0.05, 0.7])
f.colorbar(_ax1, cax=cbar_ax,label='Editor overlap')
f.tight_layout()
f.savefig('editor_overlap.png',dpi=200,bbox_inches='tight')
Create a DataFrame max_daily_revid
that is indexed by dates in 2014 and has columns for each article. Crucially, the values in each cell correspond to the maximum (final) revision of that article on that date. Use the complete rev_df
DataFrame to populate revids from before Jan 1, 2014 forward and then use the fillna
's "ffill" method to populate revids of previous dates forward to dates when no revision happened.
624150417[']
_idx = rev_df.groupby(['title','date']).agg({'revid':lambda x:x.idxmax()})
max_daily_revid = rev_df[['title','date','revid']].ix[_idx['revid']]
max_daily_revid = pd.pivot_table(data=max_daily_revid,columns='title',index='date',values='revid')
max_daily_revid.fillna(method='ffill',inplace=True)
max_daily_revid = max_daily_revid.ix[pd.date_range(start='1-1-2014',end='12-22-2014')]
# It turns out a bunch of revisions were deleted on the ISIS article
max_daily_revid.ix[pd.to_datetime('2014-09-03').date(),'Islamic State of Iraq and the Levant'] = np.nan #624150417
Now we have daily revisions for each article that should give us a way to retrieve article versions at a daily level of resolution to track changes in content over time. In particular, we can use the MediaWiki API to parse out the external links for each version of an article as a way of looking at how citation practices have changed over time. There are nearly 9,000 revisions in total that need to be parsed across all the articles this year --- and likely some cleanup needed in the event that the chosen revisions happen to be outliers or vandalism.
len([_revid for _article in max_daily_revid.columns for _revid in max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()])
3409
Into the parsed_revid_data
dictionary we're going to place the payloads from the MediaWiki parsing results. Specifically, we'll be asking for the revision IDs, categories, language links, external links, internal wikilinks, templates, and images. Having each of these properties at the revision level will let us track the "evolution" of the content on these articles over time.
Because this requires making over 3,000 API calls, it will take a while and we'll only want to run it once. The cell block below has been converted to Raw rather than Code to prevent inadvertent execution.
#parsed_revid_data = dict()
for _article in max_daily_revid.columns[14:15]:
print _article
parsed_revid_data[_article] = dict()
_unique_revids = max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()
for _revid in _unique_revids:
try:
parsed_revid_data[_article][_revid] = ws.wikipedia_query({'action':'parse',
'oldid': _revid,
'redirects': True,
'prop': 'revid|langlinks|categories|externallinks|iwlinks|templates|images'},'en')
except:
print "Revision {0} has an error".format(str(_revid))
parsed_revid_data[_article][_revid] = np.nan
pass
simlified_parsed_revid_data = {_rev:_payload for _article,_revs in parsed_revid_data.items() for _rev,_payload in _revs.items()}
with open('parsed_revid_data.json','wb') as f:
json.dump(parsed_revid_data,f)
with open('simlified_parsed_revid_data.json','wb') as f:
json.dump(simlified_parsed_revid_data,f)
Islamic State of Iraq and the Levant Revision 624083851.0 has an error
parsed_revid_data['Islamic State of Iraq and the Levant'][624083851]
nan
with open('parsed_revid_data.json','rb') as f:
parsed_revid_data = json.load(f)
Look at the most-referenced domain names.
_final_revs = dict(max_daily_revid.ix[pd.to_datetime('2014-12-21').date()])
urls = list()
for _article, _rev in _final_revs.items():
if 'externallinks' in parsed_revid_data[_article][_rev].keys():
for _url in parsed_revid_data[_article][_rev]['externallinks']:
urls.append(urlparse.urlparse(_url)[1])
_s = pd.Series(Counter(urls)).sort(ascending=False,inplace=False)
_s.head(10)
www.bbc.co.uk 218 www.theguardian.com 175 www.nytimes.com 135 www.bbc.com 126 www.reuters.com 106 www.telegraph.co.uk 99 web.archive.org 94 www.washingtonpost.com 92 www.fifa.com 72 www.scmp.com 65 dtype: int64
Define a list of Western news sources in westerners
and create a function western_link_fraction
that will compute the fraction of domain names in a given revid that come from this list.
westerners = ['bbc','guardian','ft','telegraph','independent',
'nytimes','reuters','washingtonpost','cnn','wsj','abc','nbc','cbs','yahoo','bloomberg']
def western_link_fraction(_revid,_revdict):
try:
_urls = _revdict[_revid]['externallinks']
_domains = [urlparse.urlparse(_url)[1] for _url in _urls]
_western = [any(_w in _d for _w in westerners) for _d in _domains]
if len(_western) > 0:
return float(sum(_western))/len(_western)
else:
return 0
except KeyError:
return np.nan
western_link_df = pd.DataFrame(index=pd.date_range('1-1-2014','12-21-2014'))
for article in max_daily_revid.columns:
western_link_df[article] = max_daily_revid[article].apply(lambda x:western_link_fraction(x,simlified_parsed_revid_data))
2014 Crimean crisis | 2014 FIFA World Cup | 2014 Ferguson unrest | 2014 Hong Kong protests | 2014 Israel–Gaza conflict | 2014 Winter Olympics | Chibok schoolgirl kidnapping | Cuba–United States relations | Ebola virus epidemic in West Africa | Eurovision Song Contest 2014 | ... | Indian general election, 2014 | Islamic State of Iraq and the Levant | Malaysia Airlines Flight 17 | Malaysia Airlines Flight 370 | Minecraft | Rosetta spacecraft | Scottish independence referendum, 2014 | Sinking of the MV Sewol | Soma mine disaster | United States elections, 2014 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2014-12-17 | 0.318735 | 0.219653 | 0.34413 | 0.362573 | 0.196277 | 0.156425 | 0.304348 | 0.252747 | 0.275064 | 0.043062 | ... | 0.074176 | 0.327586 | 0.351064 | 0.349206 | 0.096070 | 0.103448 | 0.522422 | 0.318339 | 0.328571 | 0.408163 |
2014-12-18 | 0.318735 | 0.219653 | 0.34413 | 0.363372 | 0.195616 | 0.156425 | 0.304348 | 0.227723 | 0.275253 | 0.043062 | ... | 0.074176 | 0.327586 | 0.351064 | 0.351438 | 0.095652 | 0.103448 | 0.522422 | 0.318339 | 0.328571 | 0.408163 |
2014-12-19 | 0.315663 | 0.219653 | 0.34413 | 0.359195 | 0.195616 | 0.156425 | 0.304348 | 0.235294 | 0.276382 | 0.043062 | ... | 0.074176 | 0.325431 | 0.351064 | 0.351438 | 0.094828 | 0.103448 | 0.522422 | 0.318339 | 0.328571 | 0.408163 |
2014-12-20 | 0.315663 | 0.219653 | 0.34413 | 0.359195 | 0.195616 | 0.156425 | 0.304348 | 0.235294 | 0.276543 | 0.043062 | ... | 0.074176 | 0.324034 | 0.348592 | 0.351438 | 0.094828 | 0.103448 | 0.522422 | 0.318339 | 0.328571 | 0.408163 |
2014-12-21 | 0.315663 | 0.219653 | 0.34413 | 0.359195 | 0.195616 | 0.156425 | 0.304348 | 0.235294 | 0.276543 | 0.043062 | ... | 0.074176 | 0.324034 | 0.348592 | 0.351438 | 0.094828 | 0.103448 | 0.522422 | 0.318339 | 0.328571 | 0.408163 |
5 rows × 23 columns
f,ax = plt.subplots(1,1,figsize=(10,6))
_ax = western_link_df.plot(colormap='spectral',ax=ax)
_ax.legend(loc='center left',bbox_to_anchor=[1,.5])
f.tight_layout()
f.savefig('western_links.png',dpi=200,bbox_inches='tight')
def chunk_maker(a_list,size):
chunk_num = len(a_list)/size
chunks = list()
for c in range(chunk_num + 1):
start = c * (size + 1)
end = (c + 1) * (size + 1)
elements = list(itertools.islice(a_list,start,end))
if len(elements) > 0:
chunks.append(elements)
return chunks
# http://stackoverflow.com/a/319291/1574687
def valid_ip(address):
try:
parts = address.split(".")
if len(parts) != 4:
return False
for item in parts:
if not 0 <= int(item) <= 255 and len(item) > 3:
return False
return True
except ValueError:
return False
The code block below is used to get the userproperties and save them to user_properties.json
. First filter out the users corresponding to IP addresses, as they won't have any valid information. Then use the chunk_maker
to make a list of lists containing 50 elements in each list. For each chunk, convert the list of 50 usernames into a giant string with names joined by pipes (u'|'
) and pass this "list" of usernames to the get_user_properties
function. Add the each element of the returned list of names back to the user_properties
empty list. Then save it to disk.
If you've completed the above step or have userproperties.json
already in your directory, you can load this up and proceed from this step.
with open('user_properties.json','rb') as f:
user_properties2 = json.load(f)
user_props_df = pd.DataFrame(user_properties2).set_index('name')
user_props_df = user_props_df[user_props_df['userid'].notnull()]
user_props_df['registration'] = pd.to_datetime(user_props_df['registration'],format='%Y-%m-%dT%H:%M:%SZ')
user_props_df['blockedtimestamp'] = pd.to_datetime(user_props_df['blockedtimestamp'],format='%Y-%m-%dT%H:%M:%SZ')
user_props_df['account_age'] = (pd.datetime.today().date() - user_props_df['registration'])/np.timedelta64(1,'D')
user_props_df['blocked'] = user_props_df['blockexpiry'].notnull()
user_props_df['blocked_account_age'] = (user_props_df['blockedtimestamp'] - user_props_df['registration'])/np.timedelta64(1,'D')
user_props_df['editcount'] = user_props_df['editcount'].map(float)
user_props_df['permissions'] = user_props_df['groups'].apply(len) - 2
user_props_df.drop(['invalid','blockedbyid','blockid','userid','blockedby','blockexpiry','blockreason'],inplace=True,axis=1)
user_props_df.head()
blockedtimestamp | editcount | gender | groups | registration | account_age | blocked | blocked_account_age | permissions | |
---|---|---|---|---|---|---|---|---|---|
name | |||||||||
-sche | NaT | 2593 | unknown | [reviewer, *, user, autoconfirmed] | 2010-12-14 22:02:32 | 1406.081574 | False | NaN | 2 |
0Aliuk | NaT | 47 | unknown | [*, user, autoconfirmed] | 2014-02-18 09:54:43 | 244.587002 | False | NaN | 1 |
1007D | 2011-07-09 22:51:54 | 1734 | male | [*, user, autoconfirmed] | 2010-10-06 00:24:04 | 1475.983287 | True | 276.935995 | 1 |
18abruce | NaT | 5101 | unknown | [*, user, autoconfirmed] | 2010-09-07 13:09:43 | 1504.451586 | False | NaN | 1 |
1980fast | NaT | 885 | unknown | [*, user, autoconfirmed] | 2010-04-13 04:32:12 | 1651.810972 | False | NaN | 1 |
There are very few registered users in the corpus who identify as women (43) versus men (802). However, the vast majority of registered editors do not identify their gender at all (2006).
gender_count = user_props_df.groupby('gender').agg({'editcount':len})
print gender_count
sns.barplot(gender_count.index,gender_count.values,palette='muted')
plt.yscale('log')
plt.ylabel('Number of users',fontsize=15)
plt.xlabel('')
editcount gender female 43 male 802 unknown 2006
<matplotlib.text.Text at 0x2d1e6278>
The editcount
variable is across all edits to the English Wikipedia, not just the articles in the current 2014 news corpus. Plotting the distributions across genders, it appears women make more edits than men.
sns.boxplot(user_props_df['editcount'],groupby=user_props_df['gender'],color='muted')
plt.yscale('log')
plt.ylabel('Total revisions',fontsize=15)
plt.xlabel('')
<matplotlib.text.Text at 0x2c19ec88>
Running a Mann-Whiney rank test to test the significance of the difference between men and women, it is not significant (one-tailed $p = .132$) In other words, we cannot reject the hypothesis that the differences in the median values between men and women is due to random chance.
female_editcounts = user_props_df[user_props_df['gender'] == 'female']['editcount'].values
male_editcounts = user_props_df[user_props_df['gender'] == 'male']['editcount'].values
stats.mannwhitneyu(female_editcounts,male_editcounts)
(15505.0, 0.13257152370225006)
sns.boxplot(user_props_df['account_age'],groupby=user_props_df['gender'],color='muted')
plt.yscale('log')
plt.ylabel('Account age',fontsize=15)
plt.xlabel('')
<matplotlib.text.Text at 0x2c5562e8>
Running a Mann-Whiney rank test to test the significance of the difference between men and women, it is not significant (two-tailed $p = .156$). In other words, we cannot reject the hypothesis that the differences in the median values between men and women is due to random chance.
female_account_ages = user_props_df[user_props_df['gender'] == 'female']['account_age'].values
male_account_ages = user_props_df[user_props_df['gender'] == 'male']['account_age'].values
stats.mannwhitneyu(female_account_ages,male_account_ages)
(15668.0, 0.15630039119059119)
In the corpus of women who edited any of these 11 articles, women get blocked ~4.7% of the time and men get blocked 2.5% of the time, but there aren't enough observations to make any claims about these differences being significant.
blocked = user_props_df[user_props_df['blocked']]
print blocked.groupby('gender').agg({'editcount':len})
print '\n'
print blocked.groupby('gender').agg({'editcount':len})/gender_count
editcount gender female 2 male 20 unknown 137 editcount gender female 0.046512 male 0.024938 unknown 0.068295
Subset the usernodelist
and edgelists
to only include the nodes in the gt1
graph.
bp_g_gt1_usernodelist.head()
latency_min | latency_median | latency_max | ts_min | ts_max | weight | diff_median | diff_max | diff_min | link_count_median | ... | blockedtimestamp | editcount | gender | groups | registration | account_age | blocked | blocked_account_age | permissions | degree | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
username | |||||||||||||||||||||
-sche | 31 | 341 | 81251 | 4937.999317 | 5017.180903 | 358 | -1.0 | 2851 | -3266 | 229 | ... | NaT | 2593 | unknown | [reviewer, *, user, autoconfirmed] | 2010-12-14 22:02:32 | 1406.081574 | False | NaN | 2 | 2 |
0Aliuk | 111 | 536 | 31981 | 4787.828588 | 4818.636806 | 4 | 12.5 | 35 | -1 | 366 | ... | NaT | 47 | unknown | [*, user, autoconfirmed] | 2014-02-18 09:54:43 | 244.587002 | False | NaN | 1 | 1 |
1.123.194.170 | 41 | 226 | 411 | 4801.286829 | 4801.287292 | 2 | 174.5 | 322 | 27 | 127 | ... | NaT | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 1 |
1.36.102.163 | 71 | 906 | 1741 | 5007.774919 | 5007.775752 | 2 | 26.5 | 51 | 2 | 132 | ... | NaT | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 1 |
1.36.209.129 | 91 | 336 | 19731 | 5008.152789 | 5021.642234 | 16 | 41.5 | 150 | -576 | 133 | ... | NaT | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 1 |
5 rows × 26 columns
bp_g_gt1_usernodelist = revs_usernodelist[revs_usernodelist.index.isin(users)]
bp_g_gt1_usernodelist = bp_g_gt1_usernodelist.join(user_props_df,how='left')
bp_g_gt1_usernodelist['degree'] = pd.Series({k:v for k,v in idc.iteritems() if k in bp_g_gt1_usernodelist.index})
bp_g_gt1_edgelist = revs_edgelist[revs_edgelist.index.isin(coauthorship_g_gt1.edges())]
bp_g_gt1_edgelist['article_degree'] = pd.Series({i:odc[i[0]] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editor_degree'] = pd.Series({i:idc[i[1]] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['article_age'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'ts_min'] - revs_pagenodelist.ix[i[0],'ts_min'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editor_age'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'account_age'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['gender'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'gender'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['permissions'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'permissions'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editcount'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'editcount'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['persistence'] = bp_g_gt1_edgelist['revision_max'] - bp_g_gt1_edgelist['revision_min']
bp_g_gt1_edgelist['revision_min_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_min']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['revision_max_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_max']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)})
Compare the number of revisions made per article to the number of articles edited. There's an upward trend suggesting that editors who contribute to more articles also edit articles more intensively.
ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Revisions made',fontsize=15)
<matplotlib.text.Text at 0x1bb6ce48>
ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Revisions made',fontsize=15)
<matplotlib.text.Text at 0x3416ada0>
Edit intensity can also be measured using the median latency of an editors' contributions. Editors making changes in rapid succession have lower latency and editors who take a lot of time between edits have higher latency. There's no apparent relationship between lataency and number of articles edited.
ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Latency (s)',fontsize=15)
<matplotlib.text.Text at 0x2da69518>
ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Latency (s)',fontsize=15)
<matplotlib.text.Text at 0x33c5eb38>
bp_g_gt1_edgelist.head()
latency_min | latency_median | latency_max | ts_min | ts_max | weight | diff_median | diff_max | diff_min | link_count_median | ... | article_degree | editor_degree | article_age | editor_age | editor_gender | permissions | editcount | persistence | revision_min_frac | revision_max_frac | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | username | |||||||||||||||||||||
2014 Crimean crisis | 101.98.175.68 | 31 | 6021 | 12011 | 4855.264688 | 4855.265058 | 2 | 72 | 143 | 1 | 410.0 | ... | 342 | 1 | 65.554549 | NaN | NaN | NaN | NaN | 1 | 0.922298 | 0.922551 |
109.148.57.245 | 81 | 101 | 131 | 4795.964560 | 4795.970093 | 3 | 11 | 27 | 7 | 300.0 | ... | 342 | 1 | 6.254421 | NaN | NaN | NaN | NaN | 4 | 0.331309 | 0.332321 | |
109.255.139.0 | 31 | 81 | 3021 | 4797.811412 | 4813.016528 | 3 | -1 | 17 | -2 | 346.0 | ... | 342 | 1 | 8.101273 | NaN | NaN | NaN | NaN | 1233 | 0.434827 | 0.746900 | |
109.64.20.198 | 771 | 811 | 1241 | 4795.843796 | 4817.730625 | 3 | 29 | 112 | 18 | 353.0 | ... | 342 | 1 | 6.133657 | NaN | NaN | NaN | NaN | 1827 | 0.323209 | 0.785624 | |
109.78.144.177 | 731 | 4306 | 7881 | 4815.682222 | 4816.079375 | 2 | 0 | 0 | 0 | 415.5 | ... | 342 | 1 | 25.972083 | NaN | NaN | NaN | NaN | 25 | 0.768413 | 0.774741 |
5 rows × 25 columns
plt.scatter(bp_g_gt1_edgelist['weight'],bp_g_gt1_edgelist['persistence'],alpha=.5)
plt.plot((0,10**4),(0,10**4),color='k',linestyle='--',linewidth=2)
plt.xscale('symlog')
plt.yscale('symlog')
plt.xlim((0,10**4))
plt.ylim((0,10**4))
plt.xlabel('Revisions made',fontsize=15)
plt.ylabel('Persistence (days)',fontsize=15)
<matplotlib.text.Text at 0x29cec9b0>
plt.scatter(bp_g_gt1_edgelist['weight']*np.random.uniform(.9,1.1,size=len(bp_g_gt1_edgelist)),bp_g_gt1_edgelist['revision_min_frac'],alpha=.5)
plt.xscale('symlog')
plt.xlim((2,10**4))
plt.ylim((0,1))
plt.xlabel('Revisions made',fontsize=15)
plt.ylabel('Latency',fontsize=15)
<matplotlib.text.Text at 0x2986df98>
ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)
<matplotlib.text.Text at 0x370f8550>
ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['permissions'],color='gist_rainbow')
ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)
<matplotlib.text.Text at 0x32477470>
ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Editor age (days)',fontsize=15)
<matplotlib.text.Text at 0x3465ce10>
ax = sns.boxplot(bp_g_gt1_edgelist['article_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Time since first edit (days)',fontsize=15)
<matplotlib.text.Text at 0x36036da0>
ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)
<matplotlib.text.Text at 0x381c1048>
ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel')
#ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('Editor age',fontsize=15)
<matplotlib.text.Text at 0x3889ce10>
ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
#ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)
<matplotlib.text.Text at 0x3515cfd0>
ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
#ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)
<matplotlib.text.Text at 0x344f22e8>
ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel')
#ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)
<matplotlib.text.Text at 0x379b0cc0>