Analysis of Wikipedia's Coverage of 2014 News Events

By Brian Keegan, Ph.D. -- December 19, 2014

Web; @bkeegan; GitHub

In [1024]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, requests, re, itertools, urllib2, urlparse
import wikipedia_scraping as ws
import seaborn as sns
import networkx as nx

from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup, element
from collections import Counter
from IPython.display import Image
from operator import itemgetter
from scipy import stats
from matplotlib.lines import Line2D

_start = pd.datetime(2001,1,1)
_end = pd.datetime(2015,1,1)
_filedir = u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'

We begin by defining the article titles for the news events we want to collect data on. We'll also decode these strings into valid unicode strings to account for any wacky characters.

What were top stories in 2014?

In [4]:
# http://yearinreview.fb.com/
facebook = ['World Cup','Ebola Outbreak','Elections in Brazil','Robin Williams','Ice Bucket Challenge','Conflict in Gaza','Malaysia Airlines disasters','Super Bowl','Ferguson','Sochi']

# http://www.google.com/trends/topcharts?hl=en#date=2014
google = ['Robin Williams','World Cup','Ebola','Malaysia Airlines','Flappy Bird','ALS Ice Bucket Challenge','ISIS','Ferguson','Frozen','Ukraine']

# https://2014.twitter.com/moments
twitter = ['Philip Seymour Hoffman','State of the Union','Carnaval','Malaysia Airlines','Bring Back Our Girls','India Election','Spanish Abdication','Maya Angelou','Ferguson','Robin Williams','Ice Bucket Challenge','Scottish referendum','Ebola','He for She','Hong Kong protests','Mars Orbiter','Malala Yousafzi','US elections','Berlin Wall','Philae']

# Editorial judgment, https://en.wikipedia.org/wiki/2014
wikipedia1 = ['2014 Winter Olympics','Ebola virus epidemic in West Africa','2014 Crimean crisis','Malaysia Airlines Flight 370','Chibok schoolgirl kidnapping','Sinking of the MV Sewol','Islamic State in Iraq and the Levant','2014 FIFA World Cup','Felipe VI','2014 Israel–Gaza conflict','Malaysia Airlines Flight 17','Rosetta spacecraft','Cuba-United States relations']

# Number of contributors, http://stats.wikimedia.org/EN/TablesWikipediaEN.htm#zeitgeist
# Excluding repeats like "Deaths in 2014"
wikipedia2 = ['2013–14 North American cold wave',]

Top articles across languages

This is un-redeemably hacky HTML parsing. I apologize. But this generates your top 25 list, which is published on this gist.

In [394]:
def top_articles(lang):
    # Read the HTML from the web and convert to soup
    # Broken URLS here: soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/{0}/TablesWikipedia{0}.htm'.format(lang.upper())).read()) 
    soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/EN/TablesWikipedia{0}.htm'.format(lang.upper())).read())

    # Look for all the paragraphs with 2014
    _p = soup.findAll('b',text=re.compile('2014'))

    # Select only those paragraph parents that have exactly 152 fields, corresponding to the top-25 lists
    _p2014 = [t.parent for t in _p if len(t.parent) == 152]

    # Get the text out of the children tags as a list of lists
    parsed = [[t.text for t in list(p.children) if type(t) != element.NavigableString] for p in _p2014]

    # Convert to a dictionary keyed by month abbreviation with values as the list of text fields
    parsed = {month[0].split(u'\xa0')[0]:month[1:] for month in parsed}

    # Do some crazy dictionary and list comprehensions with zips to convert the values in the list
    parsed = {k:[{'rank':int(a),'editors':int(b),'article':c} for a,b,c in zip(v[0::3],v[1::3],v[2::3])] for k,v in parsed.items()}

    # Convert each month into a DataFrame with month information in the index
    # and then concat all the dfs together, sorting on those with the most editors
    ranked = pd.concat([pd.DataFrame(parsed[i],index=[i]*len(parsed[i])) for i in parsed.keys()]).sort('editors',ascending=False).reset_index()

    # rename the reset index to something meaningful
    ranked.rename(columns={'index':'month'},inplace=True)

    # Group the articles by name, compute aggregate statistics
    # Rank on the total number editors and months in the top 25
    top_articles = ranked.groupby('article').agg({'month':len,'editors':np.sum,'rank':np.min})
    top_articles['editor-month'] = top_articles['month'] * top_articles['editors']
    top_articles.sort(['editor-month'],ascending=False,inplace=True)
    return top_articles
In [395]:
country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
                 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
                 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
                 'uk':'Ukranian'}

top_articles_by_country = {}
for country in country_codes.keys():
    try:
        top_articles_by_country[country] = top_articles(country)
    except urllib2.HTTPError:
        print "The '{0}' language does not have a stats page ".format(country)
        pass

for _country,_df in top_articles_by_country.items():
    _df.to_csv('/Data/{0}.csv'.format(_country),encoding='utf8')
In [110]:
def langlink_translater(source_lang,target_lang,article_titles):
    chunks = ws.chunk_maker(article_titles,40)
    translation_dict = dict()
    
    for chunk in chunks:
        result = ws.wikipedia_query({'action':'query',
                                     'prop': 'langlinks',
                                     'lllang': source_lang,
                                     'titles': '|'.join(chunk),
                                     'lllimit': '500'},target_lang)
        if result and 'pages' in result.keys():
            translation_dict.update({_d['title'] : _d['langlinks'][0]['*'] for _d in result['pages'].values() if 'langlinks' in _d.keys()})
            
    return translation_dict

# This step takes a few minutes
translater_dict = {source_lang:{target_lang:langlink_translater(source_lang,target_lang,df.index) for target_lang,df in top_articles_by_country.items()} for source_lang in top_articles_by_country.keys()}

# Save the file
with open('translater_dict.json','wb') as f:
    json.dump(translater_dict,f)

Load up data crawled above.

In [400]:
_filedir
Out[400]:
u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'
In [401]:
country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
                 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
                 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
                 'uk':'Ukranian'}

top_articles_by_country = dict()
for country in country_codes.keys():
    top_articles_by_country[country] = pd.read_csv(_filedir + '/Data/{0}.csv'.format(country),encoding='utf8',index_col=0)

with open('translater_dict.json','rb') as f:
    translater_dict = json.load(f)
In [438]:
lang_link_exists_dict = dict()
top_articles_df = pd.DataFrame()

for source_lang,target_dictionary in translater_dict.iteritems():
    langlink_exists_df = pd.DataFrame()
    for target_lang,d in target_dictionary.iteritems():
        top_articles_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index)
        langlink_exists_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index).isin(translater_dict[source_lang][target_lang].keys())
        if source_lang == target_lang:
            langlink_exists_df[target_lang] = [1]*len(langlink_exists_df[target_lang])
        langlink_exists_df = langlink_exists_df.reindex_axis(sorted(langlink_exists_df.columns), axis=1)
        lang_link_exists_dict[source_lang] = langlink_exists_df
In [462]:
_df = top_articles_df.ix[:2].T
_df.index = [country_codes[i] for i in _df.index]
_df.columns = range(1,4)
_df.sort()
Out[462]:
1 2 3
Arabic كريستيانو رونالدو ريال مدريد السعودية
Chinese 世間情 太陽花學運 马来西亚航空370号班机空难
Czech Válka na východní Ukrajině Euromajdan Minecraft
Dutch Lijst van personen overleden in 2014 Malaysia Airlines-vlucht 17 Eurovisiesongfestival 2014
English Deaths in 2014 Malaysia Airlines Flight 370 Islamic State of Iraq and the Levant
Farsi دولت اسلامی عراق و شام ایل ملکشاهی مهران مدیری
French État islamique (organisation) Manuel Valls Dieudonné
German Krise in der Ukraine 2014 Alternative für Deutschland Fußball-Weltmeisterschaft 2014
Indonesian JKT48 NET. Joko Widodo
Italian Juventus Football Club Campionato mondiale di calcio 2014 Serie A 2013-2014
Japanese 仮面ライダー鎧武/ガイム 烈車戦隊トッキュウジャー ハピネスチャージプリキュア!
Korean 대한민국 일베저장소 세월호 침몰 사고
Polish Robert Lewandowski 2014 Euromajdan
Portugese Em Família (telenovela) Copa do Mundo FIFA de 2014 Campeonato Brasileiro de Futebol de 2014 - Sér...
Russian Список умерших в 2014 году Вооружённый конфликт на востоке Украины (2014) Донецкая Народная Республика
Spanish Copa Mundial de Fútbol de 2014 Podemos (partido político) Copa Sudamericana 2014
Swedish Sverigedemokraterna Avlidna 2014 Feministiskt initiativ
Turkish Türkiye Recep Tayyip Erdoğan Mustafa Kemal Atatürk
Ukranian Війна на сході України Небесна сотня Ленінопад
In [469]:
_df.ix['id'].sum(axis=1)
Out[469]:
55.0
In [465]:
_lang = 'en'
f, ax = plt.subplots(figsize=(10,5))
_df = lang_link_exists_dict[_lang].ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]

_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=1)

ax.set_frame_on(False)
ax.set_xticks(np.arange(0.5,_x+.5,10),minor=False)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.invert_yaxis()
ax.set_xticklabels(_df.columns[::10],minor=False,fontsize=12)
ax.set_yticklabels([country_codes[x] for x in _df.index],minor=False,fontsize=12)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_xlabel('Article rank',fontsize=15)
#f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)

f.tight_layout();
f.savefig('en_lang_link_exists.png',dpi=150)
In [474]:
sum_lang_link = pd.DataFrame(np.zeros(lang_link_exists_dict['en'].shape),columns=lang_link_exists_dict['en'].columns)
for lang,_df in lang_link_exists_dict.iteritems():
    sum_lang_link = sum_lang_link + _df.values.astype(float)
#frac_sum_lang_link = sum_lang_link.apply(lambda x:x/19)
sum_lang_link.columns = [country_codes[i] for i in sum_lang_link.columns]

f, ax = plt.subplots(figsize=(10,5))
_df = sum_lang_link.ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]
_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=19)

ax.set_frame_on(False)
ax.set_xticks(np.arange(0,_x,10),minor=False)
ax.set_xticklabels(np.arange(0,_x,10),fontsize=12)
ax.set_xlabel('Article Rank',fontsize=15)
ax.set_title('Number of Languages with Article on Topic',fontsize=20)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.set_yticklabels(_df.index,minor=False)
ax.invert_yaxis()

#f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([.875, 0.15, 0.025, .75])
f.colorbar(_ax, cax=cbar_ax)

f.tight_layout()
f.savefig('sum_lang_link.png',dpi=200)
In [476]:
_s = sum_lang_link.ix[:100].apply(np.average,axis=1)
ax = plt.scatter(_s.index,_s.values,s=50,cmap='rainbow')
ax.axes.set_title('Coverage for Top 100 Stories',fontsize=20)
ax.axes.set_xlabel('Article Rank',fontsize=16)
ax.axes.set_ylabel('Number of Languages Covered',fontsize=16)
ax.axes.set_xlim((-1,101))
plt.tight_layout()
plt.savefig('top100_coverage.png',dpi=200)

But all of this analysis about article coverage is inaccurate because we've made the assumption that the interlanguage links between articles are reliable. That is, if a "Eurovision 2014" article exists in all 19 languages, all 18 languages link to the other 18 languages' version of the article. The analysis below will show this assumption is flawed.

Article language graph

Make a network of the interlanguage links to reveal the clusters and missing links between languages.

In [479]:
article_language_graph = nx.DiGraph()
article_language_mapper = dict() # This will be helpful later

for source_lang,d in translater_dict.iteritems():
    for target_lang,mapping in d.iteritems():
        for target_lang_article,source_lang_article in mapping.iteritems():
            article_language_graph.add_edge(target_lang_article,source_lang_article)
            article_language_graph.add_node(source_lang_article,lang=source_lang)
            article_language_graph.add_node(target_lang_article,lang=target_lang)
            
            # Populate the article_language_mapper
            if source_lang_article in article_language_mapper.keys():
                article_language_mapper[source_lang_article].append(source_lang)
            else:
                article_language_mapper[source_lang_article] = [source_lang]
            if target_lang_article in article_language_mapper.keys():
                article_language_mapper[target_lang_article].append(target_lang)
            else:
                article_language_mapper[target_lang_article] = [target_lang]
            
nx.write_gexf(article_language_graph,'article_language_graph.gexf')
article_language_mapper = {k:list(set(v)) for k,v in article_language_mapper.iteritems()}

with open('article_language_mapper.json','wb') as f:
    json.dump(article_language_mapper,f)
In [14]:
Image('article_language_links.png')
Out[14]:

There are at least two kinds of problems in these topical subgraphs. The first problem is that these subgraphs are missing many links within topics; articles about the same are linked in some languages, but not others. This is marked in the figure below by the blue observations falling much below the red Ideal line.

The second problem is that different topics are sometimes linked together. Because there are 19 languages we're looking at, there should be a maximum of 19 articles in a cluster. However, there are three clusters that have more than 19 articles in them. This is an artifact of imprecise topical linking. For example, some languages link to the article on Ebolavirus while others link to Ebola virus disease. Entities like Taiwan and China refer to complex and overlapping concepts like political entities (PRC vs. ROC), geographies ("Formosa" vs. Mainland), and cultures. Of course, this disambiguation problem is almost certainly likely to be present in the topical subgraphs below size 19, but I'm simply going to ignore it.

In [480]:
topic_subgraphs = list(nx.components.connected_component_subgraphs(article_language_graph.to_undirected()))
subgraph_properties = [{'edges':_subgraph.number_of_edges(),'nodes':_subgraph.number_of_nodes(),'density':nx.density(_subgraph)} for _subgraph in topic_subgraphs]

# Uncomment to see what's in these subgraphs
#for _subgraph in topic_subgraphs:
#    if _subgraph.number_of_nodes() > 19:
#        print _subgraph.nodes()

subgraph_df = pd.DataFrame(subgraph_properties)
subgraph_df = subgraph_df[subgraph_df['nodes'] > 2]

f,ax = plt.subplo