Wikipedia data scraping functions

This notebook contains a variety of functions primarily for accessing the MediaWiki API to extract data page revisions, user revisions, article hyperlinks, category membership, and pageview dynamics.

These scripts invoke several non-standard libraries:

This code was primarily authored by Brian Keegan ([email protected]) in 2012 and 2013 with contributions from Nick Bennett ([email protected]).

Basic functions

In [2]:
from wikitools import wiki, api
import networkx as nx
from operator import itemgetter
from collections import Counter
import re, random, datetime, urlparse, urllib2, simplejson, copy
import pandas as pd
In [5]:
def is_ip(ip_string, masked=False):
	# '''
	# Input:
	# ip_string - A string we'd like to check if it matches the pattern of a valid IP address.
	# Output:
	# A boolean value indicating whether the input was a valid IP address.
	# '''
	if not isinstance(ip_string, str) and not isinstance(ip_string, unicode):
		return False
	if masked:
		ip_pattern = re.compile('((([\d]{1,3})|([Xx]{1,3}))\.){3}(([\d]{1,3})|([Xx]{1,3}))', re.UNICODE)
	else:
		ip_pattern = re.compile('([\d]{1,3}\.){3}([\d]{1,3})', re.UNICODE)
	if ip_pattern.match(ip_string):
		return True
	else:
		return False

def convert_to_datetime(string):
    dt = datetime.datetime.strptime(string,'%Y-%m-%dT%H:%M:%SZ')
    return dt
    
def convert_from_datetime(dt):
    string = dt.strftime('%Y%m%d%H%M%S')
    return string

def convert_datetime_to_epoch(dt):
    epochtime = (dt - datetime.datetime(1970,1,1)).total_seconds()
    return epochtime

def wikipedia_query(query_params,lang='en'):
	site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')
	request = api.APIRequest(site, query_params)
	result = request.query()
	return result[query_params['action']]

def short_wikipedia_query(query_params,lang='en'):
	site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')
	request = api.APIRequest(site, query_params)
	# Don't do multiple requests
	result = request.query(querycontinue=False)
	return result[query_params['action']]

def random_string(le, letters=True, numerals=False):
	def rc():
		charset = []
		cr = lambda x,y: range(ord(x), ord(y) + 1)
		if letters:
			charset += cr('a', 'z')
		if numerals:
			charset += cr('0', '9')
		return chr(random.choice(charset))
	def rcs(k):
		return [rc() for i in range(k)]
	return ''.join(rcs(le))

def clean_revision(rev):
	# We must deal with some malformed user/userid values. Some 
	# revisions have the following problems:
	# 1. no 'user' or 'userid' keys and the existence of the 'userhidden' key
	# 2. 'userid'=='0' and 'user'=='Conversion script' and 'anon'==''
	# 3. 'userid'=='0' and 'user'=='66.92.166.xxx' and 'anon'==''
	# 4. 'userid'=='0' and 'user'=='204.55.21.34' and 'anon'==''
	# In these cases, we must substitute a placeholder value
	# for 'userid' to uniquely identify the respective kind
	# of malformed revision as above. 
	revision = rev.copy()
	if 'userhidden' in revision:
		revision['user'] = random_string(15, letters=False, numerals=True)
		revision['userid'] = revision['user']
	elif 'anon' in revision:
		if revision['user']=='Conversion script':
			revision['user'] = random_string(14, letters=False, numerals=True)
			revision['userid'] = revision['user']
		elif is_ip(revision['user']):
			# Just leaving this reflection in for consistency
			revision['user'] = revision['user']
			# The weird stuff about multiplying '0' by a number is to 
			# make sure that IP addresses end up looking like this:
			# 192.168.1.1 -> 192168001001
			# This serves to prevent collisions if the numbers were
			# simply joined by removing the periods:
			# 215.1.67.240 -> 215167240
			# 21.51.67.240 -> 215167240
			# This also results in the number being exactly 12 decimal digits.
			revision['userid'] = ''.join(['0' * (3 - len(octet)) + octet \
											for octet in revision['user'].split('.')])
		elif is_ip(revision['user'], masked=True):
			# Let's distinguish masked IP addresses, like
			# 192.168.1.xxx or 255.XXX.XXX.XXX, by setting 
			# 'user'/'userid' both to a random 13 digit number
			# or 13 character string. 
			# This will probably be unique and easily 
			# distinguished from an IP address (with 12 digits
			# or characters). 
			revision['user'] = random_string(13, letters=False, numerals=True)
			revision['userid'] = revision['user']
	return revision

def cast_to_unicode(string):
    if isinstance(string,str):
        try:
            string2 = string.decode('utf8')
        except:
            try:
                string2 = string.decode('latin1')
            except:
                print "Some messed up encoding here"
    elif isinstance(string,unicode):
        string2 = string
    return string2

User revisions

In [2]:
def get_user_revisions(user,dt_end,lang):
    '''
    Input: 
    user - The name of a wikipedia user with no "User:" prefix, e.g. 'Madcoverboy' 
    dt_end - a datetime object indicating the maximum datetime to return for revisions
    lang - a string (typically two characters) indicating the language version of Wikipedia to crawl

    Output:
    revisions - A list of revisions for the given article, each given as a dictionary. This will
            include all properties as described by revision_properties, and will also include the
            title and id of the source article. 
    '''
    user = cast_to_unicode(user)
    revisions = list()
    dt_end_string = convert_from_datetime(dt_end)
    result = wikipedia_query({'action':'query',
                              'list': 'usercontribs',
                              'ucuser': u"User:"+user,
                              'ucprop': 'ids|title|timestamp|sizediff',
                              #'ucnamespace':'0',
                              'uclimit': '500',
                              'ucend':dt_end_string},lang)
    if result and 'usercontribs' in result.keys():
            r = result['usercontribs']
            r = sorted(r, key=lambda revision: revision['timestamp'])
            for revision in r:
                    # Sometimes the size key is not present, so we'll set it to 0 in those cases
                    revision['sizediff'] = revision.get('sizediff', 0)
                    revision['timestamp'] = convert_to_datetime(revision['timestamp'])
                    revisions.append(revision)
    return revisions

def get_user_properties(user,lang):
    '''
    Input:
    user - a string with no "User:" prefix corresponding to the username ("Madcoverboy"
    lang - a string (usually two digits) for the language version of Wikipedia to query

    Output:
    result - a dictionary containing attrubutes about the user
    '''
    user = cast_to_unicode(user)
    result = wikipedia_query({'action':'query',
                                'list':'users',
                                'usprop':'blockinfo|groups|editcount|registration|gender',
                                'ususers':user},lang)
    return result
    
def make_user_alters(revisions):
    '''
    Input:
    revisions - a list of revisions generated by get_user_revisions

    Output:
    alters - a dictionary keyed by page name that returns a dictionary containing
        the count of how many times the user edited the page, the timestamp of the user's
        earliest edit to the page, the timestamp the user's latest edit to the page, and 
        the namespace of the page itself
    '''
    alters = dict()
    for rev in revisions:
        if rev['title'] not in alters.keys():
            alters[rev['title']] = dict()
            alters[rev['title']]['count'] = 1
            alters[rev['title']]['min_timestamp'] = rev['timestamp']
            alters[rev['title']]['max_timestamp'] = rev['timestamp']
            alters[rev['title']]['ns'] = rev['ns']
        else:
            alters[rev['title']]['count'] += 1
            alters[rev['title']]['max_timestamp'] = rev['timestamp']
    return alters

Page revisions

In [3]:
def rename_on_redirect(article_title,lang='en'):
    '''
    Input:
    article_title - a string with the name of the article or page that may be redirected to another title
    lang - a string (typically two characters) indicating the language version of Wikipedia to crawl

    Output:
    article_title - a string with the name of the article or page that the redirect resolves to
    '''
    result = wikipedia_query({'titles': article_title,
                                  'prop': 'info',
                                  'action': 'query',
                                  'redirects': 'True'},lang)
    if 'redirects' in result.keys() and 'pages' in result.keys():
        article_title = result['redirects'][0]['to']
    return article_title

def get_page_revisions(article_title,dt_start,dt_end,lang):
    '''
    Input: 
    article - A string with the name of the article or page to crawl
    dt_start - A datetime object indicating the minimum datetime to return for revisions
    dt_end - a datetime object indicating the maximum datetime to return for revisions
    lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
    
    Output:
    revisions - A list of revisions for the given article, each given as a dictionary. This will
            include all properties as described by revision_properties, and will also include the
            title and id of the source article. 
    '''
    article_title = rename_on_redirect(article_title)
    dt_start_string = convert_from_datetime(dt_start)
    dt_end_string = convert_from_datetime(dt_end) 
    revisions = list()
    result = wikipedia_query({'titles': article_title,
                              'prop': 'revisions',
                              'rvprop': 'ids|timestamp|user|userid|size',
                              'rvlimit': '5000',
                              'rvstart': dt_start_string,
                              'rvend': dt_end_string,
                              'rvdir': 'newer',
                              'action': 'query'},lang)
    if result and 'pages' in result.keys():
            page_number = result['pages'].keys()[0]
            try:
                r = result['pages'][page_number]['revisions']
                for revision in r:
                        revision['pageid'] = page_number
                        revision['title'] = result['pages'][page_number]['title']
                        # Sometimes the size key is not present, so we'll set it to 0 in those cases
                        revision['size'] = revision.get('size', 0)
                        revision['timestamp'] = convert_to_datetime(revision['timestamp'])
                        revisions.append(revision)
            except KeyError:
                revisions = list()
    return revisions

def make_page_alters(revisions):
    '''
    Input:
    revisions - a list of revisions generated by get_page_revisions

    Output:
    alters - a dictionary keyed by user name that returns a dictionary containing
    the count of how many times the user edited the page, the timestamp of the user's
    earliest edit to the page, the timestamp the user's latest edit to the page, and 
    the namespace of the page itself
    '''
    alters = dict()
    for rev in revisions:
        if rev['user'] not in alters.keys():
            alters[rev['user']] = dict()
            alters[rev['user']]['count'] = 1
            alters[rev['user']]['min_timestamp'] = rev['timestamp']
            alters[rev['user']]['max_timestamp'] = rev['timestamp']
        else:
            alters[rev['user']]['count'] += 1
            alters[rev['user']]['max_timestamp'] = rev['timestamp']
    return alters
In [202]:
def get_page_content(page_title,lang):
    '''
    Input: 
    page_title - A string with the name of the article or page to crawl
    lang - A string (typically two characters) indicating the language version of Wikipedia to crawl

    Output:
    revisions_dict - A dictionary of revisions for the given article keyed by revision ID returning a 
            a dictionary of revision attributes. These attributes include all properties as described 
            by revision_properties, and will also include the title and id of the source article. 
    '''
    article_title = rename_on_redirect(page_title)
    revisions_dict = dict()
    result = wikipedia_query({'titles': page_title,
                              'prop': 'revisions',
                              'rvprop': 'ids|timestamp|user|userid|size|content',
                              'rvlimit': '5000',
                              'action': 'query'},lang)
    if result and 'pages' in result.keys():
        page_number = result['pages'].keys()[0]
        revisions = result['pages'][page_number]['revisions']
        for revision in revisions:
            rev = dict()
            rev['pageid'] = page_number
            rev['title'] = result['pages'][page_number]['title']
            rev['size'] = revision.get('size', 0) # Sometimes the size key is not present, so we'll set it to 0 in those cases
            rev['timestamp'] = convert_to_datetime(revision['timestamp'])
            rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string
            rev['links'] = link_finder(rev['content'])
            rev['username'] = revision['user']
            rev['userid'] = revision['userid']
            rev['revid'] = revision['revid']
            revisions_dict[revision['revid']] = rev
    return revisions_dict

Category members

In [4]:
def get_category_members(category_name, depth, lang='en'):
    '''
    Input: 
    category_name - The name of a Wikipedia(en) category, e.g. 'Category:2001_fires'. 
    depth - An integer in the range [0,n) reflecting the number of sub-categories to crawl
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    articles - A list of articles that are found within the given category or one of its
        subcategories, explored recursively. Each article will be a dictionary object with
        the keys 'title' and 'id' with the values of the individual article's title and 
        page_id respectively. 
    '''
    articles = []
    if depth < 0:
        return articles
    
    #Begin crawling articles in category
    results = wikipedia_query({'list': 'categorymembers',
                                   'cmtitle': category_name,
                                   'cmtype': 'page',
                                   'cmlimit': '500',
                                   'action': 'query'},lang)  
    if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:
        for i, page in enumerate(results['categorymembers']):
            article = page['title']
            articles.append(article)
    
    # Begin crawling subcategories
    results = wikipedia_query({'list': 'categorymembers',
                                   'cmtitle': category_name,
                                   'cmtype': 'subcat',
                                   'cmlimit': '500',
                                   'action': 'query'},lang)
    subcategories = []
    if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:
        for i, category in enumerate(results['categorymembers']):
            cat_title = category['title']
            subcategories.append(cat_title)
    for category in subcategories:
        articles += get_category_members(category,depth-1)      
    return articles

def get_page_categories(page_title,lang='en'):
    '''
    Input:
    page_title - A string with the name of the article or page to crawl
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    categories - A list of the names of the categories of which the page is a member
    '''
    page_title = rename_on_redirect(page_title)
    results = wikipedia_query({'prop': 'categories',
                                   'titles': page_title,
                                   'cllimit': '500',
                                   'clshow':'!hidden',
                                   'action': 'query'},lang)
    if 'pages' in results.keys():
        page_number = results['pages'].keys()[0]
        categories = results['pages'][page_number]['categories']
        categories = [i['title'] for i in categories]
        categories = [i for i in categories if i != u'Category:Living people']
    else:
        print u"{0} not found in category results".format(page_title)
    return categories
In [5]:
def get_page_outlinks(page_title,lang='en'):
    '''
    Input:
    page_title - A string with the name of the article or page to crawl
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    outlinks - A list of all "alter" pages that link out from the current version of the "ego" page

    Notes:
    This uses API calls to return all [[links]] which may be slower and result in overlinking from templates
    '''
    # This approach is susceptible to 'overlinking' as it includes links from templates
    page_title = cast_to_unicode(page_title)
    page_title = rename_on_redirect(page_title)
    result = wikipedia_query({'titles': page_title,
                                  'prop': 'links',
                                  'pllimit': '500',
                                  'plnamespace':'0',
                                  'action': 'query'},lang)
    if 'pages' in result.keys():
        page_number = result['pages'].keys()[0]
        results = result['pages'][page_number]['links']
        outlinks = [l['title'] for l in results]
    else:
        print u"Error: No links found in {0}".format(page_title)
    return outlinks

def get_page_inlinks(page_title,lang='en'):
    '''
    Input:
    page_title - A string with the name of the article or page to crawl
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    inlinks - A list of all "alter" pages that link in to the current version of the "ego" page
    '''
    page_title = cast_to_unicode(page_title)
    page_title = rename_on_redirect(page_title)
    result = wikipedia_query({'bltitle': page_title,
                                  'list': 'backlinks',
                                  'bllimit': '500',
                                  'blnamespace':'0',
                                  'blfilterredir':'nonredirects',
                                  'action': 'query'},lang)
    if 'backlinks' in result.keys():
        results = result['backlinks']
        inlinks = [l['title'] for l in results]
    else:
        print u"Error: No links found in {0}".format(article_title)
    return inlinks

# Links inside templates are included which results in completely-connected components
# Remove links from templates by getting a list of templates used across all pages
def get_page_templates(page_title,lang):
    '''
    Input:
    page_title - A string with the name of the article or page to crawl
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    templates - A list of all the templates (which contain redundant links) in the current version
    '''
    page_title = cast_to_unicode(page_title)
    page_title = rename_on_redirect(page_title)
    result = wikipedia_query({'titles': page_title,
                                  'prop': 'templates',
                                  'tllimit': '500',
                                  'action': 'query'},lang)
    if 'pages' in result.keys():
        page_id = result['pages'].keys()[0]
        templates = [i['title'] for i in result['pages'][page_id]['templates']]
    return templates

def get_page_links(page_title,lang='en'):
    '''
    Input:
    page_title - A string with the name of the article or page to crawl that is the "ego" page
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    links - A dictionary keyed by ['in','out'] of all "alter" pages that link in to and out from the 
        current version of the "ego" page
    '''
    links=dict()
    links['in'] = get_page_inlinks(page_title,lang)
    links['out'] = get_page_outlinks(page_title,lang)
    return links

# Identify links based on content of revisions
def link_finder(content_string):
    '''
    Input:
    content_string - A string containing the raw wiki-markup for a page

    Output:
    links - A list of all "alter" pages that link out from the current version of the "ego" page

    Notes:
    This uses regular expressions to coarsely parse the content for instances of [[links]] and likely returns messy data
    '''
    links = list()
    for i,j in re.findall(r'\[\[([^|\]]*\|)?([^\]]+)\]\]',content_string):
        if len(i) == 0:
            links.append(j)
        elif u'#' not in i :
            links.append(i[:-1])
        elif u'#' in i:
            new_i = i[:i.index(u'#')]
            links.append(new_i)
    links = [l for l in links if u'|' not in l and u'Category:' not in l and u'File:' not in l]
    return links

def get_page_outlinks_from_content(page_title,lang='en'):
    '''
    Input:
    page_title - A string with the name of the article or page to crawl that is the "ego" page
    lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl

    Output:
    links - A list of all "alter" pages that link out from the current version of the "ego" page

    Notes:
    This uses regular expressions to coarsely parse the content for instances of [[links]] and may be messy
    '''
    page_title = cast_to_unicode(page_title)
    page_title = rename_on_redirect(page_title)
    
    # Get content from most recent revision of an article
    result = short_wikipedia_query({'titles': page_title,
                                  'prop': 'revisions',
                                  'rvlimit': '1',
                                  'rvprop':'ids|timestamp|user|userid|content',
                                  'action': 'query'},lang)
    if 'pages' in result.keys():
        page_id = result['pages'].keys()[0]
        content = result['pages'][page_id]['revisions'][0]['*']
        links = link_finder(content)
    else:
        print u'...Error in {0}'.format(page_title)
        links = list()
        
    return links

Discussion

In [6]:
def get_user_outdiscussion(user_name,dt_end,lang='en'):
    '''
    Input:
    user_name - The name of a "ego" wikipedia user with no "User:" prefix, e.g. 'Madcoverboy' 
    dt_end - a datetime object indicating the maximum datetime to return for revisions
    lang - a string (typically two characters) indicating the language version of Wikipedia to crawl

    Output:
    users - A list of all "alter" user talk pages that the ego has ever posted to
    '''
    # User revision code in only user namespace
    user_name = cast_to_unicode(user_name)
    users = dict()
    dt_end_string = convert_from_datetime(dt_end)
    result = wikipedia_query({'action':'query',
                                  'list': 'usercontribs',
                                  'ucuser': u"User:"+user_name,
                                  'ucprop': 'ids|title|timestamp|sizediff',
                                  'ucnamespace':'3',
                                  'uclimit': '500',
                                  'ucend':dt_end_string},lang)
    if result and 'usercontribs' in result.keys():
        r = result['usercontribs']
        for rev in r:
            alter = rev['title'][10:] # Ignore "User talk:"
            if alter not in users.keys():
                users[alter] = dict()
                users[alter]['count'] = 1
                users[alter]['min_timestamp'] = rev['timestamp']
                users[alter]['max_timestamp'] = rev['timestamp']
            else:
                users[alter]['count'] += 1
                users[alter]['max_timestamp'] = rev['timestamp']
    return users

def get_user_indiscussion(user_name,dt_end,lang='en'):
    '''
    Input:
    user_name - The name of a "ego" wikipedia user with no "User:" prefix, e.g. 'Madcoverboy' 
    dt_end - a datetime object indicating the maximum datetime to return for revisions
    lang - a string (typically two characters) indicating the language version of Wikipedia to crawl

    Output:
    users - A list of all "alter" user talk pages that have ever posted to the user's talk page
    '''
    # Article revision code in only user talk page
    user_name = cast_to_unicode(user_name)
    users = dict()
    dt_end_string = convert_from_datetime(dt_end)
    result = wikipedia_query({'titles': u'User talk:'+user_name,
                                  'prop': 'revisions',
                                  'rvprop': 'ids|timestamp|user|userid|size',
                                  'rvlimit': '5000',
                                  'rvend': dt_end_string,
                                  'action': 'query'},lang)
    if result and 'pages' in result.keys():
        page_number = result['pages'].keys()[0]
        try:
            r = result['pages'][page_number]['revisions']
            for rev in r:
                if rev['user'] not in users.keys():
                    users[rev['user']] = dict()
                    users[rev['user']]['count'] = 1
                    users[rev['user']]['min_timestamp'] = rev['timestamp']
                    users[rev['user']]['max_timestamp'] = rev['timestamp']
                else:
                    users[rev['user']]['count'] += 1
                    users[rev['user']]['max_timestamp'] = rev['timestamp']
        except KeyError:
            pass
    return users

def get_user_discussion(user_name,dt_end,lang='en'):
    '''
    Input:
    user_name - The name of a "ego" wikipedia user with no "User:" prefix, e.g. 'Madcoverboy' 
    dt_end - a datetime object indicating the maximum datetime to return for revisions
    lang - a string (typically two characters) indicating the language version of Wikipedia to crawl

    Output:
    users - A dictionary keyed by the values ['in','out'] that combines both get_user_outdiscussion and
        get_user_indiscussion
    '''
    users=dict()
    users['out'] = get_user_outdiscussion(user_name,dt_end,lang)
    users['in'] = get_user_indiscussion(user_name,dt_end,lang)
    return users

Trajectories

In [7]:
def make_article_trajectory(revisions):
    '''
    Input:
    revisions - A list of revisions generated by get_page_revisions

    Output:
    g - A NetworkX DiGraph object corresponding to the trajectory of an article moving between users
        Nodes are users and links from i to j exist when user i made a revision immediately following user j
    '''
    g = nx.DiGraph()
    # Sort revisions on ascending timestamp
    sorted_revisions = sorted(revisions,key=lambda k:k['timestamp'])

    # Don't use the last revision
    for num,rev in enumerate(sorted_revisions[:-1]):
        # Edge exists between user and user in next revision
        edge = (rev['user'],revisions[num+1]['user'])
        if g.has_edge(*edge):
            g[edge[0]][edge[1]]['weight'] += 1
        else:
            g.add_edge(*edge,weight=1)
    return g

def make_editor_trajectory(revisions):
    '''
    Input:
    revisions - A list of revisions generated by get_user_revisions

    Output:
    g - A NetworkX DiGraph object corresponding to the trajectory of a user moving between articles
        Nodes are pages and links from i to j exist when page i was edited by the user immediately following page j
    '''
    g = nx.DiGraph()
    # Sort revisions on ascending timestamp
    sorted_revisions = sorted(revisions,key=lambda k:k['timestamp'])

    # Don't use the last revision
    for num,rev in enumerate(sorted_revisions[:-1]):
        # Edge exists between user and user in next revision
        edge = (rev['title'],revisions[num+1]['user'])
        if g.has_edge(*edge):
            g[rev['title']][revisions[num+1]['user']]['weight'] += 1
        else:
            g.add_edge(*edge,weight=1)
    return g

Pageviews

In [8]:
def fixurl(url):
    # turn string into unicode
    if not isinstance(url,unicode):
        url = url.decode('utf8')

    # parse it
    parsed = urlparse.urlsplit(url)

    # divide the netloc further
    userpass,at,hostport = parsed.netloc.rpartition('@')
    user,colon1,pass_ = userpass.partition(':')
    host,colon2,port = hostport.partition(':')

    # encode each component
    scheme = parsed.scheme.encode('utf8')
    user = urllib2.quote(user.encode('utf8'))
    colon1 = colon1.encode('utf8')
    pass_ = urllib2.quote(pass_.encode('utf8'))
    at = at.encode('utf8')
    host = host.encode('idna')
    colon2 = colon2.encode('utf8')
    port = port.encode('utf8')
    path = '/'.join(  # could be encoded slashes!
        urllib2.quote(urllib2.unquote(pce).encode('utf8'),'')
        for pce in parsed.path.split('/')
    )
    query = urllib2.quote(urllib2.unquote(parsed.query).encode('utf8'),'=&?/')
    fragment = urllib2.quote(urllib2.unquote(parsed.fragment).encode('utf8'))

    # put it back together
    netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
    return urlparse.urlunsplit((scheme,netloc,path,query,fragment))

def convert_months_to_strings(m):
	if len(str(m)) > 1:
		new_m = unicode(m)
	else:
		new_m = u'0'+unicode(m)
	return new_m

def get_url(article_name,lang,month,year):
    url = u"http://stats.grok.se/json/" + lang + u"/" + unicode(year) + convert_months_to_strings(month) + u"/" + article_name
    fixed_url = fixurl(url)
    return fixed_url

def requester(url):
    opener = urllib2.build_opener()
    req = urllib2.Request(url)
    f = opener.open(req)
    r = simplejson.load(f)
    result = pd.Series(r['daily_views'])
    return result

def clean_timestamps(df):
    to_drop = list()
    for d in df.index:
        try:
            datetime.date(int(d[0:4]),int(d[5:7]),int(d[8:10]))
        except ValueError:
            to_drop.append(d)
    df2 = df.drop(to_drop,axis=0)
    df2.index = pd.to_datetime(df2.index)
    return df2

def get_pageviews(article,lang,min_date,max_date):
    rng = pd.date_range(min_date,max_date,freq='M')
    rng2 = [(i.month,i.year) for i in rng]
    ts = pd.Series()
    for i in rng2:
        url = get_url(article,lang,i[0],i[1])
        result = requester(url)
        ts = pd.Series.append(result,ts)
    ts = ts.sort_index()
    ts = clean_timestamps(ts)
    ts = ts.asfreq('D')
    return ts

def make_pageview_df(article_list,lang,min_date,max_date):
    df = pd.DataFrame(index=pd.date_range(start=min_date,end=max_date))
    l = len(article_list)
    for num,a in enumerate(article_list):
        try:
            print "{0} / {1} : {2}".format(num+1,l,a)
            ts = get_pageviews(a,lang,min_date,max_date)
            df[a] = ts
        except:
            print u'Something happened to {0}'.format(unicode(a))
            pass
    return df

Make networks

In [10]:
def editors_other_activity(article_title,dt_start,dt_end,ignorelist,lang):
    revisions = get_page_revisions(article_title,dt_start,dt_end,lang)
    revision_alters = make_page_alters(revisions)
    revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
    
    alter_contributions = dict()
    for num,editor_alter in enumerate(revision_alters2.keys()):
        print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
        alter_contributions[editor_alter] = get_user_revisions(editor_alter,dt_start,lang)
        
    #el = directed_dict_to_edgelist(alter_discussions)
    return revisions,alter_contributions

def editing_primary_discussion_secondary(article_title,dt_start,dt_end,ignorelist,lang):
    revisions = get_page_revisions(article_title,dt_start,dt_end,lang)
    revision_alters = make_page_alters(revisions)
    revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
    
    alter_discussions = dict()
    for num,editor_alter in enumerate(revision_alters2.keys()):
        print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
        alter_discussions[editor_alter] = get_user_discussion(editor_alter,dt)
        
    #el = directed_dict_to_edgelist(alter_discussions)
    return revisions,alter_discussions
In [7]:
g = nx.DiGraph()
for user,revisions in alter_contribs.iteritems():
    #print user
    for rev in revisions:
        article = rev['title']
        
        # If edge already exists, iterate weight
        if g.has_edge(user,title):
            g[user][title]['weight'] += 1
            
        # Otherwise create editor node and properties then add new edge
        else:
            # If editor node is not invalid or an IP, do a bunch of stuff
            if 'invalid' not in user_props[user]['users'][0].keys():
                ns = rev['ns']
                gen = user_props[user]['users'][0]['gender']
                edits = user_props[user]['users'][0]['editcount']
                
                # Registration returns None sometimes
                start = user_props[user]['users'][0]['registration']
                if start is not None:
                    start = convert_datetime_to_epoch(convert_to_datetime(start))
                else:
                    start = u'unknown'
                
                # Add node
                g.add_node(user, gender = gen, startdate = start, edits = edits, nodetype = 'user', ns='user')
                g.add_node(article, gender = 'page', startdate = 'page', edits = 'page', sysop = 'page', autoconfirmed = 'page', nodetype = 'page',namespace=ns)
                
                if 'sysop' in user_props[user]['users'][0]['groups']:
                    g.node[user]['sysop'] = 1
                else:
                    g.node[user]['sysop'] = 0
                
                if 'autoconfirmed' in user_props[user]['users'][0]['groups']:
                    g.node[user]['autoconfirmed'] = 1
                else:
                    g.node[user]['autoconfirmed'] = 0
                
                g.add_edge(user,article,weight=1)
            
            # If editor node is ivalid or an IP, populate fields
            else:
                g.add_node(user,gender=u'unknown',start=u'uknown',edits=u'unknown',sysop=0,autoconfirmed=0,nodetype='user')

# Remove Talk:Chelsea_Manning because it's connected to everything
g.remove_node('Talk:Chelsea Manning')
In [8]:
editors = [title for title,attribs in g.nodes(data=True) if attribs['nodetype'] == 'user']
#pages = [title for title,attribs in g.nodes(data=True) if attribs['nodetype'] == 'page']
g2 = g.to_undirected()
g3 = nx.bipartite.weighted_projected_graph(g2,editors)
#g4 = nx.bipartite.weighted_projected_graph(g2,pages)
In [9]:
nx.write_graphml(g,'Manning_talk_coauthorship.graphml')
nx.write_gexf(g,'Manning_talk_coauthorship.gexf')
nx.write_graphml(g3,'Manning_talk_coediting.graphml')
nx.write_gexf(g3,'Manning_talk_coediting.gexf')
In [11]:
def editing_primary_hyperlink_secondary(article_title,dt_start,dt_end,ignorelist):
    revisions = get_page_revisions(article_title,dt_start,dt_end,lang)
    revision_alters = make_page_alters(revisions)
    revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
    
    alter_hyperlinks = dict()
    for num,editor_alter in enumerate(revision_alters2.keys()):
        print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
        alter_discussions[editor_alter] = get_page_outlinks(editor_alter,dt)
        
    el = directed_dict_to_edgelist(alter_discussions)
    return revisions,alter_discussions,el
In [12]:
def two_step_editing(article_title,dt,ignorelist):
    revisions = get_page_revisions(article_title,dt)
    revision_alters = make_page_alters(revisions)
    revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
    
    alter_revisions = dict()
    for num,editor_alter in enumerate(revision_alters2.keys()):
        print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
        alter_revisions[editor_alter] = get_user_revisions(editor_alter,dt)
    return revisions, alter_revisions
In [13]:
def two_step_outlinks(page_title):
    page_alters = dict()
    templates_dict = dict()
    
    links = get_page_outlinks(page_title)
    page_alters[unicode(page_title)] = links
    
    templates = get_page_templates(page_title)
    templates_dict[page_title] = templates
    
    l = len(links)
    for num,link in enumerate(links):
        print u"{0} / {1} : {2}".format(num+1,l,link)
        try:
            page_alters[link] = get_page_outlinks(link)
            templates_dict[link] = get_page_templates(link)
        except:
            print u"...{0} doesn't exist".format(link)
            pass
    return page_alters,templates_dict
In [14]:
def two_step_outlinks_from_content(page_title):
    page_alters = dict()
    
    links = get_page_outlinks_from_content(page_title)
    unique_links = list(set(links))
    page_alters[unicode(page_title)] = unique_links
    
    l = len(unique_links)
    for num,link in enumerate(unique_links):
        print u"{0} / {1} : {2}".format(num+1,l,link)
        try:
            page_alters[link] = get_page_outlinks_from_content(link)
        except:
            print u"...{0} doesn't exist".format(link)
            pass
    return page_alters
In [15]:
def make_hyperlink_network(hyperlink_dict):
    hyperlink_g = nx.DiGraph()
    for page,links in hyperlink_dict.iteritems():
        for link in links:
            # Only include links to 1-step alter pages, not 2-step alters' alters
            if link in hyperlink_dict.keys():
                hyperlink_g.add_edge(page,link)
    return hyperlink_g
In [16]:
def make_shared_user_editing_network(alter_revisions_dict,threshold):
    
    # Make the graph
    net = nx.DiGraph()
    for editor,revisions in alter_revisions_dict.iteritems():
        articles = [r['title'] for r in revisions]
        for num,article in enumerate(articles[:-1]):
            if net.has_edge(article,articles[num+1]):
                net[article][articles[num+1]]['weight'] += 1
            else:
                net.add_edge(article,articles[num+1],weight=1)
                
    # If edge is below threshold, remove it            
    for i,j,d in net.edges_iter(data=True):
        if d['weight'] < threshold:
            net.remove_edge(i,j)
            
    # Remove self-loops
    for i,j,d in net.edges_iter(data=True):
        if i == j:
            net.remove_edge(i,j)
    
    # Remove resulting isolates
    isolates = nx.isolates(net)
    for isolate in isolates:
        net.remove_node(isolate)
    
    return net
In [17]:
# Take the alter_revisions_dict keyed by user with a list of revisions
# And return an inverted alter_pages keyed by page with a dictionary of users
def invert_alter_revisions(alter_revisions_dict):
    alter_pages = dict()
    for user,revisions in alter_revisions_dict.iteritems():
        temp_list = list()
        for revision in revisions:
            temp_list.append(revision['title'])
        alter_pages[user] = dict(Counter(temp_list))

    inverted_alter_pages = dict()
    for user,counts in alter_pages.iteritems():
        for article,count in counts.iteritems():
            try:
                inverted_alter_pages[article][user] = count
            except KeyError:
                inverted_alter_pages[article] = dict()
                inverted_alter_pages[article][user] = count
    
    return inverted_alter_pages
In [18]:
def make_shared_page_editing_network(alter_revisions_dict,threshold):
    
    inverted_alter_revisions_dict = invert_alter_revisions(alter_revisions_dict)
    
    # Make the graph
    g = nx.DiGraph()
    for page,users in inverted_alter_revisions_dict.iteritems():
        user_list = users.keys()
        for num,user in enumerate(user_list[:-1]):
            next_user = user_list[num+1]
            if g.has_edge(user,next_user):
                g[user][next_user]['weight'] += 1
            else:
                g.add_edge(user,next_user,weight=1)
                
    # If edge is below threshold, remove it            
    for i,j,d in g.edges_iter(data=True):
        if d['weight'] < threshold:
            g.remove_edge(i,j)
            
    # Remove self-loops
    for i,j,d in g.edges_iter(data=True):
        if i == j:
            g.remove_edge(i,j)
    
    # Remove resulting isolates
    isolates = nx.isolates(g)
    for isolate in isolates:
        g.remove_node(isolate)
    
    return g
In [19]:
def make_category_network(categories_dict):
    '''Takes a dictionary keyed by page name with list of categories as values
    Returns a two-mode (enforced by DiGraph) page-category
    '''
    g_categories=nx.DiGraph()

    for page,categories in categories_dict.iteritems():
        for category in categories:
            g_categories.add_node(page,node_type='page')
            g_categories.add_node(category,node_type='category')
            g_categories.add_edge(page,category)

    return g_categories

Example

Select articles from the 2012 Mexican elections category on the Spanish Wikipedia based on articles having more than one gubinatorial candidate having an existing article ("blue links").

In [5]:
articles = ['Elecciones estatales de 2012 en Yucatán','Elecciones estatales en Tabasco de 2012','Elecciones estatales en San Luis Potosí de 2012','Elecciones estatales de Morelos de 2012','Elecciones estatales en Jalisco de 2012','Elecciones estatales en Guanajuato de 2012','Elecciones en el Distrito Federal (México) de 2012','Elecciones estatales en Chiapas de 2012']
articles = [i.decode('utf8') for i in articles]
In [47]:
category_members = get_category_members('Categoría:Elecciones_de_México_de_2012',1,'es')
In [48]:
category_members
Out[48]:
[u'Elecciones federales en M\xe9xico de 2012',
 u'Elecciones estatales de Campeche de 2012',
 u'Elecciones estatales en Chiapas de 2012',
 u'Elecciones estatales de Colima de 2012',
 u'Elecciones en el Distrito Federal (M\xe9xico) de 2012',
 u'Elecciones estatales del Estado de M\xe9xico de 2012',
 u'Elecciones estatales en Guanajuato de 2012',
 u'Elecciones estatales de Guerrero de 2012',
 u'Elecciones estatales extraordinarias de Hidalgo de 2012',
 u'Elecciones estatales en Jalisco de 2012',
 u'Elecciones estatales extraordinarias de Michoac\xe1n de 2012',
 u'Elecciones estatales de Morelos de 2012',
 u'Elecciones estatales de Nuevo Le\xf3n de 2012',
 u'Elecciones estatales de Quer\xe9taro de 2012',
 u'Elecciones estatales en San Luis Potos\xed de 2012',
 u'Elecciones estatales en Tabasco de 2012',
 u'Elecciones estatales de 2012 en Yucat\xe1n',
 u'Elecciones estatales extraordinarias de Yucat\xe1n de 2012']
In [9]:
bots = get_category_members('Category:All Wikipedia bots',3,'en')
bots = [b[5:] for b in bots]
In [12]:
user_props = dict()
for i,user in enumerate(alter_contribs.keys()):
    print u"{0} / {1}: {2}".format(i+1,len(alter_contribs.keys()),user)
    user_props[user] = get_user_properties(user,'en')
1 / 507: Edison
2 / 507: Richard BB
3 / 507: 156.98.4.11
4 / 507: Vobedd
5 / 507: Qcomplex5
6 / 507: Skyraider
7 / 507: Adjwilley
8 / 507: Wbm1058
9 / 507: Roscelese
10 / 507: Bernarddb
11 / 507: Solarguy17
12 / 507: It Is Me Here
13 / 507: Degen Earthfast
14 / 507: Tony Webster
15 / 507: Guerillero
16 / 507: Coffeepusher
17 / 507: Vexorian
18 / 507: Rhialto
19 / 507: Sodaant
20 / 507: Jfhutson
21 / 507: Marcus Qwertyus
22 / 507: Carolmooredc
23 / 507: Cullen328
24 / 507: Benlisquare
25 / 507: Rcsprinter123
26 / 507: EvergreenFir
27 / 507: Wslack
28 / 507: BrownHairedGirl
29 / 507: Thechungling
30 / 507: Two kinds of pork
31 / 507: CaseyPenk
32 / 507: Casey.Grim85
33 / 507: Pudeo
34 / 507: KoshVorlon
35 / 507: NE Ent
36 / 507: Miranche
37 / 507: Wctaiwan
38 / 507: Rlendog
39 / 507: FT2
40 / 507: Wallie
41 / 507: Livitup
42 / 507: 190.235.87.27
43 / 507: Param Mudgal
44 / 507: Pass a Method
45 / 507: David Gerard
46 / 507: Pawyilee
47 / 507: Trinitresque
48 / 507: Daffydavid
49 / 507: Scott Martin
50 / 507: 117.199.7.24
51 / 507: Jenssey
52 / 507: Zzyzx11
53 / 507: GorillaWarfare
54 / 507: Necrothesp
55 / 507: Hullaballoo Wolfowitz
56 / 507: Brettalan
57 / 507: 97.84.222.198
58 / 507: Scottywong
59 / 507: Themfromspace
60 / 507: Shrigley
61 / 507: LtGen
62 / 507: Nick
63 / 507: Steeletrap
64 / 507: Michael Dorosh
65 / 507: Yourself In Person
66 / 507: Fs
67 / 507: Juno
68 / 507: Me and
69 / 507: Sophie means wisdom
70 / 507: Ericloewe
71 / 507: Toyokuni3
72 / 507: AnonNep
73 / 507: Ileanadu
74 / 507: Jeude54cartes
75 / 507: Zoe Brain
76 / 507: Vinithehat
77 / 507: Cengime
78 / 507: Abeg92
79 / 507: Born2cycle
80 / 507: Kevin W.
81 / 507: Sovetus
82 / 507: Sj
83 / 507: 91.153.87.155
84 / 507: Wadewitz
85 / 507: Katana geldar
86 / 507: Vigyani
87 / 507: Solomonfromfinland
88 / 507: Mareklug
89 / 507: DrCruse
90 / 507: Eopsid
91 / 507: Scray
92 / 507: Theodolite
93 / 507: Dralwik
94 / 507: Snappy
95 / 507: PublicAmpersand
96 / 507: Zaphody3k
97 / 507: Agmonaco
98 / 507: Liz
99 / 507: SqueakBox
100 / 507: Crumpled Fire
101 / 507: A Thousand Doors
102 / 507: AzureCitizen
103 / 507: Hitmonchan
104 / 507: Hamiltonstone
105 / 507: 83.128.147.107
106 / 507: Miraculouschaos
107 / 507: Dyrnych
108 / 507: Hobit
109 / 507: DanHakimi
110 / 507: Wikipeterproject
111 / 507: Cameron Scott
112 / 507: PikkoroDaimao
113 / 507: GiantSnowman
114 / 507: Kelly
115 / 507: Cimon Avaro
116 / 507: 86.16.146.123
117 / 507: ThinkEnemies
118 / 507: KTC
119 / 507: Shii
120 / 507: BHC
121 / 507: Thegreatdr
122 / 507: Joefromrandb
123 / 507: Milkunderwood
124 / 507: Maximilian Schönherr
125 / 507: Kaldari
126 / 507: DHeyward
127 / 507: Byposted
128 / 507: Almonroth
129 / 507: Srlevine1
130 / 507: BlueSalix
131 / 507: Vanisaac
132 / 507: FutureTrillionaire
133 / 507: John Cline
134 / 507: Pointillist
135 / 507: Raeven0
136 / 507: Psychologicaloric
137 / 507: Tennenrishin
138 / 507: Atshal
139 / 507: Modest Genius
140 / 507: 5minutes
141 / 507: Josepharari
142 / 507: Tbhotch
143 / 507: 70.89.234.49
144 / 507: TParis
145 / 507: JamesAM
146 / 507: Golbez
147 / 507: 208.163.239.119
148 / 507: FormerIP
149 / 507: StAnselm
150 / 507: Cyclopia
151 / 507: HiB2Bornot2B
152 / 507: Jayron32
153 / 507: Iselilja
154 / 507: Jojhutton
155 / 507: BFWB
156 / 507: Talmage
157 / 507: 24.22.47.95
158 / 507: K7L
159 / 507: Azirus
160 / 507: Smyth
161 / 507: Cavarrone
162 / 507: OtterSmith
163 / 507: Anthonyhcole
164 / 507: R. fiend
165 / 507: Michael Glass
166 / 507: Soerfm
167 / 507: Loadmaster
168 / 507: Daira Hopwood
169 / 507: 85.65.68.209
170 / 507: 99.192.64.222
171 / 507: Kiralexis
172 / 507: DPRoberts534
173 / 507: 98.157.156.137
174 / 507: Insulam Simia
175 / 507: U-Mos
176 / 507: 2001:5C0:1000:A:0:0:0:49D
177 / 507: Jburman
178 / 507: Malerooster
179 / 507: Thehistorian10
180 / 507: Fightin' Phillie
181 / 507: Safiel
182 / 507: Coemgenus
183 / 507: Jackmcbarn
184 / 507: Archaeo
185 / 507: AlexTiefling
186 / 507: NativeForeigner
187 / 507: Belorn
188 / 507: LukeSurl
189 / 507: 86.173.69.123
190 / 507: Eregli bob
191 / 507: Nicholas Perkins
192 / 507: Amatulic
193 / 507: Gtadood
194 / 507: Torquemama007
195 / 507: Casiotone
196 / 507: Jean-Jacques Georges
197 / 507: Dainamo
198 / 507: Labattblueboy
199 / 507: Phil Sandifer
200 / 507: Pez Dispens3r
201 / 507: Bob bobato
202 / 507: DragonflySixtyseven
203 / 507: Bright Darkness
204 / 507: Psychonaut
205 / 507: Sbingner
206 / 507: Thebirdlover
207 / 507: Ukrained2012
208 / 507: AutomaticStrikeout
209 / 507: Maproom
210 / 507: GeorgeLouis
211 / 507: 69.244.220.253
212 / 507: 71.231.186.92
213 / 507: Synchronism
214 / 507: JCO312
215 / 507: Tariqabjotu
216 / 507: 71.90.172.117
217 / 507: Chris G
218 / 507: Obiwankenobi
219 / 507: Mr. Stradivarius
220 / 507: GenericBob
221 / 507: TheCatalyst31
222 / 507: 71.116.34.80
223 / 507: A.amitkumar
224 / 507: Sluffs
225 / 507: Vegaswikian
226 / 507: Tombomp
227 / 507: KathrynBrooks1
228 / 507: Canoe1967
229 / 507: 71.179.167.242
230 / 507: 184.152.74.159
231 / 507: Lacarids
232 / 507: Gymnophoria
233 / 507: Miranda1989
234 / 507: Robin Lionheart
235 / 507: GrimmC
236 / 507: 7daysahead
237 / 507: Richard75
238 / 507: GregorB
239 / 507: 97.123.210.252
240 / 507: Agnosticaphid
241 / 507: MONGO
242 / 507: Mpgviolist
243 / 507: Hebel
244 / 507: NinjaRobotPirate
245 / 507: Silver seren
246 / 507: Giants27
247 / 507: Brandmeister
248 / 507: Surfer43
249 / 507: Tarc
250 / 507: BrianJ34
251 / 507: Blueboar
252 / 507: Fighter1stClass
253 / 507: Maunus
254 / 507: Walterego
255 / 507: LlywelynII
256 / 507: QuackCD
257 / 507: BabbaQ
258 / 507: Sandstein
259 / 507: BD2412
260 / 507: 74.138.45.132
261 / 507: 88.66.37.221
262 / 507: Alaric
263 / 507: Theodore!
264 / 507: Penwhale
265 / 507: Blackbird 4
266 / 507: JDiala
267 / 507: Cls14
268 / 507: Dicklyon
269 / 507: Guy Macon
270 / 507: Dorsal Axe
271 / 507: Count Iblis
272 / 507: Cymru.lass
273 / 507: Fritzendugan
274 / 507: Muboshgu
275 / 507: PauAmma
276 / 507: TripleU
277 / 507: Ajfweb
278 / 507: Taylor Trescott
279 / 507: Søren
280 / 507: Helixdq
281 / 507: Gobonobo
282 / 507: Alanscottwalker
283 / 507: 84.18.241.143
284 / 507: Mike Rosoft
285 / 507: Netcrusher88
286 / 507: 2001:558:6024:12:10BB:B8E3:A9F3:C3C3
287 / 507: White whirlwind
288 / 507: Andrewman327
289 / 507: Sportfan5000
290 / 507: Tivanir2
291 / 507: ItsZippy
292 / 507: A Quest For Knowledge
293 / 507: Yintan
294 / 507: Another Believer
295 / 507: AjaxSmack
296 / 507: 151.230.243.44
297 / 507: Berean Hunter
298 / 507: Tryptofish
299 / 507: XMattingly
300 / 507: Jonie148
301 / 507: Ὁ οἶστρος
302 / 507: Jonathandeamer
303 / 507: Emarsee
304 / 507: JasonCNJ
305 / 507: MightySaiyan
306 / 507: 108.247.32.232
307 / 507: Writegeist
308 / 507: And Adoil Descended
309 / 507: 71.68.234.176
310 / 507: TheScootz
311 / 507: Risker
312 / 507: Sam Blacketer
313 / 507: SlimVirgin
314 / 507: JASpencer
315 / 507: Woody
316 / 507: Bdell555
317 / 507: Phoebe
318 / 507: 168.12.253.66
319 / 507: Hot Stop
320 / 507: Srich32977
321 / 507: 86.153.186.25
322 / 507: 181.179.58.111
323 / 507: Count Truthstein
324 / 507: Alex Hortman
325 / 507: Thatbox
326 / 507: George Ho
327 / 507: InedibleHulk
328 / 507: Isaidnoway
329 / 507: My very best wishes
330 / 507: Gaurav
331 / 507: Saxman1984
332 / 507: Mohamed CJ
333 / 507: 65.51.209.126
334 / 507: Cindamuse
335 / 507: MaxHarmony
336 / 507: HandsomeFella
337 / 507: Yonskii
338 / 507: 198.161.2.241
339 / 507: Wnt
340 / 507: Hbdragon88
341 / 507: Martylunsford
342 / 507: Wikid77
343 / 507: Shemp Howard, Jr.
344 / 507: 173.178.34.11
345 / 507: Gaijin42
346 / 507: Eclecticology
347 / 507: Red Slash
348 / 507: 76.65.128.222
349 / 507: Baseball Bugs
350 / 507: Redrose64
351 / 507: 82.42.38.252
352 / 507: IFreedom1212
353 / 507: Jehochman
354 / 507: Ken Arromdee
355 / 507: Trystan
356 / 507: Grolltech
357 / 507: NewAccount4Me
358 / 507: Totorotroll
359 / 507: Moncrief
360 / 507: Numazİs
361 / 507: LudicrousTripe
362 / 507: Toddy1
363 / 507: Soranoch
364 / 507: M.thoriyan
365 / 507: Welshsocialist
366 / 507: Eddpayne
367 / 507: Jayen466
368 / 507: Cowcharge
369 / 507: Nil Einne
370 / 507: Jbower47
371 / 507: 159.83.196.1
372 / 507: Foofbun
373 / 507: Countered
374 / 507: McGeddon
375 / 507: Fyunck(click)
376 / 507: Iamcuriousblue
377 / 507: NickCT
378 / 507: 88.73.34.231
379 / 507: Haxwell
380 / 507: 23 editor
381 / 507: 92.29.51.58
382 / 507: Edge3
383 / 507: SarekOfVulcan
384 / 507: Smowton
385 / 507: 190.103.67.169
386 / 507: Timrollpickering
387 / 507: Cjarbo2
388 / 507: Norden1990
389 / 507: Kairi Izumi
390 / 507: FoxyOrange
391 / 507: Mark Arsten
392 / 507: 2.80.208.56
393 / 507: Bearcat
394 / 507: Labellementeuse
395 / 507: Surtsicna
396 / 507: I JethroBT
397 / 507: Anagogist
398 / 507: DracoEssentialis
399 / 507: Njardarlogar
400 / 507: ColonelHenry
401 / 507: Floydian
402 / 507: Mattgirling
403 / 507: 69.155.81.253
404 / 507: Jaakko Sivonen
405 / 507: IRWolfie-
406 / 507: KumiokoCleanStart
407 / 507: Aoidh
408 / 507: 142.161.97.237
409 / 507: PenguiN42
410 / 507: Collect
411 / 507: MrDolomite
412 / 507: Oren0
413 / 507: McPhail
414 / 507: OohBunnies!
415 / 507: Sailsbystars
416 / 507: Joseph A. Spadaro
417 / 507: Wester
418 / 507: 68.81.192.33
419 / 507: Randy2063
420 / 507: Lyo
421 / 507: StuartH
422 / 507: OSborn
423 / 507: Niemti
424 / 507: Haipa Doragon
425 / 507: Steven Zhang
426 / 507: Wasmachien
427 / 507: 71.184.71.199
428 / 507: GregJackP
429 / 507: Deep Purple Dreams
430 / 507: Robofish
431 / 507: Longsight
432 / 507: Ginsengbomb
433 / 507: PiMaster3
434 / 507: AndyTheGrump
435 / 507: Mark Miller
436 / 507: PBS
437 / 507: Rannpháirtí anaithnid
438 / 507: Thryduulf
439 / 507: Space simian
440 / 507: Morwen
441 / 507: SchreiberBike
442 / 507: CFynn
443 / 507: Badanagram
444 / 507: -sche
445 / 507: Yetisyny
446 / 507: Carrite
447 / 507: Dmarquard
448 / 507: VictusB
449 / 507: Sca
450 / 507: Dirac66
451 / 507: LionMans Account
452 / 507: Scs
453 / 507: Bwmoll3
454 / 507: Bluerasberry
455 / 507: April Arcus
456 / 507: Antonio Hazard
457 / 507: Thinking of England
458 / 507: 94.31.32.30
459 / 507: Dee Earley
460 / 507: 108.226.20.130
461 / 507: JohnValeron
462 / 507: Tocino
463 / 507: Stryn
464 / 507: 97.90.153.202
465 / 507: General Staal
466 / 507: Josh Gorand
467 / 507: Rinnenadtrosc
468 / 507: Adrian
469 / 507: JasonJack
470 / 507: Alandeus
471 / 507: Abductive
472 / 507: Ross Hill
473 / 507: Cerejota
474 / 507: LFaraone
475 / 507: Lawsonstu
476 / 507: DebashisM
477 / 507: Crisis
478 / 507: An Editor With a Self-Referential Name
479 / 507: WeldNeck
480 / 507: Shoeless Ho
481 / 507: Somchai Sun
482 / 507: Paul Erik
483 / 507: CombatWombat42
484 / 507: Neutron
485 / 507: Amitabho
486 / 507: Bob K31416
487 / 507: 202.174.184.14
488 / 507: Andy Dingley
489 / 507: 91.125.230.213
490 / 507: Uvaduck
491 / 507: Daniel32708
492 / 507: FeydHuxtable
493 / 507: Mjb
494 / 507: Ishmael reis
495 / 507: Mispy
496 / 507: NorthBySouthBaranof
497 / 507: Prototime
498 / 507: Alex Bakharev
499 / 507: Stephan Schulz
500 / 507: Hurtsmyears
501 / 507: Pigsonthewing
502 / 507: Rgrasmus
503 / 507: Sue Gardner
504 / 507: Knowledgekid87
505 / 507: Tazerdadog
506 / 507: Wing gundam
507 / 507: 90.210.192.246

Get all of the links from each of these pages.

In [50]:
hyperlink_dict = dict()
for i,a in enumerate(category_members):
    print u'{0} / {1} : {2}'.format(i+1,len(category_members),a)
    hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')
1 / 18 : Elecciones federales en México de 2012
2 / 18 : Elecciones estatales de Campeche de 2012
3 / 18 : Elecciones estatales en Chiapas de 2012
4 / 18 : Elecciones estatales de Colima de 2012
5 / 18 : Elecciones en el Distrito Federal (México) de 2012
6 / 18 : Elecciones estatales del Estado de México de 2012
7 / 18 : Elecciones estatales en Guanajuato de 2012
8 / 18 : Elecciones estatales de Guerrero de 2012
9 / 18 : Elecciones estatales extraordinarias de Hidalgo de 2012
10 / 18 : Elecciones estatales en Jalisco de 2012
11 / 18 : Elecciones estatales extraordinarias de Michoacán de 2012
12 / 18 : Elecciones estatales de Morelos de 2012
13 / 18 : Elecciones estatales de Nuevo León de 2012
14 / 18 : Elecciones estatales de Querétaro de 2012
15 / 18 : Elecciones estatales en San Luis Potosí de 2012
16 / 18 : Elecciones estatales en Tabasco de 2012
17 / 18 : Elecciones estatales de 2012 en Yucatán
18 / 18 : Elecciones estatales extraordinarias de Yucatán de 2012

Create a set of alters to crawl in turn, excluding links to categories, files, and archives.

In [56]:
hyperlink_alters = list()
for ego,alters in hyperlink_dict.iteritems():
    alters = list(set(alters))
    for alter in alters:
        if u'Categor\xeda:' not in alter and u'Anexo:' not in alter and u'Archivo:' not in alter:
            hyperlink_alters.append(alter)

hyperlink_alters = list(set(hyperlink_alters))

Crawl these alters and add their alters to the hyperlink dictionary. Some pages may not exist, in which case ignore them.

In [57]:
for i,a in enumerate(hyperlink_alters):
    print u'{0} / {1} : {2}'.format(i+1,len(hyperlink_alters),a)
    try:
        hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')
    except KeyError:
        print u"...{0} doesn't exist".format(a)
        pass
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-57-fd43abeaef06> in <module>()
      2     print u'{0} / {1} : {2}'.format(i+1,len(hyperlink_alters),a)
      3     try:
----> 4         hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')
      5     except KeyError:
      6         print u"...{0} doesn't exist".format(a)

<ipython-input-16-a913c4800090> in get_page_outlinks_from_content(page_title, lang)
     70 def get_page_outlinks_from_content(page_title,lang):
     71     page_title = cast_to_unicode(page_title)
---> 72     page_title = rename_on_redirect(page_title)
     73 
     74     # Get content from most recent revision of an article

<ipython-input-11-a2a093f5c382> in rename_on_redirect(article_title)
      3                                   'prop': 'info',
      4                                   'action': 'query',
----> 5                                   'redirects': 'True'})
      6     if 'redirects' in result.keys() and 'pages' in result.keys():
      7         article_title = result['redirects'][0]['to']

<ipython-input-15-d6a249997c3a> in wikipedia_query(query_params, lang)
     37         site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')
     38         request = api.APIRequest(site, query_params)
---> 39         result = request.query()
     40         return result[query_params['action']]
     41 

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/wikitools/api.pyc in query(self, querycontinue)
    137                 data = False
    138                 while not data:
--> 139                         rawdata = self.__getRaw()
    140                         data = self.__parseJSON(rawdata)
    141                 #Certain errors should probably be handled here...

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/wikitools/api.pyc in __getRaw(self)
    212                                 else:
    213                                         catcherror = Exception
--> 214                                 data = self.opener.open(self.request)
    215                                 self.response = data.info()
    216                                 if gzip:

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    398             req = meth(req)
    399 
--> 400         response = self._open(req, data)
    401 
    402         # post-process response

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _open(self, req, data)
    416         protocol = req.get_type()
    417         result = self._call_chain(self.handle_open, protocol, protocol +
--> 418                                   '_open', req)
    419         if result:
    420             return result

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    376             func = getattr(handler, meth_name)
    377 
--> 378             result = func(*args)
    379             if result is not None:
    380                 return result

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in http_open(self, req)
   1205 
   1206     def http_open(self, req):
-> 1207         return self.do_open(httplib.HTTPConnection, req)
   1208 
   1209     http_request = AbstractHTTPHandler.do_request_

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in do_open(self, http_class, req)
   1178         else:
   1179             try:
-> 1180                 r = h.getresponse(buffering=True)
   1181             except TypeError: # buffering kw not supported
   1182                 r = h.getresponse()

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in getresponse(self, buffering)
   1028         response = self.response_class(*args, **kwds)
   1029 
-> 1030         response.begin()
   1031         assert response.will_close != _UNKNOWN
   1032         self.__state = _CS_IDLE

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in begin(self)
    405         # read until we get a non-100 response
    406         while True:
--> 407             version, status, reason = self._read_status()
    408             if status != CONTINUE:
    409                 break

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _read_status(self)
    363     def _read_status(self):
    364         # Initialize with Simple-Response defaults
--> 365         line = self.fp.readline()
    366         if self.debuglevel > 0:
    367             print "reply:", repr(line)

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc in readline(self, size)
    445             while True:
    446                 try:
--> 447                     data = self._sock.recv(self._rbufsize)
    448                 except error, e:
    449                     if e.args[0] == EINTR:

KeyboardInterrupt: 
1 / 847 : 
In [45]:
hyperlink_graph = nx.DiGraph()
for ego,alters in hyperlink_dict.iteritems():
    for alter in alters:
        if alter in hyperlink_dict.keys():
            hyperlink_graph.add_edge(ego,alter)
nx.write_graphml(hyperlink_graph,'hyperlinks.graphml')
In [168]:
net = nx.DiGraph()
for article,revisions in alter_revs.iteritems():
    for revision in revisions:
        if 'user' in revision.keys() and 'bot' not in revision['user']:
            try:
                net[revision['user']][revision['title']]['weight'] += 1
            except KeyError:
                net.add_node(revision['user'],node_type='user')
                net.add_node(revision['title'],node_type='article')
                net.add_edge(revision['user'],revision['title'],weight=1)
                
net_articles = [i for i,j in net.nodes(data=True) if j['node_type'] == 'article']
net_users = [i for i,j in net.nodes(data=True) if j['node_type'] == 'user']

len(net_users)
Out[168]:
2443

Load from Pickle

In [13]:
result = cPickle.load(open('Boston_Marathon_bombings.p','rb'))
revisions_dict = dict()
page_number = result['pages'].keys()[0]
revisions = result['pages'][page_number]['revisions']
for revision in revisions:
    rev = dict()
    rev['pageid'] = page_number
    rev['title'] = result['pages'][page_number]['title']
    rev['size'] = revision.get('size', 0) # Sometimes the size key is not present, so we'll set it to 0 in those cases
    rev['timestamp'] = convert_to_datetime(revision['timestamp'])
    rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string
    rev['links'] = link_finder(rev['content'])
    rev['username'] = revision['user']
    rev['userid'] = revision['userid']
    rev['revid'] = revision['revid']
    revisions_dict[revision['revid']] = rev
In [42]:
def adjacency_calcs(revisions):
    revisions = sorted(revisions,key=itemgetter('pageid','timestamp'))
    revisions[0]['position'] = 0
    revisions[0]['edit_lag'] = datetime.timedelta(0)
    revisions[0]['bytes_added'] = revisions[0]['size']
    revisions[0]['unique_users'] = [revisions[0]['username']]
    revisions[0]['unique_users_count'] = 1
    revisions[0]['article_age'] = 0
    for num,rev in enumerate(revisions[:-1]):
        revisions[num+1]['position'] = rev['position'] + 1
        revisions[num+1]['edit_lag'] = revisions[num+1]['timestamp'] - rev['timestamp']
        revisions[num+1]['bytes_added'] = revisions[num+1]['size'] - rev['size']
        
        revisions[num+1]['unique_users'] = rev['unique_users']
        revisions[num+1]['unique_users'].append(revisions[num+1]['username'])
        revisions[num+1]['unique_users'] = list(set(revisions[num+1]['unique_users']))
        
        revisions[num+1]['unique_users_count'] = len(revisions[num+1]['unique_users'])
        revisions[num+1]['article_age'] = revisions[num+1]['timestamp'] - revisions[0]['timestamp']
    return revisions