dataset preparation¶

In [2]:

# -*- coding: utf-8 -*-

import json
import codecs
from bs4 import BeautifulSoup

import pandas as pd

pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)

basic processing¶

word analysis¶

In [8]:

def basic_word_analysis(text):
    length = len(text)
    words = len(content.split(" "))
    average_word_length = float(length - words)/float(words)
    
    return length, words, average_word_length

This function performs a basic word analysis using raw python functions. Word counting will be more precise using a NLP framework such as nltk. Further development will certainly be done in more developed phases of the project and documented in an according notebook.

users¶

In [5]:

def basic_user_stats(page):
    revs = json.load(codecs.open("data/revisions/%s.json" % (page), "r", "utf-8-sig"))
    
    unique_users = 0
    unique_registered_users = 0
    unique_ip_users = 0
    hidden_users = 0
    
    #users = set()
    registered_users = set()
    ip_users = set()
    
    for r in revs:
        if "user" in r:
            if r["userid"] != 0:
                registered_users.add(r["user"])
            else:
                ip_users.add(r["user"])
        else:
            hidden_users += 1

    first_revision = revs[-1]["timestamp"]
            
    unique_registered_users = len(registered_users)
    unique_ip_users = len(ip_users)
    unique_users = unique_registered_users + unique_ip_users
    
    return unique_users, unique_registered_users, unique_ip_users, hidden_users, first_revision

print basic_user_stats("Pi")

(2880, 1799, 1081, 0, u'2002-02-01T12:26:39Z')

page views¶

In [4]:

def pageviews(page):
    pageviews = json.load(codecs.open("data/pageviews/%s.json" % (page), "r", "utf-8-sig"))
    
    total = 0
    
    for m in pageviews:
        for d, v in m.iteritems():
            total += v
    
    return total

print pageviews("Pi")

23309406

pagerank¶

In [24]:

pageranks = pd.DataFrame.from_csv("data/pagerank.csv", sep=";", encoding="utf-8")

specialization¶

In [37]:

specialization = pd.read_json("data/specialization.json")
specialization = specialization.set_index("title")
#specialization.head()

co-editors graph¶

In [38]:

coeditors = pd.DataFrame.from_csv("data/pages-linked-by-coeditors.stats.csv", encoding="utf-8")

finalization¶

In [25]:

# for p in specialization[0:25]:
data = []

for page in pages:
    p = {}
    
    p["pagename"] = page
    
    content_file = codecs.open("data/pages/%s.json" % (page), "r", "utf-8-sig")
    j = json.load(content_file)
    r = j["query"]["pages"][j["query"]["pages"].keys()[0]]

    if "revisions" in r.keys():
        content = r["revisions"][0]["*"]       
        content = BeautifulSoup(content).text
        
        p["length"], p["words"], p["average word length"] = basic_word_analysis(content)
        p["unique users"], p["unique registered users"], p["unique ip users"], p["hidden users"],p["first revision"] = basic_user_stats(page)
        p["page views"] = pageviews(page)
        
    data.append(p)

In [47]:

table = pd.DataFrame(data)
table = table.set_index("pagename")
table = table.join(pageranks)
table = table.join(specialization)
table = table.join(coeditors , rsuffix=" (co-editors)")

preview¶

In [48]:

table.head(10)

Out[48]:

	average word length	first revision	hidden users	length	page views	unique ip users	unique registered users	unique users	words	Pagerank pro 0.8	...	nbcontributorsBot	nbcontributorsIP	nbcontributorsMembers	nbrevisions	nbrevisionsBot	nbrevisionsIP	nbrevisionsMembers	ns	pageid	quality
pagename
2D computer graphics	5.912321	2001-10-13T06:23:27Z	0	24124	930542	83	139	222	3490	2.540488	...	18	78	119	370	23	124	223	0	35248	4
2D geometric model	5.786585	2004-03-08T00:49:59Z	0	1113	155960	13	28	41	164	1.846966	...	3	13	24	54	3	20	31	0	511647	1
3D computer graphics	6.360368	2007-03-21T05:56:20Z	0	8803	3174611	258	240	498	1196	3.665079	...	28	250	211	893	65	368	460	0	10175073	4
3D projection	5.907763	2003-09-07T18:48:57Z	0	8987	1141382	98	113	211	1301	3.430536	...	16	95	96	351	25	117	209	0	313741	2
3-sphere	5.071429	2002-02-19T11:12:25Z	0	17595	488222	44	117	161	2898	3.693437	...	12	44	103	277	13	59	205	0	39792	4
Absolute geometry	5.561167	2004-06-02T19:31:39Z	0	5846	77451	22	59	81	891	2.870685	...	14	20	44	104	15	27	62	0	699294	2
Acute and obtuse triangles	5.818387	2014-10-10T19:11:27Z	0	9048	5549	1	2	3	1327	0.829064	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	44076423	NaN
Affine geometry	5.424357	2003-06-11T09:28:43Z	0	15245	277910	30	82	112	2373	3.495030	...	15	30	64	205	22	40	143	0	243890	3
Affine space	5.325827	2003-08-18T04:32:19Z	0	13202	401482	64	105	169	2087	4.010550	...	18	61	84	384	30	91	263	0	298834	2
Affine transformation	5.599894	2002-02-25T15:51:15Z	0	12454	1333126	78	139	217	1887	4.072621	...	21	76	111	371	26	107	238	0	38449	2

10 rows × 32 columns

storage¶

We are storing our intermediary data in csv instead of json for readability purposes. For example, github includes a web table viewer directly into its main interface. It is also more convenient to share and open csv files within the data processing pipeline since we are mainly doing table calculus more than object manipulation.

In [49]:

table.to_csv("data/final.csv", encoding="UTF-8")