# -*- coding: utf-8 -*-
import json
import codecs
from bs4 import BeautifulSoup
import pandas as pd
pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)
def basic_word_analysis(text):
length = len(text)
words = len(content.split(" "))
average_word_length = float(length - words)/float(words)
return length, words, average_word_length
This function performs a basic word analysis using raw python functions. Word counting will be more precise using a NLP framework such as nltk
. Further development will certainly be done in more developed phases of the project and documented in an according notebook.
def basic_user_stats(page):
revs = json.load(codecs.open("data/revisions/%s.json" % (page), "r", "utf-8-sig"))
unique_users = 0
unique_registered_users = 0
unique_ip_users = 0
hidden_users = 0
#users = set()
registered_users = set()
ip_users = set()
for r in revs:
if "user" in r:
if r["userid"] != 0:
registered_users.add(r["user"])
else:
ip_users.add(r["user"])
else:
hidden_users += 1
first_revision = revs[-1]["timestamp"]
unique_registered_users = len(registered_users)
unique_ip_users = len(ip_users)
unique_users = unique_registered_users + unique_ip_users
return unique_users, unique_registered_users, unique_ip_users, hidden_users, first_revision
print basic_user_stats("Pi")
(2880, 1799, 1081, 0, u'2002-02-01T12:26:39Z')
def pageviews(page):
pageviews = json.load(codecs.open("data/pageviews/%s.json" % (page), "r", "utf-8-sig"))
total = 0
for m in pageviews:
for d, v in m.iteritems():
total += v
return total
print pageviews("Pi")
23309406
pageranks = pd.DataFrame.from_csv("data/pagerank.csv", sep=";", encoding="utf-8")
specialization = pd.read_json("data/specialization.json")
specialization = specialization.set_index("title")
#specialization.head()
coeditors = pd.DataFrame.from_csv("data/pages-linked-by-coeditors.stats.csv", encoding="utf-8")
# for p in specialization[0:25]:
data = []
for page in pages:
p = {}
p["pagename"] = page
content_file = codecs.open("data/pages/%s.json" % (page), "r", "utf-8-sig")
j = json.load(content_file)
r = j["query"]["pages"][j["query"]["pages"].keys()[0]]
if "revisions" in r.keys():
content = r["revisions"][0]["*"]
content = BeautifulSoup(content).text
p["length"], p["words"], p["average word length"] = basic_word_analysis(content)
p["unique users"], p["unique registered users"], p["unique ip users"], p["hidden users"],p["first revision"] = basic_user_stats(page)
p["page views"] = pageviews(page)
data.append(p)
table = pd.DataFrame(data)
table = table.set_index("pagename")
table = table.join(pageranks)
table = table.join(specialization)
table = table.join(coeditors , rsuffix=" (co-editors)")
table.head(10)
average word length | first revision | hidden users | length | page views | unique ip users | unique registered users | unique users | words | Pagerank pro 0.8 | ... | nbcontributorsBot | nbcontributorsIP | nbcontributorsMembers | nbrevisions | nbrevisionsBot | nbrevisionsIP | nbrevisionsMembers | ns | pageid | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pagename | |||||||||||||||||||||
2D computer graphics | 5.912321 | 2001-10-13T06:23:27Z | 0 | 24124 | 930542 | 83 | 139 | 222 | 3490 | 2.540488 | ... | 18 | 78 | 119 | 370 | 23 | 124 | 223 | 0 | 35248 | 4 |
2D geometric model | 5.786585 | 2004-03-08T00:49:59Z | 0 | 1113 | 155960 | 13 | 28 | 41 | 164 | 1.846966 | ... | 3 | 13 | 24 | 54 | 3 | 20 | 31 | 0 | 511647 | 1 |
3D computer graphics | 6.360368 | 2007-03-21T05:56:20Z | 0 | 8803 | 3174611 | 258 | 240 | 498 | 1196 | 3.665079 | ... | 28 | 250 | 211 | 893 | 65 | 368 | 460 | 0 | 10175073 | 4 |
3D projection | 5.907763 | 2003-09-07T18:48:57Z | 0 | 8987 | 1141382 | 98 | 113 | 211 | 1301 | 3.430536 | ... | 16 | 95 | 96 | 351 | 25 | 117 | 209 | 0 | 313741 | 2 |
3-sphere | 5.071429 | 2002-02-19T11:12:25Z | 0 | 17595 | 488222 | 44 | 117 | 161 | 2898 | 3.693437 | ... | 12 | 44 | 103 | 277 | 13 | 59 | 205 | 0 | 39792 | 4 |
Absolute geometry | 5.561167 | 2004-06-02T19:31:39Z | 0 | 5846 | 77451 | 22 | 59 | 81 | 891 | 2.870685 | ... | 14 | 20 | 44 | 104 | 15 | 27 | 62 | 0 | 699294 | 2 |
Acute and obtuse triangles | 5.818387 | 2014-10-10T19:11:27Z | 0 | 9048 | 5549 | 1 | 2 | 3 | 1327 | 0.829064 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 44076423 | NaN |
Affine geometry | 5.424357 | 2003-06-11T09:28:43Z | 0 | 15245 | 277910 | 30 | 82 | 112 | 2373 | 3.495030 | ... | 15 | 30 | 64 | 205 | 22 | 40 | 143 | 0 | 243890 | 3 |
Affine space | 5.325827 | 2003-08-18T04:32:19Z | 0 | 13202 | 401482 | 64 | 105 | 169 | 2087 | 4.010550 | ... | 18 | 61 | 84 | 384 | 30 | 91 | 263 | 0 | 298834 | 2 |
Affine transformation | 5.599894 | 2002-02-25T15:51:15Z | 0 | 12454 | 1333126 | 78 | 139 | 217 | 1887 | 4.072621 | ... | 21 | 76 | 111 | 371 | 26 | 107 | 238 | 0 | 38449 | 2 |
10 rows × 32 columns
We are storing our intermediary data in csv
instead of json
for readability purposes. For example, github includes a web table viewer directly into its main interface. It is also more convenient to share and open csv files within the data processing pipeline since we are mainly doing table calculus more than object manipulation.
table.to_csv("data/final.csv", encoding="UTF-8")