In [3]:
#
# Gianluca Demartini, 2015
# http://gianlucademartini.net
# Processing of WikiR data https://wikireverse.org/data (version 2014-23-data) to see which top-level domain (e.g., .com .net) link to each Wikipedia
# The final data goes to a tab separated file which is load in to Tableau Public to generate this visualisation: http:
#

import pandas as pd
file0='./Desktop/2014-23-data/part-0'
file1='./Desktop/2014-23-data/part-1'
file2='./Desktop/2014-23-data/part-2'
file3='./Desktop/2014-23-data/part-3'
header_row=['lan','wiki','url','WebTitle', 'time']

#load data from files
try:
    wikiR0 = pd.read_csv(file0, error_bad_lines=False, header=None, names=header_row, sep='\t')
    wikiR1 = pd.read_csv(file1, error_bad_lines=False, header=None, names=header_row, sep='\t')
    wikiR2 = pd.read_csv(file2, error_bad_lines=False, header=None, names=header_row, sep='\t')
    wikiR3 = pd.read_csv(file3, error_bad_lines=False, header=None, names=header_row, sep='\t')
except pd.parser.CParserError as e:
    print e

#put all in one dataframe
wikiR = wikiR0.append(wikiR1, ignore_index=True)
wikiR = wikiR.append(wikiR2, ignore_index=True)
wikiR = wikiR.append(wikiR3, ignore_index=True)

wikiR.head()
Out[3]:
lan wiki url WebTitle time
0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... NaN 2014-07-13 08:10:00
1 ab 1939 http://rarplayer.appspot.com/wiki/1939 1939 – Wikipedia, wolna encyklopedia 2014-08-02 06:30:00
2 ab 1994 http://rarplayer.appspot.com/wiki/1994 1994 – Wikipedia, wolna encyklopedia 2014-08-01 00:02:00
3 ab 2004 http://instapedia.com/m/2004 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00
4 ab 2007 http://rarplayer.appspot.com/wiki/2007 2007 – Wikipedia, wolna encyklopedia 2014-07-31 09:37:00
In [4]:
#how much data?
len(wikiR.index)
Out[4]:
36301008
In [7]:
from __future__ import with_statement
from urlparse import urlparse
import urllib2

# load tlds, ignore comments and empty lines:
file="https://publicsuffix.org/list/effective_tld_names.dat"
filehandle = urllib2.urlopen(file) 
tlds = [line.strip() for line in filehandle if line[0] not in "/\n"]

#function to get the top level domain from the url given the list of tlds
def get_TLdomain(url, tlds):
    # print url
    url_elements = urlparse(url)[1].split('.')
    # url_elements = ["abcde","co","uk"]

    for i in range(-len(url_elements), 0):
        last_i_elements = url_elements[i:]
        #    i=-3: ["abcde","co","uk"]
        #    i=-2: ["co","uk"]
        #    i=-1: ["uk"] etc

        candidate = ".".join(last_i_elements) # abcde.co.uk, co.uk, uk
        wildcard_candidate = ".".join(["*"] + last_i_elements[1:]) # *.co.uk, *.uk, *
        exception_candidate = "!" + candidate

        # match tlds: 
        if (exception_candidate in tlds):
            return ".".join(url_elements[i:]) 
        if (candidate in tlds or wildcard_candidate in tlds):
            return ".".join(url_elements[i-0:]) # i-1 returns domain name with TLD, i-0 return TLD
            # returns "abcde.co.uk"  
#   raise ValueError("Domain not in global list of TLDs "+url)
    print "Domain not in global list of TLDs "+url
    return "null"

print get_TLdomain("http://io.abcde.com", tlds)
print wikiR.url[3]
print get_TLdomain(wikiR.url[3], tlds)
com
http://instapedia.com/m/2004
com
In [8]:
# extract the top level domain from the url for all entries of wikiR. This takes time.
wikiR.loc[:,'tld']=wikiR.apply(lambda x: get_TLdomain(x['url'], tlds), axis=1)
wikiR.head()
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs 2014-07-28 10:19:00
Domain not in global list of TLDs 2014-07-31 17:36:00
Domain not in global list of TLDs 2014-08-01 01:43:00
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs 2014-07-28 22:32:00
Domain not in global list of TLDs 2014-07-31 17:32:00
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs 2014-07-10 10:18:00
Domain not in global list of TLDs 2014-07-31 09:47:00
Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips
Domain not in global list of TLDs http://www.geni.com./people/Anna-Paquin/6000000005350378009
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs http://www.ativanonline.net./
Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html
Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx
Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?pageid=FD30F09484F44DE4
Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?postid=F6B7D3412CD04432
Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/CamelCase
Domain not in global list of TLDs http://rubygarage.org./
Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting
Domain not in global list of TLDs http://173.8.135.113/
Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame
Domain not in global list of TLDs http://blog.bluehost.com./blog/bluehost/clean-up-online-business-listings-with-yext-1664/
Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine
Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame
Domain not in global list of TLDs http://216.119.148.216/update/deleting-online-predators-act/
Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips
Domain not in global list of TLDs http://198.102.103.39/community/
Domain not in global list of TLDs http://198.102.103.39/community/index.php
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0518.htm
Domain not in global list of TLDs http://173.8.135.113/
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine
Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips
Domain not in global list of TLDs http://rubygarage.org./
Domain not in global list of TLDs 2014-07-30 21:17:00
Domain not in global list of TLDs http://173.8.135.113/
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt
Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt
Domain not in global list of TLDs http://rubygarage.org./
Domain not in global list of TLDs http://www.klonopinonline.org./
Domain not in global list of TLDs http://204.14.213.185/Laptops-Notebooks/SubCategory/ID-32?Category=223
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs 2014-07-30 21:16:00
Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips
Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting
Domain not in global list of TLDs 2014-07-29 16:49:00
Domain not in global list of TLDs http://www.scotxblog.com./
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs http://144.76.109.211/redirect.php?dst=
Domain not in global list of TLDs http://www.thegatewaypundit.com./
Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://look.gvsu.edu:8000/opc
Domain not in global list of TLDs http://198.100.46.202/2009/09/
Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm
Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php
Domain not in global list of TLDs http://198.100.46.202/2009/09/
Domain not in global list of TLDs http://173.8.135.113/
Domain not in global list of TLDs http://www.russianseason.net./
Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0531.htm
Domain not in global list of TLDs http://62.143.88.190/devalco/
Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips
Domain not in global list of TLDs http://www.buy-online-viagra.us./
Domain not in global list of TLDs http://www.visualnews.com./
Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/
Domain not in global list of TLDs http://173.8.135.113/
Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html
Domain not in global list of TLDs 2014-07-29 06:44:00
Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi
Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi
Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi
Domain not in global list of TLDs 2014-08-01 15:54:00
Domain not in global list of TLDs http://test2.mafiachat.net:8800/
Domain not in global list of TLDs http://www.xn--e1adkpj5f.xn--p1ai/%D0%B8%D0%B2%D0%B0%D0%BD-%D0%BF%D0%BE%D0%BB%D0%BE%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%80%D0%B0%D0%B7%D0%B2%D0%BE%D0%B4-%D0%B4%D0%BB%D1%8F-%D0%BD%D0%BE%D0%B2%D0%B8%D1%87%D0%BA%D0%BE%D0%B2/
Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/games-for-xbox/1838-skachat-besplatno-igru-betmen-arkhem-origins-na-iksboks-360-batman-arkham-origins-xbox-360-2013-god-russkaya-licenzionnaya-versiya.html
Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html
Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44
Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44
Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44
Domain not in global list of TLDs http://xn--p1af1b.xn--p1ai/%D0%91%D0%B0%D1%88%D0%B0%D1%80_%D0%90%D1%81%D0%B0%D0%B4
Domain not in global list of TLDs http://www.xn----dtbikagememahdgab5aia4a3b3k.xn--p1ai/
Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html
Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1804-skachat-besplatno-igru-ekspediciya-konkistadorov-na-kompyuter-expeditions-conquistador-2013-god-russkaya-versiya-repak.html
Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=3
Domain not in global list of TLDs http://xn----7sbb5ahj4aiadq2m.xn--p1ai/guide/army/ta/t10.shtml
Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=25
Out[8]:
lan wiki url WebTitle time tld
0 ab ???????? http://mail-archives.apache.org/mod_mbox/openo... NaN 2014-07-13 08:10:00 org
1 ab 1939 http://rarplayer.appspot.com/wiki/1939 1939 – Wikipedia, wolna encyklopedia 2014-08-02 06:30:00 appspot.com
2 ab 1994 http://rarplayer.appspot.com/wiki/1994 1994 – Wikipedia, wolna encyklopedia 2014-08-01 00:02:00 appspot.com
3 ab 2004 http://instapedia.com/m/2004 2004 - iPhone/Mobile Wikipedia 2014-07-13 16:18:00 com
4 ab 2007 http://rarplayer.appspot.com/wiki/2007 2007 – Wikipedia, wolna encyklopedia 2014-07-31 09:37:00 appspot.com
In [9]:
# seralize dataframe now
wikiR.to_pickle('/Users/gianlucademartini/wikiRall.pickle')

#group by and count inlinks for each tld and wikipedia edition
tldCount=wikiR.groupby(['lan','tld']).count()

# save as tab separated
tldCount.reset_index()
tldCount.to_csv('tldCount.tsv', sep='\t')
In [ ]: