In [4]:
import pandas as pd, numpy as np, json
data={}
In [52]:
for year in range(2003,2015):
    data[year]=pd.read_html('http://www.shanghairanking.com/ARWU'+repr(year)+'.html',infer_types=False, header=0)[0].sort_index(axis=1)
    print year
In [62]:
#save input data for later editing
DATA={}
for year in range(2003,2015):
    DATA[year]=data[year].to_json()
file('data.json','w').write(json.dumps(DATA))
In [5]:
#load data if already saved
DATA = json.loads(open('data.json').read())
data={}
for year in range(2003,2015):
    data[year]=pd.read_json(DATA[repr(year)]).sort_index(axis=1)
In [6]:
u={}
v={}
locs=['Institution',u'Institution*']
years=[[2003,2004,2011],[2005,2006,2007,2008,2009,2010,2012,2013,2014]]
for r in range(2):
    for year in years[r]:
        unis=[]
        for i in range(data[year]['World Rank'].count()):
            uni={}
            uni['name']=data[year].loc[i][locs[r]]
            if uni['name'] not in v:v[uni['name']]={'years':[],'ranks':[]}
            v[uni['name']]['years'].append(year)
            uni['rank']=data[year].loc[i]['World Rank']
            v[uni['name']]['ranks'].append(uni['rank'])
            unis.append(uni)
        u[year]=unis
In [50]:
exceptions=[]
In [0]:
#geocode uni names and create list with unis
from geopy.geocoders import Bing
from geopy.geocoders import GoogleV3
from geopy.geocoders import OpenMapQuest
from geopy.geocoders import Nominatim
geolocator_n = Nominatim()
geolocator_q = OpenMapQuest()
geolocator_g = GoogleV3()
geolocator_b = Bing('AiQdfYGfIiDP0FXKQ3yQ3NXHOZBPuSVZJzpJzu1641ffd9GkzBbS_yblwqPym2WR')
counter=0
for k in v.keys():
    counter+=1
    try:
        location = geolocator_q.geocode(k)
        v[k]["coord"]=((location.latitude, location.longitude))
        print 'OK MapQuest',counter
    except:
        try:
            location = geolocator_g.geocode(k)
            v[k]["coord"]=((location.latitude, location.longitude))
            print 'OK Google',counter
        except:
            try:
                location = geolocator_n.geocode(k)
                v[k]["coord"]=((location.latitude, location.longitude))
                print 'OK Nominatim',counter
            except:
                try:
                    location = geolocator_b.geocode(k)
                    v[k]["coord"]=((location.latitude, location.longitude))
                    print 'OK Bing',counter
                except:
                    exceptions.append(k)
                    print k,counter
In [52]:
geohelper={
'The Imperial College of Science, Technology and Medicine': (51.500229,-0.178940),
'University of Manchester Institute of Science and Technology':(53.470741,-2.235570),
'University of the Mediterranean (Aix-Marseille 2)':(43.299916, 5.374818)
}
for k in v:
    if 'coord' not in v[k]: 
        v[k]['coord']=geohelper[k]
In [53]:
for k in v:
    if 'coord' not in v[k]: print k
#db ok
In [54]:
#save results
file('u.json','w').write(json.dumps(u))#rankings
file('v.json','w').write(json.dumps(v))#universities