#!/usr/bin/env python # coding: utf-8 # In[4]: import pandas as pd, numpy as np, json data={} # In[52]: for year in range(2003,2015): data[year]=pd.read_html('http://www.shanghairanking.com/ARWU'+repr(year)+'.html',infer_types=False, header=0)[0].sort_index(axis=1) print year # In[62]: #save input data for later editing DATA={} for year in range(2003,2015): DATA[year]=data[year].to_json() file('data.json','w').write(json.dumps(DATA)) # In[5]: #load data if already saved DATA = json.loads(open('data.json').read()) data={} for year in range(2003,2015): data[year]=pd.read_json(DATA[repr(year)]).sort_index(axis=1) # In[6]: u={} v={} locs=['Institution',u'Institution*'] years=[[2003,2004,2011],[2005,2006,2007,2008,2009,2010,2012,2013,2014]] for r in range(2): for year in years[r]: unis=[] for i in range(data[year]['World Rank'].count()): uni={} uni['name']=data[year].loc[i][locs[r]] if uni['name'] not in v:v[uni['name']]={'years':[],'ranks':[]} v[uni['name']]['years'].append(year) uni['rank']=data[year].loc[i]['World Rank'] v[uni['name']]['ranks'].append(uni['rank']) unis.append(uni) u[year]=unis # In[50]: exceptions=[] # In[ ]: #geocode uni names and create list with unis from geopy.geocoders import Bing from geopy.geocoders import GoogleV3 from geopy.geocoders import OpenMapQuest from geopy.geocoders import Nominatim geolocator_n = Nominatim() geolocator_q = OpenMapQuest() geolocator_g = GoogleV3() geolocator_b = Bing('AiQdfYGfIiDP0FXKQ3yQ3NXHOZBPuSVZJzpJzu1641ffd9GkzBbS_yblwqPym2WR') counter=0 for k in v.keys(): counter+=1 try: location = geolocator_q.geocode(k) v[k]["coord"]=((location.latitude, location.longitude)) print 'OK MapQuest',counter except: try: location = geolocator_g.geocode(k) v[k]["coord"]=((location.latitude, location.longitude)) print 'OK Google',counter except: try: location = geolocator_n.geocode(k) v[k]["coord"]=((location.latitude, location.longitude)) print 'OK Nominatim',counter except: try: location = geolocator_b.geocode(k) v[k]["coord"]=((location.latitude, location.longitude)) print 'OK Bing',counter except: exceptions.append(k) print k,counter # In[52]: geohelper={ 'The Imperial College of Science, Technology and Medicine': (51.500229,-0.178940), 'University of Manchester Institute of Science and Technology':(53.470741,-2.235570), 'University of the Mediterranean (Aix-Marseille 2)':(43.299916, 5.374818) } for k in v: if 'coord' not in v[k]: v[k]['coord']=geohelper[k] # In[53]: for k in v: if 'coord' not in v[k]: print k #db ok # In[54]: #save results file('u.json','w').write(json.dumps(u))#rankings file('v.json','w').write(json.dumps(v))#universities