#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd, json, numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: url='http://en.wikipedia.org/wiki/List_of_airports_in_Romania' df=pd.read_html(url) df=df[0].loc[:17].T.set_index(0).T.loc[2:].set_index('IATA') # In[3]: df # In[4]: from pygeocoder import Geocoder apik='AIzaSyDybC2OroTE_XDJTuxjKruxFpby5VDhEGk' # In[5]: locations={} for i in df.index: results = Geocoder(apik).geocode(i+' airport romania') locations[i]=results[0].coordinates print i # In[6]: file("locations_ro.json",'w').write(json.dumps(locations)) # In[7]: locations=json.loads(file('locations_ro.json','r').read()) # In[8]: import requests # In[9]: airportialinks={} for i in locations: print i, url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+str(i)+'+airport+romania' m=requests.get(url).content z=pd.read_html(m)[5][0][0] z=z[z.find('http'):] airportialinks[i]=z print z # In[10]: #reformat for z in airportialinks: airportialinks[z]=airportialinks[z].split('arrivals')[0].split('departures')[0].replace(' ','').replace('...','-international-') if airportialinks[z][-1]!='/':airportialinks[z]+='/' #manual fixes if z=='TSR':airportialinks[z]='https://www.airportia.com/romania/timişoara-traian-vuia-airport/' print airportialinks[z] # In[11]: sch={} # record schedules for 2 weeks, then augment count with weekly flight numbers. # seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past. # In[12]: for i in locations: print i if i not in sch:sch[i]={} if i!='TGM': #march 11-24 = 2 weeks for d in range (11,25): if d not in sch[i]: try: url=airportialinks[i] full=url+'departures/201703'+str(d) m=requests.get(full).content sch[i][full]=pd.read_html(m)[0] #print full except: pass #print 'no tables',i,d else: #november 17-30 = 2 weeks for d in range (17,31): if d not in sch[i]: try: url=airportialinks[i] full=url+'departures/201611'+str(d) m=requests.get(full).content sch[i][full]=pd.read_html(m)[0] #print full except: pass #print 'no tables',i,d # In[13]: mdf=pd.DataFrame() # In[14]: for i in sch: for d in sch[i]: df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1) df['From']=i df['Date']=d mdf=pd.concat([mdf,df]) # In[15]: mdf=mdf.replace('Hahn','Frankfurt') mdf=mdf.replace('Hahn HHN','Frankfurt HHN') # In[16]: mdf['City']=[i[:i.rfind(' ')] for i in mdf['To']] mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['To']] # In[17]: file("mdf_ro_dest.json",'w').write(json.dumps(mdf.reset_index().to_json())) # In[25]: len(mdf) # In[18]: airlines=set(mdf['Airline']) # In[19]: cities=set(mdf['City']) # In[20]: file("cities_ro_dest.json",'w').write(json.dumps(list(cities))) file("airlines_ro_dest.json",'w').write(json.dumps(list(airlines))) # In[26]: citycoords={} # In[27]: for i in cities: if i not in citycoords: if i==u'Birmingham': z='Birmingham, UK' elif i==u'Valencia': z='Valencia, Spain' elif i==u'Naples': z='Naples, Italy' elif i==u'St. Petersburg': z='St. Petersburg, Russia' elif i==u'Bristol': z='Bristol, UK' else: z=i citycoords[i]=Geocoder(apik).geocode(z) print i # In[28]: citysave={} for i in citycoords: citysave[i]={"coords":citycoords[i][0].coordinates, "country":citycoords[i][0].country} # In[29]: file("citysave_ro_dest.json",'w').write(json.dumps(citysave))