#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd, json, numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: url='http://en.wikipedia.org/wiki/List_of_airports_in_Hungary' df=pd.read_html(url) df=df[0].loc[:6].T.set_index(0).T.loc[2:].set_index('IATA') # In[3]: df # In[4]: from pygeocoder import Geocoder apik='AIzaSyDybC2OroTE_XDJTuxjKruxFpby5VDhEGk' # In[5]: locations={} for i in df.index: results = Geocoder(apik).geocode(i+' airport Hungary') locations[i]=results[0].coordinates print i # In[6]: file("locations_hu.json",'w').write(json.dumps(locations)) # In[7]: locations=json.loads(file('locations_hu.json','r').read()) # In[8]: import requests # In[9]: i # In[10]: airportialinks={} for i in locations: print i, if i=='QPJ': url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+'PEV'+'+airport+hungary' else: url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+str(i)+'+airport+hungary' m=requests.get(url).content z=pd.read_html(m)[5][0][0] z=z[z.find('http'):] airportialinks[i]=z print z # In[11]: #reformat for z in airportialinks: airportialinks[z]=airportialinks[z].split('arrivals')[0].split('departures')[0].replace(' ','').replace('...','-international-') if airportialinks[z][-1]!='/':airportialinks[z]+='/' #manual fixes if z=='QGY':airportialinks[z]=u'https://www.airportia.com/hungary/győr_pér-international-airport/' print airportialinks[z] # In[12]: sch={} # record schedules for 2 weeks, then augment count with weekly flight numbers. # seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past. # In[13]: for i in locations: print i if i not in sch:sch[i]={} #march 11-24 = 2 weeks for d in range (11,25): if d not in sch[i]: try: url=airportialinks[i] full=url+'departures/201703'+str(d) m=requests.get(full).content sch[i][full]=pd.read_html(m)[0] #print full except: pass #print 'no tables',i,d # In[57]: for i in range(11,25): testurl=u'https://www.airportia.com/hungary/budapest-liszt-ferenc-international-airport/departures/201703'+str(i) print 'nr. of flights on March',i,':',len(sch['BUD'][testurl]) testurl=u'https://www.airportia.com/hungary/budapest-liszt-ferenc-international-airport/departures/20170318' k=sch['BUD'][testurl] k[k['To']=='Frankfurt FRA'] # `sch` checks out with source # In[38]: mdf=pd.DataFrame() # In[39]: for i in sch: for d in sch[i]: df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1) df['From']=i df['Date']=d mdf=pd.concat([mdf,df]) # In[40]: mdf=mdf.replace('Hahn','Frankfurt') mdf=mdf.replace('Hahn HHN','Frankfurt HHN') # In[41]: mdf['City']=[i[:i.rfind(' ')] for i in mdf['To']] mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['To']] # In[48]: k=mdf[mdf['Date']==testurl] k[k['To']=='Frankfurt FRA'] # `mdf` checks out with source # In[58]: file("mdf_hu_dest.json",'w').write(json.dumps(mdf.reset_index().to_json())) # In[71]: len(mdf) # In[72]: airlines=set(mdf['Airline']) # In[73]: cities=set(mdf['City']) # In[74]: file("cities_hu_dest.json",'w').write(json.dumps(list(cities))) file("airlines_hu_dest.json",'w').write(json.dumps(list(airlines))) # In[75]: citycoords={} # In[76]: for i in cities: if i not in citycoords: if i==u'Birmingham': z='Birmingham, UK' elif i==u'Valencia': z='Valencia, Spain' elif i==u'Naples': z='Naples, Italy' elif i==u'St. Petersburg': z='St. Petersburg, Russia' elif i==u'Bristol': z='Bristol, UK' elif i==u'Victoria': z='Victoria, Seychelles' elif i==u'Washington': z='Washington, DC' elif i==u'Odessa': z='Odessa, Ukraine' else: z=i citycoords[i]=Geocoder(apik).geocode(z) print i # In[77]: citysave={} for i in citycoords: citysave[i]={"coords":citycoords[i][0].coordinates, "country":citycoords[i][0].country} # In[78]: file("citysave_hu_dest.json",'w').write(json.dumps(citysave))