import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
maindf = pd.read_csv('data/airline_delay_causes.csv')
maindf = maindf.iloc[:, :21]
maindf.head()
year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2009 | 11 | AS | Alaska Airlines Inc. | BRW | Barrow, AK: Wiley Post/Will Rogers Memorial | 65.0 | 14.0 | 3.61 | 1.06 | ... | 0.0 | 6.19 | 0.0 | 1.0 | 920.0 | 592.0 | 34.0 | 78.0 | 0.0 | 216.0 |
1 | 2009 | 11 | AS | Alaska Airlines Inc. | BUR | Burbank, CA: Bob Hope | 85.0 | 5.0 | 1.00 | 0.00 | ... | 0.0 | 0.98 | 0.0 | 0.0 | 134.0 | 25.0 | 0.0 | 70.0 | 0.0 | 39.0 |
2 | 2009 | 11 | AS | Alaska Airlines Inc. | CDV | Cordova, AK: Merle K Mudhole Smith | 58.0 | 8.0 | 2.04 | 1.00 | ... | 0.0 | 2.96 | 4.0 | 2.0 | 586.0 | 174.0 | 37.0 | 62.0 | 0.0 | 313.0 |
3 | 2009 | 11 | AS | Alaska Airlines Inc. | DCA | Washington, DC: Ronald Reagan Washington National | 88.0 | 11.0 | 1.99 | 0.00 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 266.0 | 65.0 | 0.0 | 201.0 | 0.0 | 0.0 |
4 | 2009 | 11 | AS | Alaska Airlines Inc. | DEN | Denver, CO: Denver International | 123.0 | 5.0 | 3.47 | 0.00 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 81.0 | 57.0 | 0.0 | 24.0 | 0.0 | 0.0 |
5 rows × 21 columns
# Let's find the names of all individual airlines, so that we get an idea of how many unique airlines are there in the data
set(list(maindf['carrier_name']))
{'AirTran Airways Corporation', 'Alaska Airlines Inc.', 'Allegiant Air', 'American Airlines Inc.', 'American Eagle Airlines Inc.', 'Atlantic Southeast Airlines', 'Comair Inc.', 'Continental Air Lines Inc.', 'Delta Air Lines Inc.', 'Endeavor Air Inc.', 'Envoy Air', 'ExpressJet Airlines Inc.', 'Frontier Airlines Inc.', 'Hawaiian Airlines Inc.', 'JetBlue Airways', 'Mesa Airlines Inc.', 'Northwest Airlines Inc.', 'PSA Airlines Inc.', 'Pinnacle Airlines Inc.', 'Republic Airline', 'SkyWest Airlines Inc.', 'Southwest Airlines Co.', 'Spirit Air Lines', 'US Airways Inc.', 'United Air Lines Inc.', 'Virgin America'}
#Find out the column names to write them properly in code:
maindf.set_index('carrier_name').columns #since we're going to group by 'Carrier Name'
Index(['year', ' month', 'carrier', 'airport', 'airport_name', 'arr_flights', 'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', ' arr_delay', ' carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'], dtype='object')
#Find the index of relevant column to use in the following GroupBy statement
list(maindf.set_index('carrier_name').columns).index(' arr_delay')
14
trial = maindf.set_index('carrier_name').iloc[:, list(maindf.set_index('carrier_name').columns).index(' arr_delay')].groupby(level = 0).sum()
trial2 = trial.sort_values(ascending = False)
trial2
carrier_name Southwest Airlines Co. 115771486.0 American Airlines Inc. 78815786.0 SkyWest Airlines Inc. 72789590.0 Delta Air Lines Inc. 71621393.0 ExpressJet Airlines Inc. 68781482.0 United Air Lines Inc. 57915309.0 JetBlue Airways 38036618.0 American Eagle Airlines Inc. 26718814.0 US Airways Inc. 20966493.0 Frontier Airlines Inc. 12947465.0 Atlantic Southeast Airlines 12936567.0 Envoy Air 12111571.0 Mesa Airlines Inc. 11538648.0 Alaska Airlines Inc. 11065065.0 AirTran Airways Corporation 11019631.0 Spirit Air Lines 9062640.0 Continental Air Lines Inc. 7413086.0 Pinnacle Airlines Inc. 6645291.0 Endeavor Air Inc. 4972029.0 Virgin America 4895524.0 Republic Airline 4333426.0 PSA Airlines Inc. 4180379.0 Comair Inc. 3988422.0 Hawaiian Airlines Inc. 2809143.0 Northwest Airlines Inc. 2643000.0 Allegiant Air 1793203.0 Name: arr_delay, dtype: float64
#Let's create a Bar graph!
# First, create a trace:
x = list(trial2.index)
y = list(trial2)
iplot([go.Bar(x=x, y=y)], filename = 'Delay Bar Chart (minutes)')
#Find the index of relevant column to use in the following GroupBy statement
list(maindf.set_index('carrier_name').columns).index('arr_flights')
5
Total_flights_per_airline = maindf.set_index('carrier_name').iloc[:, list(maindf.set_index('carrier_name').columns).index('arr_flights')].groupby(level = 0).sum()
Total_flights_per_airline
carrier_name AirTran Airways Corporation 1201121.0 Alaska Airlines Inc. 1689757.0 Allegiant Air 110255.0 American Airlines Inc. 6785529.0 American Eagle Airlines Inc. 2300220.0 Atlantic Southeast Airlines 923459.0 Comair Inc. 286694.0 Continental Air Lines Inc. 721450.0 Delta Air Lines Inc. 7951812.0 Endeavor Air Inc. 404370.0 Envoy Air 932356.0 ExpressJet Airlines Inc. 4891450.0 Frontier Airlines Inc. 915000.0 Hawaiian Airlines Inc. 751950.0 JetBlue Airways 2516031.0 Mesa Airlines Inc. 1080705.0 Northwest Airlines Inc. 270188.0 PSA Airlines Inc. 323091.0 Pinnacle Airlines Inc. 678801.0 Republic Airline 365631.0 SkyWest Airlines Inc. 6346731.0 Southwest Airlines Co. 12216352.0 Spirit Air Lines 617450.0 US Airways Inc. 2624022.0 United Air Lines Inc. 4887878.0 Virgin America 389060.0 Name: arr_flights, dtype: float64
new_df = pd.DataFrame({'Total Delay': trial, 'Total No. of Flights': Total_flights_per_airline})
new_df['Delay/Flight'] = new_df['Total Delay']/new_df['Total No. of Flights']
trial3 = new_df['Delay/Flight'].sort_values(ascending = False)
# Now let's create a bar plot to compare the delay/flight of specific airlines
# Creating traces
x1 = list(trial3.index)
y1 = list(trial3)
iplot([go.Bar(x=x1, y=y1)], filename = 'Delay/Flight of Airline from Nov 2009 - Feb 2019')
# Add some color to the above graph
color = np.array(['rgb(255,255,255)']*trial3.shape[0])
color[trial3>=15] = 'rgb(130,0,0)'
color[trial3<15]= 'rgb(204,204,205)'
data=[dict(type='bar',
x=trial3.index,
y=trial3,
marker=dict(color=color.tolist()),
text = round(trial3,1),
textposition = 'auto',
name = 'Delay/Flight of Airline from Nov 2009 - Feb 2019'
)]
layout = go.Layout(bargap = 0.3, title = 'Delay/Flight of Airline from Nov 2009 - Feb 2019 <br> (Airlines with over 15 mins in red)',
yaxis = dict(title = 'No. of minutes'))
fig = go.Figure(data = data, layout = layout)
iplot(fig)
Each Hawaiin Airline flight has been delayed by around 3.7 minutes!
And Allegiant Air flights are late by around 16.2 minutes.
Now let's plot the performance of airlines across the years. Note that for 2009, we only have data from Month 11 onward. i.e. Month 11 & 12.
Thus for this, we will create and group by a 'Multi-index' of Airline name ('carrier_name') & Year.
# Now for this too, let's find out the index of the ' arr_delay' column since this group by is going to be a multi-index
new_index = list(maindf.set_index(['carrier_name', 'year']).columns).index(' arr_delay')
new_index
13
# Now, let's plot the average lateness per flight per each year.
# We are provided with only 2 months of the year 2009 so we will ignore that later on.
trial_series = maindf.set_index(['carrier_name', 'year']).iloc[:, new_index].groupby(level =[0,1]).sum()
#Let's see what the series looks like.
trial_series.head(20)
carrier_name year AirTran Airways Corporation 2009 2949355.0 2010 2250331.0 2011 1990102.0 2012 1359781.0 2013 1791450.0 2014 678612.0 Alaska Airlines Inc. 2009 859073.0 2010 741330.0 2011 766706.0 2012 872417.0 2013 848473.0 2014 989491.0 2015 1074977.0 2016 996624.0 2017 1454555.0 2018 1942070.0 2019 519349.0 Allegiant Air 2018 1565764.0 2019 227439.0 American Airlines Inc. 2009 6227014.0 Name: arr_delay, dtype: float64
# to find index of 'arr_flights' column in Multi_index:
list(maindf.set_index(['carrier_name', 'year']).columns).index('arr_flights')
4
Total_flights_per_airline_per_year = maindf.set_index(['carrier_name', 'year']).iloc[:, list(maindf.set_index(['carrier_name', 'year']).columns).index('arr_flights')].groupby(level = [0,1]).sum()
Total_flights_per_airline_per_year.head(20)
carrier_name year AirTran Airways Corporation 2009 231965.0 2010 248844.0 2011 248703.0 2012 218162.0 2013 173952.0 2014 79495.0 Alaska Airlines Inc. 2009 126110.0 2010 136950.0 2011 143726.0 2012 147569.0 2013 154743.0 2014 160257.0 2015 172521.0 2016 177280.0 2017 185068.0 2018 245761.0 2019 39772.0 Allegiant Air 2018 96221.0 2019 14034.0 American Airlines Inc. 2009 503858.0 Name: arr_flights, dtype: float64
new_df = pd.DataFrame({'Total Delay': trial_series, 'Total No. of Flights': Total_flights_per_airline_per_year})
new_df['Delay/Flight'] = new_df['Total Delay']/new_df['Total No. of Flights']
new_df.head(20)
Total Delay | Total No. of Flights | Delay/Flight | ||
---|---|---|---|---|
carrier_name | year | |||
AirTran Airways Corporation | 2009 | 2949355.0 | 231965.0 | 12.714655 |
2010 | 2250331.0 | 248844.0 | 9.043139 | |
2011 | 1990102.0 | 248703.0 | 8.001922 | |
2012 | 1359781.0 | 218162.0 | 6.232896 | |
2013 | 1791450.0 | 173952.0 | 10.298531 | |
2014 | 678612.0 | 79495.0 | 8.536537 | |
Alaska Airlines Inc. | 2009 | 859073.0 | 126110.0 | 6.812093 |
2010 | 741330.0 | 136950.0 | 5.413143 | |
2011 | 766706.0 | 143726.0 | 5.334498 | |
2012 | 872417.0 | 147569.0 | 5.911926 | |
2013 | 848473.0 | 154743.0 | 5.483111 | |
2014 | 989491.0 | 160257.0 | 6.174401 | |
2015 | 1074977.0 | 172521.0 | 6.230992 | |
2016 | 996624.0 | 177280.0 | 5.621751 | |
2017 | 1454555.0 | 185068.0 | 7.859571 | |
2018 | 1942070.0 | 245761.0 | 7.902271 | |
2019 | 519349.0 | 39772.0 | 13.058156 | |
Allegiant Air | 2018 | 1565764.0 | 96221.0 | 16.272581 |
2019 | 227439.0 | 14034.0 | 16.206285 | |
American Airlines Inc. | 2009 | 6227014.0 | 503858.0 | 12.358669 |
#AirTran Airways has data only till 2014.
trace1 = go.Scatter(x=list(new_df.loc[['AirTran Airways Corporation']].index.get_level_values('year')),
y= list(new_df.loc[['AirTran Airways Corporation']].ix[:,2]),
mode='lines+markers')
iplot([trace1],filename='line chart')
We will use data of all airlines across all years and see what plot we get. Could get a little messy, so let's split the graphs. For that we will need to create a list of all the airlines and split it.
#list of airlines
list_of_airlines = list(set(list(new_df.index.get_level_values('carrier_name'))))
split1 = list_of_airlines[:int(len(list_of_airlines)/2)]
split2 = list_of_airlines[int(len(list_of_airlines)/2):]
new_list = []
for airline in split1:
trace = go.Scatter(x=list(new_df.loc[[airline]].index.get_level_values('year')),
y= list(new_df.loc[[airline]].ix[:,2]),
text = airline, name = airline,
mode='lines+markers')
new_list.append(trace)
data = new_list
layout = dict(title = 'Delay of Each Airline per Flight by year <br> First Group of Airlines',
xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
yaxis = dict(title= 'Avg. Delay per fligh in minutes', rangemode= 'tozero'),
showlegend=True
)
fig = dict(data = data, layout = layout)
iplot(fig)
new_list = []
for airline in split2:
trace = go.Scatter(x=list(new_df.loc[[airline]].index.get_level_values('year')),
y= list(new_df.loc[[airline]].ix[:,2]),
text = airline, name = airline,
mode='lines+markers')
new_list.append(trace)
data = new_list
layout = dict(title = 'Delay of Each Airline per Flight by year <br> Second Group of Airlines',
xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
yaxis = dict(title= 'Avg. Delay per fligh in minutes', rangemode= 'tozero'),
showlegend=True
)
fig = dict(data = data, layout = layout)
iplot(fig)
#Grouping the data by year:
year_df = maindf.set_index('year').groupby(level=0).sum()
year_df.head()
month | arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | ||||||||||||||||
2009 | 113147 | 5917946.0 | 1109632.0 | 296652.25 | 36781.26 | 407857.51 | 2118.93 | 366223.96 | 76947.0 | 14200.0 | 60393828.0 | 16952697.0 | 2923083.0 | 18457683.0 | 72335.0 | 21988030.0 |
2010 | 115115 | 6450117.0 | 1174884.0 | 339084.44 | 35854.04 | 376616.89 | 3057.76 | 420272.65 | 113255.0 | 15474.0 | 63257498.0 | 19217505.0 | 2785212.0 | 16229118.0 | 107410.0 | 24918253.0 |
2011 | 101538 | 6085281.0 | 1109872.0 | 313296.54 | 30963.04 | 352694.64 | 2318.39 | 410600.62 | 115978.0 | 14399.0 | 61675942.0 | 18553241.0 | 2558373.0 | 15303210.0 | 81353.0 | 25179765.0 |
2012 | 93510 | 6096762.0 | 1015158.0 | 302680.22 | 28799.46 | 300169.19 | 1980.71 | 381529.39 | 78862.0 | 12519.0 | 56916692.0 | 18165456.0 | 2279810.0 | 12828822.0 | 73052.0 | 23569552.0 |
2013 | 104837 | 6369482.0 | 1269277.0 | 353206.75 | 36817.01 | 384847.91 | 2321.20 | 492086.83 | 96012.0 | 14160.0 | 71937602.0 | 21136897.0 | 2974153.0 | 17426866.0 | 103651.0 | 30296035.0 |
#Plotting the number of flights
trace = go.Scatter(x= list(year_df.index),
y= list(year_df['arr_flights']),
text = year_df['arr_flights'], mode = 'lines+markers')
data = [trace]
layout = dict(title = 'Total No. of Flights By Year - 2009-2019',
xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
yaxis = dict(title= 'Avg. Delay per fligh in minutes', rangemode= 'tozero'),
showlegend=True
)
fig = dict(data = data, layout = layout)
iplot(fig)
new_year_df = year_df.iloc[:-1, :]
new_year_df
month | arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | ||||||||||||||||
2009 | 113147 | 5917946.0 | 1109632.0 | 296652.25 | 36781.26 | 407857.51 | 2118.93 | 366223.96 | 76947.0 | 14200.0 | 60393828.0 | 16952697.0 | 2923083.0 | 18457683.0 | 72335.0 | 21988030.0 |
2010 | 115115 | 6450117.0 | 1174884.0 | 339084.44 | 35854.04 | 376616.89 | 3057.76 | 420272.65 | 113255.0 | 15474.0 | 63257498.0 | 19217505.0 | 2785212.0 | 16229118.0 | 107410.0 | 24918253.0 |
2011 | 101538 | 6085281.0 | 1109872.0 | 313296.54 | 30963.04 | 352694.64 | 2318.39 | 410600.62 | 115978.0 | 14399.0 | 61675942.0 | 18553241.0 | 2558373.0 | 15303210.0 | 81353.0 | 25179765.0 |
2012 | 93510 | 6096762.0 | 1015158.0 | 302680.22 | 28799.46 | 300169.19 | 1980.71 | 381529.39 | 78862.0 | 12519.0 | 56916692.0 | 18165456.0 | 2279810.0 | 12828822.0 | 73052.0 | 23569552.0 |
2013 | 104837 | 6369482.0 | 1269277.0 | 353206.75 | 36817.01 | 384847.91 | 2321.20 | 492086.83 | 96012.0 | 14160.0 | 71937602.0 | 21136897.0 | 2974153.0 | 17426866.0 | 103651.0 | 30296035.0 |
2014 | 90927 | 5819811.0 | 1240528.0 | 356587.94 | 35035.38 | 368634.53 | 1641.06 | 478629.06 | 126984.0 | 14449.0 | 70316961.0 | 21259409.0 | 2988669.0 | 16519549.0 | 65350.0 | 29483984.0 |
2015 | 86768 | 5819079.0 | 1063439.0 | 323453.86 | 36036.71 | 316684.49 | 1949.72 | 385314.94 | 89884.0 | 15187.0 | 62651867.0 | 20172956.0 | 3100233.0 | 14335762.0 | 80985.0 | 24961931.0 |
2016 | 79764 | 5617658.0 | 964239.0 | 283191.71 | 28658.42 | 301437.86 | 1811.80 | 349140.36 | 65861.0 | 13652.0 | 59850360.0 | 19533337.0 | 2606083.0 | 14170617.0 | 81925.0 | 23458398.0 |
2017 | 81724 | 5674621.0 | 1029474.0 | 285058.62 | 28580.50 | 330589.89 | 1801.53 | 383442.51 | 82693.0 | 12530.0 | 65816682.0 | 20516702.0 | 2799759.0 | 16500787.0 | 94170.0 | 25905070.0 |
2018 | 132471 | 7213446.0 | 1352710.0 | 373477.31 | 47408.01 | 438765.75 | 2518.70 | 490540.94 | 116584.0 | 17859.0 | 87540080.0 | 26316981.0 | 4919075.0 | 21488435.0 | 126531.0 | 34689058.0 |
#Plot
trace = go.Scatter(x= list(new_year_df.index),
y= list(new_year_df['arr_flights']),
text = new_year_df['arr_flights'], mode = 'lines+markers')
data = [trace]
layout = dict(title = 'Total No. of Flights By Year - 2009-2018* <br> [2019 is avoided as Data only provided until February]',
xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
yaxis = dict(title= 'Total No. Of Flights', rangemode= 'tozero'),
showlegend=True
)
fig = dict(data = data, layout = layout)
iplot(fig)
#Let's do a visual comparison of delayed flights in 2017 and 2018
stat = ['Delayed Flights', 'Carrier Del.', 'NAS Del.', 'Late Aircraft Del.']
fig = go.Figure(data=[
go.Bar(name='2017', x=stat, y=[new_year_df.loc[2017][2], new_year_df.loc[2017][3], new_year_df.loc[2017][5], new_year_df.loc[2017][7]]),
go.Bar(name='2018', x=stat, y= [new_year_df.loc[2018][2], new_year_df.loc[2018][3], new_year_df.loc[2018][5], new_year_df.loc[2018][7]])
])
fig.layout.update(barmode='group', title = 'Comparison of Delayed Flights and causes - 2017 & 2018',
yaxis = dict(title = 'No. of Flights'))
fig.show()
new_year_df
month | arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | ||||||||||||||||
2009 | 113147 | 5917946.0 | 1109632.0 | 296652.25 | 36781.26 | 407857.51 | 2118.93 | 366223.96 | 76947.0 | 14200.0 | 60393828.0 | 16952697.0 | 2923083.0 | 18457683.0 | 72335.0 | 21988030.0 |
2010 | 115115 | 6450117.0 | 1174884.0 | 339084.44 | 35854.04 | 376616.89 | 3057.76 | 420272.65 | 113255.0 | 15474.0 | 63257498.0 | 19217505.0 | 2785212.0 | 16229118.0 | 107410.0 | 24918253.0 |
2011 | 101538 | 6085281.0 | 1109872.0 | 313296.54 | 30963.04 | 352694.64 | 2318.39 | 410600.62 | 115978.0 | 14399.0 | 61675942.0 | 18553241.0 | 2558373.0 | 15303210.0 | 81353.0 | 25179765.0 |
2012 | 93510 | 6096762.0 | 1015158.0 | 302680.22 | 28799.46 | 300169.19 | 1980.71 | 381529.39 | 78862.0 | 12519.0 | 56916692.0 | 18165456.0 | 2279810.0 | 12828822.0 | 73052.0 | 23569552.0 |
2013 | 104837 | 6369482.0 | 1269277.0 | 353206.75 | 36817.01 | 384847.91 | 2321.20 | 492086.83 | 96012.0 | 14160.0 | 71937602.0 | 21136897.0 | 2974153.0 | 17426866.0 | 103651.0 | 30296035.0 |
2014 | 90927 | 5819811.0 | 1240528.0 | 356587.94 | 35035.38 | 368634.53 | 1641.06 | 478629.06 | 126984.0 | 14449.0 | 70316961.0 | 21259409.0 | 2988669.0 | 16519549.0 | 65350.0 | 29483984.0 |
2015 | 86768 | 5819079.0 | 1063439.0 | 323453.86 | 36036.71 | 316684.49 | 1949.72 | 385314.94 | 89884.0 | 15187.0 | 62651867.0 | 20172956.0 | 3100233.0 | 14335762.0 | 80985.0 | 24961931.0 |
2016 | 79764 | 5617658.0 | 964239.0 | 283191.71 | 28658.42 | 301437.86 | 1811.80 | 349140.36 | 65861.0 | 13652.0 | 59850360.0 | 19533337.0 | 2606083.0 | 14170617.0 | 81925.0 | 23458398.0 |
2017 | 81724 | 5674621.0 | 1029474.0 | 285058.62 | 28580.50 | 330589.89 | 1801.53 | 383442.51 | 82693.0 | 12530.0 | 65816682.0 | 20516702.0 | 2799759.0 | 16500787.0 | 94170.0 | 25905070.0 |
2018 | 132471 | 7213446.0 | 1352710.0 | 373477.31 | 47408.01 | 438765.75 | 2518.70 | 490540.94 | 116584.0 | 17859.0 | 87540080.0 | 26316981.0 | 4919075.0 | 21488435.0 | 126531.0 | 34689058.0 |
#Let's do a visual comparison of average delayed flights from 2009-2017 vs the delayed flights of 2018
stat = ['Delayed Flights', 'Carrier Del.', 'NAS Del.', 'Late Aircraft Del.']
fig = go.Figure(data=[
go.Bar(name='Avg. of 2009-2017', x=stat, y=[np.mean(new_year_df.iloc[:-1, :]).loc['arr_del15'],
np.mean(new_year_df.iloc[:-1, :]).loc['carrier_ct'],
np.mean(new_year_df.iloc[:-1, :]).loc['nas_ct'],
np.mean(new_year_df.iloc[:-1, :]).loc['late_aircraft_ct']]),
go.Bar(name='2018', x=stat, y= [new_year_df.loc[2018][2], new_year_df.loc[2018][3], new_year_df.loc[2018][5], new_year_df.loc[2018][7]])
])
fig.layout.update(barmode='group', title = 'Comparison of Delayed Flights and causes - Avg. of 2009-2017 & 2018<br>(Hover for details)',
yaxis = dict(title = 'No. of Flights'),
xaxis_tickangle = 0, bargap = 0.45)
fig.show()
For this, we will have to group by Month - pretty simple. We will again be calculating the Delay in minutes per flight in that month. As that seems to be a reasonable statistic
month_total_delay = maindf.set_index(' month').iloc[:, list(maindf.set_index(' month').columns).index(' arr_delay')].groupby(level = 0).sum()
month_total_arrivals = maindf.set_index(' month').iloc[:, list(maindf.set_index(' month').columns).index('arr_flights')].groupby(level = 0).sum()
month_df = pd.DataFrame({'Total Delay': month_total_delay, 'Total No. of Flights': month_total_arrivals})
month_df['Delay/Flight'] = month_df['Total Delay']/month_df['Total No. of Flights']
month_df
Total Delay | Total No. of Flights | Delay/Flight | |
---|---|---|---|
month | |||
1 | 55992069.0 | 5003725.0 | 11.190077 |
2 | 55421731.0 | 5109873.0 | 10.846009 |
3 | 54102640.0 | 5295087.0 | 10.217517 |
4 | 52453539.0 | 5110413.0 | 10.264051 |
5 | 57717664.0 | 5255897.0 | 10.981506 |
6 | 73834338.0 | 5331553.0 | 13.848561 |
7 | 75536793.0 | 5513319.0 | 13.700784 |
8 | 64681554.0 | 5454172.0 | 11.859097 |
9 | 40278757.0 | 4959221.0 | 8.121993 |
10 | 43951145.0 | 5178013.0 | 8.488033 |
11 | 39010669.0 | 4921185.0 | 7.927088 |
12 | 62791162.0 | 5048905.0 | 12.436590 |
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October',
'November','December']
month_df['Month'] = month_list
month_df
Total Delay | Total No. of Flights | Delay/Flight | Month | |
---|---|---|---|---|
month | ||||
1 | 55992069.0 | 5003725.0 | 11.190077 | January |
2 | 55421731.0 | 5109873.0 | 10.846009 | February |
3 | 54102640.0 | 5295087.0 | 10.217517 | March |
4 | 52453539.0 | 5110413.0 | 10.264051 | April |
5 | 57717664.0 | 5255897.0 | 10.981506 | May |
6 | 73834338.0 | 5331553.0 | 13.848561 | June |
7 | 75536793.0 | 5513319.0 | 13.700784 | July |
8 | 64681554.0 | 5454172.0 | 11.859097 | August |
9 | 40278757.0 | 4959221.0 | 8.121993 | September |
10 | 43951145.0 | 5178013.0 | 8.488033 | October |
11 | 39010669.0 | 4921185.0 | 7.927088 | November |
12 | 62791162.0 | 5048905.0 | 12.436590 | December |
lowest_index = np.argmin(month_df['Delay/Flight']) - 1 #since month_df's index starts from 1 and not 0
colors = ['lightslategray',] * 12
colors[lowest_index] = 'crimson'
data= go.Bar(x = month_df.Month,
y = month_df['Delay/Flight'],
text=round(month_df['Delay/Flight'], 2),
textposition = 'auto',
marker_color =colors)
fig = go.Figure()
fig.add_trace(data)
fig.layout.update(title = 'Average Delay per Flight by month',
xaxis =dict(title='Month'),
yaxis= dict(title='Avg. Delay per Flight in minutes'))
iplot(fig)
Thus September, October and November (with November being the best) are the best months to travel in. And one should avoid travelling in June, July if they're looking for least delays.
maindf.head(10)
year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2009 | 11 | AS | Alaska Airlines Inc. | BRW | Barrow, AK: Wiley Post/Will Rogers Memorial | 65.0 | 14.0 | 3.61 | 1.06 | ... | 0.0 | 6.19 | 0.0 | 1.0 | 920.0 | 592.0 | 34.0 | 78.0 | 0.0 | 216.0 |
1 | 2009 | 11 | AS | Alaska Airlines Inc. | BUR | Burbank, CA: Bob Hope | 85.0 | 5.0 | 1.00 | 0.00 | ... | 0.0 | 0.98 | 0.0 | 0.0 | 134.0 | 25.0 | 0.0 | 70.0 | 0.0 | 39.0 |
2 | 2009 | 11 | AS | Alaska Airlines Inc. | CDV | Cordova, AK: Merle K Mudhole Smith | 58.0 | 8.0 | 2.04 | 1.00 | ... | 0.0 | 2.96 | 4.0 | 2.0 | 586.0 | 174.0 | 37.0 | 62.0 | 0.0 | 313.0 |
3 | 2009 | 11 | AS | Alaska Airlines Inc. | DCA | Washington, DC: Ronald Reagan Washington National | 88.0 | 11.0 | 1.99 | 0.00 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 266.0 | 65.0 | 0.0 | 201.0 | 0.0 | 0.0 |
4 | 2009 | 11 | AS | Alaska Airlines Inc. | DEN | Denver, CO: Denver International | 123.0 | 5.0 | 3.47 | 0.00 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 81.0 | 57.0 | 0.0 | 24.0 | 0.0 | 0.0 |
5 | 2009 | 11 | AS | Alaska Airlines Inc. | DFW | Dallas/Fort Worth, TX: Dallas/Fort Worth Inter... | 66.0 | 6.0 | 1.28 | 0.69 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 176.0 | 64.0 | 11.0 | 101.0 | 0.0 | 0.0 |
6 | 2009 | 11 | AS | Alaska Airlines Inc. | EWR | Newark, NJ: Newark Liberty International | 59.0 | 10.0 | 0.24 | 0.00 | ... | 0.0 | 0.56 | 0.0 | 0.0 | 553.0 | 8.0 | 0.0 | 526.0 | 0.0 | 19.0 |
7 | 2009 | 11 | AS | Alaska Airlines Inc. | FAI | Fairbanks, AK: Fairbanks International | 299.0 | 41.0 | 18.35 | 1.82 | ... | 0.0 | 8.70 | 2.0 | 0.0 | 1910.0 | 707.0 | 114.0 | 386.0 | 0.0 | 703.0 |
8 | 2009 | 11 | AS | Alaska Airlines Inc. | GEG | Spokane, WA: Spokane International | 86.0 | 0.0 | 0.00 | 0.00 | ... | 0.0 | 0.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
9 | 2009 | 11 | AS | Alaska Airlines Inc. | HNL | Honolulu, HI: Daniel K Inouye International | 85.0 | 18.0 | 1.34 | 1.46 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 547.0 | 85.0 | 89.0 | 373.0 | 0.0 | 0.0 |
10 rows × 21 columns
maindf.shape
(155710, 21)
choro_df = maindf.set_index('airport').groupby(level=0).sum()
#Here is a data set containing the latitudes and longitudes of Airports
df_tp = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
df_tp.head()
iata | airport | city | state | country | lat | long | cnt | |
---|---|---|---|---|---|---|---|---|
0 | ORD | Chicago O'Hare International | Chicago | IL | USA | 41.979595 | -87.904464 | 25129 |
1 | ATL | William B Hartsfield-Atlanta Intl | Atlanta | GA | USA | 33.640444 | -84.426944 | 21925 |
2 | DFW | Dallas-Fort Worth International | Dallas-Fort Worth | TX | USA | 32.895951 | -97.037200 | 20662 |
3 | PHX | Phoenix Sky Harbor International | Phoenix | AZ | USA | 33.434167 | -112.008056 | 17290 |
4 | DEN | Denver Intl | Denver | CO | USA | 39.858408 | -104.667002 | 13781 |
choro_df.head()
year | month | arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
airport | |||||||||||||||||
ABE | 998462 | 3189 | 31126.0 | 5976.0 | 2339.05 | 205.87 | 1463.03 | 5.40 | 1962.63 | 735.0 | 102.0 | 371118.0 | 156117.0 | 17890.0 | 63970.0 | 186.0 | 132955.0 |
ABI | 235551 | 751 | 20361.0 | 4189.0 | 1623.86 | 475.66 | 704.14 | 10.46 | 1374.79 | 559.0 | 20.0 | 217038.0 | 71571.0 | 37452.0 | 28333.0 | 366.0 | 79316.0 |
ABQ | 2349915 | 7570 | 270269.0 | 51839.0 | 18862.46 | 1112.53 | 8667.76 | 141.16 | 23055.19 | 2011.0 | 398.0 | 2543213.0 | 893157.0 | 88056.0 | 308377.0 | 4928.0 | 1248695.0 |
ABR | 173294 | 569 | 5190.0 | 532.0 | 189.76 | 24.11 | 151.00 | 0.00 | 167.17 | 81.0 | 12.0 | 38801.0 | 15362.0 | 7415.0 | 5375.0 | 0.0 | 10649.0 |
ABY | 245669 | 780 | 9758.0 | 1876.0 | 751.80 | 62.73 | 325.77 | 2.00 | 733.72 | 120.0 | 26.0 | 116642.0 | 49808.0 | 9362.0 | 14859.0 | 139.0 | 42474.0 |
We need to merge these dataframes.
merged_df = pd.merge(choro_df, df_tp, how='inner', left_index = True, right_on = 'iata') #merge on airport
merged_df.head()
year | month | arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | ... | security_delay | late_aircraft_delay | iata | airport | city | state | country | lat | long | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
203 | 998462 | 3189 | 31126.0 | 5976.0 | 2339.05 | 205.87 | 1463.03 | 5.40 | 1962.63 | 735.0 | ... | 186.0 | 132955.0 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 | 60 |
126 | 235551 | 751 | 20361.0 | 4189.0 | 1623.86 | 475.66 | 704.14 | 10.46 | 1374.79 | 559.0 | ... | 366.0 | 79316.0 | ABI | Abilene Regional | Abilene | TX | USA | 32.411320 | -99.681897 | 382 |
64 | 2349915 | 7570 | 270269.0 | 51839.0 | 18862.46 | 1112.53 | 8667.76 | 141.16 | 23055.19 | 2011.0 | ... | 4928.0 | 1248695.0 | ABQ | Albuquerque International | Albuquerque | NM | USA | 35.040222 | -106.609194 | 1258 |
176 | 243649 | 782 | 25263.0 | 6285.0 | 1333.74 | 199.48 | 913.70 | 0.91 | 3837.15 | 1295.0 | ... | 16.0 | 265217.0 | ACV | Arcata | Arcata/Eureka | CA | USA | 40.978115 | -124.108619 | 144 |
167 | 571721 | 1840 | 32882.0 | 6139.0 | 2367.37 | 292.24 | 1333.80 | 8.98 | 2136.65 | 713.0 | ... | 413.0 | 125965.0 | AEX | Alexandria International | Alexandria | LA | USA | 31.327372 | -92.548556 | 168 |
5 rows × 25 columns
merged_df = merged_df.iloc[:,2:-1] #Year and month, and cnt columns, are irrelevant.
merged_df.head(10)
arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | ... | nas_delay | security_delay | late_aircraft_delay | iata | airport | city | state | country | lat | long | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
203 | 31126.0 | 5976.0 | 2339.05 | 205.87 | 1463.03 | 5.40 | 1962.63 | 735.0 | 102.0 | 371118.0 | ... | 63970.0 | 186.0 | 132955.0 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 |
126 | 20361.0 | 4189.0 | 1623.86 | 475.66 | 704.14 | 10.46 | 1374.79 | 559.0 | 20.0 | 217038.0 | ... | 28333.0 | 366.0 | 79316.0 | ABI | Abilene Regional | Abilene | TX | USA | 32.411320 | -99.681897 |
64 | 270269.0 | 51839.0 | 18862.46 | 1112.53 | 8667.76 | 141.16 | 23055.19 | 2011.0 | 398.0 | 2543213.0 | ... | 308377.0 | 4928.0 | 1248695.0 | ABQ | Albuquerque International | Albuquerque | NM | USA | 35.040222 | -106.609194 |
176 | 25263.0 | 6285.0 | 1333.74 | 199.48 | 913.70 | 0.91 | 3837.15 | 1295.0 | 261.0 | 414526.0 | ... | 33236.0 | 16.0 | 265217.0 | ACV | Arcata | Arcata/Eureka | CA | USA | 40.978115 | -124.108619 |
167 | 32882.0 | 6139.0 | 2367.37 | 292.24 | 1333.80 | 8.98 | 2136.65 | 713.0 | 106.0 | 348355.0 | ... | 54063.0 | 413.0 | 125965.0 | AEX | Alexandria International | Alexandria | LA | USA | 31.327372 | -92.548556 |
187 | 31889.0 | 6326.0 | 2677.74 | 222.85 | 1136.22 | 3.76 | 2285.29 | 475.0 | 60.0 | 388455.0 | ... | 47079.0 | 87.0 | 146011.0 | AGS | Bush | Augusta | GA | USA | 33.369955 | -81.964496 |
198 | 102056.0 | 20707.0 | 8423.85 | 663.02 | 3907.45 | 41.70 | 7671.02 | 1702.0 | 155.0 | 1144355.0 | ... | 164028.0 | 1576.0 | 469889.0 | ALB | Albany Cty | Albany | NY | USA | 42.748119 | -73.802979 |
98 | 49017.0 | 11308.0 | 3911.40 | 698.17 | 2877.50 | 20.17 | 3800.78 | 2407.0 | 1514.0 | 712832.0 | ... | 129589.0 | 1456.0 | 241399.0 | ASE | Aspen-Pitkin Co/Sardy | Aspen | CO | USA | 39.223160 | -106.868845 |
1 | 3931975.0 | 661494.0 | 151349.57 | 18379.91 | 248613.97 | 603.00 | 242548.03 | 48865.0 | 8331.0 | 42069206.0 | ... | 11303992.0 | 29836.0 | 16113325.0 | ATL | William B Hartsfield-Atlanta Intl | Atlanta | GA | USA | 33.640444 | -84.426944 |
211 | 38029.0 | 7450.0 | 2854.37 | 231.92 | 1855.38 | 4.96 | 2503.63 | 1160.0 | 107.0 | 451002.0 | ... | 75527.0 | 144.0 | 159268.0 | ATW | Outagamie County Regional | Appleton | WI | USA | 44.257408 | -88.519476 |
10 rows × 22 columns
merged_df['% of Flights15'] = (round(merged_df['arr_del15']/merged_df['arr_flights'], 4))*100
merged_df.head(10)
arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | ... | security_delay | late_aircraft_delay | iata | airport | city | state | country | lat | long | % of Flights15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
203 | 31126.0 | 5976.0 | 2339.05 | 205.87 | 1463.03 | 5.40 | 1962.63 | 735.0 | 102.0 | 371118.0 | ... | 186.0 | 132955.0 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 | 19.20 |
126 | 20361.0 | 4189.0 | 1623.86 | 475.66 | 704.14 | 10.46 | 1374.79 | 559.0 | 20.0 | 217038.0 | ... | 366.0 | 79316.0 | ABI | Abilene Regional | Abilene | TX | USA | 32.411320 | -99.681897 | 20.57 |
64 | 270269.0 | 51839.0 | 18862.46 | 1112.53 | 8667.76 | 141.16 | 23055.19 | 2011.0 | 398.0 | 2543213.0 | ... | 4928.0 | 1248695.0 | ABQ | Albuquerque International | Albuquerque | NM | USA | 35.040222 | -106.609194 | 19.18 |
176 | 25263.0 | 6285.0 | 1333.74 | 199.48 | 913.70 | 0.91 | 3837.15 | 1295.0 | 261.0 | 414526.0 | ... | 16.0 | 265217.0 | ACV | Arcata | Arcata/Eureka | CA | USA | 40.978115 | -124.108619 | 24.88 |
167 | 32882.0 | 6139.0 | 2367.37 | 292.24 | 1333.80 | 8.98 | 2136.65 | 713.0 | 106.0 | 348355.0 | ... | 413.0 | 125965.0 | AEX | Alexandria International | Alexandria | LA | USA | 31.327372 | -92.548556 | 18.67 |
187 | 31889.0 | 6326.0 | 2677.74 | 222.85 | 1136.22 | 3.76 | 2285.29 | 475.0 | 60.0 | 388455.0 | ... | 87.0 | 146011.0 | AGS | Bush | Augusta | GA | USA | 33.369955 | -81.964496 | 19.84 |
198 | 102056.0 | 20707.0 | 8423.85 | 663.02 | 3907.45 | 41.70 | 7671.02 | 1702.0 | 155.0 | 1144355.0 | ... | 1576.0 | 469889.0 | ALB | Albany Cty | Albany | NY | USA | 42.748119 | -73.802979 | 20.29 |
98 | 49017.0 | 11308.0 | 3911.40 | 698.17 | 2877.50 | 20.17 | 3800.78 | 2407.0 | 1514.0 | 712832.0 | ... | 1456.0 | 241399.0 | ASE | Aspen-Pitkin Co/Sardy | Aspen | CO | USA | 39.223160 | -106.868845 | 23.07 |
1 | 3931975.0 | 661494.0 | 151349.57 | 18379.91 | 248613.97 | 603.00 | 242548.03 | 48865.0 | 8331.0 | 42069206.0 | ... | 29836.0 | 16113325.0 | ATL | William B Hartsfield-Atlanta Intl | Atlanta | GA | USA | 33.640444 | -84.426944 | 16.82 |
211 | 38029.0 | 7450.0 | 2854.37 | 231.92 | 1855.38 | 4.96 | 2503.63 | 1160.0 | 107.0 | 451002.0 | ... | 144.0 | 159268.0 | ATW | Outagamie County Regional | Appleton | WI | USA | 44.257408 | -88.519476 | 19.59 |
10 rows × 23 columns
Plot:
trace = go.Scattergeo(locationmode = 'USA-states',
lon = merged_df['long'],
lat = merged_df['lat'],
text= merged_df['airport'] +", "+ merged_df['city']+ ", "+ merged_df['state']+", " +merged_df['% of Flights15'].astype(str)+"%",
mode='markers',
marker = dict(size = merged_df['% of Flights15'],
opacity = 0.8,
reversescale=True,
autocolorscale = False,
symbol = 'circle',
line = dict(width=1,
color='rgba(205, 102, 102)'),
colorscale = 'Rainbow',
cmin = 0,
color = merged_df['% of Flights15'],
cmax = merged_df['% of Flights15'].max(),
colorbar_title="% flights<br>Delayed by >15 mins"
))
fig = go.Figure(data = [trace])
fig.layout.update(
title = '% of flights delayed for more than 15 mins <br>(Hover for airport names)',
geo = dict(
scope='usa',
projection_type='albers usa',
showland = True,
landcolor = "rgb(250, 250, 250)",
subunitcolor = "rgb(217, 217, 217)",
countrycolor = "rgb(217, 217, 217)",
countrywidth = 0.5,
subunitwidth = 0.5
),
)
fig.show()
On hovering we can find out the delays 221 airports across the USA. But let's find out, differently, which region of airport has most significant number of delays.
choro_df.columns
Index(['year', ' month', 'arr_flights', 'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', ' arr_delay', ' carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'], dtype='object')
choro_df2 = choro_df[['carrier_ct', ' weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct']]
choro_df2 = choro_df2.rename(columns = {'carrier_ct': 'Carrier', ' weather_ct':'Weather', 'nas_ct':'National Air System',
'security_ct':'Security', 'late_aircraft_ct':'Late Aircraft'})
Now, 'choro_df' is the one that has been grouped by the 'Airport' names (JFK, LAX, etc.). Now, I want to create a new column in this dataframe. The values of this column will state what has been the major cause of delay at these airports.
tp2_list = []
for num in range(choro_df2.shape[0]):
for i in choro_df2.iloc[num,:].index:
if choro_df2.iloc[num,:][i]==np.max(choro_df2.iloc[num,:]):
xx = i
tp2_list.append(xx)
len(tp2_list) #There are rows with all their entries = 0
405
choro_df2.shape
(381, 5)
Something's wrong, instead of 381 values, there are 405 values in the list.
On further analysis, we can find that there are rows (airports) that have all their values as 0. Which is why there were extra values as the code kept looping around those 0's of those particular airports. Let's get rid of those airports as they don't add anything to this piece of analysis.
choro_df2['max'] = list(choro_df2.apply(np.max, axis=1))
choro_df2 = choro_df2[choro_df2['max'] != 0]
choro_df2.shape
(375, 6)
tp2_list = []
for num in range(choro_df2.shape[0]):
for i in choro_df2.iloc[num,:5].index:
if choro_df2.iloc[num,:5][i]==np.max(choro_df2.iloc[num,:5]):
xx = i
tp2_list.append(xx)
len(tp2_list)
375
Now these values are of the same lenght! We can add 'tp2_list' as a new column.
choro_df2['Cause'] = tp2_list
choro_df2.head(15) #values of 'Cause' column represent what the major cause of delay for the airline has been over the past 10 years
Carrier | Weather | National Air System | Security | Late Aircraft | max | Cause | |
---|---|---|---|---|---|---|---|
airport | |||||||
ABE | 2339.05 | 205.87 | 1463.03 | 5.40 | 1962.63 | 2339.05 | Carrier |
ABI | 1623.86 | 475.66 | 704.14 | 10.46 | 1374.79 | 1623.86 | Carrier |
ABQ | 18862.46 | 1112.53 | 8667.76 | 141.16 | 23055.19 | 23055.19 | Late Aircraft |
ABR | 189.76 | 24.11 | 151.00 | 0.00 | 167.17 | 189.76 | Carrier |
ABY | 751.80 | 62.73 | 325.77 | 2.00 | 733.72 | 751.80 | Carrier |
ACK | 361.85 | 98.35 | 200.32 | 1.25 | 200.25 | 361.85 | Carrier |
ACT | 896.01 | 140.25 | 408.46 | 2.69 | 844.62 | 896.01 | Carrier |
ACV | 1333.74 | 199.48 | 913.70 | 0.91 | 3837.15 | 3837.15 | Late Aircraft |
ACY | 773.94 | 71.70 | 1764.72 | 11.56 | 982.03 | 1764.72 | National Air System |
ADK | 40.88 | 4.02 | 63.16 | 1.02 | 37.95 | 63.16 | National Air System |
ADQ | 210.23 | 123.61 | 179.06 | 1.00 | 116.07 | 210.23 | Carrier |
AEX | 2367.37 | 292.24 | 1333.80 | 8.98 | 2136.65 | 2367.37 | Carrier |
AGS | 2677.74 | 222.85 | 1136.22 | 3.76 | 2285.29 | 2677.74 | Carrier |
AKN | 33.37 | 0.00 | 16.52 | 0.00 | 25.11 | 33.37 | Carrier |
ALB | 8423.85 | 663.02 | 3907.45 | 41.70 | 7671.02 | 8423.85 | Carrier |
neww = pd.DataFrame({'Col': list(choro_df2['Cause'])})
neww['Col'].value_counts().index
Index(['Late Aircraft', 'Carrier', 'National Air System'], dtype='object')
data= go.Bar(x = neww['Col'].value_counts().index,
y = neww['Col'].value_counts(),
textposition = 'auto')
fig = go.Figure()
fig.add_trace(data)
fig.layout.update(title = 'Major cause of delay at Airports (Past 10 years)',
xaxis =dict(title='Major Cause of Delay'),
yaxis= dict(title='No. of Airports'))
iplot(fig)
labels = list(neww['Col'].value_counts().index)
values = neww['Col'].value_counts()
fig = go.Figure(data=[go.Pie(labels=labels, values=values,
textinfo = 'label+percent')])
fig.layout.update(title = 'Major Cause of Delay in Airports')
iplot(fig)
values2 = list(np.sum(choro_df2)[:5])
labels2 = np.sum(choro_df2)[:5].index
fig = go.Figure(data=[go.Pie(labels=labels2, values=values2,
textinfo = 'label+percent')])
fig.layout.update(title = 'Major Cause of Delay of Past 10 years')
iplot(fig)
choro_df2['Sum of Total Flights Delayed'] = list(choro_df2.iloc[:,:5].apply(np.sum, axis=1))
df_dotplot = choro_df2
df_dotplot.columns
Index(['Carrier', 'Weather', 'National Air System', 'Security', 'Late Aircraft', 'max', 'Cause', 'Sum of Total Flights Delayed'], dtype='object')
airports = list(df_dotplot.index)
carrier_delay = list(round((df_dotplot['Carrier']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
weather_delay = list(round((df_dotplot['Weather']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
nas_delay = list(round((df_dotplot['National Air System']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
security_delay = list(round((df_dotplot['Security']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
lateaircraft_delay = list(round((df_dotplot['Late Aircraft']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
fig = go.Figure()
fig.add_trace(go.Scatter(
x=carrier_delay,
y= airports,
name='Percent of flights delayed due to Carrier',
marker=dict(
color='rgba(0, 0, 255, 0.95)',
line_color='rgba(156, 165, 196, 1.0)',
)
))
fig.add_trace(go.Scatter(
x=weather_delay,
y= airports,
name='Percent of flights delayed due to Weather',
marker=dict(
color='rgba(0, 255, 0, 0.95)',
line_color='rgba(217, 217, 217, 1.0)'
)
))
fig.add_trace(go.Scatter(
x=nas_delay,
y= airports,
name='Percent of flights delayed due to National Air System',
marker=dict(
color='rgba(56, 65, 96, 0.95)',
line_color='rgba(56, 65, 96, 1.0)',
)
))
fig.add_trace(go.Scatter(
x=security_delay,
y= airports,
name='Percent of flights delayed due to Security',
marker=dict(
color='rgba(255, 255, 0, 0.95)',
line_color='rgba(156, 165, 196, 1.0)',
)
))
fig.add_trace(go.Scatter(
x=lateaircraft_delay,
y= airports,
name='Percent of flights delayed due to Late Aircraft',
marker=dict(
color='rgba(255, 144,144, 0.95)',
line_color='rgba(255, 144, 144, 1.0)',
)
))
fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=6.5, opacity = 0.6))
fig.layout.update(
title="% of Delay Reasons at Airports",
xaxis=dict(
showgrid=False,
showline=True,
linecolor='rgb(102, 102, 102)',
tickfont_color='rgb(102, 102, 102)',
showticklabels=True,
dtick=10,
ticks='outside',
tickcolor='rgb(102, 102, 102)',
),
yaxis = dict(title = 'Airport'),
margin=dict(l=140, r=40, b=50, t=80),
legend=dict(
font_size=10,
yanchor='middle',
xanchor='right',
),
width=800,
height=600,
paper_bgcolor='white',
plot_bgcolor='white',
hovermode='closest',
)
fig.show()
This is a little too congested. We were trying to fit in data of 375 airports. But, we can get a fair idea of what does cause the most delay. - We can see a lot of pink, blue and grey dots which shows that the main reasons for delay are Late Aircraft, Carrier and National Air Systems.
#Let's find out about the top 25 airports which have the most delay.
top25_df_dotplot = df_dotplot.sort_values(by=['Sum of Total Flights Delayed'], ascending = False).iloc[:25,:]
airports3 = list(top25_df_dotplot.index)
carrier_delay3 = list(round((top25_df_dotplot['Carrier']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
weather_delay3 = list(round((top25_df_dotplot['Weather']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
nas_delay3 = list(round((top25_df_dotplot['National Air System']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
security_delay3 = list(round((top25_df_dotplot['Security']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
lateaircraft_delay3 = list(round((top25_df_dotplot['Late Aircraft']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
fig = go.Figure()
fig.add_trace(go.Scatter(
x=carrier_delay3,
y= airports3,
name='Percent of flights delayed due to Carrier',
marker=dict(
color='rgba(0, 0, 255, 0.95)',
line_color='rgba(156, 165, 196, 1.0)',
)
))
fig.add_trace(go.Scatter(
x=weather_delay3,
y= airports3,
name='Percent of flights delayed due to Weather',
marker=dict(
color='rgba(0, 255, 0, 0.95)',
line_color='rgba(217, 217, 217, 1.0)'
)
))
fig.add_trace(go.Scatter(
x=nas_delay3,
y= airports3,
name='Percent of flights delayed due to National Air System',
marker=dict(
color='rgba(255, 128, 0, 0.95)',
line_color='rgba(255, 128, 0 1.0)',
)
))
fig.add_trace(go.Scatter(
x=security_delay3,
y= airports3,
name='Percent of flights delayed due to Security',
marker=dict(
color='rgba(255, 255, 0, 0.95)',
line_color='rgba(156, 165, 196, 1.0)',
)
))
fig.add_trace(go.Scatter(
x=lateaircraft_delay3,
y= airports3,
name='Percent of flights delayed due to Late Aircraft',
marker=dict(
color='rgba(255, 0,255, 0.95)',
line_color='rgba(255, 0,255, 1.0)',
)
))
fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=6.5, opacity = 0.99))
fig.layout.update(
title="% of Delay Reasons at Top 25 Airports",
xaxis=dict(
showgrid=False,
showline=True,
linecolor='rgb(102, 102, 102)',
tickfont_color='rgb(102, 102, 102)',
showticklabels=True,
dtick=10,
ticks='outside',
tickcolor='rgb(102, 102, 102)',
),
yaxis = dict(title = 'Airport'),
margin=dict(l=140, r=40, b=50, t=80),
legend=dict(
font_size=10,
yanchor='middle',
xanchor='right',
),
width=800,
height=600,
paper_bgcolor='white',
plot_bgcolor='white',
hovermode='closest',
)
fig.show()
fig = go.Figure()
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Carrier']), text=carrier_delay3,
textposition = 'auto', name='Carrier Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Weather']), text = weather_delay3,name='Weather Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['National Air System']),
text = nas_delay3, textposition = 'inside', name='National Air System Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Security']),text=security_delay3, name='Security Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Late Aircraft']),
text = lateaircraft_delay3,textposition = 'inside', name='Late Aircraft Delay'))
fig.layout.update(barmode='stack', title = "Casues of Delay at Top 25 Airports<br>[2009-2019]",xaxis=dict(title = 'Airport'),
yaxis = dict(title = 'Total Delay (in minutes)'))
fig.show()
merged_df2 = pd.merge(top25_df_dotplot, merged_df, how='inner', left_index = True, right_on = 'iata')
trace = go.Scattergeo(locationmode = 'USA-states',
lon = merged_df2['long'],
lat = merged_df2['lat'],
text= merged_df2['airport'] +", "+ merged_df2['city']+ ", "+ merged_df2['state']+",<br> " +
round(merged_df2['Sum of Total Flights Delayed']).astype(str)+" flights delayed",
mode='markers',
marker = dict(size = merged_df2['Sum of Total Flights Delayed']/7800,
opacity = 0.8,
reversescale=True,
autocolorscale = False,
symbol = 'circle',
line = dict(width=1,
color='rgba(205, 102, 102)'),
colorscale = 'Rainbow',
cmin = 0,
color = merged_df2['Sum of Total Flights Delayed'],
cmax = merged_df2['Sum of Total Flights Delayed'].max(),
colorbar_title="Total flights<br>Delayed by >15 mins"
))
fig = go.Figure(data = [trace])
fig.layout.update(
title = 'Top 25 Airports with the most delays <br>(Hover for airport names)',
geo = dict(
scope='usa',
projection_type='albers usa',
showland = True,
landcolor = "rgb(250, 250, 250)",
subunitcolor = "rgb(217, 217, 217)",
countrycolor = "rgb(217, 217, 217)",
countrywidth = 0.5,
subunitwidth = 0.5
),
)
fig.show()
fig = go.Figure()
fig.add_trace(go.Bar(x=airports, y=carrier_delay, name='% of Carrier Delay'))
fig.add_trace(go.Bar(x=airports, y=weather_delay, name='% of Weather Delay'))
fig.add_trace(go.Bar(x=airports, y=nas_delay, name='% of National Air System Delay'))
fig.add_trace(go.Bar(x=airports, y=security_delay, name='% of Security Delay'))
fig.add_trace(go.Bar(x=airports, y=lateaircraft_delay, name='% of Late Aircraft Delay'))
fig.layout.update(barmode='stack', xaxis=dict(title = 'Airport'))
fig.show()
The above plot is very bad.
Let's check if the flights are late during any particular season or not.
maindf.head(10)
year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2009 | 11 | AS | Alaska Airlines Inc. | BRW | Barrow, AK: Wiley Post/Will Rogers Memorial | 65.0 | 14.0 | 3.61 | 1.06 | ... | 0.0 | 6.19 | 0.0 | 1.0 | 920.0 | 592.0 | 34.0 | 78.0 | 0.0 | 216.0 |
1 | 2009 | 11 | AS | Alaska Airlines Inc. | BUR | Burbank, CA: Bob Hope | 85.0 | 5.0 | 1.00 | 0.00 | ... | 0.0 | 0.98 | 0.0 | 0.0 | 134.0 | 25.0 | 0.0 | 70.0 | 0.0 | 39.0 |
2 | 2009 | 11 | AS | Alaska Airlines Inc. | CDV | Cordova, AK: Merle K Mudhole Smith | 58.0 | 8.0 | 2.04 | 1.00 | ... | 0.0 | 2.96 | 4.0 | 2.0 | 586.0 | 174.0 | 37.0 | 62.0 | 0.0 | 313.0 |
3 | 2009 | 11 | AS | Alaska Airlines Inc. | DCA | Washington, DC: Ronald Reagan Washington National | 88.0 | 11.0 | 1.99 | 0.00 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 266.0 | 65.0 | 0.0 | 201.0 | 0.0 | 0.0 |
4 | 2009 | 11 | AS | Alaska Airlines Inc. | DEN | Denver, CO: Denver International | 123.0 | 5.0 | 3.47 | 0.00 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 81.0 | 57.0 | 0.0 | 24.0 | 0.0 | 0.0 |
5 | 2009 | 11 | AS | Alaska Airlines Inc. | DFW | Dallas/Fort Worth, TX: Dallas/Fort Worth Inter... | 66.0 | 6.0 | 1.28 | 0.69 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 176.0 | 64.0 | 11.0 | 101.0 | 0.0 | 0.0 |
6 | 2009 | 11 | AS | Alaska Airlines Inc. | EWR | Newark, NJ: Newark Liberty International | 59.0 | 10.0 | 0.24 | 0.00 | ... | 0.0 | 0.56 | 0.0 | 0.0 | 553.0 | 8.0 | 0.0 | 526.0 | 0.0 | 19.0 |
7 | 2009 | 11 | AS | Alaska Airlines Inc. | FAI | Fairbanks, AK: Fairbanks International | 299.0 | 41.0 | 18.35 | 1.82 | ... | 0.0 | 8.70 | 2.0 | 0.0 | 1910.0 | 707.0 | 114.0 | 386.0 | 0.0 | 703.0 |
8 | 2009 | 11 | AS | Alaska Airlines Inc. | GEG | Spokane, WA: Spokane International | 86.0 | 0.0 | 0.00 | 0.00 | ... | 0.0 | 0.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
9 | 2009 | 11 | AS | Alaska Airlines Inc. | HNL | Honolulu, HI: Daniel K Inouye International | 85.0 | 18.0 | 1.34 | 1.46 | ... | 0.0 | 0.00 | 0.0 | 0.0 | 547.0 | 85.0 | 89.0 | 373.0 | 0.0 | 0.0 |
10 rows × 21 columns
season = maindf.set_index(' month').groupby(level=0).sum()
#Let's classify each month according to what season they belong.
season_list = ['Winter', 'Winter', 'Spring', 'Spring', 'Spring', 'Summer',
'Summer','Summer', 'Fall', 'Fall', 'Fall', 'Winter']
season['season'] = season_list
season['Toal No. of Delays'] = list(np.sum(season.iloc[:,3:8], axis=1))
season
year | arr_flights | arr_del15 | carrier_ct | weather_ct | nas_ct | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | season | Toal No. of Delays | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
month | ||||||||||||||||||
1 | 26286960 | 5003725.0 | 940632.0 | 269868.34 | 34788.07 | 300533.86 | 2078.15 | 333364.48 | 143912.0 | 11863.0 | 55992069.0 | 17648065.0 | 3237030.0 | 13292328.0 | 87846.0 | 21726800.0 | Winter | 940632.90 |
2 | 28951706 | 5109873.0 | 947700.0 | 262137.99 | 32457.62 | 317828.98 | 1678.34 | 333598.64 | 152163.0 | 11706.0 | 55421731.0 | 16800522.0 | 3030289.0 | 14163755.0 | 73100.0 | 21354065.0 | Winter | 947701.57 |
3 | 25578033 | 5295087.0 | 965403.0 | 274431.42 | 24002.72 | 309844.17 | 1868.82 | 355257.15 | 92317.0 | 10851.0 | 54102640.0 | 16750864.0 | 2063992.0 | 13502611.0 | 68530.0 | 21716643.0 | Spring | 965404.28 |
4 | 25777447 | 5110413.0 | 899946.0 | 249682.64 | 24973.76 | 295246.16 | 1534.69 | 328509.08 | 64034.0 | 11247.0 | 52453539.0 | 15710201.0 | 2277923.0 | 13560137.0 | 56562.0 | 20848716.0 | Spring | 899946.33 |
5 | 25515644 | 5255897.0 | 978608.0 | 269082.06 | 31622.09 | 319143.13 | 1718.72 | 357042.67 | 61889.0 | 14261.0 | 57717664.0 | 16678287.0 | 2811017.0 | 15234498.0 | 66324.0 | 22927538.0 | Spring | 978608.67 |
6 | 26280980 | 5331553.0 | 1182531.0 | 334109.52 | 42109.14 | 348799.63 | 1987.73 | 455525.05 | 81797.0 | 17349.0 | 73834338.0 | 21607992.0 | 3726750.0 | 17585519.0 | 84823.0 | 30829254.0 | Summer | 1182531.07 |
7 | 25783383 | 5513319.0 | 1209872.0 | 345103.19 | 43003.61 | 350846.20 | 2007.58 | 468912.61 | 80894.0 | 17180.0 | 75536793.0 | 22286591.0 | 3632895.0 | 17872721.0 | 77322.0 | 31667264.0 | Summer | 1209873.19 |
8 | 26087607 | 5454172.0 | 1070528.0 | 308297.32 | 35628.99 | 321577.87 | 2208.85 | 402814.44 | 78847.0 | 14543.0 | 64681554.0 | 19779804.0 | 2917264.0 | 15606459.0 | 98777.0 | 26279056.0 | Summer | 1070527.47 |
9 | 25821607 | 4959221.0 | 726022.0 | 210617.89 | 20702.56 | 244856.56 | 1308.84 | 248537.82 | 53883.0 | 9629.0 | 40278757.0 | 12951173.0 | 1617002.0 | 10564292.0 | 62629.0 | 15083661.0 | Fall | 726023.67 |
10 | 25576272 | 5178013.0 | 819560.0 | 233194.71 | 15604.41 | 279873.59 | 1356.47 | 289531.98 | 52331.0 | 8553.0 | 43951145.0 | 13923859.0 | 1307822.0 | 11712200.0 | 54128.0 | 16953136.0 | Fall | 819561.16 |
11 | 25775466 | 4921185.0 | 722517.0 | 213385.97 | 15510.71 | 237676.36 | 1494.14 | 254450.45 | 37784.0 | 8020.0 | 39010669.0 | 12834692.0 | 1352039.0 | 9659274.0 | 83315.0 | 15081349.0 | Fall | 722517.63 |
12 | 26095567 | 5048905.0 | 1090593.0 | 315854.20 | 33005.22 | 329933.65 | 2683.49 | 409118.31 | 95190.0 | 12129.0 | 62791162.0 | 19420704.0 | 2989235.0 | 14477237.0 | 99861.0 | 25804125.0 | Winter | 1090594.87 |
plotdf = season.set_index('season').iloc[:, 2:8].groupby(level=0).sum()
fig = go.Figure()
fig.add_trace(go.Bar(
y= list(plotdf.index),
x= list(plotdf['carrier_ct']),
name='Carrier Delay',
orientation='h',
marker=dict(
color='rgba(246, 78, 139, 0.6)',
line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
)))
fig.add_trace(go.Bar(
y= list(plotdf.index),
x= list(plotdf[' weather_ct']),
name='Weather Delay',
orientation='h',
marker=dict(
color='rgba(178, 102, 255, 0.6)',
line=dict(color='rgba(178, 102, 255, 1.0)', width=3)
)))
fig.add_trace(go.Bar(
y= list(plotdf.index),
x= list(plotdf['nas_ct']),
name='National Air System Delay',
orientation='h',
marker=dict(
color='rgba(255, 178, 102, 0.6)',
line=dict(color='rgba(255, 178, 102, 1.0)', width=3)
)
))
fig.add_trace(go.Bar(
y= list(plotdf.index),
x= list(plotdf['security_ct']),
name='Security Delay',
orientation='h',
marker=dict(
color='rgba(102, 204, 0, 0.6)',
line=dict(color='rgba(102, 204, 0, 1.0)', width=3)
)
))
fig.add_trace(go.Bar(
y= list(plotdf.index),
x= list(plotdf['late_aircraft_ct']),
name='Late Aircraft Delay',
orientation='h',
marker=dict(
color='rgba(51,153,255, 0.6)',
line=dict(color='rgba(51, 153, 255, 1.0)', width=3)
)
))
fig.layout.update(barmode='stack',
title = 'Amount of Delay by Season<br> 2009-2019',
xaxis = dict(title = 'Total No. of Delays'))
iplot(fig)