In [96]:
import pandas as pd 
import numpy as np 
import plotly.plotly as py  
import plotly.graph_objs as go  
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)  
Importing the dataset from the 'Data' folder from Coursera's cloud that I have access to.
In [97]:
maindf = pd.read_csv('data/airline_delay_causes.csv')
maindf = maindf.iloc[:, :21]
maindf.head()
Out[97]:
year month carrier carrier_name airport airport_name arr_flights arr_del15 carrier_ct weather_ct ... security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
0 2009 11 AS Alaska Airlines Inc. BRW Barrow, AK: Wiley Post/Will Rogers Memorial 65.0 14.0 3.61 1.06 ... 0.0 6.19 0.0 1.0 920.0 592.0 34.0 78.0 0.0 216.0
1 2009 11 AS Alaska Airlines Inc. BUR Burbank, CA: Bob Hope 85.0 5.0 1.00 0.00 ... 0.0 0.98 0.0 0.0 134.0 25.0 0.0 70.0 0.0 39.0
2 2009 11 AS Alaska Airlines Inc. CDV Cordova, AK: Merle K Mudhole Smith 58.0 8.0 2.04 1.00 ... 0.0 2.96 4.0 2.0 586.0 174.0 37.0 62.0 0.0 313.0
3 2009 11 AS Alaska Airlines Inc. DCA Washington, DC: Ronald Reagan Washington National 88.0 11.0 1.99 0.00 ... 0.0 0.00 0.0 0.0 266.0 65.0 0.0 201.0 0.0 0.0
4 2009 11 AS Alaska Airlines Inc. DEN Denver, CO: Denver International 123.0 5.0 3.47 0.00 ... 0.0 0.00 0.0 0.0 81.0 57.0 0.0 24.0 0.0 0.0

5 rows × 21 columns

In [98]:
# Let's find the names of all individual airlines, so that we get an idea of how many unique airlines are there in the data
set(list(maindf['carrier_name']))
Out[98]:
{'AirTran Airways Corporation',
 'Alaska Airlines Inc.',
 'Allegiant Air',
 'American Airlines Inc.',
 'American Eagle Airlines Inc.',
 'Atlantic Southeast Airlines',
 'Comair Inc.',
 'Continental Air Lines Inc.',
 'Delta Air Lines Inc.',
 'Endeavor Air Inc.',
 'Envoy Air',
 'ExpressJet Airlines Inc.',
 'Frontier Airlines Inc.',
 'Hawaiian Airlines Inc.',
 'JetBlue Airways',
 'Mesa Airlines Inc.',
 'Northwest Airlines Inc.',
 'PSA Airlines Inc.',
 'Pinnacle Airlines Inc.',
 'Republic Airline',
 'SkyWest Airlines Inc.',
 'Southwest Airlines Co.',
 'Spirit Air Lines',
 'US Airways Inc.',
 'United Air Lines Inc.',
 'Virgin America'}

Now, I want to group the data set using 'groupby' on the Airlines. But the column names are not consistent as some have a space before the name starts. So let's print out the column names which we can then use to copy and paste in our code for smooth functioning.

In [99]:
#Find out the column names to write them properly in code:
maindf.set_index('carrier_name').columns #since we're going to group by 'Carrier Name'
Out[99]:
Index(['year', ' month', 'carrier', 'airport', 'airport_name', 'arr_flights',
       'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct',
       'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', ' arr_delay',
       ' carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay'],
      dtype='object')
Now, I'm interested in the ' arr_delay' column which is the Sum of Delay Minutes. We will be using this column and the airline names ('carrier_names') in our initial plots.
In [100]:
#Find the index of relevant column to use in the following GroupBy statement
list(maindf.set_index('carrier_name').columns).index(' arr_delay')
Out[100]:
14
Trial, Trial2, and so on are just names to new series, and objects. Not to be confused for anything.
In [101]:
trial = maindf.set_index('carrier_name').iloc[:, list(maindf.set_index('carrier_name').columns).index(' arr_delay')].groupby(level = 0).sum()
trial2 = trial.sort_values(ascending = False)
trial2
Out[101]:
carrier_name
Southwest Airlines Co.          115771486.0
American Airlines Inc.           78815786.0
SkyWest Airlines Inc.            72789590.0
Delta Air Lines Inc.             71621393.0
ExpressJet Airlines Inc.         68781482.0
United Air Lines Inc.            57915309.0
JetBlue Airways                  38036618.0
American Eagle Airlines Inc.     26718814.0
US Airways Inc.                  20966493.0
Frontier Airlines Inc.           12947465.0
Atlantic Southeast Airlines      12936567.0
Envoy Air                        12111571.0
Mesa Airlines Inc.               11538648.0
Alaska Airlines Inc.             11065065.0
AirTran Airways Corporation      11019631.0
Spirit Air Lines                  9062640.0
Continental Air Lines Inc.        7413086.0
Pinnacle Airlines Inc.            6645291.0
Endeavor Air Inc.                 4972029.0
Virgin America                    4895524.0
Republic Airline                  4333426.0
PSA Airlines Inc.                 4180379.0
Comair Inc.                       3988422.0
Hawaiian Airlines Inc.            2809143.0
Northwest Airlines Inc.           2643000.0
Allegiant Air                     1793203.0
Name:  arr_delay, dtype: float64

First Few Plots

In [102]:
#Let's create a Bar graph! 
# First, create a trace: 
x = list(trial2.index)
y = list(trial2)
iplot([go.Bar(x=x, y=y)], filename = 'Delay Bar Chart (minutes)')

But this is not an accurate representation. Let's calculate the Average Lateness per each flight of airlines. But before that we need to find out the total number of flights per airline.

In [103]:
#Find the index of relevant column to use in the following GroupBy statement
list(maindf.set_index('carrier_name').columns).index('arr_flights')
Out[103]:
5
In [104]:
Total_flights_per_airline = maindf.set_index('carrier_name').iloc[:, list(maindf.set_index('carrier_name').columns).index('arr_flights')].groupby(level = 0).sum()
Total_flights_per_airline
Out[104]:
carrier_name
AirTran Airways Corporation      1201121.0
Alaska Airlines Inc.             1689757.0
Allegiant Air                     110255.0
American Airlines Inc.           6785529.0
American Eagle Airlines Inc.     2300220.0
Atlantic Southeast Airlines       923459.0
Comair Inc.                       286694.0
Continental Air Lines Inc.        721450.0
Delta Air Lines Inc.             7951812.0
Endeavor Air Inc.                 404370.0
Envoy Air                         932356.0
ExpressJet Airlines Inc.         4891450.0
Frontier Airlines Inc.            915000.0
Hawaiian Airlines Inc.            751950.0
JetBlue Airways                  2516031.0
Mesa Airlines Inc.               1080705.0
Northwest Airlines Inc.           270188.0
PSA Airlines Inc.                 323091.0
Pinnacle Airlines Inc.            678801.0
Republic Airline                  365631.0
SkyWest Airlines Inc.            6346731.0
Southwest Airlines Co.          12216352.0
Spirit Air Lines                  617450.0
US Airways Inc.                  2624022.0
United Air Lines Inc.            4887878.0
Virgin America                    389060.0
Name: arr_flights, dtype: float64
In [105]:
new_df = pd.DataFrame({'Total Delay': trial, 'Total No. of Flights': Total_flights_per_airline})
new_df['Delay/Flight'] = new_df['Total Delay']/new_df['Total No. of Flights']
trial3 = new_df['Delay/Flight'].sort_values(ascending = False)
In [106]:
# Now let's create a bar plot to compare the delay/flight of specific airlines
# Creating traces

x1 = list(trial3.index)
y1 = list(trial3)
iplot([go.Bar(x=x1, y=y1)], filename = 'Delay/Flight of Airline from Nov 2009 - Feb 2019')
In [107]:
# Add some color to the above graph 
color = np.array(['rgb(255,255,255)']*trial3.shape[0])
color[trial3>=15] = 'rgb(130,0,0)'
color[trial3<15]= 'rgb(204,204,205)'
data=[dict(type='bar',
           x=trial3.index,
           y=trial3,
           marker=dict(color=color.tolist()),
           text = round(trial3,1), 
           textposition = 'auto',
           name = 'Delay/Flight of Airline from Nov 2009 - Feb 2019'       
      )]
layout = go.Layout(bargap = 0.3, title = 'Delay/Flight of Airline from Nov 2009 - Feb 2019 <br> (Airlines with over 15 mins in red)', 
                  yaxis = dict(title = 'No. of minutes'))
fig = go.Figure(data = data, layout = layout)
iplot(fig)
From the above visualization it is safe to say that over the past 10 years (approx):

Each Hawaiin Airline flight has been delayed by around 3.7 minutes!

And Allegiant Air flights are late by around 16.2 minutes.

Line/Scatter Plot

Now let's plot the performance of airlines across the years. Note that for 2009, we only have data from Month 11 onward. i.e. Month 11 & 12.

Thus for this, we will create and group by a 'Multi-index' of Airline name ('carrier_name') & Year.

In [108]:
# Now for this too, let's find out the index of the ' arr_delay' column since this group by is going to be a multi-index
new_index = list(maindf.set_index(['carrier_name', 'year']).columns).index(' arr_delay')
new_index
Out[108]:
13
In [109]:
# Now, let's plot the average lateness per flight per each year. 
# We are provided with only 2 months of the year 2009 so we will ignore that later on. 
trial_series = maindf.set_index(['carrier_name', 'year']).iloc[:, new_index].groupby(level =[0,1]).sum()
In [110]:
#Let's see what the series looks like. 
trial_series.head(20)
Out[110]:
carrier_name                 year
AirTran Airways Corporation  2009    2949355.0
                             2010    2250331.0
                             2011    1990102.0
                             2012    1359781.0
                             2013    1791450.0
                             2014     678612.0
Alaska Airlines Inc.         2009     859073.0
                             2010     741330.0
                             2011     766706.0
                             2012     872417.0
                             2013     848473.0
                             2014     989491.0
                             2015    1074977.0
                             2016     996624.0
                             2017    1454555.0
                             2018    1942070.0
                             2019     519349.0
Allegiant Air                2018    1565764.0
                             2019     227439.0
American Airlines Inc.       2009    6227014.0
Name:  arr_delay, dtype: float64
I know, we can groupby the dataframe and create a new Data Frame, and then select the appropriate columns but I'm just doing it this way because it's slightly more comprehensible, comapred to the clutter the groupby statement creates (which you will notice later on in the project)
In [111]:
# to find index of 'arr_flights' column in Multi_index: 
list(maindf.set_index(['carrier_name', 'year']).columns).index('arr_flights')
Out[111]:
4
In [112]:
Total_flights_per_airline_per_year = maindf.set_index(['carrier_name', 'year']).iloc[:, list(maindf.set_index(['carrier_name', 'year']).columns).index('arr_flights')].groupby(level = [0,1]).sum()
Total_flights_per_airline_per_year.head(20)
Out[112]:
carrier_name                 year
AirTran Airways Corporation  2009    231965.0
                             2010    248844.0
                             2011    248703.0
                             2012    218162.0
                             2013    173952.0
                             2014     79495.0
Alaska Airlines Inc.         2009    126110.0
                             2010    136950.0
                             2011    143726.0
                             2012    147569.0
                             2013    154743.0
                             2014    160257.0
                             2015    172521.0
                             2016    177280.0
                             2017    185068.0
                             2018    245761.0
                             2019     39772.0
Allegiant Air                2018     96221.0
                             2019     14034.0
American Airlines Inc.       2009    503858.0
Name: arr_flights, dtype: float64
In [113]:
new_df = pd.DataFrame({'Total Delay': trial_series, 'Total No. of Flights': Total_flights_per_airline_per_year})
new_df['Delay/Flight'] = new_df['Total Delay']/new_df['Total No. of Flights']
new_df.head(20)
Out[113]:
Total Delay Total No. of Flights Delay/Flight
carrier_name year
AirTran Airways Corporation 2009 2949355.0 231965.0 12.714655
2010 2250331.0 248844.0 9.043139
2011 1990102.0 248703.0 8.001922
2012 1359781.0 218162.0 6.232896
2013 1791450.0 173952.0 10.298531
2014 678612.0 79495.0 8.536537
Alaska Airlines Inc. 2009 859073.0 126110.0 6.812093
2010 741330.0 136950.0 5.413143
2011 766706.0 143726.0 5.334498
2012 872417.0 147569.0 5.911926
2013 848473.0 154743.0 5.483111
2014 989491.0 160257.0 6.174401
2015 1074977.0 172521.0 6.230992
2016 996624.0 177280.0 5.621751
2017 1454555.0 185068.0 7.859571
2018 1942070.0 245761.0 7.902271
2019 519349.0 39772.0 13.058156
Allegiant Air 2018 1565764.0 96221.0 16.272581
2019 227439.0 14034.0 16.206285
American Airlines Inc. 2009 6227014.0 503858.0 12.358669
Let's try out an initial scatter plot for AirTran Airways Corporation as an example:
In [114]:
#AirTran Airways has data only till 2014. 
trace1 = go.Scatter(x=list(new_df.loc[['AirTran Airways Corporation']].index.get_level_values('year')),
                    y= list(new_df.loc[['AirTran Airways Corporation']].ix[:,2]),
                    mode='lines+markers')

iplot([trace1],filename='line chart')

Let's try out a Grouped Scatter Plot

We will use data of all airlines across all years and see what plot we get. Could get a little messy, so let's split the graphs. For that we will need to create a list of all the airlines and split it.

In [115]:
#list of airlines
list_of_airlines = list(set(list(new_df.index.get_level_values('carrier_name'))))
split1 = list_of_airlines[:int(len(list_of_airlines)/2)]
split2 = list_of_airlines[int(len(list_of_airlines)/2):]
In [116]:
new_list = [] 
for airline in split1:
    trace = go.Scatter(x=list(new_df.loc[[airline]].index.get_level_values('year')),
                       y= list(new_df.loc[[airline]].ix[:,2]),
                       text = airline, name = airline,
                       mode='lines+markers')
    new_list.append(trace)

    
data = new_list
layout = dict(title = 'Delay of Each Airline per Flight by year <br> First Group of Airlines',
              xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
              yaxis = dict(title= 'Avg. Delay per fligh in minutes', rangemode= 'tozero'),
              showlegend=True
             )

fig = dict(data = data, layout = layout)

iplot(fig)
    
In [117]:
new_list = [] 
for airline in split2:
    trace = go.Scatter(x=list(new_df.loc[[airline]].index.get_level_values('year')),
                       y= list(new_df.loc[[airline]].ix[:,2]),
                       text = airline, name = airline,
                       mode='lines+markers')
    new_list.append(trace)

    
data = new_list
layout = dict(title = 'Delay of Each Airline per Flight by year <br> Second Group of Airlines',
              xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
              yaxis = dict(title= 'Avg. Delay per fligh in minutes', rangemode= 'tozero'),
              showlegend=True
             )

fig = dict(data = data, layout = layout)

iplot(fig)

Nearly all Airlines have a spike from 2017 to 2018, which means that all of them are getting delayed. Let's find out why (from the data provided)

In [118]:
#Grouping the data by year:
year_df = maindf.set_index('year').groupby(level=0).sum()
year_df.head()
Out[118]:
month arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
year
2009 113147 5917946.0 1109632.0 296652.25 36781.26 407857.51 2118.93 366223.96 76947.0 14200.0 60393828.0 16952697.0 2923083.0 18457683.0 72335.0 21988030.0
2010 115115 6450117.0 1174884.0 339084.44 35854.04 376616.89 3057.76 420272.65 113255.0 15474.0 63257498.0 19217505.0 2785212.0 16229118.0 107410.0 24918253.0
2011 101538 6085281.0 1109872.0 313296.54 30963.04 352694.64 2318.39 410600.62 115978.0 14399.0 61675942.0 18553241.0 2558373.0 15303210.0 81353.0 25179765.0
2012 93510 6096762.0 1015158.0 302680.22 28799.46 300169.19 1980.71 381529.39 78862.0 12519.0 56916692.0 18165456.0 2279810.0 12828822.0 73052.0 23569552.0
2013 104837 6369482.0 1269277.0 353206.75 36817.01 384847.91 2321.20 492086.83 96012.0 14160.0 71937602.0 21136897.0 2974153.0 17426866.0 103651.0 30296035.0
In [119]:
#Plotting the number of flights
trace = go.Scatter(x= list(year_df.index), 
                   y= list(year_df['arr_flights']), 
                   text = year_df['arr_flights'], mode = 'lines+markers')
    
data = [trace]
layout = dict(title = 'Total No. of Flights By Year - 2009-2019',
              xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
              yaxis = dict(title= 'Avg. Delay per fligh in minutes', rangemode= 'tozero'),
              showlegend=True
             )

fig = dict(data = data, layout = layout)

iplot(fig)

Well, in 2019, we're only provided the data till February, which is why we there is such a drop in the numbers. Let's get rid of 2019

In [120]:
new_year_df = year_df.iloc[:-1, :]
In [121]:
new_year_df
Out[121]:
month arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
year
2009 113147 5917946.0 1109632.0 296652.25 36781.26 407857.51 2118.93 366223.96 76947.0 14200.0 60393828.0 16952697.0 2923083.0 18457683.0 72335.0 21988030.0
2010 115115 6450117.0 1174884.0 339084.44 35854.04 376616.89 3057.76 420272.65 113255.0 15474.0 63257498.0 19217505.0 2785212.0 16229118.0 107410.0 24918253.0
2011 101538 6085281.0 1109872.0 313296.54 30963.04 352694.64 2318.39 410600.62 115978.0 14399.0 61675942.0 18553241.0 2558373.0 15303210.0 81353.0 25179765.0
2012 93510 6096762.0 1015158.0 302680.22 28799.46 300169.19 1980.71 381529.39 78862.0 12519.0 56916692.0 18165456.0 2279810.0 12828822.0 73052.0 23569552.0
2013 104837 6369482.0 1269277.0 353206.75 36817.01 384847.91 2321.20 492086.83 96012.0 14160.0 71937602.0 21136897.0 2974153.0 17426866.0 103651.0 30296035.0
2014 90927 5819811.0 1240528.0 356587.94 35035.38 368634.53 1641.06 478629.06 126984.0 14449.0 70316961.0 21259409.0 2988669.0 16519549.0 65350.0 29483984.0
2015 86768 5819079.0 1063439.0 323453.86 36036.71 316684.49 1949.72 385314.94 89884.0 15187.0 62651867.0 20172956.0 3100233.0 14335762.0 80985.0 24961931.0
2016 79764 5617658.0 964239.0 283191.71 28658.42 301437.86 1811.80 349140.36 65861.0 13652.0 59850360.0 19533337.0 2606083.0 14170617.0 81925.0 23458398.0
2017 81724 5674621.0 1029474.0 285058.62 28580.50 330589.89 1801.53 383442.51 82693.0 12530.0 65816682.0 20516702.0 2799759.0 16500787.0 94170.0 25905070.0
2018 132471 7213446.0 1352710.0 373477.31 47408.01 438765.75 2518.70 490540.94 116584.0 17859.0 87540080.0 26316981.0 4919075.0 21488435.0 126531.0 34689058.0
In [122]:
#Plot
trace = go.Scatter(x= list(new_year_df.index), 
                   y= list(new_year_df['arr_flights']), 
                   text = new_year_df['arr_flights'], mode = 'lines+markers')
    
data = [trace]
layout = dict(title = 'Total No. of Flights By Year - 2009-2018* <br> [2019 is avoided as Data only provided until February]',
              xaxis= dict(title= 'Year',ticklen= 5,showgrid=True),
              yaxis = dict(title= 'Total No. Of Flights', rangemode= 'tozero'),
              showlegend=True
             )

fig = dict(data = data, layout = layout)

iplot(fig)
we can see that the total number of flights in 2018 is so much higher than that of 2017, the latter appearing pretty close to the average of all the previous years combined. [at least, visually]
In [200]:
#Let's do a visual comparison of delayed flights in 2017 and 2018
stat = ['Delayed Flights', 'Carrier Del.', 'NAS Del.', 'Late Aircraft Del.']
fig = go.Figure(data=[
    go.Bar(name='2017', x=stat, y=[new_year_df.loc[2017][2], new_year_df.loc[2017][3], new_year_df.loc[2017][5], new_year_df.loc[2017][7]]),
    go.Bar(name='2018', x=stat, y= [new_year_df.loc[2018][2], new_year_df.loc[2018][3], new_year_df.loc[2018][5], new_year_df.loc[2018][7]])
])

fig.layout.update(barmode='group', title = 'Comparison of Delayed Flights and causes - 2017 & 2018',
                  yaxis = dict(title = 'No. of Flights'))
fig.show()
Let's take the average of the years of 2009-2017 and compare that with 2018's numbers
In [124]:
new_year_df
Out[124]:
month arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
year
2009 113147 5917946.0 1109632.0 296652.25 36781.26 407857.51 2118.93 366223.96 76947.0 14200.0 60393828.0 16952697.0 2923083.0 18457683.0 72335.0 21988030.0
2010 115115 6450117.0 1174884.0 339084.44 35854.04 376616.89 3057.76 420272.65 113255.0 15474.0 63257498.0 19217505.0 2785212.0 16229118.0 107410.0 24918253.0
2011 101538 6085281.0 1109872.0 313296.54 30963.04 352694.64 2318.39 410600.62 115978.0 14399.0 61675942.0 18553241.0 2558373.0 15303210.0 81353.0 25179765.0
2012 93510 6096762.0 1015158.0 302680.22 28799.46 300169.19 1980.71 381529.39 78862.0 12519.0 56916692.0 18165456.0 2279810.0 12828822.0 73052.0 23569552.0
2013 104837 6369482.0 1269277.0 353206.75 36817.01 384847.91 2321.20 492086.83 96012.0 14160.0 71937602.0 21136897.0 2974153.0 17426866.0 103651.0 30296035.0
2014 90927 5819811.0 1240528.0 356587.94 35035.38 368634.53 1641.06 478629.06 126984.0 14449.0 70316961.0 21259409.0 2988669.0 16519549.0 65350.0 29483984.0
2015 86768 5819079.0 1063439.0 323453.86 36036.71 316684.49 1949.72 385314.94 89884.0 15187.0 62651867.0 20172956.0 3100233.0 14335762.0 80985.0 24961931.0
2016 79764 5617658.0 964239.0 283191.71 28658.42 301437.86 1811.80 349140.36 65861.0 13652.0 59850360.0 19533337.0 2606083.0 14170617.0 81925.0 23458398.0
2017 81724 5674621.0 1029474.0 285058.62 28580.50 330589.89 1801.53 383442.51 82693.0 12530.0 65816682.0 20516702.0 2799759.0 16500787.0 94170.0 25905070.0
2018 132471 7213446.0 1352710.0 373477.31 47408.01 438765.75 2518.70 490540.94 116584.0 17859.0 87540080.0 26316981.0 4919075.0 21488435.0 126531.0 34689058.0
In [199]:
#Let's do a visual comparison of average delayed flights from 2009-2017 vs the delayed flights of 2018
stat = ['Delayed Flights', 'Carrier Del.', 'NAS Del.', 'Late Aircraft Del.']
fig = go.Figure(data=[
    go.Bar(name='Avg. of 2009-2017', x=stat, y=[np.mean(new_year_df.iloc[:-1, :]).loc['arr_del15'],
                                                np.mean(new_year_df.iloc[:-1, :]).loc['carrier_ct'], 
                                                np.mean(new_year_df.iloc[:-1, :]).loc['nas_ct'],
                                                np.mean(new_year_df.iloc[:-1, :]).loc['late_aircraft_ct']]),    
    go.Bar(name='2018', x=stat, y= [new_year_df.loc[2018][2], new_year_df.loc[2018][3], new_year_df.loc[2018][5], new_year_df.loc[2018][7]])
])

fig.layout.update(barmode='group', title = 'Comparison of Delayed Flights and causes - Avg. of 2009-2017 & 2018<br>(Hover for details)',
                  yaxis = dict(title = 'No. of Flights'), 
                 xaxis_tickangle = 0, bargap = 0.45)

fig.show()

Which months should we fly to avoid significant delays?

For this, we will have to group by Month - pretty simple. We will again be calculating the Delay in minutes per flight in that month. As that seems to be a reasonable statistic

In [126]:
month_total_delay = maindf.set_index(' month').iloc[:, list(maindf.set_index(' month').columns).index(' arr_delay')].groupby(level = 0).sum()
In [127]:
month_total_arrivals = maindf.set_index(' month').iloc[:, list(maindf.set_index(' month').columns).index('arr_flights')].groupby(level = 0).sum()
In [128]:
month_df = pd.DataFrame({'Total Delay': month_total_delay, 'Total No. of Flights': month_total_arrivals})
month_df['Delay/Flight'] = month_df['Total Delay']/month_df['Total No. of Flights']

month_df
Out[128]:
Total Delay Total No. of Flights Delay/Flight
month
1 55992069.0 5003725.0 11.190077
2 55421731.0 5109873.0 10.846009
3 54102640.0 5295087.0 10.217517
4 52453539.0 5110413.0 10.264051
5 57717664.0 5255897.0 10.981506
6 73834338.0 5331553.0 13.848561
7 75536793.0 5513319.0 13.700784
8 64681554.0 5454172.0 11.859097
9 40278757.0 4959221.0 8.121993
10 43951145.0 5178013.0 8.488033
11 39010669.0 4921185.0 7.927088
12 62791162.0 5048905.0 12.436590
In [129]:
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 
             'November','December']
month_df['Month'] = month_list 
month_df
Out[129]:
Total Delay Total No. of Flights Delay/Flight Month
month
1 55992069.0 5003725.0 11.190077 January
2 55421731.0 5109873.0 10.846009 February
3 54102640.0 5295087.0 10.217517 March
4 52453539.0 5110413.0 10.264051 April
5 57717664.0 5255897.0 10.981506 May
6 73834338.0 5331553.0 13.848561 June
7 75536793.0 5513319.0 13.700784 July
8 64681554.0 5454172.0 11.859097 August
9 40278757.0 4959221.0 8.121993 September
10 43951145.0 5178013.0 8.488033 October
11 39010669.0 4921185.0 7.927088 November
12 62791162.0 5048905.0 12.436590 December
In [130]:
lowest_index = np.argmin(month_df['Delay/Flight']) - 1 #since month_df's index starts from 1 and not 0
colors =  ['lightslategray',] * 12
colors[lowest_index] = 'crimson'

data= go.Bar(x = month_df.Month, 
             y = month_df['Delay/Flight'],
             text=round(month_df['Delay/Flight'], 2),
             textposition = 'auto', 
             marker_color =colors)



fig = go.Figure()
fig.add_trace(data)
fig.layout.update(title = 'Average Delay per Flight by month', 
                 xaxis =dict(title='Month'), 
                 yaxis= dict(title='Avg. Delay per Flight in minutes'))
iplot(fig)

Thus September, October and November (with November being the best) are the best months to travel in. And one should avoid travelling in June, July if they're looking for least delays.

% of Flights Delayed (>15 mins) by Airport. Map Plot

In [131]:
 
maindf.head(10)
Out[131]:
year month carrier carrier_name airport airport_name arr_flights arr_del15 carrier_ct weather_ct ... security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
0 2009 11 AS Alaska Airlines Inc. BRW Barrow, AK: Wiley Post/Will Rogers Memorial 65.0 14.0 3.61 1.06 ... 0.0 6.19 0.0 1.0 920.0 592.0 34.0 78.0 0.0 216.0
1 2009 11 AS Alaska Airlines Inc. BUR Burbank, CA: Bob Hope 85.0 5.0 1.00 0.00 ... 0.0 0.98 0.0 0.0 134.0 25.0 0.0 70.0 0.0 39.0
2 2009 11 AS Alaska Airlines Inc. CDV Cordova, AK: Merle K Mudhole Smith 58.0 8.0 2.04 1.00 ... 0.0 2.96 4.0 2.0 586.0 174.0 37.0 62.0 0.0 313.0
3 2009 11 AS Alaska Airlines Inc. DCA Washington, DC: Ronald Reagan Washington National 88.0 11.0 1.99 0.00 ... 0.0 0.00 0.0 0.0 266.0 65.0 0.0 201.0 0.0 0.0
4 2009 11 AS Alaska Airlines Inc. DEN Denver, CO: Denver International 123.0 5.0 3.47 0.00 ... 0.0 0.00 0.0 0.0 81.0 57.0 0.0 24.0 0.0 0.0
5 2009 11 AS Alaska Airlines Inc. DFW Dallas/Fort Worth, TX: Dallas/Fort Worth Inter... 66.0 6.0 1.28 0.69 ... 0.0 0.00 0.0 0.0 176.0 64.0 11.0 101.0 0.0 0.0
6 2009 11 AS Alaska Airlines Inc. EWR Newark, NJ: Newark Liberty International 59.0 10.0 0.24 0.00 ... 0.0 0.56 0.0 0.0 553.0 8.0 0.0 526.0 0.0 19.0
7 2009 11 AS Alaska Airlines Inc. FAI Fairbanks, AK: Fairbanks International 299.0 41.0 18.35 1.82 ... 0.0 8.70 2.0 0.0 1910.0 707.0 114.0 386.0 0.0 703.0
8 2009 11 AS Alaska Airlines Inc. GEG Spokane, WA: Spokane International 86.0 0.0 0.00 0.00 ... 0.0 0.00 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 2009 11 AS Alaska Airlines Inc. HNL Honolulu, HI: Daniel K Inouye International 85.0 18.0 1.34 1.46 ... 0.0 0.00 0.0 0.0 547.0 85.0 89.0 373.0 0.0 0.0

10 rows × 21 columns

In [132]:
maindf.shape
Out[132]:
(155710, 21)
In [133]:
choro_df = maindf.set_index('airport').groupby(level=0).sum()
I found a data set containing co-ordinates and more info of airports. But these are only 221 airports as compared to 381 airports. However, it's very much likely that these 221 airports are rather, the main ones.
In [134]:
#Here is a data set containing the latitudes and longitudes of Airports

df_tp = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
In [135]:
df_tp.head()
Out[135]:
iata airport city state country lat long cnt
0 ORD Chicago O'Hare International Chicago IL USA 41.979595 -87.904464 25129
1 ATL William B Hartsfield-Atlanta Intl Atlanta GA USA 33.640444 -84.426944 21925
2 DFW Dallas-Fort Worth International Dallas-Fort Worth TX USA 32.895951 -97.037200 20662
3 PHX Phoenix Sky Harbor International Phoenix AZ USA 33.434167 -112.008056 17290
4 DEN Denver Intl Denver CO USA 39.858408 -104.667002 13781
In [136]:
choro_df.head()
Out[136]:
year month arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
airport
ABE 998462 3189 31126.0 5976.0 2339.05 205.87 1463.03 5.40 1962.63 735.0 102.0 371118.0 156117.0 17890.0 63970.0 186.0 132955.0
ABI 235551 751 20361.0 4189.0 1623.86 475.66 704.14 10.46 1374.79 559.0 20.0 217038.0 71571.0 37452.0 28333.0 366.0 79316.0
ABQ 2349915 7570 270269.0 51839.0 18862.46 1112.53 8667.76 141.16 23055.19 2011.0 398.0 2543213.0 893157.0 88056.0 308377.0 4928.0 1248695.0
ABR 173294 569 5190.0 532.0 189.76 24.11 151.00 0.00 167.17 81.0 12.0 38801.0 15362.0 7415.0 5375.0 0.0 10649.0
ABY 245669 780 9758.0 1876.0 751.80 62.73 325.77 2.00 733.72 120.0 26.0 116642.0 49808.0 9362.0 14859.0 139.0 42474.0

We need to merge these dataframes.

In [137]:
merged_df = pd.merge(choro_df, df_tp, how='inner', left_index = True, right_on = 'iata') #merge on airport
merged_df.head()
Out[137]:
year month arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled ... security_delay late_aircraft_delay iata airport city state country lat long cnt
203 998462 3189 31126.0 5976.0 2339.05 205.87 1463.03 5.40 1962.63 735.0 ... 186.0 132955.0 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402 60
126 235551 751 20361.0 4189.0 1623.86 475.66 704.14 10.46 1374.79 559.0 ... 366.0 79316.0 ABI Abilene Regional Abilene TX USA 32.411320 -99.681897 382
64 2349915 7570 270269.0 51839.0 18862.46 1112.53 8667.76 141.16 23055.19 2011.0 ... 4928.0 1248695.0 ABQ Albuquerque International Albuquerque NM USA 35.040222 -106.609194 1258
176 243649 782 25263.0 6285.0 1333.74 199.48 913.70 0.91 3837.15 1295.0 ... 16.0 265217.0 ACV Arcata Arcata/Eureka CA USA 40.978115 -124.108619 144
167 571721 1840 32882.0 6139.0 2367.37 292.24 1333.80 8.98 2136.65 713.0 ... 413.0 125965.0 AEX Alexandria International Alexandria LA USA 31.327372 -92.548556 168

5 rows × 25 columns

In [138]:
merged_df = merged_df.iloc[:,2:-1] #Year and month, and cnt columns, are irrelevant.
merged_df.head(10)
Out[138]:
arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay ... nas_delay security_delay late_aircraft_delay iata airport city state country lat long
203 31126.0 5976.0 2339.05 205.87 1463.03 5.40 1962.63 735.0 102.0 371118.0 ... 63970.0 186.0 132955.0 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402
126 20361.0 4189.0 1623.86 475.66 704.14 10.46 1374.79 559.0 20.0 217038.0 ... 28333.0 366.0 79316.0 ABI Abilene Regional Abilene TX USA 32.411320 -99.681897
64 270269.0 51839.0 18862.46 1112.53 8667.76 141.16 23055.19 2011.0 398.0 2543213.0 ... 308377.0 4928.0 1248695.0 ABQ Albuquerque International Albuquerque NM USA 35.040222 -106.609194
176 25263.0 6285.0 1333.74 199.48 913.70 0.91 3837.15 1295.0 261.0 414526.0 ... 33236.0 16.0 265217.0 ACV Arcata Arcata/Eureka CA USA 40.978115 -124.108619
167 32882.0 6139.0 2367.37 292.24 1333.80 8.98 2136.65 713.0 106.0 348355.0 ... 54063.0 413.0 125965.0 AEX Alexandria International Alexandria LA USA 31.327372 -92.548556
187 31889.0 6326.0 2677.74 222.85 1136.22 3.76 2285.29 475.0 60.0 388455.0 ... 47079.0 87.0 146011.0 AGS Bush Augusta GA USA 33.369955 -81.964496
198 102056.0 20707.0 8423.85 663.02 3907.45 41.70 7671.02 1702.0 155.0 1144355.0 ... 164028.0 1576.0 469889.0 ALB Albany Cty Albany NY USA 42.748119 -73.802979
98 49017.0 11308.0 3911.40 698.17 2877.50 20.17 3800.78 2407.0 1514.0 712832.0 ... 129589.0 1456.0 241399.0 ASE Aspen-Pitkin Co/Sardy Aspen CO USA 39.223160 -106.868845
1 3931975.0 661494.0 151349.57 18379.91 248613.97 603.00 242548.03 48865.0 8331.0 42069206.0 ... 11303992.0 29836.0 16113325.0 ATL William B Hartsfield-Atlanta Intl Atlanta GA USA 33.640444 -84.426944
211 38029.0 7450.0 2854.37 231.92 1855.38 4.96 2503.63 1160.0 107.0 451002.0 ... 75527.0 144.0 159268.0 ATW Outagamie County Regional Appleton WI USA 44.257408 -88.519476

10 rows × 22 columns

In [139]:
merged_df['% of Flights15'] = (round(merged_df['arr_del15']/merged_df['arr_flights'], 4))*100
merged_df.head(10)
Out[139]:
arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay ... security_delay late_aircraft_delay iata airport city state country lat long % of Flights15
203 31126.0 5976.0 2339.05 205.87 1463.03 5.40 1962.63 735.0 102.0 371118.0 ... 186.0 132955.0 ABE Lehigh Valley International Allentown PA USA 40.652363 -75.440402 19.20
126 20361.0 4189.0 1623.86 475.66 704.14 10.46 1374.79 559.0 20.0 217038.0 ... 366.0 79316.0 ABI Abilene Regional Abilene TX USA 32.411320 -99.681897 20.57
64 270269.0 51839.0 18862.46 1112.53 8667.76 141.16 23055.19 2011.0 398.0 2543213.0 ... 4928.0 1248695.0 ABQ Albuquerque International Albuquerque NM USA 35.040222 -106.609194 19.18
176 25263.0 6285.0 1333.74 199.48 913.70 0.91 3837.15 1295.0 261.0 414526.0 ... 16.0 265217.0 ACV Arcata Arcata/Eureka CA USA 40.978115 -124.108619 24.88
167 32882.0 6139.0 2367.37 292.24 1333.80 8.98 2136.65 713.0 106.0 348355.0 ... 413.0 125965.0 AEX Alexandria International Alexandria LA USA 31.327372 -92.548556 18.67
187 31889.0 6326.0 2677.74 222.85 1136.22 3.76 2285.29 475.0 60.0 388455.0 ... 87.0 146011.0 AGS Bush Augusta GA USA 33.369955 -81.964496 19.84
198 102056.0 20707.0 8423.85 663.02 3907.45 41.70 7671.02 1702.0 155.0 1144355.0 ... 1576.0 469889.0 ALB Albany Cty Albany NY USA 42.748119 -73.802979 20.29
98 49017.0 11308.0 3911.40 698.17 2877.50 20.17 3800.78 2407.0 1514.0 712832.0 ... 1456.0 241399.0 ASE Aspen-Pitkin Co/Sardy Aspen CO USA 39.223160 -106.868845 23.07
1 3931975.0 661494.0 151349.57 18379.91 248613.97 603.00 242548.03 48865.0 8331.0 42069206.0 ... 29836.0 16113325.0 ATL William B Hartsfield-Atlanta Intl Atlanta GA USA 33.640444 -84.426944 16.82
211 38029.0 7450.0 2854.37 231.92 1855.38 4.96 2503.63 1160.0 107.0 451002.0 ... 144.0 159268.0 ATW Outagamie County Regional Appleton WI USA 44.257408 -88.519476 19.59

10 rows × 23 columns

Plot:

In [140]:
trace = go.Scattergeo(locationmode = 'USA-states',
                     lon = merged_df['long'], 
                     lat = merged_df['lat'], 
                     text= merged_df['airport'] +", "+ merged_df['city']+ ", "+ merged_df['state']+", " +merged_df['% of Flights15'].astype(str)+"%",
                     mode='markers',
                     marker = dict(size = merged_df['% of Flights15'],
                                   opacity = 0.8,
                                   reversescale=True,
                                   autocolorscale = False,
                                   symbol = 'circle',
                                   line = dict(width=1,
                                               color='rgba(205, 102, 102)'),
                                   colorscale = 'Rainbow',
                                   cmin = 0,
                                   color = merged_df['% of Flights15'],
                                   cmax = merged_df['% of Flights15'].max(),
                                   colorbar_title="% flights<br>Delayed by >15 mins"
                                  ))
fig = go.Figure(data = [trace])

fig.layout.update(
        title = '% of flights delayed for more than 15 mins <br>(Hover for airport names)',
        geo = dict(
            scope='usa',
            projection_type='albers usa',
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )
fig.show()

On hovering we can find out the delays 221 airports across the USA. But let's find out, differently, which region of airport has most significant number of delays.

In [141]:
choro_df.columns
Out[141]:
Index(['year', ' month', 'arr_flights', 'arr_del15', 'carrier_ct',
       ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct',
       'arr_cancelled', 'arr_diverted', ' arr_delay', ' carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'],
      dtype='object')
In [142]:
choro_df2 = choro_df[['carrier_ct', ' weather_ct', 'nas_ct',
                        'security_ct', 'late_aircraft_ct']]
In [143]:
choro_df2 = choro_df2.rename(columns = {'carrier_ct': 'Carrier', ' weather_ct':'Weather', 'nas_ct':'National Air System',
                                        'security_ct':'Security', 'late_aircraft_ct':'Late Aircraft'})

Now, 'choro_df' is the one that has been grouped by the 'Airport' names (JFK, LAX, etc.). Now, I want to create a new column in this dataframe. The values of this column will state what has been the major cause of delay at these airports.

In [144]:
tp2_list = []
for num in range(choro_df2.shape[0]): 
    for i in choro_df2.iloc[num,:].index: 
        if choro_df2.iloc[num,:][i]==np.max(choro_df2.iloc[num,:]):
            xx = i
            tp2_list.append(xx)
In [145]:
len(tp2_list) #There are rows with all their entries = 0
Out[145]:
405
In [146]:
choro_df2.shape
Out[146]:
(381, 5)

Something's wrong, instead of 381 values, there are 405 values in the list.

On further analysis, we can find that there are rows (airports) that have all their values as 0. Which is why there were extra values as the code kept looping around those 0's of those particular airports. Let's get rid of those airports as they don't add anything to this piece of analysis.

In [147]:
choro_df2['max'] = list(choro_df2.apply(np.max, axis=1))
In [148]:
choro_df2 = choro_df2[choro_df2['max'] != 0] 
choro_df2.shape
Out[148]:
(375, 6)
In [149]:
tp2_list = []
for num in range(choro_df2.shape[0]): 
    for i in choro_df2.iloc[num,:5].index: 
        if choro_df2.iloc[num,:5][i]==np.max(choro_df2.iloc[num,:5]):
            xx = i
            tp2_list.append(xx)
In [150]:
len(tp2_list)
Out[150]:
375

Now these values are of the same lenght! We can add 'tp2_list' as a new column.

Let's call this column: "Cause"
In [151]:
choro_df2['Cause'] = tp2_list
choro_df2.head(15) #values of 'Cause' column represent what the major cause of delay for the airline has been over the past 10 years
Out[151]:
Carrier Weather National Air System Security Late Aircraft max Cause
airport
ABE 2339.05 205.87 1463.03 5.40 1962.63 2339.05 Carrier
ABI 1623.86 475.66 704.14 10.46 1374.79 1623.86 Carrier
ABQ 18862.46 1112.53 8667.76 141.16 23055.19 23055.19 Late Aircraft
ABR 189.76 24.11 151.00 0.00 167.17 189.76 Carrier
ABY 751.80 62.73 325.77 2.00 733.72 751.80 Carrier
ACK 361.85 98.35 200.32 1.25 200.25 361.85 Carrier
ACT 896.01 140.25 408.46 2.69 844.62 896.01 Carrier
ACV 1333.74 199.48 913.70 0.91 3837.15 3837.15 Late Aircraft
ACY 773.94 71.70 1764.72 11.56 982.03 1764.72 National Air System
ADK 40.88 4.02 63.16 1.02 37.95 63.16 National Air System
ADQ 210.23 123.61 179.06 1.00 116.07 210.23 Carrier
AEX 2367.37 292.24 1333.80 8.98 2136.65 2367.37 Carrier
AGS 2677.74 222.85 1136.22 3.76 2285.29 2677.74 Carrier
AKN 33.37 0.00 16.52 0.00 25.11 33.37 Carrier
ALB 8423.85 663.02 3907.45 41.70 7671.02 8423.85 Carrier
In [152]:
neww = pd.DataFrame({'Col': list(choro_df2['Cause'])})
In [153]:
neww['Col'].value_counts().index
Out[153]:
Index(['Late Aircraft', 'Carrier', 'National Air System'], dtype='object')
In [154]:
data= go.Bar(x = neww['Col'].value_counts().index, 
             y = neww['Col'].value_counts(),
             
             textposition = 'auto')



fig = go.Figure()
fig.add_trace(data)
fig.layout.update(title = 'Major cause of delay at Airports (Past 10 years)', 
                 xaxis =dict(title='Major Cause of Delay'), 
                 yaxis= dict(title='No. of Airports'))
iplot(fig)

I don't like the Bar chart for this, let's try the Pie Chart.

In [155]:
labels = list(neww['Col'].value_counts().index)
values = neww['Col'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, 
                            textinfo = 'label+percent')])

fig.layout.update(title = 'Major Cause of Delay in Airports')
iplot(fig)
In [156]:
values2 = list(np.sum(choro_df2)[:5])
labels2 = np.sum(choro_df2)[:5].index

fig = go.Figure(data=[go.Pie(labels=labels2, values=values2, 
                            textinfo = 'label+percent')])

fig.layout.update(title = 'Major Cause of Delay of Past 10 years')
iplot(fig)

This is a good plot and as you may think, it basically invalidates the 'Cause' column. It was redundant and not necessary. Let's stick with this.

Dot Plot

Let's visualize this data rather differently, using a dot plot and dividing the data according to the Airports

In [157]:
choro_df2['Sum of Total Flights Delayed'] = list(choro_df2.iloc[:,:5].apply(np.sum, axis=1))
In [158]:
df_dotplot = choro_df2
In [159]:
df_dotplot.columns
Out[159]:
Index(['Carrier', 'Weather', 'National Air System', 'Security',
       'Late Aircraft', 'max', 'Cause', 'Sum of Total Flights Delayed'],
      dtype='object')
In [160]:
airports = list(df_dotplot.index)
carrier_delay = list(round((df_dotplot['Carrier']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
weather_delay = list(round((df_dotplot['Weather']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
nas_delay = list(round((df_dotplot['National Air System']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
security_delay = list(round((df_dotplot['Security']/df_dotplot['Sum of Total Flights Delayed'])*100,4))
lateaircraft_delay = list(round((df_dotplot['Late Aircraft']/df_dotplot['Sum of Total Flights Delayed'])*100,4))

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=carrier_delay,
    y= airports,
    name='Percent of flights delayed due to Carrier',
    marker=dict(
        color='rgba(0, 0, 255, 0.95)',
        line_color='rgba(156, 165, 196, 1.0)',
    )
))

fig.add_trace(go.Scatter(
    x=weather_delay,
    y= airports,
    name='Percent of flights delayed due to Weather',
    marker=dict(
        color='rgba(0, 255, 0, 0.95)',
        line_color='rgba(217, 217, 217, 1.0)'
    )
))

fig.add_trace(go.Scatter(
    x=nas_delay,
    y= airports,
    name='Percent of flights delayed due to National Air System',
    marker=dict(
        color='rgba(56, 65, 96, 0.95)',
        line_color='rgba(56, 65, 96, 1.0)',
    )
))

fig.add_trace(go.Scatter(
    x=security_delay,
    y= airports,
    name='Percent of flights delayed due to Security',
    marker=dict(
        color='rgba(255, 255, 0, 0.95)',
        line_color='rgba(156, 165, 196, 1.0)',
    )
))

fig.add_trace(go.Scatter(
    x=lateaircraft_delay,
    y= airports,
    name='Percent of flights delayed due to Late Aircraft',
    marker=dict(
        color='rgba(255, 144,144, 0.95)',
        line_color='rgba(255, 144, 144, 1.0)',
    )
))

fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=6.5, opacity = 0.6))

fig.layout.update(
    title="% of Delay Reasons at Airports",
    xaxis=dict(
        showgrid=False,
        showline=True,
        linecolor='rgb(102, 102, 102)',
        tickfont_color='rgb(102, 102, 102)',
        showticklabels=True,
        dtick=10,
        ticks='outside',
        tickcolor='rgb(102, 102, 102)',
    ),
    yaxis = dict(title = 'Airport'),
    margin=dict(l=140, r=40, b=50, t=80),
    legend=dict(
        font_size=10,
        yanchor='middle',
        xanchor='right',
    ),
    width=800,
    height=600,
    paper_bgcolor='white',
    plot_bgcolor='white',
    hovermode='closest',
)
fig.show()

This is a little too congested. We were trying to fit in data of 375 airports. But, we can get a fair idea of what does cause the most delay. - We can see a lot of pink, blue and grey dots which shows that the main reasons for delay are Late Aircraft, Carrier and National Air Systems.

Let's do the same but for the Top 25 airports (with respect to the delay)

In [161]:
#Let's find out about the top 25 airports which have the most delay. 
top25_df_dotplot = df_dotplot.sort_values(by=['Sum of Total Flights Delayed'], ascending = False).iloc[:25,:]
In [162]:
airports3 = list(top25_df_dotplot.index)
carrier_delay3 = list(round((top25_df_dotplot['Carrier']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
weather_delay3 = list(round((top25_df_dotplot['Weather']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
nas_delay3 = list(round((top25_df_dotplot['National Air System']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
security_delay3 = list(round((top25_df_dotplot['Security']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))
lateaircraft_delay3 = list(round((top25_df_dotplot['Late Aircraft']/top25_df_dotplot['Sum of Total Flights Delayed'])*100,4))

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=carrier_delay3,
    y= airports3,
    name='Percent of flights delayed due to Carrier',
    marker=dict(
        color='rgba(0, 0, 255, 0.95)',
        line_color='rgba(156, 165, 196, 1.0)',
    )
))

fig.add_trace(go.Scatter(
    x=weather_delay3,
    y= airports3,
    name='Percent of flights delayed due to Weather',
    marker=dict(
        color='rgba(0, 255, 0, 0.95)',
        line_color='rgba(217, 217, 217, 1.0)'
    )
))

fig.add_trace(go.Scatter(
    x=nas_delay3,
    y= airports3,
    name='Percent of flights delayed due to National Air System',
    marker=dict(
        color='rgba(255, 128, 0, 0.95)',
        line_color='rgba(255, 128, 0 1.0)',
    )
))

fig.add_trace(go.Scatter(
    x=security_delay3,
    y= airports3,
    name='Percent of flights delayed due to Security',
    marker=dict(
        color='rgba(255, 255, 0, 0.95)',
        line_color='rgba(156, 165, 196, 1.0)',
    )
))

fig.add_trace(go.Scatter(
    x=lateaircraft_delay3,
    y= airports3,
    name='Percent of flights delayed due to Late Aircraft',
    marker=dict(
        color='rgba(255, 0,255, 0.95)',
        line_color='rgba(255, 0,255, 1.0)',
    )
))

fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=6.5, opacity = 0.99))

fig.layout.update(
    title="% of Delay Reasons at Top 25 Airports",
    xaxis=dict(
        showgrid=False,
        showline=True,
        linecolor='rgb(102, 102, 102)',
        tickfont_color='rgb(102, 102, 102)',
        showticklabels=True,
        dtick=10,
        ticks='outside',
        tickcolor='rgb(102, 102, 102)',
    ),
    yaxis = dict(title = 'Airport'),
    margin=dict(l=140, r=40, b=50, t=80),
    legend=dict(
        font_size=10,
        yanchor='middle',
        xanchor='right',
    ),
    width=800,
    height=600,
    paper_bgcolor='white',
    plot_bgcolor='white',
    hovermode='closest',
)
fig.show()

Let's try a stacked Bar plot

In [163]:
fig = go.Figure()
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Carrier']), text=carrier_delay3, 
                     textposition = 'auto', name='Carrier Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Weather']), text = weather_delay3,name='Weather Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['National Air System']), 
                     text = nas_delay3, textposition = 'inside', name='National Air System Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Security']),text=security_delay3, name='Security Delay'))
fig.add_trace(go.Bar(x=airports3, y=list(top25_df_dotplot['Late Aircraft']), 
                     text = lateaircraft_delay3,textposition = 'inside', name='Late Aircraft Delay'))

fig.layout.update(barmode='stack', title = "Casues of Delay at Top 25 Airports<br>[2009-2019]",xaxis=dict(title = 'Airport'),
                 yaxis = dict(title = 'Total Delay (in minutes)'))
fig.show()
In [164]:
merged_df2 = pd.merge(top25_df_dotplot, merged_df, how='inner', left_index = True, right_on = 'iata')
In [165]:
trace = go.Scattergeo(locationmode = 'USA-states',
                     lon = merged_df2['long'], 
                     lat = merged_df2['lat'], 
                     text= merged_df2['airport'] +", "+ merged_df2['city']+ ", "+ merged_df2['state']+",<br> " + 
                      round(merged_df2['Sum of Total Flights Delayed']).astype(str)+" flights delayed",
                     mode='markers',
                     marker = dict(size = merged_df2['Sum of Total Flights Delayed']/7800,
                                   opacity = 0.8,
                                   reversescale=True,
                                   autocolorscale = False,
                                   symbol = 'circle',
                                   line = dict(width=1,
                                               color='rgba(205, 102, 102)'),
                                   colorscale = 'Rainbow',
                                   cmin = 0,
                                   color = merged_df2['Sum of Total Flights Delayed'],
                                   cmax = merged_df2['Sum of Total Flights Delayed'].max(),
                                   colorbar_title="Total flights<br>Delayed by >15 mins"
                                  ))
fig = go.Figure(data = [trace])

fig.layout.update(
        title = 'Top 25 Airports with the most delays <br>(Hover for airport names)',
        geo = dict(
            scope='usa',
            projection_type='albers usa',
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )
fig.show()
In [166]:
fig = go.Figure()
fig.add_trace(go.Bar(x=airports, y=carrier_delay, name='% of Carrier Delay'))
fig.add_trace(go.Bar(x=airports, y=weather_delay, name='% of Weather Delay'))
fig.add_trace(go.Bar(x=airports, y=nas_delay, name='% of National Air System Delay'))
fig.add_trace(go.Bar(x=airports, y=security_delay, name='% of Security Delay'))
fig.add_trace(go.Bar(x=airports, y=lateaircraft_delay, name='% of Late Aircraft Delay'))

fig.layout.update(barmode='stack', xaxis=dict(title = 'Airport'))
fig.show()

The above plot is very bad.

Relationship Between Seasons and Flight Delays?

Let's check if the flights are late during any particular season or not.

In [167]:
maindf.head(10)
Out[167]:
year month carrier carrier_name airport airport_name arr_flights arr_del15 carrier_ct weather_ct ... security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
0 2009 11 AS Alaska Airlines Inc. BRW Barrow, AK: Wiley Post/Will Rogers Memorial 65.0 14.0 3.61 1.06 ... 0.0 6.19 0.0 1.0 920.0 592.0 34.0 78.0 0.0 216.0
1 2009 11 AS Alaska Airlines Inc. BUR Burbank, CA: Bob Hope 85.0 5.0 1.00 0.00 ... 0.0 0.98 0.0 0.0 134.0 25.0 0.0 70.0 0.0 39.0
2 2009 11 AS Alaska Airlines Inc. CDV Cordova, AK: Merle K Mudhole Smith 58.0 8.0 2.04 1.00 ... 0.0 2.96 4.0 2.0 586.0 174.0 37.0 62.0 0.0 313.0
3 2009 11 AS Alaska Airlines Inc. DCA Washington, DC: Ronald Reagan Washington National 88.0 11.0 1.99 0.00 ... 0.0 0.00 0.0 0.0 266.0 65.0 0.0 201.0 0.0 0.0
4 2009 11 AS Alaska Airlines Inc. DEN Denver, CO: Denver International 123.0 5.0 3.47 0.00 ... 0.0 0.00 0.0 0.0 81.0 57.0 0.0 24.0 0.0 0.0
5 2009 11 AS Alaska Airlines Inc. DFW Dallas/Fort Worth, TX: Dallas/Fort Worth Inter... 66.0 6.0 1.28 0.69 ... 0.0 0.00 0.0 0.0 176.0 64.0 11.0 101.0 0.0 0.0
6 2009 11 AS Alaska Airlines Inc. EWR Newark, NJ: Newark Liberty International 59.0 10.0 0.24 0.00 ... 0.0 0.56 0.0 0.0 553.0 8.0 0.0 526.0 0.0 19.0
7 2009 11 AS Alaska Airlines Inc. FAI Fairbanks, AK: Fairbanks International 299.0 41.0 18.35 1.82 ... 0.0 8.70 2.0 0.0 1910.0 707.0 114.0 386.0 0.0 703.0
8 2009 11 AS Alaska Airlines Inc. GEG Spokane, WA: Spokane International 86.0 0.0 0.00 0.00 ... 0.0 0.00 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 2009 11 AS Alaska Airlines Inc. HNL Honolulu, HI: Daniel K Inouye International 85.0 18.0 1.34 1.46 ... 0.0 0.00 0.0 0.0 547.0 85.0 89.0 373.0 0.0 0.0

10 rows × 21 columns

In [168]:
season = maindf.set_index(' month').groupby(level=0).sum()
In [169]:
#Let's classify each month according to what season they belong. 
season_list = ['Winter', 'Winter', 'Spring', 'Spring', 'Spring', 'Summer',
              'Summer','Summer', 'Fall', 'Fall', 'Fall', 'Winter']
season['season'] = season_list
In [170]:
season['Toal No. of Delays'] = list(np.sum(season.iloc[:,3:8], axis=1))
In [171]:
season
Out[171]:
year arr_flights arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay late_aircraft_delay season Toal No. of Delays
month
1 26286960 5003725.0 940632.0 269868.34 34788.07 300533.86 2078.15 333364.48 143912.0 11863.0 55992069.0 17648065.0 3237030.0 13292328.0 87846.0 21726800.0 Winter 940632.90
2 28951706 5109873.0 947700.0 262137.99 32457.62 317828.98 1678.34 333598.64 152163.0 11706.0 55421731.0 16800522.0 3030289.0 14163755.0 73100.0 21354065.0 Winter 947701.57
3 25578033 5295087.0 965403.0 274431.42 24002.72 309844.17 1868.82 355257.15 92317.0 10851.0 54102640.0 16750864.0 2063992.0 13502611.0 68530.0 21716643.0 Spring 965404.28
4 25777447 5110413.0 899946.0 249682.64 24973.76 295246.16 1534.69 328509.08 64034.0 11247.0 52453539.0 15710201.0 2277923.0 13560137.0 56562.0 20848716.0 Spring 899946.33
5 25515644 5255897.0 978608.0 269082.06 31622.09 319143.13 1718.72 357042.67 61889.0 14261.0 57717664.0 16678287.0 2811017.0 15234498.0 66324.0 22927538.0 Spring 978608.67
6 26280980 5331553.0 1182531.0 334109.52 42109.14 348799.63 1987.73 455525.05 81797.0 17349.0 73834338.0 21607992.0 3726750.0 17585519.0 84823.0 30829254.0 Summer 1182531.07
7 25783383 5513319.0 1209872.0 345103.19 43003.61 350846.20 2007.58 468912.61 80894.0 17180.0 75536793.0 22286591.0 3632895.0 17872721.0 77322.0 31667264.0 Summer 1209873.19
8 26087607 5454172.0 1070528.0 308297.32 35628.99 321577.87 2208.85 402814.44 78847.0 14543.0 64681554.0 19779804.0 2917264.0 15606459.0 98777.0 26279056.0 Summer 1070527.47
9 25821607 4959221.0 726022.0 210617.89 20702.56 244856.56 1308.84 248537.82 53883.0 9629.0 40278757.0 12951173.0 1617002.0 10564292.0 62629.0 15083661.0 Fall 726023.67
10 25576272 5178013.0 819560.0 233194.71 15604.41 279873.59 1356.47 289531.98 52331.0 8553.0 43951145.0 13923859.0 1307822.0 11712200.0 54128.0 16953136.0 Fall 819561.16
11 25775466 4921185.0 722517.0 213385.97 15510.71 237676.36 1494.14 254450.45 37784.0 8020.0 39010669.0 12834692.0 1352039.0 9659274.0 83315.0 15081349.0 Fall 722517.63
12 26095567 5048905.0 1090593.0 315854.20 33005.22 329933.65 2683.49 409118.31 95190.0 12129.0 62791162.0 19420704.0 2989235.0 14477237.0 99861.0 25804125.0 Winter 1090594.87
In [172]:
plotdf = season.set_index('season').iloc[:, 2:8].groupby(level=0).sum()
In [173]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y= list(plotdf.index), 
    x= list(plotdf['carrier_ct']),
    name='Carrier Delay',
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
    )))
            
fig.add_trace(go.Bar(
    y= list(plotdf.index), 
    x= list(plotdf[' weather_ct']),
    name='Weather Delay',
    orientation='h',
    marker=dict(
        color='rgba(178, 102, 255, 0.6)',
        line=dict(color='rgba(178, 102, 255, 1.0)', width=3)
    )))

fig.add_trace(go.Bar(
    y= list(plotdf.index), 
    x= list(plotdf['nas_ct']),
    name='National Air System Delay',
    orientation='h',
    marker=dict(
        color='rgba(255, 178, 102, 0.6)',
        line=dict(color='rgba(255, 178, 102, 1.0)', width=3)
    )
))
              
fig.add_trace(go.Bar(
    y= list(plotdf.index), 
    x= list(plotdf['security_ct']),
    name='Security Delay',
    orientation='h',
    marker=dict(
        color='rgba(102, 204, 0, 0.6)',
        line=dict(color='rgba(102, 204, 0, 1.0)', width=3)
    )
))
              
fig.add_trace(go.Bar(
    y= list(plotdf.index), 
    x= list(plotdf['late_aircraft_ct']),
    name='Late Aircraft Delay',
    orientation='h',
    marker=dict(
        color='rgba(51,153,255, 0.6)',
        line=dict(color='rgba(51, 153, 255, 1.0)', width=3)
    )
))
              
fig.layout.update(barmode='stack',
                  title = 'Amount of Delay by Season<br> 2009-2019',
                  xaxis = dict(title = 'Total No. of Delays'))

iplot(fig)

Thus, we can see that during the summer, there are more delays as compared to Spring and Winter, whereas there are minimum delays during Fall season!

End