import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import zipfile
df_all_flights =pd.read_csv("C:\\Users\sarita\Desktop\MSBA\Q4\MSIS\DelayedFlights\\2008.csv")
df_delayed_flights = pd.read_csv("C:\\Users\sarita\Desktop\MSBA\Q4\MSIS\DelayedFlights\\DelayedFlights.csv")
df_delayed_flights =df_delayed_flights.fillna(0)
df_all_flights = df_all_flights.fillna(0)
df_delayed_flights_by_origin = df_delayed_flights.groupby('Origin').count()
df_all_flights_by_origin = df_all_flights.groupby('Origin').count()
df_all_flights['Delayed']=df_all_flights['DepDelay']>=10
df_all_flights['CRSDate'] = df_all_flights.apply(lambda x: dt.date(x['Year'], x['Month'], x['DayofMonth']), axis=1)
df_flights_by_date = pd.DataFrame(df_all_flights.groupby('CRSDate')['Year'].count())
df_delayed_flights_by_date = pd.DataFrame(df_all_flights[df_all_flights['Delayed']==True].groupby('CRSDate')['Year'].count())
df_flights_by_date['Date']=df_flights_by_date.index
df_delayed_flights_by_date['Date']=df_delayed_flights_by_date.index
df_flights_by_date2=df_flights_by_date.merge(df_delayed_flights_by_date, how='inner', on='Date')
df_flights_by_date2=df_flights_by_date2.rename(columns={'Year_x':'All_Flights', 'Year_y':'Delayed_Flights'})
df_flights_by_date2['Week_Num']=df_flights_by_date2['Date'].apply(lambda x: x.isocalendar()[1])
df_flights_by_week = df_flights_by_date2[['Week_Num', 'All_Flights', 'Delayed_Flights']]
df_flights_by_week2=df_flights_by_week.groupby(['Week_Num']).sum()
df_flights_by_week2['Week_Num']= df_flights_by_week2.index
import matplotlib.ticker as tick
#Format function to show numbers morethan 1000 in the ticks in K
def fmt_x(x,y):
if x > 1000:
val = int(x) / 1000
return '{val:d}K'.format(val=int(val))
else:
return int(x)
plt.style.use('fivethirtyeight')
ax=df_flights_by_week2.set_index(['Week_Num']).plot(kind='line', figsize=(14, 10), grid=True, fontsize=16)
ax.yaxis.set_major_formatter(tick.FuncFormatter(fmt_x))
plt.title("Delayed Flights vs. Total Flights", fontsize=24, y=1.01)
plt.xlabel("Week Number", fontsize=20, labelpad=12)
plt.ylabel("Number of Flights", fontsize=20, labelpad=12)
plt.show()
ANALYSIS: It is apparent from the graph of all flights and delayed flights that there is a correlation between the two as expected. Surprisingly, the volatility in the delayed curve not present in the all-flights curve indicates that delayed flights is not a perfect function of the total number of flights. For example, the variance (or increases and decreases) in the delayed curve between weeks 10 and 20, 50 and 52 and to a lesser extent 36 and 40, is not reciprocated in the all-flights curve.
df_all_flights=df_all_flights.fillna(0)
df_delayed_flights=df_delayed_flights.fillna(0)
df_all_flights_car=pd.DataFrame(df_delayed_flights.groupby("UniqueCarrier")["CarrierDelay"].mean())
df_all_flights_car["Carrier"]=df_all_flights_car.index
df_airline_codes=pd.read_csv("C:\\Users\sarita\Desktop\MSBA\Q4\MSIS\DelayedFlights\\airline_codes.csv")
df_all_flights_car=pd.merge(df_all_flights_car, df_airline_codes, how='left', left_on="Carrier", right_on="ATA")
df_all_flights_car=df_all_flights_car[["Carrier", "Airline", "CarrierDelay"]]
df_all_flights_car=df_all_flights_car.dropna()
df_all_flights_car=df_all_flights_car[df_all_flights_car.index!=23]
df_all_flights_car=df_all_flights_car.sort_values(by='CarrierDelay')
df_all_flights_car['Average_Carrier_Delay']= df_all_flights_car['CarrierDelay'].mean()
df_all_flights_car.set_index(['Airline'])['CarrierDelay'].plot(kind='barh', figsize=(14, 10), grid=True, fontsize=18, color='salmon')
plt.axvline(df_all_flights_car['Average_Carrier_Delay'].iloc[0], linestyle='--')
plt.title("Average Airline Carrier Delays", fontsize=24, y=1.01)
plt.xlabel("Average Carrier Delay (minutes)", fontsize=20, labelpad=15)
plt.ylabel("Airline", fontsize=20, labelpad=15)
plt.legend(['Avg. Delay for all carriers', 'Avg Delay for this Carrier'], loc=0, fontsize='small');
plt.show()
ANALYSIS: The major airline carriers, US Airways, United Airlines, and Southwest Airlines appear to have limited carrier-caused delay times on average. Conversely, all the carrier who shoulder the blame for the longest extended wait times are smaller airlines like Hawaii, Comair, and Atlantic Southeast. Lastly, Mesa Airlines is clearly the worst airline for those fliers who lack patience or loose schedules.
df_car_perc=pd.DataFrame(100*df_delayed_flights.groupby("UniqueCarrier")['Year'].count()/df_all_flights.groupby("UniqueCarrier")['Year'].count())
df_car_perc["Carrier"]=df_car_perc.index
df_car_perc=df_car_perc.rename(columns={'Year':'Percent Delayed'})
df_car_perc_2=pd.merge(df_car_perc, df_airline_codes, how='left', left_on="Carrier", right_on="ATA")
df_car_perc_2=df_car_perc_2[["Carrier", "Airline", "Percent Delayed"]]
sns.set(font_scale=1.5)
df_car_perc_2=df_car_perc_2[df_car_perc_2.index!=23]
df_car_perc_2=df_car_perc_2.dropna()
df_car_perc_2=df_car_perc_2.sort_values(by='Percent Delayed')
df_car_perc_2['Average_Percent_Delayed']= df_car_perc_2['Percent Delayed'].mean()
df_car_perc_2.set_index(['Airline'])['Percent Delayed'].plot(kind='barh', figsize=(14, 10), grid=True, fontsize=18, color='steelblue')
plt.axvline(df_car_perc_2['Average_Percent_Delayed'].iloc[0], color='red', linestyle='--')
plt.title("Airline Percent Flights Delayed", fontsize=24, y=1.01)
plt.xlabel("Percent of Flights Delayed", fontsize=20, labelpad=15)
plt.ylabel("Airline", fontsize=20, labelpad=15)
plt.legend(['Average Percent Delayed', 'Percent Delayed for this carrier'], loc=0, fontsize='small');
plt.show()
ANALYSIS: Unlike the average carrier delay times, the major airlines have the highest percentage of flights delayed. Interestingly, contrary to their laid-back image, the airlines representing Hawaii boast the most punctual departures of any carrier.
df_all_flights['CRSDepTime1'] = df_all_flights['CRSDepTime'].apply(lambda x: '{0:0>4}'.format(x)) #pad with 0 to convert to hhmm
df_all_flights['CRSDepTime2'] = df_all_flights['CRSDepTime1'].astype(str) #convert object to string
df_all_flights.CRSDepTime2 = df_all_flights.CRSDepTime2.str.slice(0, 2) #Get Hour from CRS Dep time
df_all_flights['CRSDepTimeHr'] = df_all_flights['CRSDepTime2'].astype(int)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-3671f8ad1e15> in <module>() ----> 1 df_all_flights['CRSDepTime1'] = df_all_flights['CRSDepTime'].apply(lambda x: '{0:0>4}'.format(x)) #pad with 0 to convert to hhmm 2 df_all_flights['CRSDepTime2'] = df_all_flights['CRSDepTime1'].astype(str) #convert object to string 3 df_all_flights.CRSDepTime2 = df_all_flights.CRSDepTime2.str.slice(0, 2) #Get Hour from CRS Dep time 4 df_all_flights['CRSDepTimeHr'] = df_all_flights['CRSDepTime2'].astype(int) NameError: name 'df_all_flights' is not defined
#Create df with percent of flights that are delayed for each day of the week
df_delays_perc = pd.DataFrame(df_all_flights[df_all_flights['Delayed']==True].groupby(['DayOfWeek', 'CRSDepTimeHr'])['Year'].count()*100/df_all_flights.groupby(['DayOfWeek', 'CRSDepTimeHr'])['Year'].count())
df_delays_byhour = df_delays_perc.unstack(level=0)
day_short_names = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
day_Hours = ['12am', '1am', '2am', '3am', '4am', '5am', '6am','7am','8am','9am', '10am', '11am', '12pm', '1pm', '2pm', '3pm', '4pm', '5pm', '6pm', '7pm', '8pm', '9pm', '10pm', '11pm']
#Create heatmap of the percent of flights delayed for each time of each day of the week
sns.set_context("poster")
f, ax = plt.subplots(figsize=(14, 14))
ax = sns.heatmap(df_delays_byhour, annot=True, fmt="0.0f", linewidths=.5, ax=ax, xticklabels=day_short_names, yticklabels=day_Hours, cmap="Blues")
ax.axes.set_title("Percentage of Flight Delays by Day and Hour of Day", fontsize=20, y=1.01)
ax.set(xlabel='Day of Week', ylabel='Scheduled Departure time');
ANALYSIS: 1. Flights in the afternoon and evening are more delayed than early morning and night on any day of the week. 2. Flights delays are maximum on Friday and Sunday evenings at 8 pm. 3. The percentage of flights delayed at 3am shows higher than normal, but that is a result of bias created by considering percentages instead of the raw numbers.
df_delays_perc_Month = pd.DataFrame(df_delayed_flights.groupby(['Month', 'DayofMonth'])['Year'].count()*100/df_all_flights.groupby(['Month', 'DayofMonth'])['Year'].count())
df_delays_by_month = df_delays_perc_Month.unstack(level=0)
month_short_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']
sns.set_context("poster")
f, ax1 = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(dfDelaysByMonth, annot=True, fmt="0.0f", ax=ax1, xticklabels=month_short_names, cmap="Greens")
ax.axes.set_title("Percentage of Flight Delays by Month and Day Of Month", fontsize=20, y=1.01)
ax.set(xlabel='Month', ylabel='Day Of Month');
ANALYSIS: 1. Flight delays are maximum in the around holiday season . 2. During Christmas and Thanksgiving, the delays could go up to 57% 3. We have some outliers in the middle of February and end of March, where there are no holidays or any reason that stands out.
df_airport_location.columns = ['AirportID',
'Name',
'City',
'Country',
'IATA',
'ICAO',
'Lat',
'Lon',
'Alt',
'TimeZone',
'DST',
'time',
'zone',
'type',
]
df_airport_location.drop(['AirportID'],axis=1,inplace=True)
df_airport_location.drop(['Name'],axis=1,inplace=True)
df_airport_location.drop(['Country'],axis=1,inplace=True)
df_airport_location.drop(['ICAO'],axis=1,inplace=True)
df_airport_location.drop(['Alt'],axis=1,inplace=True)
df_airport_location.drop(['TimeZone'],axis=1,inplace=True)
df_airport_location.drop(['DST'],axis=1,inplace=True)
df_airport_location.drop(['time'],axis=1,inplace=True)
df_airport_location.drop(['zone'],axis=1,inplace=True)
df_airport_location.drop(['type'],axis=1,inplace=True)
df_airport_location['city'] = df_airport_location['City'] # create City as last column of dataframe
df_airport_location.drop(['City'],axis=1,inplace=True)# drop city since it is added as the last column of the dataframe
# rename columns
df_airport_location.columns = ['Airport',
'Lat',
'Lon',
'City']
# make a list of all the unique airports that had flight delays
Airports = df_delayed_flights['Origin'].unique()
Airports.sort()
# add a column to delayed flights data set with the total number of flights from
# each airport. Add a column with the names of all the airport
df_delayed_flights_by_origin['TotalFlights'] = df_all_flights_by_origin['LateAircraftDelay']
df_delayed_flights_by_origin['Airport'] = Airports
# calculate percentage of delayed flights from each airport
df_delayed_flights_by_origin['PercentofDelayedFlights'] = round(((df_delayed_flights_by_origin['LateAircraftDelay']/df_delayed_flights_by_origin['TotalFlights'])*100),2)
df_delayed_flights_by_origin.drop(df_delayed_flights_by_origin.iloc[:, 0:28], inplace=True, axis=1)
df_merge_airport_location_delayed_flights= pd.merge( df_airport_location, df_delayed_flights_by_origin, on = 'Airport', how = 'right')
df_merge_airport_location_delayed_flights = df_merge_airport_location_delayed_flights.sort_values(by=['LateAircraftDelay'], ascending = False)
df_merge_airport_location_delayed_flights is sorted from most amount of delays to least delays. The limits variable below is used to separate the two airports with the highest amount of delays (in blue on the map) from the rest of the airports (in orange). The size of each bubble is determined by the number of delays from each airport.
df_merge_airport_location_delayed_flights['text'] = df_merge_airport_location_delayed_flights['Airport'] + ': ' + df_merge_airport_location_delayed_flights['City'] + ' <br>Number Of Delays: ' + (df_merge_airport_location_delayed_flights['LateAircraftDelay']).astype(str) + '<br>' + (df_merge_airport_location_delayed_flights['PercentofDelayedFlights']).astype(str) + '% of flights delayed'
limits = [(0,2), (3,302)]
colors = ["rgb(255,65,54)","rgb(0,116,217)"]
names = ['Greater than 125,000 delays', 'Less than 125,000 delays']
airports = []
scale = 50
for i in range(len(limits)):
lim = limits[i]
df_sub1 = df_merge_airport_location_delayed_flights[lim[0]:lim[1]]
airport = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub1['Lon'],
lat = df_sub1['Lat'],
text = df_sub1['text'],
marker = dict(
size = df_sub1['LateAircraftDelay']/scale,
colors = colors [i],
line = dict(width=1, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = names[i])
airports.append(airport)
layout = dict(
title = 'Amount of Delays per US Airport in 2008<br>(Click Legend to Toggle Traces)',
showlegend = (True),
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)",
),
)
fig2 = dict( data=airports, layout=layout )
py.iplot(fig2, validate=False, filename='d3-bubble-map-populations')
#py.offline.iplot( fig2, validate=False, filename='d3-bubble-map-populations' )
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~skenny/0 or inside your plot.ly account where it is named 'd3-bubble-map-populations'
Each bubble represents an airport and the size of the bubbles indicate the amount of the delays relative to the other airports. The airports with most amount of delays are ATL in Atlanta and ORD in Chicago.
ATL had 131,613 delays which constitutes 31.75% of all flights from ATL. ORD had 125,979 delays which constitutes 35.95% of all flights from ORD.
Some airports with a lower amount of delays had a higher percentage of delayed flights. For example, AKN in King Salmon had 72 delays which constitutes 62.07% of all flights from AKN.