import vaex
from vaex.ui.colormaps import cm_plusmin
import numpy as np
import pylab as plt
import seaborn as sns
matplotlib
parameters¶SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
The original data can be downloaded from United States Department of Transportation. The selected columns are the same as in this older, better known dataset used for competitions available here.
Please refer to this notebook on how to converd the raw data to the memory-mappable HDF5 format.
# read in the data
df = vaex.open('./original_data/hdf5/airline_data_1988_2018.hd5')
# Get a high level overview of the DataFrame
df.describe()
df.describe()
¶This filters out the unphisical measurements of durations and distances.
# Keep missing values if they fall within the selection criteria.
df_filtered = df[((df.ActualElapsedTime>=0).fillmissing(True)) &
((df.CRSElapsedTime>=0).fillmissing(True)) &
((df.AirTime>0).fillmissing(True)) &
((df.Distance > 0).fillmissing(True))]
# Describe the filtered dataset
df_filtered.describe()
Let's start by looking at the number of scheduled flights per year.
flights_years = df_filtered.Year.value_counts()
plt.figure(figsize=(18,4))
sns.barplot(x=flights_years.index, y=flights_years.values)
plt.xlabel('Year')
plt.ylabel('Total number of flights')
plt.xticks(rotation='vertical')
plt.show()
The dip in flights in 2002 is due to new regulations following the tragedy in 2001. After that, a year or adjustments to the new protocols, number of flights started increasing again.
Find the most busy airports
origin_value_counts = df_filtered.Origin.value_counts()
dest_value_counts = df_filtered.Origin.value_counts()
plt.figure(figsize=(12, 5))
plt.subplot(121)
sns.barplot(x=origin_value_counts.index[:11], y=origin_value_counts.values[:11])
plt.title('Most frequent origins')
plt.xlabel('Airport code', fontsize=14)
plt.ylabel('Number of take-offs')
plt.subplot(122)
sns.barplot(x=dest_value_counts.index[:11], y=dest_value_counts.values[:11])
plt.title('Most frequent destinations')
plt.xlabel('Airport code')
plt.ylabel('Number of landings')
plt.tight_layout()
plt.show()
Let's look at the number of flights per day of week and month, as well as the number of flights per hour of day and day of week.
df_filtered['Month'] = df_filtered.Month - 1 # to make it start from 0
df_filtered['DayOfWeek'] = df_filtered.DayOfWeek - 1 # to make it start from 0
# label them as categories
df_filtered.categorize(column='Month')
df_filtered.categorize(column='DayOfWeek')
# Helper lists for labelling the plots
label_month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
label_day_list = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
# Plot number of flights per month vs day of week
df_filtered.plot('Month', 'DayOfWeek', colorbar=True, colormap=cm_plusmin, figsize=(15, 5))
plt.xticks(np.arange(12), label_month_list)
plt.yticks(np.arange(7), label_day_list)
plt.xlabel('Month')
plt.ylabel('Day of week')
plt.tick_params(labelsize=12)
plt.show()
# Extract CRS Hour of departure
df_filtered['CRSDepHour'] = df_filtered.CRSDepTime // 100 % 24
# Treat as a categorical
df_filtered.categorize(column='CRSDepHour')
# Plot number of flights per departure hour vs day of week
df_filtered.plot('CRSDepHour', 'DayOfWeek', colorbar=True, colormap=cm_plusmin, figsize=(15, 5))
plt.xticks(np.arange(24), np.arange(24))
plt.yticks(np.arange(7), label_day_list)
plt.tick_params(labelsize=12)
plt.show()
Now, let's look at the overall distribution of the flight durations, distances and speed of flight.
For some of the distributions the limits are adjusted to clip obvious outliers due to likely faulty data recordings.
plt.figure(figsize=(18, 5))
plt.subplot(131)
df_filtered.plot1d('AirTime', limits=[0, 500], lw=3, shape=64)
plt.xlabel('AirTime [min]')
plt.subplot(132)
df_filtered.plot1d('Distance', limits='minmax', lw=3, shape=64)
plt.xlabel('Distance [miles]')
plt.subplot(133)
# Calculate the mean speed of the aircraft
df_filtered['Speed'] = df_filtered.Distance / (df_filtered.AirTime/60.) # this is in miles per hour
df_filtered.plot1d('Speed', limits=[100, 700], lw=3, shape=64)
plt.xlabel('Speed [miles/hour]')
plt.tight_layout()
plt.show()
Out of curiositly, let's find which are the closest airports which had a regular connection between them.
# Check if there are frequent flights at uber-short distances
df_filtered[df_filtered.Distance <= 20].Distance.value_counts(progress=True)
Ok rather large number of flights for distance of 11 miles. Let's find the airports in question.
# Let's see the origins for this short route
df_filtered[df_filtered.Distance == 11.].Origin.value_counts()
It looks like there are quite a large number of flights between SFO and OAK airports. Let's see how that connection evolved throughout the years.
# How the number of the shortest destination flights evolved through the years
nf_sfo_oak = df_filtered[(df_filtered.Distance == 11.) & ((df_filtered.Origin=='SFO') | (df_filtered.Origin == 'OAK'))]['Year'].value_counts()
nf_jfk_lga = df_filtered[(df_filtered.Distance == 11.) & ((df_filtered.Origin=='JFK') | (df_filtered.Origin == 'LGA'))]['Year'].value_counts()
# plot it
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(16, 4)
sns.barplot(nf_sfo_oak.index, nf_sfo_oak.values, ax=ax1)
sns.barplot(nf_jfk_lga.index, nf_jfk_lga.values, ax=ax2)
ax1.set_title('Number of flights between San Francisco and Oakland airpors')
ax2.set_title('Number of flights between John F. Kennedy and LaGuardia airpors')
ax1.set_ylabel('Number of flights')
ax1.set_xlabel('Year')
ax2.set_xlabel('Year')
plt.tight_layout()
plt.show()
In Oakland, it looks like a regular connection existed for 5 years! (at least).
In NYC, just a handful of flights scheduled over the years, probably private, or special services.
Let's find the most distant airports that have a regular connection.
# Check if there are frequent flights at uber-long distances
df_filtered[df_filtered.Distance > 4900].Distance.value_counts(progress=True)
So we have few common long distance relations. Let's examine them.
# Group by Origin, Destination and Distance, and count the number of flights
df_filtered[df_filtered.Distance > 4900].groupby(by=['Origin', 'Dest', 'Distance'], agg={'Origin':'count'})
There are assentially two pairs of airports. Newark (EWR) --- Honolulu (HNL); and John F. Kennedy (JFK) --- Honolulu (HNL)
Notice that between EWR and HNL there are two sets of distances, which are very close to each other. We will examine this in turn.
Now let's look at the number of flights per year between these airport pairs.
# Flight between Newark and Honolulu airport (the more common distance of 4962 miles)
nf_ewr_nhl = df_filtered[(df_filtered.Distance == 4962)]['Year'].value_counts()
plt.figure(figsize=(14, 4))
sns.barplot(nf_ewr_nhl.index, nf_ewr_nhl.values)
plt.title('Number of flights between Newark Liberty International and Honolulu International airpors')
plt.ylabel('Number of flights')
plt.xlabel('Year')
plt.show()
# Pretty much 2 per day since 1999
# Flight between Newark and Honolulu airport (the less common distance of 4963 miles)
nf_ewr_nhl = df_filtered[(df_filtered.Distance == 4963)]['Year'].value_counts()
plt.figure(figsize=(14, 4))
sns.barplot(nf_ewr_nhl.index, nf_ewr_nhl.values)
plt.title('Number of flights between Newark Liberty International and Honolulu International airpors')
plt.ylabel('Number of flights')
plt.xlabel('Year')
plt.show()
# Pretty much 2 per day since 1999
The flights between Honolulu International Airport and Newark Liberty International should travel exactly 4962 miles. However, due to the tragedy in in 2001, the trajectory of the route has been changed slightly, accounting for the change in the distance flown. If we sum up the above two histograms, we can see that the service was never suspended: 2 flights per day since 1999!
# Flight between Newark and Honolulu airport (all distances)
nf_ewr_nhl = df_filtered[(df_filtered.Distance == 4962) |
(df_filtered.Distance == 4963)]['Year'].value_counts()
plt.figure(figsize=(14, 4))
sns.barplot(nf_ewr_nhl.index, nf_ewr_nhl.values)
plt.title('Number of flights between Newark Liberty International and Honolulu International airpors')
plt.ylabel('Number of flights')
plt.xlabel('Year')
plt.show()
# Pretty much 2 per day since 1999
Let's do the same, now of the right JFK --- HNL pair.
# Flights between John F Kennedy and Honolulu Airpot
nf_jfk_nhl = df_filtered[(df_filtered.Distance == 4983)]['Year'].value_counts()
plt.figure(figsize=(14, 4))
sns.barplot(nf_jfk_nhl.index, nf_jfk_nhl.values)
plt.title('Number of flights between John F. Kennedy International and Honolulu International airpors')
plt.ylabel('Number of flights')
plt.xlabel('Year')
plt.show()
Pretty much 2 flights per day since 2013 as well.
There are a lot of airports in this dataset. But most of them do no not contribute much to the over all traffic.
# Start by checking the number of flights per origin
n_flights_per_origin = df_filtered.Origin.value_counts(progress=True)
# See how many of the airports "contribute" to the total traffic.
print('Total number of flights:', n_flights_per_origin.sum())
print('Total number of unique airports:', len(n_flights_per_origin))
_ = n_flights_per_origin[:10].sum() / n_flights_per_origin.sum() * 100
print('Percentage of total flights contributed by the 10 most frequent airports: %.1f%%' % (_))
_ = n_flights_per_origin[:30].sum() / n_flights_per_origin.sum() * 100
print('Percentage of total flights contributed by the 30 most frequent airports: %.1f%%' % (_))
_ = n_flights_per_origin[:50].sum() / n_flights_per_origin.sum() * 100
print('Percentage of total flights contributed by the 50 most frequent airports: %.1f%%' % (_))
_ = n_flights_per_origin[:100].sum() / n_flights_per_origin.sum() * 100
print('Percentage of total flights contributed by the 100 most frequent airports: %.1f%%' % (_))
Now we wanna turn our attention to just the dominant transport hubs, i.e. the more frequently used airports. To do this, let's construct a DataFrame in which we will aggregate some of the flight data that we are interested in, based on the point of Origin of each flight. We will also count the number of flights departing from each Origin.
From the this grouped DataFrame, let's filter out airports with less than 200_000 departures. The rationale is the following: for an airport to be considered frequently used, we require for it to have, on average, 20 destinations per day, one flight per day for each of the over 30 years. These criteria are of course arbitrary.
# Group by origin, calculate some aggregate stats
df_group_by_origin = df_filtered.groupby(by='Origin', agg={'Year': 'count',
'Distance': ['mean', 'std'],
'TaxiIn': ['mean', 'std'],
'TaxiOut': ['mean', 'std'],
'DepDelay': ['mean', 'std'],
})
# Require to have at least 200k flights, to be considered a typical commercial airport
# (~20 destinations with one flight per day for at least 30 years)
df_group_by_origin = df_group_by_origin[(df_group_by_origin['count'] > 200_000)]
df_group_by_origin
Using the aggregations, we can examine, for instance, which are the most far-reaching airports.
# Sort the grouped dataframe by the mean distance, and show the top 10 rows.
df_group_by_origin.sort(by='Distance_mean', ascending=False).head(10)
The furthest reaching airports are of course those located in remote places like Hawaii, Alaska, or Puerto Rico. In addition to these, the largest central hubs like LA, NY, SF, Miami are also far-reaching.
Next, let's find the mean distance of all the mean distances in the grouped DataFrame. This gives a "natural" scale between big cities or populare destination for the U.S.; i.e. it gives a sense of the typical separation beteween important destinations.
# Mean Distance
df_group_by_origin.Distance_mean.mean()
# Median distance
np.median(df_group_by_origin.Distance_mean.values)
Let's look at the distribution of mean taxi-in and taxi-out times for the frequent airports.
# Taxing
plt.figure(figsize=(8, 4))
df_group_by_origin.plot1d('TaxiIn_mean', limits=[0, 30], lw=2, shape=64, label='Taxi-in')
df_group_by_origin.plot1d('TaxiOut_mean', limits=[0, 30], lw=2, shape=64, label='Taxi-out')
plt.xlabel('Taxi [min]')
plt.legend()
plt.show()
# Taxing out on average takes much longer than Taxing in:
# makes sense, you don't wait so much once the plane landf, but can often wait on "queue" before taking off
Let's point fingers and find the airports with the shortest and longest taxi times.
# Shortest taxi in
print('Shortest Taxi-In:')
display(
df_group_by_origin.sort(by='TaxiIn_mean').head(1)
)
# lonest taxi in
print('Longest Taxi-In:')
display(
df_group_by_origin.sort(by='TaxiIn_mean', ascending=False).head(1)
)
# shortest taxi out:
print('Shortest Taxi-out:')
display(
df_group_by_origin.sort(by='TaxiOut_mean').head(1)
)
print('Longest Taxi-out:')
display(
df_group_by_origin.sort(by='TaxiOut_mean', ascending=False).head(1)
)
# DAL - Dallas Love Field Airport (Texas)
# ICT - Wichita Dwight D. Eisenhower (Kansas)
# LBB - Lubbock Preston Smith International Airport (Texas)
# JFK - John F. Kennedy International Airport (NYC)
To create a DataFrame containing only the most frequent origins, as selected above, do an inner join between the main DataFrame and the grouped DataFrame, from which the less frequently used airports are already filtered out.
# Now only consider the flights emerging from the top 115 Origins.
df_top_origins = df_filtered.join(other=df_group_by_origin,
on='Origin',
how='inner',
rsuffix='_')
Now let's create a new virtual column (takes no memory!) to indicate whether a flight has been delayed or not. Note that such a column already exists regarding cancellations.
# If a flight is being delayed
df_top_origins['Delayed'] = (df_top_origins.DepDelay > 0).astype('int')
Let's see how some of the flight properties change through time. To do this, let's group them by year.
# group by year
df_group_by_year = df_top_origins.groupby(by='Year',
agg={'Year': 'count',
'Distance': ['mean', 'std'],
'TaxiIn': ['mean', 'std'],
'TaxiOut': ['mean', 'std'],
'DepDelay': ['mean', 'std', 'sum'],
'Cancelled': ['sum'],
'Delayed': ['sum'],
'Diverted': ['sum']})
How have the mean taxi times changed throughout the years. Note that these measurements were not recorded prior to 1995.
plt.figure(figsize=(18, 4))
plt.subplot(121)
sns.barplot(df_group_by_year.Year.values, df_group_by_year.TaxiIn_mean.values)
plt.xticks(rotation='vertical')
plt.xlabel('Year')
plt.ylabel('Duration [min]')
plt.title('Mean Taxi-in duration')
plt.subplot(122)
sns.barplot(df_group_by_year.Year.values, df_group_by_year.TaxiOut_mean.values)
plt.xticks(rotation='vertical')
plt.xlabel('Year')
plt.title('Mean Taxi-out duration')
plt.tight_layout()
plt.show()
The evolution of the mean distance over time. Gives an indication whether airports are connected with more distanant destinations over time.
plt.figure(figsize=(12, 4))
sns.barplot(df_group_by_year.Year.values, df_group_by_year.Distance_mean.values)
plt.xticks(rotation='vertical')
plt.xlabel('Year')
plt.ylabel('Mean Distance [miles]')
plt.title('Mean distance traveled by the flights')
plt.show()
Interesting.. the distance increases with time, then it becomes somewhat stable, and it increases again after 2014. Probably new destinations are being added or connected to each other.
Now let's turn to something more interesting.. Let's look at the number of flight delays and calncellations over time.
plt.figure(figsize=(14, 4))
sns.barplot(df_group_by_year.Year.values, df_group_by_year.Delayed_sum.values)
plt.xticks(rotation='vertical')
plt.xlabel('Year')
plt.ylabel('Delays')
plt.title('Number of delays per year')
plt.show()
plt.figure(figsize=(14, 4))
sns.barplot(df_group_by_year.Year.values, df_group_by_year.Cancelled_sum.values)
plt.xticks(rotation='vertical')
plt.xlabel('Year')
plt.ylabel('Cancellations')
plt.title('Number of cancellations per year')
plt.show()
Find the total number of cancellations per year for each cancellation type, for these most common airports.
First let's check if the number of cancellations and cancellation codes match:
# Number of flights that have cancellation codes:
df_top_origins.CancellationCode.value_counts(progress=True)
We see that there are two types of "missing" values in the case of Cancellation codes. One is an empty string, another is masked values. Let's look if either show any indication of cancelled flights.
# There are cancellation codes that are empty strings. Let's see what they contain
df_top_origins[df_top_origins.CancellationCode==''].Cancelled.value_counts(progress=True)
# So in the current dataset, all cancellation codes that are empty strings are flights which were not cancelled.
So in the current dataset, all cancellation codes that are empty strings are flights which were not cancelled.
Now let's do the same for the masked values.
# Check for cancelled flights masked values of the cancellation codes
df_top_origins[df_top_origins.CancellationCode.ismissing()].Cancelled.value_counts(progress=True)
# So there are 2_547_952 flights which were cancelled, for which there is no cancellation code.
So there are 2_547_952 flights which were cancelled, for which there is no cancellation code. We will interpret this as "reason unknown" and will simply ignore these.
Next up, let's check the total number of cancelled flights.
# Check the total number of cancelled flights
df_top_origins.Cancelled.value_counts(progress=True)
Let's check for consistency: the total number of cancelled flights should be equal to the sum of known cancellation codes (A, B, C, D) + the cancelled flights with unknown cancellation code.
# Check for consistency
# Sum the value counts from the different cancellation codes + those of cancelled flights with missing codes
print(2547952 + 216133 + 147315 + 75065 + 257 - 2986709)
This means that there are 13 flights that do have a cancellation code, but where not marked as cancelled. Let's check this in another way: count the number of flights which were not cancelled, but have any sort of cancellation code.
# This means that there are probably (13?) flights that do have a cancellation code which were not cancelled.
# let's verify that
((df_top_origins.Cancelled == 0) & ((df_top_origins.CancellationCode == 'A') |
(df_top_origins.CancellationCode == 'B') |
(df_top_origins.CancellationCode == 'C') |
(df_top_origins.CancellationCode == 'D'))).sum()
# This is such a small discrepancy, we decide to let is slide.
# In the following analysis of the cancellation code, we will ignore the unknown reason for the cancelled flights
# with unspecified cancellation code
We recover the same number of flights (13) which do have a cancellation code, but were marked as not cancelled. This is such a small discrepancy, we decide to ignore it for the present analysis.
Now that we have all this figured out: Let's count the number of cancelled flights for each reason per year. Note that the cancellation code as a field in the dataset was introduced in 2003, so we don't need to consider earlier years for this gruoping.
# Now groupby year and a cancellation code
# The cancellation code was introduced in the database after 2002, hence the filtering
df_year_cancel = df_top_origins[df_top_origins.Year > 2002].groupby(by=['Year', 'CancellationCode'],
agg={'Cancelled': ['sum']})
# Keep only the portion of the DataFrame that contains a proper cancellation code:
df_year_cancel = df_year_cancel[~((df_year_cancel.CancellationCode == '') |
(df_year_cancel.CancellationCode == 'null') )]
Let's add more meaningful names to the cancellation codes. The designations come from the documentation available at the archive from which this dataset was obtained.
# Add more meaningful names to the cancellation codes
canc_code_mapper = {'A': 'carrier',
'B': 'weather',
'C': 'National Airspace System',
'D': 'Security'}
df_year_cancel['CancellationCode_'] = df_year_cancel.CancellationCode.map(canc_code_mapper, allow_missing=False)
Now let's make a plot showing the number of cancellations, per season, per year. The seaborn
library is quite useful for this.
plt.figure(figsize=(18, 5))
sns.barplot(x='Year',
y='Cancelled_sum',
hue='CancellationCode_',
data=df_year_cancel.to_pandas_df(virtual=True),
palette=sns.color_palette(['C0', 'C1', 'C2', 'C3', 'C6']))
plt.yscale('log')
plt.legend(bbox_to_anchor=(0.00, 1.1), loc=2, borderaxespad=0., ncol=4)
plt.xticks(rotation='vertical')
plt.ylabel('Number of cancellations')
plt.show()
Let's point fingers again and see at which airports flights experience the most delays and cancellations.
# groupby origin to investigate popular cancellations
df_group_by_origin_cancel = df_top_origins.groupby(by='Origin').agg({'Delayed': ['count', 'sum'],
'Cancelled': ['sum']})
Let's add a couple of new features: the percentage of flights that are delayed or cancelled, for each Origin. Then plot the results.
df_group_by_origin_cancel['delayed_perc'] = df_group_by_origin_cancel.Delayed_sum/df_group_by_origin_cancel['count'] * 100.
df_group_by_origin_cancel['cancelled_perc'] = df_group_by_origin_cancel.Cancelled_sum/df_group_by_origin_cancel['count'] * 100.
plt.figure(figsize=(14, 4))
plt.subplot(121)
sns.barplot(df_group_by_origin_cancel.sort(by='delayed_perc', ascending=False)['Origin'].tolist()[:11],
df_group_by_origin_cancel.sort(by='delayed_perc', ascending=False)['delayed_perc'].tolist()[:11])
# plt.xticks(rotation='vertical')
plt.xlabel('Origin airport code')
plt.ylabel('Delayed flights (%)')
plt.title('Origin of most commonly delayed flights')
plt.subplot(122)
sns.barplot(df_group_by_origin_cancel.sort(by='cancelled_perc', ascending=False)['Origin'].tolist()[:11],
df_group_by_origin_cancel.sort(by='cancelled_perc', ascending=False)['cancelled_perc'].tolist()[:11])
# plt.xticks(rotation='vertical')
plt.xlabel('Origin airport code')
plt.ylabel('Cancelled flights (%)')
plt.title('Origin of most commonly cancelled flights')
plt.tight_layout()
plt.show()
Pittsburgh International (PIT), Chicago Midway International (MDW) and Dallas International (DFW) are the top 3 airports with the largest percentage of delayed flights.
Westchester County Airport (HPN), LaGuardia Airport (LGA) and Portland International Jetport (PWM) are the top 3 airports with the largest percentage of cancelled flights.
Now let's reaaaaly point figures and figure out which Carriers experience the most delays and cancellations, regardless of the point of origin.
# groupby origin to investigate popular cancellations
df_group_by_carrier_cancel = df_top_origins.groupby(by='UniqueCarrier', agg={'Delayed': ['count', 'sum'],
'Cancelled': ['sum']})
# Add new features: percentage of flights cancelled or delayed
df_group_by_carrier_cancel['delayed_perc'] = df_group_by_carrier_cancel.Delayed_sum/df_group_by_carrier_cancel['count'] * 100.
df_group_by_carrier_cancel['cancelled_perc'] = df_group_by_carrier_cancel.Cancelled_sum/df_group_by_carrier_cancel['count'] * 100.
To attach the carrier names we will use tha airline carrier auxiliary table from http://stat-computing.org/dataexpo/2009/supplemental-data.html. Now, this website is rather old, so it is possible tha some airlines have merged or for some reason has obtained new code, or maybe new airlines have emerged. Bottom line is, it may not be complete, but let's try it out here anyway.
# Load the carrier name data
aux_df_carrers = vaex.from_csv('./data/aux-carriers.csv', copy_index=False)
# Create a mapper
mapper_carriers = dict(zip(aux_df_carrers.Code.values, aux_df_carrers.Description.values))
# Do the mapping
df_group_by_carrier_cancel['CarrierName'] = df_group_by_carrier_cancel.UniqueCarrier.map(mapper=mapper_carriers)
# Some of the carriers have very long names, so let's shorten them a bit
df_group_by_carrier_cancel['CarrierName_short'] = df_group_by_carrier_cancel.CarrierName.str.slice(start=0, stop=23)
plt.figure(figsize=(14, 6))
plt.subplot(121)
sns.barplot(df_group_by_carrier_cancel.sort(by='delayed_perc', ascending=False)['delayed_perc'].tolist()[:11],
df_group_by_carrier_cancel.sort(by='delayed_perc', ascending=False)['CarrierName_short'].tolist()[:11])
plt.ylabel('Carrier')
plt.xlabel('% of delayed flights')
plt.title('Airlines with the most delays')
plt.subplot(122)
sns.barplot(df_group_by_carrier_cancel.sort(by='cancelled_perc', ascending=False)['cancelled_perc'].tolist()[:11],
df_group_by_carrier_cancel.sort(by='cancelled_perc', ascending=False)['CarrierName_short'].tolist()[:11])
plt.ylabel('Carrier')
plt.xlabel('% of cancelled flights')
plt.title('Airlines with the most cancellations')
plt.tight_layout()
plt.show()
Enough of pointing fingers, let's give some kudos. Let's determine which airlines have the fewest delays and cancellations.
plt.figure(figsize=(14, 6))
plt.subplot(121)
sns.barplot(df_group_by_carrier_cancel.sort(by='delayed_perc', ascending=True)['delayed_perc'].tolist()[:11],
df_group_by_carrier_cancel.sort(by='delayed_perc', ascending=True)['CarrierName_short'].tolist()[:11])
plt.ylabel('Carrier')
plt.xlabel('% of delayed flights')
plt.title('Airlines with the fewest delays')
plt.subplot(122)
sns.barplot(df_group_by_carrier_cancel.sort(by='cancelled_perc', ascending=True)['cancelled_perc'].tolist()[:11],
df_group_by_carrier_cancel.sort(by='cancelled_perc', ascending=True)['CarrierName_short'].tolist()[:11])
plt.ylabel('Carrier')
plt.xlabel('% of cancelled flights')
plt.title('Airlines with the fewest cancellations')
plt.tight_layout()
plt.show()
"Safest" airlies in terms of delays or cancellations are the Hawaian. Makes sense, not much traffic jam happening on the islands, and if flights go outside, they travel a long way to they are probably better cared for.
Next up, let's see the mean number of unique destinations each airport connects to, and how this number evoluves through time.
# Aggregate the number of unique destination for each origin for each year
tmp_group = df_top_origins.groupby(by=['Year', 'Origin'], agg={'Year': 'count',
'Dest': [vaex.agg.nunique(df_top_origins.Dest,
dropna=True)]})
# Calculate the mean number of destinations for all origins per year
df_mean_num_dest = tmp_group.groupby('Year').agg({'Dest_nunique': ['mean']})
plt.figure(figsize=(14, 4))
sns.barplot(x=df_mean_num_dest.Year.values, y=df_mean_num_dest.Dest_nunique_mean.values)
plt.xticks(rotation='vertical')
plt.xlabel('Year')
plt.ylabel('Mean number of destinations')
plt.show()
A surge of new destinations in 2018! The world is getting more connected than ever.
Let's take the auxiliary plane data from http://stat-computing.org/dataexpo/2009/supplemental-data.html and look at the typical performance of the aircraf used.
Warning: this data is certainly out of date by now.
# Load the data about the airplane models
aux_df_planes = vaex.from_csv(filename_or_buffer='./data/aux-plane-data.csv', copy_index=False)
# A more convenient display of the model/type of each plane
aux_df_planes['model_type'] = aux_df_planes.manufacturer + ' ' + aux_df_planes.model
# Join to the auxiliary plance data to the _df_top_origins DataFrame
df_top_origins_aux = df_top_origins.join(other=aux_df_planes,
left_on='TailNum',
right_on='tailnum',
how='left',
rsuffix='_')
Let's make a group based on the model type of the aircrafs used. We are interested in the typical speed and distance travelled by each model.
# Group by model type
df_model_type = df_top_origins_aux.groupby(by='model_type',
agg={'Speed': 'mean', 'Distance': 'mean'})
Let's display the fastest and furthest reaching aircraft models (on average)
# Sort and plot the top 11 fastest and far reaching aircraft.
model_top_speed = df_model_type.sort(by='Speed', ascending=False)[['model_type', 'Speed']]
model_top_distance = df_model_type.sort(by='Distance', ascending=False)[['model_type', 'Distance']]
plt.figure(figsize=(16, 5))
plt.subplot(121)
sns.barplot(x=model_top_distance.Distance[:11].tolist(),
y=model_top_distance.model_type[:11].tolist())
plt.xlabel('Mean distance [miles]')
plt.subplot(122)
sns.barplot(x=model_top_speed.Speed[:11].tolist(),
y=model_top_speed.model_type[:11].tolist())
plt.xlabel('Mean speed [miles/hour]')
plt.tight_layout()
plt.show()