Conver the original data

obtained from https://www.transtats.bts.gov/DL_SelectFields.asp

on Sunday, 8-Sept-2019

Converting the original airlines data to hdf5

In [1]:
import os 
import glob
from zipfile import ZipFile

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 500)

import vaex

from tqdm import  tqdm_notebook as tqdm

Settings

In [2]:
# Columns to read
columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 
           'Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number', 
           'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',
           'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 
           'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

# Force dtypes for these columns - some choices are made to optimize disk space
dtypes = {
'Year': np.int16,
'Month': np.int8,
'DayofMonth': np.int16,
'DayOfWeek': np.int8,
'DepTime': 'Int16',
'CRSDepTime': 'Int16',
'ArrTime': 'Int16',
'CRSArrTime': 'Int16',
'Reporting_Airline': np.object,
'Flight_Number_Reporting_Airline': np.int32,
'Tail_Number': np.object,
'ActualElapsedTime': 'Int32',
'CRSElapsedTime': 'Int32',
'AirTime': 'Int32',
'ArrDelay': 'Int32',
'DepDelay': 'Int32',
'Origin': np.object,
'Dest': np.object,
'Distance': 'Int32',
'TaxiIn': 'Int32',
'TaxiOut': 'Int32',
'Cancelled': 'Int8',
'CancellationCode': np.object,
'Diverted': 'Int8',
'CarrierDelay': 'Int32',
'WeatherDelay': 'Int32',
'NASDelay': 'Int32',
'SecurityDelay': 'Int32',
'LateAircraftDelay': 'Int32',
}

# Set up a renaming dictionary, in order to make the column names to match the well known data from 
# http://stat-computing.org/dataexpo/2009/the-data.html
rename_dict = {
    'DayofMonth': 'DayOfMonth',
    'Reporting_Airline': 'UniqueCarrier',
    'Flight_Number_Reporting_Airline': 'FlightNum',
    'Tail_Number': 'TailNum'
}

Conversion

In [3]:
# Set up the list of zip files to be opened and converted
zip_list = np.sort(np.array(glob.glob('./airlines-us-original/raw/*.zip')))[::-1]

# The output directory
output_dir = './airlines-us-original/hdf5/'
In [4]:
# The magic happens heree:
for file in tqdm(zip_list, leave=False, desc='Converting to hdf5...'):
    # Setting up the files, and directories
    zip_file = ZipFile(file)
    output_file = file.split('/')[-1][:-3]+'hdf5'
    output = output_dir + output_file
    
    # Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
    if (os.path.exists(output) and os.path.isfile(output)):
        pass
    else:
        # Importing the data into pandas 
        pandas_df = [pd.read_csv(zip_file.open(text_file.filename), 
                                 encoding='latin',
                                 usecols=columns,
                                 dtype=dtypes,)
                     for text_file in zip_file.infolist()
                     if text_file.filename.endswith('.csv')][0]
        # Rename some columns to match the more well known dataset from 
        # http://stat-computing.org/dataexpo/2009/the-data.html
        pandas_df.rename(columns=rename_dict, inplace=True)

        # Importing the data from pandas to vaex
        vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
        
        # Export the data with vaex to hdf5
        vaex_df.export_hdf5(path=output, progress=False)

Notes:

  • number of columns in the Data expo version: 29

Here we go:

  1. Year -> Year
  • Month -> Month
  • DayOfMonth -> DayofMonth
  • DayOfWeek -> DayOfWeek
  • DepTime -> DepTime
  • CRSDepTime -> CRSDepTime
  • ArrTime -> ArrTime
  • CRSArrTime -> CRSArrTime
  • Reporting_Airline -> UniqueCarrier
  • Flight_Number_Reporting_Airline -> FlightNum
  • Tail_Number -> TailNum
  • ActualElapsedTime -> ActualElapsedTime
  • CRSElapsedTime -> CRSElapsedTime
  • AirTime -> AirTime
  • ArrDelay -> ArrDelay
  • DepDelay -> DepDelay
  • Origin -> Origin
  • Dest -> Dest
  • Distance -> Distance
  • TaxiIn -> TaxiIn
  • TaxiOut -> TaxiOut
  • Cancelled -> Cancelled
  • CancellationCode -> CancellationCode
  • Diverted -> Diverted
  • CarrierDelay -> CarrierDelay
  • WeatherDelay -> WeatherDelay
  • NASDelay -> NASDelay
  • SecurityDelay -> SecurityDelay
  • LateAircraftDelay -> LateAircraftDelay

Now merge all the little files together

Note that macOS has a rather low limit on the number of files one can open at one time. To circumvent this issue, in a terminal run:

>ulimit -n 9999

It should be the same terminal from which later the jupyter server is started.

In [1]:
import re
import glob
import vaex
import numpy as np

def tryint(s):
    try:
        return int(s)
    except:
        return s

def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]
In [2]:
hdf5_list = glob.glob('./airlines-us-original/hdf5/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)

assert len(hdf5_list) == 372, "Incorrect number of files"
In [7]:
# This is an important step
master_df = vaex.open_many(hdf5_list)

# exporting
master_df.export_hdf5(path='./airline_data_1988_2018.hd5', progress=True)
[########################################]:  100.00% estimated time:        0s =  0.0m =  0.0h                                                                   
In [10]:
# Check how the single file looks like:
df = vaex.open('./airline_data_1988_2018.hd5')
df
Out[10]:
# Year Month DayOfMonth DayOfWeek UniqueCarrier TailNum FlightNum Origin Dest CRSDepTime DepTime DepDelay TaxiOut TaxiIn CRSArrTime ArrTime ArrDelay Cancelled CancellationCode Diverted CRSElapsedTime ActualElapsedTime AirTime Distance CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
0 1988 1 8 5 PI None 930 BGM ITH 1525 1532 7 -- -- 1545 1555 10 0 None 0 20 23 -- 32 -- -- -- -- --
1 1988 1 9 6 PI None 930 BGM ITH 1525 1522 -3 -- -- 1545 1535 -10 0 None 0 20 13 -- 32 -- -- -- -- --
2 1988 1 10 7 PI None 930 BGM ITH 1525 1522 -3 -- -- 1545 1534 -11 0 None 0 20 12 -- 32 -- -- -- -- --
3 1988 1 11 1 PI None 930 BGM ITH 1525 -- -- -- -- 1545 -- -- 1 None 0 20 -- -- 32 -- -- -- -- --
4 1988 1 12 2 PI None 930 BGM ITH 1525 1524 -1 -- -- 1545 1540 -5 0 None 0 20 16 -- 32 -- -- -- -- --
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
183,821,9212018 12 27 4 DL N705TW 864 JFK SLC 935 929 -6 28 9 1311 1230 -41 0 None 0 336 301 264 1990 -- -- -- -- --
183,821,9222018 12 27 4 DL N336NB 865 MSP SLC 1035 1030 -5 14 10 1240 1214 -26 0 None 0 185 164 140 991 -- -- -- -- --
183,821,9232018 12 27 4 DL N945DN 866 DEN MSP 1054 1100 6 10 5 1353 1350 -3 0 None 0 119 110 95 680 -- -- -- -- --
183,821,9242018 12 27 4 DL N945DN 866 MSP DEN 850 850 0 15 8 1010 951 -19 0 None 0 140 121 98 680 -- -- -- -- --
183,821,9252018 12 27 4 DL N901DE 867 ATL SYR 1455 1500 5 19 3 1703 1703 0 0 None 0 128 123 101 794 -- -- -- -- --