obtained from https://www.transtats.bts.gov/DL_SelectFields.asp
on Sunday, 8-Sept-2019
hdf5
¶import os
import glob
from zipfile import ZipFile
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import vaex
from tqdm import tqdm_notebook as tqdm
# Columns to read
columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime',
'Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number',
'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',
'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode',
'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
# Force dtypes for these columns - some choices are made to optimize disk space
dtypes = {
'Year': np.int16,
'Month': np.int8,
'DayofMonth': np.int16,
'DayOfWeek': np.int8,
'DepTime': 'Int16',
'CRSDepTime': 'Int16',
'ArrTime': 'Int16',
'CRSArrTime': 'Int16',
'Reporting_Airline': np.object,
'Flight_Number_Reporting_Airline': np.int32,
'Tail_Number': np.object,
'ActualElapsedTime': 'Int32',
'CRSElapsedTime': 'Int32',
'AirTime': 'Int32',
'ArrDelay': 'Int32',
'DepDelay': 'Int32',
'Origin': np.object,
'Dest': np.object,
'Distance': 'Int32',
'TaxiIn': 'Int32',
'TaxiOut': 'Int32',
'Cancelled': 'Int8',
'CancellationCode': np.object,
'Diverted': 'Int8',
'CarrierDelay': 'Int32',
'WeatherDelay': 'Int32',
'NASDelay': 'Int32',
'SecurityDelay': 'Int32',
'LateAircraftDelay': 'Int32',
}
# Set up a renaming dictionary, in order to make the column names to match the well known data from
# http://stat-computing.org/dataexpo/2009/the-data.html
rename_dict = {
'DayofMonth': 'DayOfMonth',
'Reporting_Airline': 'UniqueCarrier',
'Flight_Number_Reporting_Airline': 'FlightNum',
'Tail_Number': 'TailNum'
}
# Set up the list of zip files to be opened and converted
zip_list = np.sort(np.array(glob.glob('./airlines-us-original/raw/*.zip')))[::-1]
# The output directory
output_dir = './airlines-us-original/hdf5/'
# The magic happens heree:
for file in tqdm(zip_list, leave=False, desc='Converting to hdf5...'):
# Setting up the files, and directories
zip_file = ZipFile(file)
output_file = file.split('/')[-1][:-3]+'hdf5'
output = output_dir + output_file
# Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
if (os.path.exists(output) and os.path.isfile(output)):
pass
else:
# Importing the data into pandas
pandas_df = [pd.read_csv(zip_file.open(text_file.filename),
encoding='latin',
usecols=columns,
dtype=dtypes,)
for text_file in zip_file.infolist()
if text_file.filename.endswith('.csv')][0]
# Rename some columns to match the more well known dataset from
# http://stat-computing.org/dataexpo/2009/the-data.html
pandas_df.rename(columns=rename_dict, inplace=True)
# Importing the data from pandas to vaex
vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
# Export the data with vaex to hdf5
vaex_df.export_hdf5(path=output, progress=False)
Notes:
Here we go:
Note that macOS has a rather low limit on the number of files one can open at one time. To circumvent this issue, in a terminal run:
>ulimit -n 9999
It should be the same terminal from which later the jupyter server is started.
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('./airlines-us-original/hdf5/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
assert len(hdf5_list) == 372, "Incorrect number of files"
# This is an important step
master_df = vaex.open_many(hdf5_list)
# exporting
master_df.export_hdf5(path='./airline_data_1988_2018.hd5', progress=True)
# Check how the single file looks like:
df = vaex.open('./airline_data_1988_2018.hd5')
df