#!/usr/bin/env python # coding: utf-8 # # Compile EPA emissions data # Convert the data from hourly to monthly and export all years as a single file. # # ## Instructions # Choose the file format (`csv` or `feather`) that was used when saving hourly EPA data. Feather is much faster for read/write. # In[ ]: import pandas as pd import numpy as np import os from os.path import join from joblib import Parallel, delayed import sys cwd = os.getcwd() data_path = join(cwd, '..', 'Data storage') # In[ ]: file_format = 'csv' # file_format = 'feather' # In[ ]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-iv -v') # In[ ]: # Load the "autoreload" extension get_ipython().run_line_magic('load_ext', 'autoreload') # always reload modules marked with "%aimport" get_ipython().run_line_magic('autoreload', '1') # add the 'src' directory as one where we can import modules src_dir = join(os.getcwd(), os.pardir, 'src') sys.path.append(src_dir) # In[ ]: get_ipython().run_line_magic('aimport', 'Data.data_extraction') from Data.data_extraction import import_group_epa, unit_conversion get_ipython().run_line_magic('aimport', 'Analysis.index') from Analysis.index import add_datetime, add_quarter # ## Change years if necessary # In[ ]: start_year = 2001 end_year = 2017 if __name__ == '__main__': base_path = join(data_path, 'EPA emissions') paths = [join(base_path, 'EPA emissions {}.{}'.format(str(year), file_format)) for year in range(start_year, end_year + 1)] df_list = Parallel(n_jobs=-1)(delayed(import_group_epa)(path) for path in paths) # In[ ]: df = pd.concat(df_list) # In[ ]: path = os.path.join(data_path, 'Derived data', 'Monthly EPA emissions 2018-03-06.csv') df.to_csv(path, index=False) # In[ ]: