#!/usr/bin/env python
# coding: utf-8

# # Compile EPA emissions data
# Convert the data from hourly to monthly and export all years as a single file.
# 
# ## Instructions
# Choose the file format (`csv` or `feather`) that was used when saving hourly EPA data. Feather is much faster for read/write.

# In[ ]:


import pandas as pd
import numpy as np
import os
from os.path import join
from joblib import Parallel, delayed
import sys

cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')


# In[ ]:


file_format = 'csv'
# file_format = 'feather'


# In[ ]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', '-iv -v')


# In[ ]:


# Load the "autoreload" extension
get_ipython().run_line_magic('load_ext', 'autoreload')

# always reload modules marked with "%aimport"
get_ipython().run_line_magic('autoreload', '1')

# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)


# In[ ]:


get_ipython().run_line_magic('aimport', 'Data.data_extraction')
from Data.data_extraction import import_group_epa, unit_conversion

get_ipython().run_line_magic('aimport', 'Analysis.index')
from Analysis.index import add_datetime, add_quarter


# ## Change years if necessary

# In[ ]:


start_year = 2001
end_year = 2017

if __name__ == '__main__':
    base_path = join(data_path, 'EPA emissions')
    paths = [join(base_path, 'EPA emissions {}.{}'.format(str(year), file_format))
             for year in range(start_year, end_year + 1)]
    
    df_list = Parallel(n_jobs=-1)(delayed(import_group_epa)(path) 
                                  for path in paths)


# In[ ]:


df = pd.concat(df_list)


# In[ ]:


path = os.path.join(data_path, 'Derived data',
                    'Monthly EPA emissions 2018-03-06.csv')
df.to_csv(path, index=False)


# In[ ]: