#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import glob
import numpy as np


# I've been using state boundaries to define approximate NERC regions. How accurate is this? Use 2015 EIA 923 data to check.

# ## Read NERC by state data

# In[2]:


path = os.path.join('Data storage', 'final NERC data from states', 'Monthly gen*')
mg_fns = glob.glob(path)


# In[3]:


df_list = []
for f in mg_fns:
    region = f.split()[-1][:-4]
    df = pd.read_csv(f)
    df['region'] = region
    df_list.append(df)
full_mg = pd.concat(df_list)
full_mg.reset_index(inplace=True, drop=True)
full_mg['datetime'] = pd.to_datetime(full_mg['datetime'])

monthly_gen = pd.pivot_table(full_mg, index=['region', 'datetime'], 
                             values='generation (MWh)', columns='fuel category 1')
monthly_gen.reset_index(inplace=True, drop=False)
monthly_gen['Year'] = monthly_gen['datetime'].dt.year
monthly_gen.replace(np.nan, 0, inplace=True)


# In[5]:


monthly_gen.head()


# In[54]:


annual_gen = monthly_gen.groupby(['region', 'Year']).sum()
annual_gen.reset_index(inplace=True, drop=False)


# In[55]:


annual_gen = annual_gen.loc[annual_gen['Year'] == 2015]
annual_gen.index = annual_gen['region']
annual_gen.drop(['region', 'Year'], axis=1, inplace=True)
annual_gen['Total'] = annual_gen.sum(axis=1)
annual_gen


# ## Read EIA-923 data

# In[6]:


path = os.path.join('Data storage', 'EIA923_Schedules_2_3_4_5_M_12_2015_Final.xlsx')
eia = pd.read_excel(path, header=5, parse_cols='A,G,I,N,O,P,CR')


# In[31]:


eia.columns = [col.replace('\n', ' ') for col in eia.columns]
eia.rename(columns={'Net Generation (Megawatthours)': 'Generation (MWh)',
                  'NERC Region': 'region'}, inplace=True)


# In[32]:


eia.head()


# In[13]:


eia['AER Fuel Type Code'].unique()


# In[15]:


fuel_cats = {'Coal': ['COL'],
             'Natural Gas': ['NG'],
             'Hydro': ['HYC', 'HPS'],
             'Nuclear': ['NUC'],
             'Wind': ['WND'],
             'Solar': ['SUN'],
             'Other Renewables': ['GEO', 'ORW', 'WWW', 'MLG'],
             'Other': ['WOO', 'DFO', 'RFO', 'PC', 'OTH', 'OOG', 'WOC']}


# In[22]:


for cat, fuels in fuel_cats.iteritems():
    eia.loc[eia['AER Fuel Type Code'].isin(fuels), 'Fuel category'] = cat


# In[56]:


eia_nerc = eia.groupby(['region', 'Fuel category']).sum()
eia_nerc.reset_index(inplace=True)


# In[57]:


eia_nerc = eia_nerc.pivot_table(index='region', columns=['Fuel category'],
                     values='Generation (MWh)')
eia_nerc.rename(index={'TRE': 'ERCOT'}, inplace=True)
eia_nerc['Total'] = eia_nerc.sum(axis=1)
eia_nerc


# ## Compare 2 data sets

# In[60]:


(annual_gen - eia_nerc) / eia_nerc


# In[ ]: