#!/usr/bin/env python
# coding: utf-8

# In[74]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
sns.set(style='whitegrid')


# ## Import facility data and NERC labels

# In[2]:


path = os.path.join('Data storage', 'Facility gen fuels and CO2 2017-05-25.zip')
facility_df = pd.read_csv(path, parse_dates=['datetime'])


# In[3]:


facility_df.head()


# In[4]:


facility_df.dropna(inplace=True, subset=['lat', 'lon'])


# In[5]:


cols = ['lat', 'lon', 'plant id', 'year']
small_facility = facility_df.loc[:, cols].drop_duplicates()


# In[6]:


geometry = [Point(xy) for xy in zip(small_facility.lon, small_facility.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(small_facility, crs=crs, geometry=geometry)


# ## Read NERC shapefile and merge with `geo_df`

# In[7]:


path = os.path.join('Data storage', 'NERC_Regions_EIA', 'NercRegions_201610.shp')
regions = gpd.read_file(path)


# In[8]:


facility_nerc = gpd.sjoin(geo_df, regions, how='inner', op='within')


# In[9]:


facility_nerc.head()


# ## Merge NERC labels into the facility df

# In[10]:


cols = ['plant id', 'year', 'NERC']
facility_df = facility_df.merge(facility_nerc.loc[:, cols],
                                on=['plant id', 'year'], how='left')


# In[11]:


facility_df.head()


# Filter out data older than 2014 to reduce size

# In[12]:


facility_df['state'] = facility_df['geography'].str[-2:]
keep_cols = ['fuel', 'year', 'month', 'datetime', 'state', 'plant id', 'NERC',
             'generation (MWh)', 'total fuel (mmbtu)', 'elec fuel (mmbtu)']
facility_df = facility_df.loc[facility_df['year'] >= 2014, keep_cols]


# In[13]:


facility_fuel_cats = {'COW': ['SUB', 'BIT', 'LIG', 'WC', 'SC', 'RC', 'SGC'],
                      'NG': ['NG'],
                      'PEL': ['DFO', 'RFO', 'KER', 'JF',
                              'PG', 'WO', 'SGP'],
                      'PC': ['PC'],
                      'HYC': ['WAT'],
                      'HPS': [],
                      'GEO': ['GEO'],
                      'NUC': ['NUC'],
                      'OOG': ['BFG', 'OG', 'LFG'],
                      'OTH': ['OTH', 'MSN', 'MSW', 'PUR', 'TDF', 'WH'],
                      'SUN': ['SUN'],
                      'DPV': [],
                      'WAS': ['OBL', 'OBS', 'OBG', 'MSB', 'SLW'],
                      'WND': ['WND'],
                      'WWW': ['WDL', 'WDS', 'AB', 'BLQ']
                      }


# In[14]:


for category in facility_fuel_cats.keys():
    fuels = facility_fuel_cats[category]
    facility_df.loc[facility_df['fuel'].isin(fuels),
                    'fuel category'] = category


# In[15]:


facility_df.head()


# In[16]:


facility_df.dtypes


# In[17]:


facility_df.loc[facility_df['NERC'].isnull(), 'state'].unique()


# ## Import state-level generation data

# In[26]:


folder = os.path.join('Data storage', 'Derived data', 'state gen data')
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", 
          "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", 
          "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", 
          "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", 
          "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]


# In[29]:


state_list = []
for state in states:
    path = os.path.join(folder, '{} fuels gen.csv'.format(state))
    df = pd.read_csv(path, parse_dates=['datetime'])
    state_list.append(df)
state_df = pd.concat(state_list)
state_df.reset_index(inplace=True, drop=True)


# In[30]:


state_df.dtypes


# In[31]:


state_df['state'] = state_df['geography'].str[-2:]
keep_cols = ['state', 'type', 'year', 'datetime', 'generation (MWh)',
             'elec fuel (mmbtu)']

fuel_cats = facility_fuel_cats.keys()
state_df = state_df.loc[(state_df['year'] >= 2014) &
                        (state_df['type'].isin(fuel_cats)), keep_cols]


# In[32]:


state_df['type'].unique()


# ## Total generation and fuel consumption for each fuel category

# ### Annual

# In[33]:


annual_facility = facility_df.groupby(['year', 'state', 'fuel category']).sum()
# annual_facility.reset_index(inplace=True)
annual_facility.drop('plant id', axis=1, inplace=True)


# In[34]:


annual_facility.head()


# In[35]:


annual_state = state_df.groupby(['year', 'state', 'type']).sum()
# annual_state.reset_index(inplace=True)


# In[36]:


annual_state.head(n=25)


# It's interesting that the facility data has fuel consumption for solar generation and the state data doesn't. Looking at a 923 data file, it's clear that the fuel consumption is just based on a conversion efficiency of 36.6% across all facilities.

# In[37]:


annual_state.loc[2016, 'CA', 'SUN']


# In[38]:


annual_facility.loc[2016, 'CA', 'SUN']


# How much generation from large sources (Hydro, wind, coal, natural gas, and nuclear) is missed by monthly 923 data? 

# In[40]:


for fuel in ['HYC', 'WND', 'COW', 'NG', 'NUC', 'SUN']:
    state_total = annual_state.loc[2016, :, fuel]['generation (MWh)'].sum()
    facility_total = annual_facility.loc[2016, :, fuel]['generation (MWh)'].sum()
    
    error = (state_total - facility_total) / state_total
    print('{} has an error of {:.2f}%'.format(fuel, error * 100))


# ## 2015 generation and fuel consumption from annual vs monthly reporting plants
# The goal here is to figure out how much of generation and fuel consumption from facilities that only report annually is in each NERC region (by state)

# In[18]:


path = os.path.join('Data storage', 'EIA923_Schedules_2_3_4_5_M_12_2015_Final.xlsx')
frequency = pd.read_excel(path, sheetname='Page 6 Plant Frame', header=4)


# In[19]:


frequency.head()


# In[20]:


frequency.rename(columns={'Plant Id': 'plant id',
                          'Plant State': 'state',
                          'YEAR': 'year',
                          'Reporting\nFrequency': 'Reporting Frequency'}, inplace=True)


# In[99]:


frequency.head()


# In[21]:


frequency.dtypes


# Make a dataframe with generation, fuel consumption, and reporting frequency of facilities in 2015 

# In[22]:


freq_cols = ['year', 'plant id', 'Reporting Frequency']
df = pd.merge(facility_df, frequency.loc[:, freq_cols], on=['year', 'plant id'])


# In[80]:


df.head()


# In[78]:


g = sns.factorplot(x='fuel category', y='generation (MWh)', hue='Reporting Frequency',
                   col='NERC', col_wrap=3, data=df, estimator=np.sum, ci=0, kind='bar',
                   palette='tab10')

g.set_xticklabels(rotation=30)


# Number of NERC regions in a state

# In[24]:


df.loc[df['state'] == 'TX', 'NERC'].nunique()


# ### Fraction of generation/consumption from Annual reporting facilities in each NERC region of a state
# This is development of a method that will be used to approximate the fraction of EIA-estimated generation and consumption within each state that gets apportioned to each NERC regions (when there is more than one). The idea is to take data from the most recent "final" EIA-923 and use the annual reporting facilities to approximate the divisions for more recent data. I still need to figure out if it's better to do the calculation by month within a year or just for the year as a whole.
# 
# Determining if it's better to do month-by-month vs a single value for the whole year will depend on if the share of generation/consumption from Annual reporting facilities in each NERC changes much over the course of the year. There is the potential for error either way, and maybe even differences by state. Annual is certainly simpler.
# 
# While looking at data for Texas, I've discovered that generation from Annual reporting facilities can be negative. Need to figure out how (if?) to deal with this...
# 
# #### Conclusion
# While there can be variation of % generation in each NERC within a state over the course of 2015, most fuel categories across most states are quite stable. And when fuels do a have a wide spread over the year, they also tend to not be a large fraction of total generation within the NERC region. Given these observations, I'm going to stick with a split calculated as the average over an entire year.

# In[81]:


def annual(df, state):
    """Return the percent of gen & consumption by fuel type in each NERC region
    for a state"""
    a = df.loc[(df.state == state) & 
                     (df['Reporting Frequency'] == 'A')].copy()
    a.drop(['plant id', 'year'], axis=1, inplace=True)
    a = a.groupby(['NERC', 'fuel category']).sum()
    
    fuels = set(a.index.get_level_values('fuel category'))
    
    temp_list = []
    for fuel in fuels:
        temp = (a.xs(fuel, level='fuel category')
                / a.xs(fuel, level='fuel category').sum())
        temp['fuel category'] = fuel
        temp_list.append(temp)
    
    result = pd.concat(temp_list)
    result.reset_index(inplace=True)
    result['state'] = state
    
    rename_cols = {'generation (MWh)': '% generation',
                   'total fuel (mmbtu)': '% total fuel',
                   'elec fuel (mmbtu)': '% elec fuel'}
    
    result.rename(columns=rename_cols, inplace=True)
    
    return result


# In[30]:


def annual_month(df, state):
    """Return the percent of gen & consumption by fuel type and month in each 
    NERC region for a state"""
    a = df.loc[(df.state == state) & 
                     (df['Reporting Frequency'] == 'A')].copy()
    a.drop(['plant id', 'year'], axis=1, inplace=True)
    a = a.groupby(['NERC', 'fuel category', 'month']).sum()
    
    fuels = set(a.index.get_level_values('fuel category'))
    
    temp_list = []
    for fuel in fuels:
        for month in range(1, 13):
            temp = (a.xs(fuel, level='fuel category')
                     .xs(month, level='month')
                    / a.xs(fuel, level='fuel category')
                       .xs(month, level='month')
                       .sum())
            temp['fuel category'] = fuel
            temp['month'] = month
            temp_list.append(temp)
    
    result = pd.concat(temp_list)
    result.reset_index(inplace=True)
    result['state'] = state
    
    rename_cols = {'generation (MWh)': '% generation',
                   'total fuel (mmbtu)': '% total fuel',
                   'elec fuel (mmbtu)': '% elec fuel'}
    
    result.rename(columns=rename_cols, inplace=True)
    
    return result


# This is the percent of generation, total fuel consumption, and electric fuel consumption from facilities that report annually to EIA-923

# In[82]:


df_list = []
for state in states:
    num_nerc = df.loc[df.state == state, 'NERC'].nunique()
    if num_nerc > 1:
        df_list.append(annual(df, state))


# In[31]:


df_list = []
for state in states:
    num_nerc = df.loc[df.state == state, 'NERC'].nunique()
    if num_nerc > 1:
        df_list.append(annual_month(df, state))


# In[32]:


fuel_by_nerc_month = pd.concat(df_list).reset_index(drop=True)


# In[83]:


fuel_by_nerc = pd.concat(df_list).reset_index(drop=True)


# In[84]:


fuel_by_nerc.head()


# In[214]:


fuel_by_nerc_month.tail()


# In[ ]:


st


# In[27]:


split_states = []
for state in states:
    if df.loc[df.state == state, 'NERC'].nunique() > 1:
        split_states.append(state)


# In[28]:


split_states


# In[56]:


cols = ['state', 'NERC', 'fuel category']
a = fuel_by_nerc_month.groupby(cols).std()
a.drop('month', axis=1, inplace=True)


# In[63]:


a.xs('AR', level='state')


# In[70]:


a[a > .1].dropna(how='all')


# In[75]:


fuels = ['NG', 'HYC', 'COW', 'GEO', 'WND', 'SUN']

sns.factorplot(x='month', y='% generation', hue='fuel category', col='NERC',
               row='state',
               data=fuel_by_nerc_month.loc[(fuel_by_nerc_month['fuel category'].isin(fuels)) &
                                           (fuel_by_nerc_month['NERC'] != '-')],
               n_boot=1)
path = os.path.join('Figures', 'SI', 'Annual facility seasonal gen variation.pdf')
# plt.savefig(path, bbox_inches='tight')


# In[222]:


fuel_by_nerc_month.loc[(fuel_by_nerc_month.state=='TX') &
                       (fuel_by_nerc_month['fuel category'] == 'WWW')]


# In[232]:


df.loc[(df.state == 'TX') &
       (df['fuel category'] == 'WWW') &
       (df['Reporting Frequency'] == 'A')].groupby(['NERC', 'month', 'fuel category']).sum()


# In[230]:


df.loc[(df.state == 'TX') &
       (df['fuel category'] == 'WWW') &
       (df['Reporting Frequency'] == 'A')].groupby(['NERC', 'fuel category']).sum()


# #### States that include more than one NERC region

# In[79]:


NERC_states = ['WY', 'SD', 'NE', 'OK', 'TX', 'NM', 'LA', 'AR',
               'MO', 'MN', 'IL', 'KY', 'VA', 'FL']


# In[93]:


error_list = []
for state in NERC_states:
    error = (annual_state.loc[2016, state]
             - annual_facility.loc[2016, state]) / annual_state.loc[2016, state]
    error['state'] = state
    
    for col in ['generation (MWh)']:#, 'elec fuel (mmbtu)']:
        if error.loc[error[col] > 0.05, col].any():
            error_list.append(error.loc[error[col] > 0.05])


# The dataframe below shows all states with more than one NERC region where facility generation is at least 5% below EIA's state-level estimate in 2016. 

# In[94]:


pd.concat(error_list)