#!/usr/bin/env python
# coding: utf-8

# # Assign NERC labels to plants using 860 data and k-nearest neighbors

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import os
from os.path import join
import pandas as pd
from sklearn import neighbors, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter


cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')


# ### Date string for filenames
# This will be inserted into all filenames (reading and writing)

# In[2]:


file_date = '2018-03-06'


# ## Load data
# This loads facility data that has been assembled from the EIA bulk data file, and EIA-860 excel files. The EIA-860 excel files need to be downloaded manually.

# ### Load EIA facility data
# Only need to keep the plant id, year (as a check that plants don't move between years), and lat/lon

# In[3]:


path = os.path.join(data_path, 'Facility gen fuels and CO2 {}.csv'.format(file_date))
facility_df = pd.read_csv(path)
facility_df['state'] = facility_df['geography'].str[-2:]


# In[4]:


plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon', 'state']]
plants.drop_duplicates(inplace=True)


# ### Load known NERC labels from EIA-860
# Current NERCS go back to 2012. Use that, 2015, and the 2016 early release.

# In[5]:


eia_base_path = join(data_path, 'EIA downloads')
file_860_info = {
#     2011: {'io': join(eia_base_path, 'eia8602011', 'Plant.xlsx'),
#            'skiprows': 0,
#            'parse_cols': 'B,J'},
    2012: {'io': join(eia_base_path, 'eia8602012', 'PlantY2012.xlsx'),
           'skiprows': 0,
           'usecols': 'B,J'},
    2013: {'io': join(eia_base_path, 'eia8602013', '2___Plant_Y2013.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2014: {'io': join(eia_base_path, 'eia8602014', '2___Plant_Y2014.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2015: {'io': join(eia_base_path, 'eia8602015', '2___Plant_Y2015.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2016: {'io': join(eia_base_path, 'eia8602016', '2___Plant_Y2016.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'}
}


# In[6]:


eia_nercs = {}
for key, args in file_860_info.items():
    eia_nercs[key] = pd.read_excel(**args)
    eia_nercs[key].columns = ['plant id', 'nerc']
    eia_nercs[key]['year'] = key


# In[7]:


nercs = pd.concat(eia_nercs.values()).drop_duplicates(subset=['plant id', 'nerc'])


# ### Look for plants listed with different NERC labels
# There are 30 plants duplicated. Five of them don't have a NERC label in one of the years. The largest move is from MRO to other regions (12), with most of those to SPP (7). After that, moves from RFC (5) to MRO (3) and SERC (2). There are also some moves from WECC and FRCC to HICC/ASCC - these might be diesel generators that get moved.
# 
# The plants that have duplicate NERC region labels represent a small fraction of national generation, but one that is growing over time. By 2016 they consist of 0.15% of national generation.

# In[8]:


for df_ in list(eia_nercs.values()) + [nercs]:
    print('{} total records'.format(len(df_)))
    print('{} unique plants'.format(len(df_['plant id'].unique())))


# In[9]:


dup_plants = nercs.loc[nercs['plant id'].duplicated(keep=False), 'plant id'].unique()
dup_plants


# In[10]:


region_list = []
for plant in dup_plants:
    regions = nercs.loc[nercs['plant id'] == plant, 'nerc'].unique()
#     regions = regions.tolist()
    region_list.append(regions)
Counter(tuple(x) for x in region_list)


# In[11]:


(facility_df.loc[facility_df['plant id'].isin(dup_plants), :]
            .groupby('year')['generation (MWh)'].sum()
 / facility_df.loc[:, :]
              .groupby('year')['generation (MWh)'].sum())


# ### Some plants in EIA-860 don't have NERC labels. Drop them now.
# This is my training data. All of these plants should still be in my `plants` dataframe.

# In[12]:


nan_plants = nercs.loc[nercs.isnull().any(axis=1)]
len(nan_plants)


# In[13]:


nercs.loc[nercs['plant id'].isin(nan_plants['plant id'])]


# In[14]:


nercs.dropna(inplace=True)


# ## Load EIA-860m for some info on recent facilities
# SPP and TRE have the lowest accuracy. I'm going to assume that anything in TX or OK and SWPP balancing authority is in SPP. On the flip side, if it's in TX and ERCOT I'll assign it to TRE.
# 
# Only do this for plants that come online since the most recent 860 annual data.

# In[15]:


path = join(data_path, 'EIA downloads', 'december_generator2017.xlsx')
m860 = pd.read_excel(path, sheet_name='Operating',skip_footer=1,
                     usecols='C,F,P,AE', skiprows=0)
m860.columns = m860.columns.str.lower()


# In[16]:


m860 = m860.loc[m860['operating year'] == 2017]
m860.tail()


# In[17]:


m860.loc[(m860['plant state'].isin(['TX', 'OK'])) &
         (m860['balancing authority code'] == 'SWPP'), 'nerc'] = 'SPP'

m860.loc[(m860['plant state'].isin(['TX'])) &
         (m860['balancing authority code'] == 'ERCO'), 'nerc'] = 'TRE'


# In[18]:


m860.dropna(inplace=True)
m860


# Make lists of plant codes for SPP and TRE facilities

# In[19]:


m860_spp_plants = (m860.loc[m860['nerc'] == 'SPP', 'plant id']
                       .drop_duplicates()
                       .tolist())
m860_tre_plants = (m860.loc[m860['nerc'] == 'TRE', 'plant id']
                       .drop_duplicates()
                       .tolist())


# In[20]:


m860_spp_plants


# In[25]:


m860_tre_plants


# In[28]:


nan_plants.loc[nan_plants['plant id'].isin(m860_spp_plants)]


# In[ ]:


# ## Clean and prep data for KNN

# In[21]:


df = pd.merge(plants, nercs.drop('year', axis=1), on=['plant id'], how='left')


# In[22]:


df.columns


# Drop plants that don't have lat/lon data (using just lon to check), and then drop duplicates. If any plants have kept the same plant id but moved over time (maybe a diesel generator?) or switched NERC they will show up twice.

# In[23]:


df.loc[df.lon.isnull()].drop_duplicates(subset='plant id')


# In[24]:


df.loc[df.lat.isnull()].drop_duplicates(subset='plant id')


# In[25]:


cols = ['plant id', 'lat', 'lon', 'nerc', 'state']
df_slim = (df.loc[:, cols].dropna(subset=['lon'])
             .drop_duplicates(subset=['plant id', 'nerc']))


# In[26]:


len(df_slim)


# In[27]:


df_slim.head()


# Separate out the list of plants where we don't have NERC labels from EIA-860.

# In[28]:


unknown = df_slim.loc[df_slim.nerc.isnull()]


# In[29]:


print("{} plants don't have NERC labels\n".format(len(unknown)))
print(unknown.head())


# ### Create X and y matricies
# X is lat/lon
# 
# y is the NERC label
# 
# For both, I'm only using plants where we have all data (no `NaN`s). Not doing any transformation of the lat/lon at this time. There is certainly some error here, as KNN will use the Euclidian distance to calculate nearest neighbors. Not sure how I plan on dealing with this, or if it is even necessary.

# In[30]:


X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']

# le = LabelEncoder()
# le.fit(y)

# y = le.transform(y)


# In[31]:


len(X)


# In[32]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


# ## GridSearch to find the best parameters

# ### Regular KNN classifier
# Run gridsearch testing parameter values for weights, n_neighbors, and p (use Euclidean or Manhattan distance).
# 
# With 15 neighbors, weights by distance, and Euclidean distance, the model is able to accurately predict the test sample NERC region with 96% accuracy. This varies by region, with the lowest accuracy scores for TRE and SPP (89% and 87%), and the highest accuracy scores for WECC and NPCC (each 99%). F1 scores tend to be similar to the accuracy, although TRE has slightly higher F1 (0.94 vs 0.89).

# In[33]:


knn = neighbors.KNeighborsClassifier()

params = {'weights': ['uniform', 'distance'],
          'n_neighbors': [10, 15, 20],
          'p': [1, 2]
         }

clf_knn = GridSearchCV(knn, params, n_jobs=-1, iid=False, verbose=1)

clf_knn.fit(X_train, y_train)


# In[34]:


clf_knn.best_estimator_, clf_knn.best_score_


# In[35]:


clf_knn.score(X_test, y_test)


# In[36]:


nerc_labels = nercs.nerc.dropna().unique()


# Accuracy score by region

# In[37]:


for region in nerc_labels:
    mask = y_test == region
    
    X_masked = X_test[mask]
    y_hat_masked = clf_knn.predict(X_masked)
    y_test_masked = y_test[mask]
    
    accuracy = metrics.accuracy_score(y_test_masked, y_hat_masked)
    print('{} : {}'.format(region, accuracy))


# F1 score by region

# In[38]:


y_hat = clf_knn.predict(X_test)

for region in nerc_labels:
    f1 = metrics.f1_score(y_test, y_hat, labels=[region], average='macro')
    print('{} : {}'.format(region, f1))


# In[39]:


metrics.f1_score(y_test, y_hat, average='micro')


# In[40]:


metrics.f1_score(y_test, y_hat, average='macro')


# ## Use best KNN parameters to predict NERC for unknown plants

# In[41]:


unknown.loc[:, 'nerc'] = clf_knn.predict(unknown.loc[:, ['lat', 'lon']])


# Ensuring that no plants in Alaska or Hawaii are assigned to continental NERCs, or the other way around.

# In[42]:


print(unknown.loc[unknown.state.isin(['AK', 'HI']), 'nerc'].unique())
print(unknown.loc[unknown.nerc.isin(['HICC', 'ASCC']), 'state'].unique())


# In[43]:


Counter(unknown['nerc'])


# ## Export plants with lat/lon, state, and nerc

# In[44]:


unknown.head()


# In[45]:


unknown.tail()


# In[46]:


df_slim.head()


# In[47]:


labeled = pd.concat([df_slim.loc[df_slim.notnull().all(axis=1)], unknown])


# In[48]:


labeled.loc[labeled.nerc.isnull()]


# There are 11 facilities that don't show up in my labeled data - they didn't have lat/lon info.

# In[49]:


facility_df.loc[~facility_df['plant id'].isin(labeled['plant id']),
                'plant id'].unique()


# In[50]:


path = join(data_path, 'Facility labels', 'Facility locations_knn.csv')
labeled.to_csv(path, index=False)


# In[ ]: