%matplotlib inline
import matplotlib.pyplot as plt
import os
from os.path import join
import pandas as pd
from sklearn import neighbors, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter
cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')
This will be inserted into all filenames (reading and writing)
file_date = '2018-03-06'
This loads facility data that has been assembled from the EIA bulk data file, and EIA-860 excel files. The EIA-860 excel files need to be downloaded manually.
Only need to keep the plant id, year (as a check that plants don't move between years), and lat/lon
path = os.path.join(data_path, 'Facility gen fuels and CO2 {}.csv'.format(file_date))
facility_df = pd.read_csv(path)
facility_df['state'] = facility_df['geography'].str[-2:]
plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon', 'state']]
plants.drop_duplicates(inplace=True)
Current NERCS go back to 2012. Use that, 2015, and the 2016 early release.
eia_base_path = join(data_path, 'EIA downloads')
file_860_info = {
# 2011: {'io': join(eia_base_path, 'eia8602011', 'Plant.xlsx'),
# 'skiprows': 0,
# 'parse_cols': 'B,J'},
2012: {'io': join(eia_base_path, 'eia8602012', 'PlantY2012.xlsx'),
'skiprows': 0,
'usecols': 'B,J'},
2013: {'io': join(eia_base_path, 'eia8602013', '2___Plant_Y2013.xlsx'),
'skiprows': 0,
'usecols': 'C,L'},
2014: {'io': join(eia_base_path, 'eia8602014', '2___Plant_Y2014.xlsx'),
'skiprows': 0,
'usecols': 'C,L'},
2015: {'io': join(eia_base_path, 'eia8602015', '2___Plant_Y2015.xlsx'),
'skiprows': 0,
'usecols': 'C,L'},
2016: {'io': join(eia_base_path, 'eia8602016', '2___Plant_Y2016.xlsx'),
'skiprows': 0,
'usecols': 'C,L'}
}
eia_nercs = {}
for key, args in file_860_info.items():
eia_nercs[key] = pd.read_excel(**args)
eia_nercs[key].columns = ['plant id', 'nerc']
eia_nercs[key]['year'] = key
nercs = pd.concat(eia_nercs.values()).drop_duplicates(subset=['plant id', 'nerc'])
There are 30 plants duplicated. Five of them don't have a NERC label in one of the years. The largest move is from MRO to other regions (12), with most of those to SPP (7). After that, moves from RFC (5) to MRO (3) and SERC (2). There are also some moves from WECC and FRCC to HICC/ASCC - these might be diesel generators that get moved.
The plants that have duplicate NERC region labels represent a small fraction of national generation, but one that is growing over time. By 2016 they consist of 0.15% of national generation.
for df_ in list(eia_nercs.values()) + [nercs]:
print('{} total records'.format(len(df_)))
print('{} unique plants'.format(len(df_['plant id'].unique())))
7289 total records 7289 unique plants 8060 total records 8060 unique plants 8520 total records 8520 unique plants 8928 total records 8928 unique plants 9711 total records 9711 unique plants 10104 total records 10068 unique plants
dup_plants = nercs.loc[nercs['plant id'].duplicated(keep=False), 'plant id'].unique()
dup_plants
array([ 66, 1120, 1121, 7757, 7848, 7847, 6280, 57251, 57252, 70, 899, 1168, 57449, 58469, 55836, 56266, 56106, 56856, 56985, 57622, 57623, 57651, 57650, 57983, 58117, 58278, 58511, 59027, 59037, 58690, 58655, 58676], dtype=int64)
region_list = []
for plant in dup_plants:
regions = nercs.loc[nercs['plant id'] == plant, 'nerc'].unique()
# regions = regions.tolist()
region_list.append(regions)
Counter(tuple(x) for x in region_list)
Counter({('ASCC', nan): 2, ('FRCC', 'HICC'): 1, ('MRO', 'RFC'): 2, ('MRO', 'SERC'): 1, ('MRO', 'SPP'): 7, ('MRO', 'WECC'): 3, ('RFC', 'MRO'): 3, ('RFC', 'SERC'): 2, ('SERC', 'SPP'): 1, ('SPP', 'SERC'): 2, ('SPP', 'TRE'): 1, ('WECC', 'ASCC'): 2, ('WECC', 'HICC'): 1, (nan, 'WECC', 'ASCC'): 3, (nan, 'WECC', 'HICC'): 1})
(facility_df.loc[facility_df['plant id'].isin(dup_plants), :]
.groupby('year')['generation (MWh)'].sum()
/ facility_df.loc[:, :]
.groupby('year')['generation (MWh)'].sum())
year 2001 0.000345 2002 0.000269 2003 0.000262 2004 0.000313 2005 0.000426 2006 0.000514 2007 0.000509 2008 0.000527 2009 0.000631 2010 0.000683 2011 0.000763 2012 0.001286 2013 0.001138 2014 0.001079 2015 0.001701 2016 0.001962 2017 0.001460 Name: generation (MWh), dtype: float64
This is my training data. All of these plants should still be in my plants
dataframe.
nan_plants = nercs.loc[nercs.isnull().any(axis=1)]
len(nan_plants)
41
nercs.loc[nercs['plant id'].isin(nan_plants['plant id'])]
plant id | nerc | year | |
---|---|---|---|
65 | 66 | ASCC | 2012 |
1637 | 58277 | NaN | 2012 |
1928 | 70 | ASCC | 2012 |
1961 | 58405 | NaN | 2012 |
4145 | 58469 | NaN | 2012 |
6945 | 58117 | NaN | 2012 |
6946 | 58278 | NaN | 2012 |
7164 | 58380 | NaN | 2012 |
7194 | 58425 | NaN | 2012 |
7262 | 58511 | NaN | 2012 |
64 | 66 | NaN | 2013 |
828 | 58982 | NaN | 2013 |
1917 | 58639 | NaN | 2013 |
1918 | 58640 | NaN | 2013 |
1938 | 58977 | NaN | 2013 |
2205 | 58989 | NaN | 2013 |
2213 | 58549 | NaN | 2013 |
4127 | 58469 | WECC | 2013 |
6954 | 58117 | WECC | 2013 |
6955 | 58278 | WECC | 2013 |
7301 | 58511 | WECC | 2013 |
7414 | 58651 | NaN | 2013 |
7422 | 58656 | NaN | 2013 |
7424 | 58659 | NaN | 2013 |
7427 | 58662 | NaN | 2013 |
7432 | 58684 | NaN | 2013 |
7457 | 58705 | NaN | 2013 |
7603 | 58837 | NaN | 2013 |
7750 | 59035 | NaN | 2013 |
4129 | 58469 | HICC | 2014 |
6929 | 58117 | ASCC | 2014 |
6930 | 58278 | ASCC | 2014 |
7272 | 58511 | ASCC | 2014 |
8453 | 60024 | NaN | 2015 |
8550 | 60125 | NaN | 2015 |
8667 | 60243 | NaN | 2015 |
8668 | 60244 | NaN | 2015 |
8669 | 60245 | NaN | 2015 |
8674 | 60250 | NaN | 2015 |
8684 | 60260 | NaN | 2015 |
8749 | 60328 | NaN | 2015 |
40 | 70 | NaN | 2016 |
8888 | 60563 | NaN | 2016 |
8913 | 60588 | NaN | 2016 |
9128 | 60814 | NaN | 2016 |
9371 | 61068 | NaN | 2016 |
9400 | 61099 | NaN | 2016 |
9402 | 61101 | NaN | 2016 |
9448 | 61166 | NaN | 2016 |
9453 | 61172 | NaN | 2016 |
9626 | 61364 | NaN | 2016 |
nercs.dropna(inplace=True)
SPP and TRE have the lowest accuracy. I'm going to assume that anything in TX or OK and SWPP balancing authority is in SPP. On the flip side, if it's in TX and ERCOT I'll assign it to TRE.
Only do this for plants that come online since the most recent 860 annual data.
path = join(data_path, 'EIA downloads', 'december_generator2017.xlsx')
m860 = pd.read_excel(path, sheet_name='Operating',skip_footer=1,
usecols='C,F,P,AE', skiprows=0)
m860.columns = m860.columns.str.lower()
m860 = m860.loc[m860['operating year'] == 2017]
m860.tail()
plant id | plant state | operating year | balancing authority code | |
---|---|---|---|---|
21373 | 61632 | IA | 2017 | MISO |
21374 | 61633 | MA | 2017 | ISNE |
21375 | 61634 | MA | 2017 | ISNE |
21376 | 61635 | MA | 2017 | ISNE |
21377 | 61636 | MA | 2017 | ISNE |
m860.loc[(m860['plant state'].isin(['TX', 'OK'])) &
(m860['balancing authority code'] == 'SWPP'), 'nerc'] = 'SPP'
m860.loc[(m860['plant state'].isin(['TX'])) &
(m860['balancing authority code'] == 'ERCO'), 'nerc'] = 'TRE'
m860.dropna(inplace=True)
m860
plant id | plant state | operating year | balancing authority code | nerc | |
---|---|---|---|---|---|
343 | 165 | OK | 2017 | SWPP | SPP |
344 | 165 | OK | 2017 | SWPP | SPP |
4836 | 2953 | OK | 2017 | SWPP | SPP |
4837 | 2953 | OK | 2017 | SWPP | SPP |
4838 | 2953 | OK | 2017 | SWPP | SPP |
4839 | 2953 | OK | 2017 | SWPP | SPP |
4840 | 2953 | OK | 2017 | SWPP | SPP |
4841 | 2953 | OK | 2017 | SWPP | SPP |
4842 | 2953 | OK | 2017 | SWPP | SPP |
15993 | 56984 | TX | 2017 | ERCO | TRE |
18994 | 59066 | TX | 2017 | ERCO | TRE |
19143 | 59193 | TX | 2017 | ERCO | TRE |
19157 | 59206 | TX | 2017 | ERCO | TRE |
19187 | 59245 | TX | 2017 | ERCO | TRE |
19665 | 59712 | TX | 2017 | ERCO | TRE |
19791 | 59812 | TX | 2017 | ERCO | TRE |
19792 | 59812 | TX | 2017 | ERCO | TRE |
19793 | 59812 | TX | 2017 | ERCO | TRE |
20091 | 60122 | TX | 2017 | ERCO | TRE |
20092 | 60122 | TX | 2017 | ERCO | TRE |
20093 | 60122 | TX | 2017 | ERCO | TRE |
20147 | 60210 | TX | 2017 | ERCO | TRE |
20149 | 60217 | TX | 2017 | ERCO | TRE |
20296 | 60366 | TX | 2017 | ERCO | TRE |
20301 | 60372 | TX | 2017 | ERCO | TRE |
20334 | 60414 | TX | 2017 | SWPP | SPP |
20350 | 60436 | TX | 2017 | ERCO | TRE |
20371 | 60459 | TX | 2017 | ERCO | TRE |
20372 | 60459 | TX | 2017 | ERCO | TRE |
20373 | 60460 | TX | 2017 | ERCO | TRE |
20374 | 60460 | TX | 2017 | ERCO | TRE |
20476 | 60581 | TX | 2017 | ERCO | TRE |
20556 | 60682 | TX | 2017 | ERCO | TRE |
20566 | 60690 | TX | 2017 | ERCO | TRE |
20681 | 60774 | TX | 2017 | ERCO | TRE |
20785 | 60901 | TX | 2017 | ERCO | TRE |
20786 | 60902 | TX | 2017 | ERCO | TRE |
20865 | 60983 | TX | 2017 | ERCO | TRE |
20870 | 60989 | TX | 2017 | ERCO | TRE |
21010 | 61205 | TX | 2017 | ERCO | TRE |
21023 | 61221 | OK | 2017 | SWPP | SPP |
21050 | 61261 | OK | 2017 | SWPP | SPP |
21077 | 61309 | TX | 2017 | ERCO | TRE |
21114 | 61362 | TX | 2017 | ERCO | TRE |
21171 | 61409 | TX | 2017 | ERCO | TRE |
21172 | 61410 | TX | 2017 | ERCO | TRE |
21173 | 61411 | TX | 2017 | ERCO | TRE |
21362 | 61614 | OK | 2017 | SWPP | SPP |
21363 | 61615 | OK | 2017 | SWPP | SPP |
21364 | 61616 | OK | 2017 | SWPP | SPP |
21365 | 61617 | OK | 2017 | SWPP | SPP |
21366 | 61618 | OK | 2017 | SWPP | SPP |
Make lists of plant codes for SPP and TRE facilities
m860_spp_plants = (m860.loc[m860['nerc'] == 'SPP', 'plant id']
.drop_duplicates()
.tolist())
m860_tre_plants = (m860.loc[m860['nerc'] == 'TRE', 'plant id']
.drop_duplicates()
.tolist())
m860_spp_plants
[165, 2953, 60414, 61221, 61261, 61614, 61615, 61616, 61617, 61618]
m860_tre_plants
[56984, 59066, 59193, 59206, 59245, 59712, 59812, 60122, 60210, 60217, 60366, 60372, 60436, 60459, 60460, 60581, 60682, 60690, 60774, 60901, 60902, 60983, 60989, 61205, 61309, 61362, 61409, 61410, 61411]
nan_plants.loc[nan_plants['plant id'].isin(m860_spp_plants)]
plant id | nerc | year |
---|
df = pd.merge(plants, nercs.drop('year', axis=1), on=['plant id'], how='left')
df.columns
Index(['plant id', 'year', 'lat', 'lon', 'state', 'nerc'], dtype='object')
Drop plants that don't have lat/lon data (using just lon to check), and then drop duplicates. If any plants have kept the same plant id but moved over time (maybe a diesel generator?) or switched NERC they will show up twice.
df.loc[df.lon.isnull()].drop_duplicates(subset='plant id')
plant id | year | lat | lon | state | nerc | |
---|---|---|---|---|---|---|
86900 | 10851 | 2006 | NaN | NaN | NJ | NaN |
87088 | 50291 | 2005 | NaN | NaN | FL | NaN |
87607 | 56672 | 2010 | NaN | NaN | MN | NaN |
87808 | 55982 | 2004 | NaN | NaN | AK | NaN |
87857 | 55150 | 2004 | NaN | NaN | LA | NaN |
87890 | 55082 | 2006 | NaN | NaN | MS | NaN |
87955 | 55521 | 2005 | NaN | NaN | CA | NaN |
88021 | 55314 | 2003 | NaN | NaN | NaN | NaN |
88318 | 54243 | 2005 | NaN | NaN | GA | NaN |
88441 | 50726 | 2007 | NaN | NaN | LA | NaN |
88690 | 50249 | 2004 | NaN | NaN | TX | NaN |
88917 | 2666 | 2004 | NaN | NaN | NY | NaN |
91857 | 55952 | 2014 | NaN | NaN | NaN | WECC |
92742 | 7704 | 2003 | NaN | NaN | NaN | NaN |
92808 | 6339 | 2004 | NaN | NaN | NaN | NaN |
92930 | 55840 | 2003 | NaN | NaN | AZ | NaN |
92969 | 55883 | 2004 | NaN | NaN | KY | NaN |
92980 | 55955 | 2003 | NaN | NaN | AZ | NaN |
92981 | 55958 | 2004 | NaN | NaN | LA | NaN |
93017 | 56043 | 2004 | NaN | NaN | CA | NaN |
93048 | 56175 | 2007 | NaN | NaN | CA | NaN |
93061 | 55975 | 2004 | NaN | NaN | NaN | NaN |
93066 | 56042 | 2005 | NaN | NaN | NY | NaN |
93101 | 56168 | 2003 | NaN | NaN | MN | NaN |
93129 | 55073 | 2004 | NaN | NaN | NaN | NaN |
93152 | 54516 | 2006 | 41.8875 | NaN | IL | NaN |
93165 | 55303 | 2008 | NaN | NaN | MS | NaN |
93251 | 55301 | 2003 | NaN | NaN | AZ | NaN |
93431 | 52164 | 2006 | NaN | NaN | CA | NaN |
93552 | 54222 | 2007 | NaN | NaN | AK | NaN |
93734 | 50728 | 2003 | NaN | NaN | LA | SERC |
93786 | 50040 | 2005 | NaN | NaN | IL | NaN |
93858 | 50030 | 2003 | NaN | NaN | LA | NaN |
93906 | 50168 | 2003 | NaN | NaN | LA | NaN |
94017 | 50313 | 2004 | NaN | NaN | NJ | NaN |
94423 | 10683 | 2007 | NaN | NaN | CO | NaN |
94647 | 10257 | 2005 | NaN | NaN | CA | NaN |
94815 | 56508 | 2007 | NaN | NaN | CA | NaN |
df.loc[df.lat.isnull()].drop_duplicates(subset='plant id')
plant id | year | lat | lon | state | nerc | |
---|---|---|---|---|---|---|
86900 | 10851 | 2006 | NaN | NaN | NJ | NaN |
87088 | 50291 | 2005 | NaN | NaN | FL | NaN |
87607 | 56672 | 2010 | NaN | NaN | MN | NaN |
87808 | 55982 | 2004 | NaN | NaN | AK | NaN |
87857 | 55150 | 2004 | NaN | NaN | LA | NaN |
87890 | 55082 | 2006 | NaN | NaN | MS | NaN |
87955 | 55521 | 2005 | NaN | NaN | CA | NaN |
88021 | 55314 | 2003 | NaN | NaN | NaN | NaN |
88318 | 54243 | 2005 | NaN | NaN | GA | NaN |
88441 | 50726 | 2007 | NaN | NaN | LA | NaN |
88690 | 50249 | 2004 | NaN | NaN | TX | NaN |
88917 | 2666 | 2004 | NaN | NaN | NY | NaN |
91857 | 55952 | 2014 | NaN | NaN | NaN | WECC |
92742 | 7704 | 2003 | NaN | NaN | NaN | NaN |
92808 | 6339 | 2004 | NaN | NaN | NaN | NaN |
92930 | 55840 | 2003 | NaN | NaN | AZ | NaN |
92969 | 55883 | 2004 | NaN | NaN | KY | NaN |
92980 | 55955 | 2003 | NaN | NaN | AZ | NaN |
92981 | 55958 | 2004 | NaN | NaN | LA | NaN |
93017 | 56043 | 2004 | NaN | NaN | CA | NaN |
93048 | 56175 | 2007 | NaN | NaN | CA | NaN |
93061 | 55975 | 2004 | NaN | NaN | NaN | NaN |
93066 | 56042 | 2005 | NaN | NaN | NY | NaN |
93101 | 56168 | 2003 | NaN | NaN | MN | NaN |
93129 | 55073 | 2004 | NaN | NaN | NaN | NaN |
93165 | 55303 | 2008 | NaN | NaN | MS | NaN |
93251 | 55301 | 2003 | NaN | NaN | AZ | NaN |
93431 | 52164 | 2006 | NaN | NaN | CA | NaN |
93552 | 54222 | 2007 | NaN | NaN | AK | NaN |
93734 | 50728 | 2003 | NaN | NaN | LA | SERC |
93786 | 50040 | 2005 | NaN | NaN | IL | NaN |
93858 | 50030 | 2003 | NaN | NaN | LA | NaN |
93906 | 50168 | 2003 | NaN | NaN | LA | NaN |
94017 | 50313 | 2004 | NaN | NaN | NJ | NaN |
94423 | 10683 | 2007 | NaN | NaN | CO | NaN |
94647 | 10257 | 2005 | NaN | NaN | CA | NaN |
94815 | 56508 | 2007 | NaN | NaN | CA | NaN |
cols = ['plant id', 'lat', 'lon', 'nerc', 'state']
df_slim = (df.loc[:, cols].dropna(subset=['lon'])
.drop_duplicates(subset=['plant id', 'nerc']))
len(df_slim)
8982
df_slim.head()
plant id | lat | lon | nerc | state | |
---|---|---|---|---|---|
0 | 1001 | 39.924200 | -87.424400 | RFC | IN |
17 | 10003 | 39.760600 | -105.215000 | WECC | CO |
34 | 10005 | 37.047800 | -121.170800 | WECC | CA |
49 | 10008 | 30.314467 | -81.662705 | FRCC | FL |
62 | 10091 | 33.768300 | -118.283600 | WECC | CA |
Separate out the list of plants where we don't have NERC labels from EIA-860.
unknown = df_slim.loc[df_slim.nerc.isnull()]
print("{} plants don't have NERC labels\n".format(len(unknown)))
print(unknown.head())
279 plants don't have NERC labels plant id lat lon nerc state 38520 52080 37.9725 122.058333 NaN CA 48717 55647 36.1844 -86.854200 NaN TN 56706 58277 20.8867 -156.337800 NaN HI 57035 58380 61.2860 -149.610000 NaN AK 57071 58425 61.1300 -150.243611 NaN AK
X is lat/lon
y is the NERC label
For both, I'm only using plants where we have all data (no NaN
s). Not doing any transformation of the lat/lon at this time. There is certainly some error here, as KNN will use the Euclidian distance to calculate nearest neighbors. Not sure how I plan on dealing with this, or if it is even necessary.
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']
# le = LabelEncoder()
# le.fit(y)
# y = le.transform(y)
len(X)
8703
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
Run gridsearch testing parameter values for weights, n_neighbors, and p (use Euclidean or Manhattan distance).
With 15 neighbors, weights by distance, and Euclidean distance, the model is able to accurately predict the test sample NERC region with 96% accuracy. This varies by region, with the lowest accuracy scores for TRE and SPP (89% and 87%), and the highest accuracy scores for WECC and NPCC (each 99%). F1 scores tend to be similar to the accuracy, although TRE has slightly higher F1 (0.94 vs 0.89).
knn = neighbors.KNeighborsClassifier()
params = {'weights': ['uniform', 'distance'],
'n_neighbors': [10, 15, 20],
'p': [1, 2]
}
clf_knn = GridSearchCV(knn, params, n_jobs=-1, iid=False, verbose=1)
clf_knn.fit(X_train, y_train)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[Parallel(n_jobs=-1)]: Done 20 out of 36 | elapsed: 12.5s remaining: 10.0s [Parallel(n_jobs=-1)]: Done 36 out of 36 | elapsed: 21.1s finished
GridSearchCV(cv=None, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params=None, iid=False, n_jobs=-1, param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [10, 15, 20], 'p': [1, 2]}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=1)
clf_knn.best_estimator_, clf_knn.best_score_
(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=10, p=1, weights='distance'), 0.9590098375679993)
clf_knn.score(X_test, y_test)
0.9589136490250696
nerc_labels = nercs.nerc.dropna().unique()
Accuracy score by region
for region in nerc_labels:
mask = y_test == region
X_masked = X_test[mask]
y_hat_masked = clf_knn.predict(X_masked)
y_test_masked = y_test[mask]
accuracy = metrics.accuracy_score(y_test_masked, y_hat_masked)
print('{} : {}'.format(region, accuracy))
SERC : 0.9553752535496958 RFC : 0.9523809523809523 SPP : 0.8098591549295775 NPCC : 0.9768115942028985 WECC : 0.9904191616766467 MRO : 0.9469964664310954 TRE : 0.927007299270073 HICC : 1.0 ASCC : 0.975609756097561 FRCC : 0.9629629629629629
F1 score by region
y_hat = clf_knn.predict(X_test)
for region in nerc_labels:
f1 = metrics.f1_score(y_test, y_hat, labels=[region], average='macro')
print('{} : {}'.format(region, f1))
SERC : 0.9534412955465588 RFC : 0.949667616334283 SPP : 0.8273381294964028 NPCC : 0.9810771470160117 WECC : 0.9904191616766467 MRO : 0.9337979094076655 TRE : 0.9407407407407407 HICC : 0.9444444444444444 ASCC : 0.975609756097561 FRCC : 0.9811320754716981
metrics.f1_score(y_test, y_hat, average='micro')
0.9589136490250696
metrics.f1_score(y_test, y_hat, average='macro')
0.9477668276232013
unknown.loc[:, 'nerc'] = clf_knn.predict(unknown.loc[:, ['lat', 'lon']])
C:\Users\gschivley\Anaconda2\envs\psci\lib\site-packages\pandas\core\indexing.py:537: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
Ensuring that no plants in Alaska or Hawaii are assigned to continental NERCs, or the other way around.
print(unknown.loc[unknown.state.isin(['AK', 'HI']), 'nerc'].unique())
print(unknown.loc[unknown.nerc.isin(['HICC', 'ASCC']), 'state'].unique())
['HICC' 'ASCC' 'WECC'] ['HI' 'AK']
Counter(unknown['nerc'])
Counter({'ASCC': 11, 'FRCC': 2, 'HICC': 11, 'MRO': 8, 'NPCC': 57, 'RFC': 33, 'SERC': 41, 'SPP': 22, 'TRE': 29, 'WECC': 65})
unknown.head()
plant id | lat | lon | nerc | state | |
---|---|---|---|---|---|
38520 | 52080 | 37.9725 | 122.058333 | WECC | CA |
48717 | 55647 | 36.1844 | -86.854200 | SERC | TN |
56706 | 58277 | 20.8867 | -156.337800 | HICC | HI |
57035 | 58380 | 61.2860 | -149.610000 | ASCC | AK |
57071 | 58425 | 61.1300 | -150.243611 | ASCC | AK |
unknown.tail()
plant id | lat | lon | nerc | state | |
---|---|---|---|---|---|
94762 | 499 | 37.643611 | 120.757500 | WECC | CA |
94774 | 7478 | 32.738889 | 114.700278 | WECC | AZ |
94792 | 56197 | 35.301389 | 77.631111 | SERC | NC |
94796 | 52205 | 36.796389 | 121.448889 | WECC | CA |
94894 | 596 | 39.733889 | 75.564444 | NPCC | DE |
df_slim.head()
plant id | lat | lon | nerc | state | |
---|---|---|---|---|---|
0 | 1001 | 39.924200 | -87.424400 | RFC | IN |
17 | 10003 | 39.760600 | -105.215000 | WECC | CO |
34 | 10005 | 37.047800 | -121.170800 | WECC | CA |
49 | 10008 | 30.314467 | -81.662705 | FRCC | FL |
62 | 10091 | 33.768300 | -118.283600 | WECC | CA |
labeled = pd.concat([df_slim.loc[df_slim.notnull().all(axis=1)], unknown])
labeled.loc[labeled.nerc.isnull()]
plant id | lat | lon | nerc | state |
---|
There are 11 facilities that don't show up in my labeled data - they didn't have lat/lon info.
facility_df.loc[~facility_df['plant id'].isin(labeled['plant id']),
'plant id'].unique()
array([10851, 50291, 56672, 55982, 55150, 55082, 55521, 55314, 54243, 50726, 50249, 2666, 55952, 7704, 6339, 55840, 55883, 55955, 55958, 56043, 56175, 55975, 56042, 56168, 55073, 54516, 55303, 55301, 52164, 54222, 50728, 50040, 50030, 50168, 50313, 10683, 10257, 56508], dtype=int64)
path = join(data_path, 'Facility labels', 'Facility locations_knn.csv')
labeled.to_csv(path, index=False)