In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
from os.path import join
import pandas as pd
import fiona
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

sns.set(style='white')
cwd = os.getcwd()
data_path = join(cwd, '..', '..', 'Data storage')
In [5]:
%load_ext watermark
%watermark -v -p pandas,geopandas,shapely
CPython 3.6.2
IPython 6.2.1

pandas 0.20.2
geopandas 0.3.0
shapely 1.5.17.post1

Read NERC regions shapefile

Link to the shapefile for local download

In [2]:
path = os.path.join(data_path, 'NERC_Regions_EIA', 'NercRegions_201610.shp')
regions = gpd.read_file(path)
In [3]:
regions.crs
Out[3]:
{'init': 'epsg:4326'}
In [4]:
regions
Out[4]:
NERC NERC_Label geometry
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000...
1 FRCC Florida Reliability Coordinating Council (FRCC) (POLYGON ((-81.95506602699999 24.5196900770000...
2 MRO Midwest Reliability Organization (MRO) POLYGON ((-95.07148604699995 49.36451082200006...
3 NPCC Northeast Power Coordinating Council (NPCC) (POLYGON ((-73.82134384999995 40.6045004290000...
4 RFC ReliabilityFirst Corporation (RFC) (POLYGON ((-90.89359863199996 29.0467829150000...
5 SERC SERC Reliability Corporation (SERC) (POLYGON ((-91.71222639199999 32.9756623780000...
6 SPP Southwest Power Pool (SPP) (POLYGON ((-93.73310749999996 30.4137588960000...
7 TRE Texas Reliability Entity (TRE) POLYGON ((-100.0008629659999 35.02950123100004...
8 WECC Western Electricity Coordinating Council (WECC) (POLYGON ((-117.215183729 32.77737140300007, -...
In [12]:
region_pairs = [
    (8, 7),
    (8, 6),
    (8, 2),
    (1, 5),
    (2, 4),
    (2, 5),
    (2, 6),
    (7, 6),
    (3, 4)
]
In [17]:
for pairs in region_pairs:
    contains = regions.loc[pairs[0], 'geometry'].intersects(regions.loc[pairs[1], 'geometry'])
    if contains:
        region0 = regions.loc[pairs[0], 'NERC']
        region1 = regions.loc[pairs[1], 'NERC']
        
        print('{} intersects with {}'.format(region0, region1))
    


# regions.loc[0, 'geometry'].contains(regions.loc[2, 'geometry'])
WECC intersects with TRE
WECC intersects with SPP
WECC intersects with MRO
FRCC intersects with SERC
MRO intersects with RFC
MRO intersects with SERC
MRO intersects with SPP
TRE intersects with SPP
NPCC intersects with RFC
In [16]:
regions.loc[1, 'geometry'].intersects(regions.loc[5, 'geometry'])
Out[16]:
True
In [44]:
fig, ax = plt.subplots(figsize=(5,5))
regions.loc[[1], :].plot(ax=ax, color='b', alpha=0.7)
regions.loc[[5], :].plot(ax=ax, color='r', alpha=0.5)
plt.xlim(-92.5, -77.5)
plt.ylim(25, 37)
sns.despine()
In [41]:
fig, ax = plt.subplots(figsize=(5,5))
regions.loc[[1], :].plot(ax=ax, color='b', alpha=0.7)
regions.loc[[5], :].plot(ax=ax, color='r', alpha=0.5)
# plt.xlim(-92.5, -77.5)
# plt.ylim(25, 35)
sns.despine()
In [38]:
fig, ax = plt.subplots(figsize=(5,5))
regions.loc[[5], :].plot(ax=ax, color='b', alpha=0.5)
regions.loc[[4], :].plot(ax=ax, color='r', alpha=0.5)
sns.despine(trim=True)
In [5]:
regions.plot()
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ac65518>

Read file with facility location, generation, and fuel data

In [12]:
path = os.path.join(data_path, 'Facility gen fuels and CO2 2017-08-31.zip')
facility_df = pd.read_csv(path)

Drop plants without lat/lon data

From 2001-mid 2017, 9 facilities (out of 8,435) don't have valid lat/lon data. All of them are from 2010 or earlier. Drop these rows or the spatial join fails. The amount of generation from these plants is tiny - well under 0.05% in all years.

In [7]:
print(len(facility_df['plant id'].unique()), 'total plants')
print(len(facility_df.loc[facility_df['lat'].isnull(), 'plant id'].unique()),
      'plants without lat/lon')
8435 total plants
9 plants without lat/lon
In [8]:
years = facility_df.loc[facility_df['lat'].isnull(), 'year'].unique()
for year in years:
    total_gen = facility_df.loc[facility_df['year'] == year, 'generation (MWh)'].sum()
    
    # Plant ids with no 'lat' in year
    no_loc_plants = facility_df.loc[(facility_df['lat'].isnull()) & 
                                    (facility_df['year'] == year), 'plant id'].unique()
    
    no_loc_gen = facility_df.loc[(facility_df['year'] == year) &
                                 (facility_df['plant id'].isin(no_loc_plants)),
                                  'generation (MWh)'].sum()
    
    percent_dropped = no_loc_gen / total_gen * 100
    
    print('In {}, {:.3f}% of generation is from plants without lat/lon'.format(year, percent_dropped))
In 2006, 0.000% of generation is from plants without lat/lon
In 2005, 0.001% of generation is from plants without lat/lon
In 2004, 0.001% of generation is from plants without lat/lon
In 2002, 0.024% of generation is from plants without lat/lon
In 2001, 0.031% of generation is from plants without lat/lon
In 2010, 0.000% of generation is from plants without lat/lon
In 2009, 0.000% of generation is from plants without lat/lon
In 2007, 0.000% of generation is from plants without lat/lon
In 2003, 0.011% of generation is from plants without lat/lon
In [9]:
facility_df.dropna(inplace=True, subset=['lat', 'lon'])
In [10]:
facility_df.columns
Out[10]:
Index(['f', 'fuel', 'month', 'plant id', 'total fuel (mmbtu)', 'year',
       'generation (MWh)', 'elec fuel (mmbtu)', 'geography', 'last_updated',
       'lat', 'lon', 'prime mover', 'datetime', 'quarter',
       'all fuel fossil CO2 (kg)', 'elec fuel fossil CO2 (kg)',
       'all fuel total CO2 (kg)', 'elec fuel total CO2 (kg)'],
      dtype='object')

Because I have monthly data for every facility from 2001-2017, there are lots of duplicate rows. No need to do a spatial join on every row. Just keep one instance of each facility in each year.

In [11]:
cols = ['lat', 'lon', 'plant id', 'year']
small_facility = facility_df.loc[:, cols].drop_duplicates()

Use Point from Shapely to create the geometry list of facility locations. crs is the coordinate reference system that translates lat/lon into a specific map projection.

In [12]:
geometry = [Point(xy) for xy in zip(small_facility.lon, small_facility.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(small_facility, crs=crs, geometry=geometry)
In [13]:
geo_df.head()
Out[13]:
lat lon plant id year geometry
0 44.4936 -88.0303 10360 2017 POINT (-88.0303 44.4936)
6 44.4936 -88.0303 10360 2016 POINT (-88.0303 44.4936)
18 44.4936 -88.0303 10360 2015 POINT (-88.0303 44.4936)
30 44.4936 -88.0303 10360 2014 POINT (-88.0303 44.4936)
38 44.4936 -88.0303 10360 2008 POINT (-88.0303 44.4936)
In [14]:
len(geo_df)
Out[14]:
88918

Spatial join of NERC dataframe with polygons and facility dataframe with points

Joining the 9 regions (NERC) with 100,810 records takes much longer with facilities as the left dataframe in the join (2 min vs 12 seconds). Not quite sure why this is. The faster method leaves me with polygons rather than points tho. There might be a better way to rectify this, but I just create a new geodataframe with the geometry set as the lat/lon points.

EDIT Although the GeoPandas documentation says that the op parameter doesn't matter for point-in-polygon operations, using 'within' made the operation much faster (12 seconds). The first method below is probably preferable since it keeps the point geometry.

Method 1 (slow unless you use op='within')

In [15]:
facility_nerc = gpd.sjoin(geo_df, regions, how='inner', op='within')
In [16]:
facility_nerc.head()
Out[16]:
lat lon plant id year geometry index_right NERC NERC_Label
0 44.4936 -88.0303 10360 2017 POINT (-88.0303 44.4936) 2 MRO Midwest Reliability Organization (MRO)
6 44.4936 -88.0303 10360 2016 POINT (-88.0303 44.4936) 2 MRO Midwest Reliability Organization (MRO)
18 44.4936 -88.0303 10360 2015 POINT (-88.0303 44.4936) 2 MRO Midwest Reliability Organization (MRO)
30 44.4936 -88.0303 10360 2014 POINT (-88.0303 44.4936) 2 MRO Midwest Reliability Organization (MRO)
38 44.4936 -88.0303 10360 2008 POINT (-88.0303 44.4936) 2 MRO Midwest Reliability Organization (MRO)

Method 2 (faster when using the default operation)

In [10]:
facility_nerc = gpd.sjoin(df, geo_df, how="inner")
In [11]:
facility_nerc.head()
Out[11]:
NERC NERC_Label geometry index_right lat lon plant id year
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 1200931 40.389167 -91.394167 57953 2014
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 1200919 40.389167 -91.394167 57953 2015
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 1200907 40.389167 -91.394167 57953 2016
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 1200904 40.389167 -91.394167 57953 2017
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 1201030 40.389167 -91.394167 57953 2011

Make the new geometry of facility locations

In [12]:
geometry = [Point(xy) for xy in zip(facility_nerc.lon, facility_nerc.lat)]

Create new dataframe with the data I want to keep and the new geometry

In [13]:
crs = {'init': 'epsg:4326'}
keep_cols = ['NERC_Label', 'plant id', 'year']
facility_nerc = GeoDataFrame(facility_nerc[keep_cols], crs=crs, geometry=geometry)
In [14]:
facility_nerc.head()
Out[14]:
NERC_Label plant id year geometry
0 Indeterminate - various NERC membership 57953 2014 POINT (-91.39416700000001 40.389167)
0 Indeterminate - various NERC membership 57953 2015 POINT (-91.39416700000001 40.389167)
0 Indeterminate - various NERC membership 57953 2016 POINT (-91.39416700000001 40.389167)
0 Indeterminate - various NERC membership 57953 2017 POINT (-91.39416700000001 40.389167)
0 Indeterminate - various NERC membership 57953 2011 POINT (-91.39416700000001 40.389167)

Changing the crs to see what happens

Just making sure that it does something

In [31]:
df_test = df.to_crs({'init': 'epsg:3395'})
In [32]:
df.plot()
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1301c5f28>
In [33]:
df_test.plot()
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1334af7b8>

NEW - read file with only plant id, state, and lat/lon

Link to data file

If you need to load the shapefile, download it here.

In [ ]:
# If loading the shapefile
shape_path = ''
regions = gpd.read_file(shape_path)
In [3]:
path = join(data_path, 'Facility labels', 'Facility locations.csv')
location = pd.read_csv(path)
In [20]:
len(location)
Out[20]:
8217

The nerc column is labels from a spatial join using this same shapefile and lat/lon data in QGIS.

In [21]:
location.head()
Out[21]:
plant id state lat lon nerc
0 2 AL 33.458665 -87.356823 SERC
1 3 AL 31.006900 -88.010300 SERC
2 4 AL 32.583889 -86.283056 SERC
3 7 AL 34.012800 -85.970800 SERC
4 8 AL 33.644344 -87.196486 SERC
In [22]:
location.loc[location['lat'].isnull()]
Out[22]:
plant id state lat lon nerc
In [4]:
geometry = [Point(xy) for xy in zip(location.lon, location.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(location, crs=crs, geometry=geometry)
In [5]:
geo_df.head()
Out[5]:
plant id state lat lon nerc geometry
0 2 AL 33.458665 -87.356823 SERC POINT (-87.35682299999999 33.458665)
1 3 AL 31.006900 -88.010300 SERC POINT (-88.0103 31.0069)
2 4 AL 32.583889 -86.283056 SERC POINT (-86.283056 32.583889)
3 7 AL 34.012800 -85.970800 SERC POINT (-85.9708 34.0128)
4 8 AL 33.644344 -87.196486 SERC POINT (-87.19648599999999 33.644344)

Every plant id is unique

In [24]:
len(geo_df)
Out[24]:
8099
In [25]:
len(geo_df['plant id'].unique())
Out[25]:
8099

Method 1 - use defaults

In [26]:
df1 = gpd.sjoin(regions, geo_df)
In [27]:
df1.head()
Out[27]:
NERC NERC_Label geometry index_right plant id state lat lon nerc
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 4381 54930 IA 40.584150 -91.424530 -
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 639 1127 IA 40.703100 -92.425300 -
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 629 1104 IA 40.741200 -91.116667 -
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 630 1105 IA 40.747800 -92.873056 -
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 2562 7706 IA 40.816736 -91.146667 -

We can already see that there are more rows in this dataframe than there were power plants.

In [28]:
len(df1)
Out[28]:
8135

Method 2 - use 'within'

In [6]:
df2 = gpd.sjoin(geo_df, regions, op='within')
In [31]:
df2.head()
Out[31]:
plant id state lat lon nerc geometry index_right NERC NERC_Label
0 2 AL 33.458665 -87.356823 SERC POINT (-87.35682299999999 33.458665) 5 SERC SERC Reliability Corporation (SERC)
1 3 AL 31.006900 -88.010300 SERC POINT (-88.0103 31.0069) 5 SERC SERC Reliability Corporation (SERC)
2 4 AL 32.583889 -86.283056 SERC POINT (-86.283056 32.583889) 5 SERC SERC Reliability Corporation (SERC)
3 7 AL 34.012800 -85.970800 SERC POINT (-85.9708 34.0128) 5 SERC SERC Reliability Corporation (SERC)
4 8 AL 33.644344 -87.196486 SERC POINT (-87.19648599999999 33.644344) 5 SERC SERC Reliability Corporation (SERC)

Same size dataframe as from Method 1. Yet again, there appear to be extra results.

In [26]:
len(df2)
Out[26]:
8135

Method 3 - use 'contains'

In [39]:
df3 = gpd.sjoin(regions, geo_df, op='contains')
In [40]:
df3.head()
Out[40]:
NERC NERC_Label geometry index_right plant id state lat lon
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 4387 55340 AR 35.861900 -90.025300
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 6136 172 AR 35.689300 -89.994000
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 152 8055 AR 35.311000 -93.235100
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 6217 187 AR 34.211913 -93.110963
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000... 2928 59947 AR 33.631538 -92.703611

Still too many results

In [41]:
len(df3)
Out[41]:
8135

What facilities are missing?

In [32]:
regions
Out[32]:
NERC NERC_Label geometry
index_right
0 - Indeterminate - various NERC membership (POLYGON ((-91.71222639199999 32.9756623780000...
1 FRCC Florida Reliability Coordinating Council (FRCC) (POLYGON ((-81.95506602699999 24.5196900770000...
2 MRO Midwest Reliability Organization (MRO) POLYGON ((-95.07148604699995 49.36451082200006...
3 NPCC Northeast Power Coordinating Council (NPCC) (POLYGON ((-73.82134384999995 40.6045004290000...
4 RFC ReliabilityFirst Corporation (RFC) (POLYGON ((-90.89359863199996 29.0467829150000...
5 SERC SERC Reliability Corporation (SERC) (POLYGON ((-91.71222639199999 32.9756623780000...
6 SPP Southwest Power Pool (SPP) (POLYGON ((-93.73310749999996 30.4137588960000...
7 TRE Texas Reliability Entity (TRE) POLYGON ((-100.0008629659999 35.02950123100004...
8 WECC Western Electricity Coordinating Council (WECC) (POLYGON ((-117.215183729 32.77737140300007, -...
In [23]:
frcc = regions.loc[1, 'geometry']
serc = regions.loc[5, 'geometry']

I figured out that plant id 641 is one that shows up in both FRCC and SERC

In [27]:
plant_641 = geo_df.loc[geo_df['plant id'] == 641, 'geometry'].values[0]
In [28]:
frcc.contains(plant_641)
Out[28]:
True
In [29]:
serc.contains(plant_641)
Out[29]:
True

So 641 definitely is in both NERC regions. And apparently the regions overlap? But they aren't supposed to. And I didn't have any plants show up in multiple regions when doing the spatial join in QGIS.

In [37]:
frcc.intersects(serc)
Out[37]:
True

There are 36 plants with duplicate NERC regions.

In [38]:
len(df2.loc[df2['plant id'].duplicated()].sort_values('plant id'))
Out[38]:
36
In [10]:
pd.set_option('display.max_rows', 200)
In [11]:
df2.loc[df2['plant id'].duplicated(keep=False)].sort_values('plant id')
Out[11]:
plant id state lat lon nerc geometry index_right NERC NERC_Label
406 641 FL 30.566100 -87.224400 FRCC POINT (-87.2244 30.5661) 5 SERC SERC Reliability Corporation (SERC)
406 641 FL 30.566100 -87.224400 FRCC POINT (-87.2244 30.5661) 1 FRCC Florida Reliability Coordinating Council (FRCC)
407 642 FL 30.669200 -84.886900 FRCC POINT (-84.8869 30.6692) 5 SERC SERC Reliability Corporation (SERC)
407 642 FL 30.669200 -84.886900 FRCC POINT (-84.8869 30.6692) 1 FRCC Florida Reliability Coordinating Council (FRCC)
408 643 FL 30.268600 -85.700300 FRCC POINT (-85.7003 30.2686) 5 SERC SERC Reliability Corporation (SERC)
408 643 FL 30.268600 -85.700300 FRCC POINT (-85.7003 30.2686) 1 FRCC Florida Reliability Coordinating Council (FRCC)
432 690 FL 30.708571 -84.863869 FRCC POINT (-84.86386899999999 30.708571) 5 SERC SERC Reliability Corporation (SERC)
432 690 FL 30.708571 -84.863869 FRCC POINT (-84.86386899999999 30.708571) 1 FRCC Florida Reliability Coordinating Council (FRCC)
963 1773 MI 46.726100 -88.662500 MRO POINT (-88.66249999999999 46.7261) 4 RFC ReliabilityFirst Corporation (RFC)
963 1773 MI 46.726100 -88.662500 MRO POINT (-88.66249999999999 46.7261) 2 MRO Midwest Reliability Organization (MRO)
964 1774 MI 46.696400 -89.208900 MRO POINT (-89.2089 46.6964) 2 MRO Midwest Reliability Organization (MRO)
964 1774 MI 46.696400 -89.208900 MRO POINT (-89.2089 46.6964) 4 RFC ReliabilityFirst Corporation (RFC)
965 1775 WI 45.947200 -88.218900 MRO POINT (-88.2189 45.9472) 4 RFC ReliabilityFirst Corporation (RFC)
965 1775 WI 45.947200 -88.218900 MRO POINT (-88.2189 45.9472) 2 MRO Midwest Reliability Organization (MRO)
967 1777 MI 46.131100 -88.225000 MRO POINT (-88.22499999999999 46.1311) 4 RFC ReliabilityFirst Corporation (RFC)
967 1777 MI 46.131100 -88.225000 MRO POINT (-88.22499999999999 46.1311) 2 MRO Midwest Reliability Organization (MRO)
969 1780 MI 45.955300 -88.195800 MRO POINT (-88.19580000000001 45.9553) 4 RFC ReliabilityFirst Corporation (RFC)
969 1780 MI 45.955300 -88.195800 MRO POINT (-88.19580000000001 45.9553) 2 MRO Midwest Reliability Organization (MRO)
970 1781 MI 45.990781 -88.210514 MRO POINT (-88.210514 45.990781) 4 RFC ReliabilityFirst Corporation (RFC)
970 1781 MI 45.990781 -88.210514 MRO POINT (-88.210514 45.990781) 2 MRO Midwest Reliability Organization (MRO)
971 1784 MI 45.871900 -88.069400 MRO POINT (-88.0694 45.8719) 4 RFC ReliabilityFirst Corporation (RFC)
971 1784 MI 45.871900 -88.069400 MRO POINT (-88.0694 45.8719) 2 MRO Midwest Reliability Organization (MRO)
972 1785 MI 46.159200 -88.235300 MRO POINT (-88.2353 46.1592) 2 MRO Midwest Reliability Organization (MRO)
972 1785 MI 46.159200 -88.235300 MRO POINT (-88.2353 46.1592) 4 RFC ReliabilityFirst Corporation (RFC)
977 1821 MI 46.106278 -88.334531 MRO POINT (-88.33453100000001 46.106278) 4 RFC ReliabilityFirst Corporation (RFC)
977 1821 MI 46.106278 -88.334531 MRO POINT (-88.33453100000001 46.106278) 2 MRO Midwest Reliability Organization (MRO)
992 1848 MI 45.788900 -87.905800 MRO POINT (-87.9058 45.7889) 2 MRO Midwest Reliability Organization (MRO)
992 1848 MI 45.788900 -87.905800 MRO POINT (-87.9058 45.7889) 4 RFC ReliabilityFirst Corporation (RFC)
1978 4043 WI 44.253930 -88.409560 MRO POINT (-88.40956 44.25393) 2 MRO Midwest Reliability Organization (MRO)
1978 4043 WI 44.253930 -88.409560 MRO POINT (-88.40956 44.25393) 4 RFC ReliabilityFirst Corporation (RFC)
2017 4122 WI 44.316400 -88.314700 MRO POINT (-88.3147 44.3164) 2 MRO Midwest Reliability Organization (MRO)
2017 4122 WI 44.316400 -88.314700 MRO POINT (-88.3147 44.3164) 4 RFC ReliabilityFirst Corporation (RFC)
2020 4127 WI 44.200020 -88.458171 MRO POINT (-88.45817099999999 44.20002) 4 RFC ReliabilityFirst Corporation (RFC)
2020 4127 WI 44.200020 -88.458171 MRO POINT (-88.45817099999999 44.20002) 2 MRO Midwest Reliability Organization (MRO)
2191 6192 FL 30.573611 -86.215833 FRCC POINT (-86.21583299999999 30.573611) 1 FRCC Florida Reliability Coordinating Council (FRCC)
2191 6192 FL 30.573611 -86.215833 FRCC POINT (-86.21583299999999 30.573611) 5 SERC SERC Reliability Corporation (SERC)
2565 7715 FL 30.592080 -87.135391 FRCC POINT (-87.135391 30.59208) 1 FRCC Florida Reliability Coordinating Council (FRCC)
2565 7715 FL 30.592080 -87.135391 FRCC POINT (-87.135391 30.59208) 5 SERC SERC Reliability Corporation (SERC)
2614 7822 WI 44.274582 -88.319074 MRO POINT (-88.319074 44.274582) 2 MRO Midwest Reliability Organization (MRO)
2614 7822 WI 44.274582 -88.319074 MRO POINT (-88.319074 44.274582) 4 RFC ReliabilityFirst Corporation (RFC)
2947 10250 FL 30.266057 -85.520670 FRCC POINT (-85.52067 30.266057) 1 FRCC Florida Reliability Coordinating Council (FRCC)
2947 10250 FL 30.266057 -85.520670 FRCC POINT (-85.52067 30.266057) 5 SERC SERC Reliability Corporation (SERC)
3027 10416 FL 30.595800 -87.252500 FRCC POINT (-87.2525 30.5958) 5 SERC SERC Reliability Corporation (SERC)
3027 10416 FL 30.595800 -87.252500 FRCC POINT (-87.2525 30.5958) 1 FRCC Florida Reliability Coordinating Council (FRCC)
3434 50250 FL 30.596600 -87.326400 FRCC POINT (-87.32640000000001 30.5966) 5 SERC SERC Reliability Corporation (SERC)
3434 50250 FL 30.596600 -87.326400 FRCC POINT (-87.32640000000001 30.5966) 1 FRCC Florida Reliability Coordinating Council (FRCC)
3435 50251 MI 45.795600 -87.955600 MRO POINT (-87.9556 45.7956) 4 RFC ReliabilityFirst Corporation (RFC)
3435 50251 MI 45.795600 -87.955600 MRO POINT (-87.9556 45.7956) 2 MRO Midwest Reliability Organization (MRO)
3471 50310 FL 30.473600 -87.236100 FRCC POINT (-87.23609999999999 30.4736) 5 SERC SERC Reliability Corporation (SERC)
3471 50310 FL 30.473600 -87.236100 FRCC POINT (-87.23609999999999 30.4736) 1 FRCC Florida Reliability Coordinating Council (FRCC)
3681 50725 FL 30.848333 -87.113333 FRCC POINT (-87.113333 30.848333) 1 FRCC Florida Reliability Coordinating Council (FRCC)
3681 50725 FL 30.848333 -87.113333 FRCC POINT (-87.113333 30.848333) 5 SERC SERC Reliability Corporation (SERC)
3723 50807 FL 30.141984 -85.621103 FRCC POINT (-85.62110300000001 30.141984) 1 FRCC Florida Reliability Coordinating Council (FRCC)
3723 50807 FL 30.141984 -85.621103 FRCC POINT (-85.62110300000001 30.141984) 5 SERC SERC Reliability Corporation (SERC)
4327 54842 WI 44.286900 -88.333900 MRO POINT (-88.3339 44.2869) 4 RFC ReliabilityFirst Corporation (RFC)
4327 54842 WI 44.286900 -88.333900 MRO POINT (-88.3339 44.2869) 2 MRO Midwest Reliability Organization (MRO)
4352 54885 WI 44.276900 -88.334400 MRO POINT (-88.3344 44.2769) 2 MRO Midwest Reliability Organization (MRO)
4352 54885 WI 44.276900 -88.334400 MRO POINT (-88.3344 44.2769) 4 RFC ReliabilityFirst Corporation (RFC)
4527 55135 WI 44.193600 -88.506400 MRO POINT (-88.5064 44.1936) 4 RFC ReliabilityFirst Corporation (RFC)
4527 55135 WI 44.193600 -88.506400 MRO POINT (-88.5064 44.1936) 2 MRO Midwest Reliability Organization (MRO)
4610 55242 FL 30.566400 -87.115000 FRCC POINT (-87.11499999999999 30.5664) 1 FRCC Florida Reliability Coordinating Council (FRCC)
4610 55242 FL 30.566400 -87.115000 FRCC POINT (-87.11499999999999 30.5664) 5 SERC SERC Reliability Corporation (SERC)
5022 56037 WI 44.189036 -88.468492 MRO POINT (-88.468492 44.189036) 4 RFC ReliabilityFirst Corporation (RFC)
5022 56037 WI 44.189036 -88.468492 MRO POINT (-88.468492 44.189036) 2 MRO Midwest Reliability Organization (MRO)
5397 56522 FL 30.928333 -85.426111 FRCC POINT (-85.42611099999999 30.928333) 1 FRCC Florida Reliability Coordinating Council (FRCC)
5397 56522 FL 30.928333 -85.426111 FRCC POINT (-85.42611099999999 30.928333) 5 SERC SERC Reliability Corporation (SERC)
6162 57502 FL 30.570000 -87.390000 FRCC POINT (-87.39 30.57) 1 FRCC Florida Reliability Coordinating Council (FRCC)
6162 57502 FL 30.570000 -87.390000 FRCC POINT (-87.39 30.57) 5 SERC SERC Reliability Corporation (SERC)
7160 58663 WI 44.115000 -88.217222 MRO POINT (-88.21722199999999 44.115) 2 MRO Midwest Reliability Organization (MRO)
7160 58663 WI 44.115000 -88.217222 MRO POINT (-88.21722199999999 44.115) 4 RFC ReliabilityFirst Corporation (RFC)
7859 59689 FL 30.515448 -86.518407 FRCC POINT (-86.518407 30.515448) 5 SERC SERC Reliability Corporation (SERC)
7859 59689 FL 30.515448 -86.518407 FRCC POINT (-86.518407 30.515448) 1 FRCC Florida Reliability Coordinating Council (FRCC)
In [17]:
facility_df.head()
Out[17]:
f fuel month plant id total fuel (mmbtu) year generation (MWh) elec fuel (mmbtu) geography last_updated lat lon prime mover datetime quarter all fuel fossil CO2 (kg) elec fuel fossil CO2 (kg) all fuel total CO2 (kg) elec fuel total CO2 (kg)
0 M SUB 6 10360 0.0 2017 0.0 0.0 USA-WI 2017-08-24T11:46:12-04:00 44.4936 -88.0303 ALL 2017-06-01 2 0.0 0.0 0.0 0.0
1 M SUB 5 10360 0.0 2017 0.0 0.0 USA-WI 2017-08-24T11:46:12-04:00 44.4936 -88.0303 ALL 2017-05-01 2 0.0 0.0 0.0 0.0
2 M SUB 4 10360 0.0 2017 0.0 0.0 USA-WI 2017-08-24T11:46:12-04:00 44.4936 -88.0303 ALL 2017-04-01 2 0.0 0.0 0.0 0.0
3 M SUB 3 10360 0.0 2017 0.0 0.0 USA-WI 2017-08-24T11:46:12-04:00 44.4936 -88.0303 ALL 2017-03-01 1 0.0 0.0 0.0 0.0
4 M SUB 2 10360 0.0 2017 0.0 0.0 USA-WI 2017-08-24T11:46:12-04:00 44.4936 -88.0303 ALL 2017-02-01 1 0.0 0.0 0.0 0.0
In [49]:
plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon']]
plants.drop_duplicates(inplace=True)
In [24]:
plants.groupby(['plant id']).max().hist()
Out[24]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1165f8ef0>]], dtype=object)
In [116]:
path = join(data_path, 'EIA downloads', 'EIA-860 2015', '2___Plant_Y2015.xlsx')
nercs2015 = pd.read_excel(path, skiprows=0, parse_cols='C,L')
nercs2015.columns = ['plant id', 'nerc']

path = join(data_path, 'EIA downloads', 'eia8602012', 'PlantY2012.xlsx')
nercs2012 = pd.read_excel(path, skiprows=0, parse_cols='B,J')
nercs2012.columns = ['plant id', 'nerc']
In [117]:
nercs2012.head()
Out[117]:
plant id nerc
0 10867 SERC
1 50903 RFC
2 10671 SPP
3 2527 NPCC
4 3305 SERC
In [118]:
nercs = pd.concat([nercs2012, nercs2015]).drop_duplicates()
In [119]:
len(nercs), len(nercs2012), len(nercs2015)
Out[119]:
(9178, 7289, 8928)
In [120]:
df = pd.merge(plants, nercs, on=['plant id'], how='left')
In [121]:
plants.tail()
Out[121]:
plant id year lat lon
1544548 1275 2006 38.947222 -99.528611
1544560 1275 2005 38.947222 -99.528611
1544572 1275 2004 38.947222 -99.528611
1544584 1275 2003 38.947222 -99.528611
1544596 1275 2002 38.947222 -99.528611
In [122]:
nercs.tail()
Out[122]:
plant id nerc
8923 60510 SERC
8924 60511 WECC
8925 60514 SERC
8926 60550 SERC
8927 60571 WECC
In [123]:
nercs.nerc.unique()
Out[123]:
array(['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'HICC', 'ASCC',
       'FRCC', nan], dtype=object)
In [67]:
df.tail()
Out[67]:
plant id year lat lon nerc
89242 1275 2006 38.947222 -99.528611 SPP
89243 1275 2005 38.947222 -99.528611 SPP
89244 1275 2004 38.947222 -99.528611 SPP
89245 1275 2003 38.947222 -99.528611 SPP
89246 1275 2002 38.947222 -99.528611 SPP
In [63]:
len(df.loc[df.isnull().any(axis=1), 'plant id'].unique())
Out[63]:
88
In [148]:
from sklearn import neighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
In [124]:
nercs = ['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'FRCC']
In [125]:
df_slim = df.loc[df.nerc.isin(nercs), ['plant id', 'lat', 'lon', 'nerc']].dropna(subset=['lon']).drop_duplicates()
In [126]:
unknown = df_slim.loc[df_slim.nerc.isnull()]
In [69]:
n_neighbors = 10
In [155]:
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']

X_scale = StandardScaler().fit_transform(X)
In [164]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
In [128]:
y.unique()
Out[128]:
array(['MRO', 'WECC', 'NPCC', 'SERC', 'SPP', 'RFC', 'FRCC', 'TRE'], dtype=object)
In [165]:
knn = neighbors.KNeighborsClassifier()
In [ ]:
params = {'weights': ['uniform', 'distance'],
          'n_neighbors': [10, 15, 20, 30],
          'leaf_size': [3, 5, 10, 30],
          'p': [1, 2]}

clf = GridSearchCV(knn, params, n_jobs=-1)
In [160]:
clf.fit(X_train, y_train)
Out[160]:
GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [10, 15, 20, 30], 'leaf_size': [3, 5, 10, 30], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
In [161]:
clf.best_estimator_, clf.best_params_, clf.best_score_
Out[161]:
(KNeighborsClassifier(algorithm='auto', leaf_size=3, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=15, p=1,
            weights='distance'),
 {'leaf_size': 3, 'n_neighbors': 15, 'p': 1, 'weights': 'distance'},
 0.96590695046148045)

No difference in score when X is preprocessed with StandardScaler

In [163]:
clf.score(X_scale, y)
Out[163]:
0.98498422712933753
In [102]:
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X_train, y_train)
Out[102]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
In [103]:
clf.score(X_test, y_test)
Out[103]:
0.95693087224031848
In [104]:
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf.fit(X_train, y_train)
Out[104]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance')
In [105]:
clf.score(X_test, y_test)
Out[105]:
0.95946435034382915
In [95]:
h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda/envs/py36/lib/python3.6/site-packages/numpy/ma/core.py in min(obj, axis, out, fill_value, keepdims)
   6441     try:
-> 6442         return obj.min(axis=axis, fill_value=fill_value, out=out, **kwargs)
   6443     except (AttributeError, TypeError):

TypeError: _amin() got an unexpected keyword argument 'fill_value'

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
<ipython-input-95-cf8de1f4ef1e> in <module>()
     21     Z = Z.reshape(xx.shape)
     22     plt.figure()
---> 23     plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
     24 
     25     # Plot also the training points

~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/pyplot.py in pcolormesh(*args, **kwargs)
   3242                       mplDeprecation)
   3243     try:
-> 3244         ret = ax.pcolormesh(*args, **kwargs)
   3245     finally:
   3246         ax._hold = washold

~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1896                     warnings.warn(msg % (label_namer, func.__name__),
   1897                                   RuntimeWarning, stacklevel=2)
-> 1898             return func(ax, *args, **kwargs)
   1899         pre_doc = inner.__doc__
   1900         if pre_doc is None:

~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/axes/_axes.py in pcolormesh(self, *args, **kwargs)
   5601         collection.set_norm(norm)
   5602         collection.set_clim(vmin, vmax)
-> 5603         collection.autoscale_None()
   5604 
   5605         self.grid(False)

~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/cm.py in autoscale_None(self)
    343         if self._A is None:
    344             raise TypeError('You must first set_array for mappable')
--> 345         self.norm.autoscale_None(self._A)
    346         self.changed()
    347 

~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/colors.py in autoscale_None(self, A)
    887         ' autoscale only None-valued vmin or vmax'
    888         if self.vmin is None and np.size(A) > 0:
--> 889             self.vmin = ma.min(A)
    890         if self.vmax is None and np.size(A) > 0:
    891             self.vmax = ma.max(A)

~/anaconda/envs/py36/lib/python3.6/site-packages/numpy/ma/core.py in min(obj, axis, out, fill_value, keepdims)
   6445         # fill_value argument
   6446         return asanyarray(obj).min(axis=axis, fill_value=fill_value,
-> 6447                                    out=out, **kwargs)
   6448 min.__doc__ = MaskedArray.min.__doc__
   6449 

~/anaconda/envs/py36/lib/python3.6/site-packages/numpy/ma/core.py in min(self, axis, out, fill_value, keepdims)
   5592         if out is None:
   5593             result = self.filled(fill_value).min(
-> 5594                 axis=axis, out=out, **kwargs).view(type(self))
   5595             if result.ndim:
   5596                 # Set the mask

AttributeError: 'str' object has no attribute 'view'