%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import geopandas as gpd
# from shapely.geometry import Point
# from geopandas import GeoDataFrame
import os
from os.path import join
import pandas as pd
import fiona
# from cartopy import crs as ccrs
# from urllib.request import urlopen
# from zipfile import ZipFile
# from io import BytesIO
# from pathlib import Path
sns.set(style='white')
cwd = os.getcwd()
data_path = join(cwd, '..', '..', 'Data storage')
figure_path = join(cwd, '..', '..', 'Figures')
file_date = '2018-03-06'
%load_ext watermark
%watermark -v -p pandas,geopandas,shapely
CPython 3.6.4 IPython 6.2.1 pandas 0.22.0 geopandas 0.3.0 shapely 1.6.4.post1
Link to the shapefile for local download
# EIA NERC region shapefile, which has an "Indeterminate" region
# path = os.path.join(data_path, 'NERC_Regions_EIA', 'NercRegions_201610.shp')
# regions = gpd.read_file(path)
# regions.crs
{'init': 'epsg:4326'}
path = os.path.join(data_path, 'nercregions', 'NERCregions.shp')
regions_nerc = gpd.read_file(path)
regions_nerc['nerc'] = regions_nerc['NERCregion']
regions_nerc
OBJECTID | NERCregion | SHAPE_Leng | SHAPE_Area | geometry | nerc | |
---|---|---|---|---|---|---|
0 | 1 | FRCC | 22.488030 | 11.459289 | POLYGON ((-84.86300370899994 30.71266453000004... | FRCC |
1 | 2 | MRO | 76.997961 | 130.425939 | POLYGON ((-94.8320392469999 49.33080593000005,... | MRO |
2 | 3 | NPCC | 48.574281 | 32.754394 | (POLYGON ((-72.5509712409999 40.96617993100006... | NPCC |
3 | 4 | RFC | 99.332877 | 71.157869 | (POLYGON ((-87.92640864299995 44.5391396020000... | RFC |
4 | 5 | SERC | 128.958444 | 142.139208 | POLYGON ((-80.20680177299994 36.54903791300006... | SERC |
5 | 6 | SPP | 61.523164 | 56.496471 | POLYGON ((-97.56098691099993 36.38533196100008... | SPP |
6 | 7 | TRE | 44.145641 | 50.605118 | POLYGON ((-104.4498774409999 32.00686615000006... | TRE |
7 | 8 | WECC | 102.110211 | 323.227276 | (POLYGON ((-122.6656125679999 48.3967777520000... | WECC |
regions_nerc.boundary.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1a75666ac8>
Now using NERC region shapefiles created by DHS
path = os.path.join(data_path, 'NERC_Regions', 'NERC_Regions.shp')
regions = gpd.read_file(path)
regions.columns = regions.columns.str.lower()
regions['nerc'] = regions['name'].str.split().str[-1].str.strip('()')
# regions = regions.loc[:, ['nerc', 'geometry']]
regions.head()
objectid | id | subname | name | address | city | state | zipcode | country | website | method | date | source | shape__are | shape__len | geometry | nerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | WECC SOUTHWEST | WESTERN ELECTRICITY COORDINATING COUNCIL (WECC) | 155 NORTH 400 WEST SUITE 200 | SALT LAKE CITY | UT | 84103 | USA | https://www.wecc.biz/Pages/home.aspx | OTHER | 2017-04-21T00:00:00.000Z | EIA 861, eGRID 2010, http://www.nerc.com/ | 84.129616 | 60.209543 | (POLYGON ((-106.940414613533 38.5297752935145,... | WECC |
1 | 2 | 1 | WECC CALIFORNIA | WESTERN ELECTRICITY COORDINATING COUNCIL (WECC) | 156 NORTH 400 WEST SUITE 200 | SALT LAKE CITY | UT | 84103 | USA | https://www.wecc.biz/Pages/home.aspx | OTHER | 2017-04-21T00:00:00.000Z | EIA 861, eGRID 2010, http://www.nerc.com/ | 43.141024 | 59.568593 | (POLYGON ((-124.166090785201 41.0947342364264,... | WECC |
2 | 3 | 2 | ERCOT ALL | TEXAS RELIABILITY ENTITY (TRE) | 800 AIRPORT ROAD | TAYLOR | TX | 76574 | USA | http://www.ercot.com/ | OTHER | 2017-04-21T00:00:00.000Z | EIA 861, eGRID 2010, http://www.nerc.com/ | 61.102506 | 131.535864 | (POLYGON ((-123.715621602827 44.0113707388719,... | TRE |
3 | 4 | 3 | FRCC ALL | FLORIDA RELIABILITY COORDINATING COUNCIL (FRCC) | 3000 BAYPORT DRIVE, SUITE 600 | TAMPA | FL | 33607 | USA | https://www.frcc.com/default.aspx | OTHER | 2017-04-21T00:00:00.000Z | EIA 861, eGRID 2010, http://www.nerc.com/ | 12.385353 | 170.579871 | (POLYGON ((-85.7661778899989 30.3156300105005,... | FRCC |
4 | 5 | 4 | NPCC NEW ENGLAND | NORTHEAST POWER COORDINATING COUNCIL (NPCC) | 1040 AVENUE OF THE AMERICAS, 10TH FLOOR | NEW YORK | NY | 10018 | USA | https://www.npcc.org/default.aspx | OTHER | 2017-04-21T00:00:00.000Z | EIA 861, eGRID 2010, http://www.nerc.com/ | 19.384108 | 151.655607 | (POLYGON ((-67.1335131587897 45.1268843657456,... | NPCC |
regions.loc[regions['nerc']=='WECC'].plot(cmap='tab20', alpha=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0x1a7d1fe908>
regions.plot()
regions.to_crs(epsg=2163).plot(cmap='tab20', alpha=0.5,
linewidth=0.5, edgecolor='0.1')
<matplotlib.axes._subplots.AxesSubplot at 0x1a7d997c50>
# This is slow!
regions_nerc = regions.dissolve(by='nerc', as_index=False)
regions_nerc.to_crs(epsg=2163).plot()
<matplotlib.axes._subplots.AxesSubplot at 0x68c397ba58>
path = join(data_path, 'NERC_Regions', 'Aggregated NERC Regions.geojson')
regions_nerc.to_file(path, driver='GeoJSON')
fiona.open()
path = join(data_path, 'NERC_Regions', 'Aggregated NERC Regions.geojson')
regions_nerc = gpd.read_file(path, driver='GeoJSON')
path = os.path.join(data_path, 'cb_2016_us_state_20m', 'cb_2016_us_state_20m.shp')
states = gpd.read_file(path)
states.crs
{'init': 'epsg:4269'}
drop_states = ['Alaska', 'Hawaii', 'Puerto Rico']
states = states.loc[~states['NAME'].isin(drop_states)]
states.to_crs(epsg=2163).plot()
<matplotlib.axes._subplots.AxesSubplot at 0x118104c88>
path = join(data_path, 'final NERC data',
'Summary table {}.csv'.format(file_date))
index = pd.read_csv(path)
index
nerc | 2001 | 2017 | Reduction | Percent Reduction | Annual Reduction | |
---|---|---|---|---|---|---|
0 | TRE | 609.480997 | 445.396025 | 164.084972 | 0.269221 | 10.255311 |
1 | WECC | 520.628493 | 346.627554 | 174.000939 | 0.334213 | 10.875059 |
2 | USA | 629.016312 | 438.047363 | 190.968949 | 0.303599 | 11.935559 |
3 | SERC | 633.368212 | 426.814179 | 206.554033 | 0.326120 | 12.909627 |
4 | RFC | 676.181487 | 461.065520 | 215.115966 | 0.318133 | 13.444748 |
5 | FRCC | 648.223415 | 410.133060 | 238.090355 | 0.367297 | 14.880647 |
6 | NPCC | 414.395302 | 172.744904 | 241.650399 | 0.583140 | 15.103150 |
7 | MRO | 855.109267 | 541.516736 | 313.592531 | 0.366728 | 19.599533 |
8 | SPP | 853.801162 | 494.664676 | 359.136486 | 0.420632 | 22.446030 |
for nerc in regions_nerc['nerc'].unique():
try:
val_2017 = index.loc[index['nerc']==nerc, '2017'].values[0]
val_2001 = index.loc[index['nerc']==nerc, '2001'].values[0]
reduce = index.loc[index['nerc']==nerc, 'Percent Reduction'].values[0]
# print(val)
regions_nerc.loc[regions_nerc['nerc']==nerc, 2017] = val_2017
regions_nerc.loc[regions_nerc['nerc']==nerc, 2001] = val_2001
regions_nerc.loc[regions_nerc['nerc']==nerc, 'reduction'] = '{:.0%}'.format(reduce)
regions_nerc.loc[regions_nerc['nerc']==nerc, 'reduction value'] = reduce
except:
pass
regions_nerc
OBJECTID | NERCregion | SHAPE_Leng | SHAPE_Area | geometry | nerc | 2017 | 2001 | reduction | reduction value | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | FRCC | 22.488030 | 11.459289 | POLYGON ((-84.86300370899994 30.71266453000004... | FRCC | 410.133060 | 648.223415 | 37% | 0.367297 |
1 | 2 | MRO | 76.997961 | 130.425939 | POLYGON ((-94.8320392469999 49.33080593000005,... | MRO | 541.516736 | 855.109267 | 37% | 0.366728 |
2 | 3 | NPCC | 48.574281 | 32.754394 | (POLYGON ((-72.5509712409999 40.96617993100006... | NPCC | 172.744904 | 414.395302 | 58% | 0.583140 |
3 | 4 | RFC | 99.332877 | 71.157869 | (POLYGON ((-87.92640864299995 44.5391396020000... | RFC | 461.065520 | 676.181487 | 32% | 0.318133 |
4 | 5 | SERC | 128.958444 | 142.139208 | POLYGON ((-80.20680177299994 36.54903791300006... | SERC | 426.814179 | 633.368212 | 33% | 0.326120 |
5 | 6 | SPP | 61.523164 | 56.496471 | POLYGON ((-97.56098691099993 36.38533196100008... | SPP | 494.664676 | 853.801162 | 42% | 0.420632 |
6 | 7 | TRE | 44.145641 | 50.605118 | POLYGON ((-104.4498774409999 32.00686615000006... | TRE | 445.396025 | 609.480997 | 27% | 0.269221 |
7 | 8 | WECC | 102.110211 | 323.227276 | (POLYGON ((-122.6656125679999 48.3967777520000... | WECC | 346.627554 | 520.628493 | 33% | 0.334213 |
regions_albers = regions_nerc.to_crs(epsg=2163)
states_albers = states.to_crs(epsg=2163)
regions_albers.plot(column=2001, cmap='cividis_r', edgecolor='0.1', linewidth=1)
<matplotlib.axes._subplots.AxesSubplot at 0x1a76c7f908>
def plot_nerc_annual(regions_proj, states_proj, data_col, text_col,
cmap='cividis_r', vmin=None, vmax=None, title=None,
cbar_title=None, **kwargs):
states_ec = kwargs.get('states_ec', '0.6')
regions_ec = kwargs.get('regions_ec', '0.2')
regions_lw = kwargs.get('regions_lw', 0.75)
font_size = kwargs.get('font_size', 9)
bbox_alpha = kwargs.get('bbox_alpha', 0.7)
FRCC_x = kwargs.get('FRCC_x', 4.75)
SERC_x = kwargs.get('SERC_x', 2)
SERC_y = kwargs.get('SERC_y', -2)
SPP_y = kwargs.get('SPP_y', 1.75)
RFC_y = kwargs.get('RFC_y', -0.5)
fig, ax = plt.subplots(figsize=(8,3.5))
# set aspect to equal. This is done automatically
# when using *geopandas* plot on it's own, but not when
# working with pyplot directly.
ax.set_aspect('equal')
regions_proj.plot(ax=ax, column=data_col, cmap=cmap, legend=True,
vmin=vmin, vmax=vmax)
states_proj.plot(ax=ax, color='none', edgecolor=states_ec)
regions_proj.plot(ax=ax, color='none', edgecolor=regions_ec,
linewidth=regions_lw)
# plt.text(x=1.1, y=1.01, s=cbar_title, transform=ax.transAxes,
# ha='center', va='bottom', fontdict={'size':font_size})
plt.title(title)
for point, nerc in zip(regions_proj.centroid, regions_proj['nerc'].values):
text = regions_proj.loc[regions_proj['nerc']==nerc, text_col].values[0]
# text = '{}'.format(nerc, reduce)
x = point.x
y = point.y
if nerc == 'FRCC':
x = x + conv_lon(4.75)#-79
y = y - conv_lat(1)#28
rot = -67
plt.text(x, y, text, ha='center', va='center',
fontdict={'size':font_size})
elif nerc == 'NPCC':
x = x - conv_lon(1.5)
y = y + conv_lat(2.1)
plt.text(x, y, text, ha='center',
fontdict={'size':font_size})
elif nerc == 'SERC':
x = x + conv_lon(SERC_x)
y = y + conv_lat(SERC_y)
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
elif nerc == 'RFC':
# x = x + conv_lon(RFC_x)
y = y + conv_lat(RFC_y)
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
elif nerc == 'SPP':
# x = x + 2
y = y + conv_lat(SPP_y)
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
else:
plt.text(x, y, text, ha='center', va='center',
bbox=dict(facecolor='white',
alpha=bbox_alpha,
boxstyle="square"),
fontdict={'size':font_size})
sns.despine(left=True, bottom=True)
ax.set_yticklabels(labels=[])
ax.set_xticklabels(labels=[])
cax = fig.get_axes()[-1]
cax.set_title(cbar_title, fontdict={'size':font_size})
# https://gist.github.com/springmeyer/871897
def conv_lon(x):
newx = x * 20037508.34 / 180
return newx
def conv_lat(y):
newy = np.log(np.tan((90 + y) * np.pi / 360)) / (np.pi / 180)
newy *= 20037508.34 / 180
return newy
title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001)
kwargs = dict(
regions_lw = 1,
regions_ec = '0.1',
SERC_y = -1.5,
SPP_y = 2.25
)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2001,
text_col='nerc', vmin=vmin, vmax=vmax, title=title,
cbar_title='g CO$_2$/kWh', **kwargs)
path = join(figure_path, 'NERC map_cividis_2001.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
kwargs = dict(
regions_lw = 1,
regions_ec = '0.1',
SERC_y = -1.5,
SPP_y = 2.25
)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2017,
text_col='arrow reduction', vmin=vmin, vmax=vmax, title=title,
cbar_title='g CO$_2$/kWh', **kwargs)
path = join(figure_path, 'NERC map_cividis_2017_change.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2001,
text_col='NERC', vmin=vmin, vmax=vmax, cmap='Reds',
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_reds_2001.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
vmin = regions_albers.loc[:, [2001, 2017]].min().min()
vmax = regions_albers.loc[:, [2001, 2017]].max().max()
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2017,
text_col='arrow reduction', vmin=vmin, vmax=vmax, cmap='Reds',
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_reds_2017_change.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001)
vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max())
vmin = -vmax
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2001 diff',
text_col='NERC', cmap='PRGn_r', vmin=vmin, vmax=vmax,
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_diverging_2001_2017_bounds.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max())
vmin = -vmax
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2017 diff',
text_col='arrow reduction', cmap='PRGn_r', vmin=vmin, vmax=vmax,
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_diverging_2017.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017)
vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max())
vmin = -vmax
regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction']
plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2017 diff',
text_col='arrow reduction', cmap='BrBG_r', vmin=vmin, vmax=vmax,
title=title, cbar_title='g CO$_2$/kWh')
path = join(figure_path, 'NERC map_diverging_BrBG_2017.png')
plt.savefig(path, bbox_inches='tight', dpi=350)
path = os.path.join(data_path, 'Facility gen fuels and CO2 2017-08-31.zip')
facility_df = pd.read_csv(path)
From 2001-mid 2017, 9 facilities (out of 8,435) don't have valid lat/lon data. All of them are from 2010 or earlier. Drop these rows or the spatial join fails. The amount of generation from these plants is tiny - well under 0.05% in all years.
print(len(facility_df['plant id'].unique()), 'total plants')
print(len(facility_df.loc[facility_df['lat'].isnull(), 'plant id'].unique()),
'plants without lat/lon')
8435 total plants 9 plants without lat/lon
years = facility_df.loc[facility_df['lat'].isnull(), 'year'].unique()
for year in years:
total_gen = facility_df.loc[facility_df['year'] == year, 'generation (MWh)'].sum()
# Plant ids with no 'lat' in year
no_loc_plants = facility_df.loc[(facility_df['lat'].isnull()) &
(facility_df['year'] == year), 'plant id'].unique()
no_loc_gen = facility_df.loc[(facility_df['year'] == year) &
(facility_df['plant id'].isin(no_loc_plants)),
'generation (MWh)'].sum()
percent_dropped = no_loc_gen / total_gen * 100
print('In {}, {:.3f}% of generation is from plants without lat/lon'.format(year, percent_dropped))
In 2006, 0.000% of generation is from plants without lat/lon In 2005, 0.001% of generation is from plants without lat/lon In 2004, 0.001% of generation is from plants without lat/lon In 2002, 0.024% of generation is from plants without lat/lon In 2001, 0.031% of generation is from plants without lat/lon In 2010, 0.000% of generation is from plants without lat/lon In 2009, 0.000% of generation is from plants without lat/lon In 2007, 0.000% of generation is from plants without lat/lon In 2003, 0.011% of generation is from plants without lat/lon
facility_df.dropna(inplace=True, subset=['lat', 'lon'])
facility_df.columns
Index(['f', 'fuel', 'month', 'plant id', 'total fuel (mmbtu)', 'year', 'generation (MWh)', 'elec fuel (mmbtu)', 'geography', 'last_updated', 'lat', 'lon', 'prime mover', 'datetime', 'quarter', 'all fuel fossil CO2 (kg)', 'elec fuel fossil CO2 (kg)', 'all fuel total CO2 (kg)', 'elec fuel total CO2 (kg)'], dtype='object')
Because I have monthly data for every facility from 2001-2017, there are lots of duplicate rows. No need to do a spatial join on every row. Just keep one instance of each facility in each year.
cols = ['lat', 'lon', 'plant id', 'year']
small_facility = facility_df.loc[:, cols].drop_duplicates()
Use Point
from Shapely to create the geometry
list of facility locations. crs
is the coordinate reference system that translates lat/lon into a specific map projection.
geometry = [Point(xy) for xy in zip(small_facility.lon, small_facility.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(small_facility, crs=crs, geometry=geometry)
geo_df.head()
lat | lon | plant id | year | geometry | |
---|---|---|---|---|---|
0 | 44.4936 | -88.0303 | 10360 | 2017 | POINT (-88.0303 44.4936) |
6 | 44.4936 | -88.0303 | 10360 | 2016 | POINT (-88.0303 44.4936) |
18 | 44.4936 | -88.0303 | 10360 | 2015 | POINT (-88.0303 44.4936) |
30 | 44.4936 | -88.0303 | 10360 | 2014 | POINT (-88.0303 44.4936) |
38 | 44.4936 | -88.0303 | 10360 | 2008 | POINT (-88.0303 44.4936) |
len(geo_df)
88918
Joining the 9 regions (NERC) with 100,810 records takes much longer with facilities as the left dataframe in the join (2 min vs 12 seconds). Not quite sure why this is. The faster method leaves me with polygons rather than points tho. There might be a better way to rectify this, but I just create a new geodataframe with the geometry set as the lat/lon points.
EDIT
Although the GeoPandas documentation says that the op
parameter doesn't matter for point-in-polygon operations, using 'within' made the operation much faster (12 seconds). The first method below is probably preferable since it keeps the point geometry.
Method 1 (slow unless you use op='within'
)
facility_nerc = gpd.sjoin(geo_df, regions, how='inner', op='within')
facility_nerc.head()
lat | lon | plant id | year | geometry | index_right | NERC | NERC_Label | |
---|---|---|---|---|---|---|---|---|
0 | 44.4936 | -88.0303 | 10360 | 2017 | POINT (-88.0303 44.4936) | 2 | MRO | Midwest Reliability Organization (MRO) |
6 | 44.4936 | -88.0303 | 10360 | 2016 | POINT (-88.0303 44.4936) | 2 | MRO | Midwest Reliability Organization (MRO) |
18 | 44.4936 | -88.0303 | 10360 | 2015 | POINT (-88.0303 44.4936) | 2 | MRO | Midwest Reliability Organization (MRO) |
30 | 44.4936 | -88.0303 | 10360 | 2014 | POINT (-88.0303 44.4936) | 2 | MRO | Midwest Reliability Organization (MRO) |
38 | 44.4936 | -88.0303 | 10360 | 2008 | POINT (-88.0303 44.4936) | 2 | MRO | Midwest Reliability Organization (MRO) |
Method 2 (faster when using the default operation)
facility_nerc = gpd.sjoin(df, geo_df, how="inner")
facility_nerc.head()
NERC | NERC_Label | geometry | index_right | lat | lon | plant id | year | |
---|---|---|---|---|---|---|---|---|
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 1200931 | 40.389167 | -91.394167 | 57953 | 2014 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 1200919 | 40.389167 | -91.394167 | 57953 | 2015 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 1200907 | 40.389167 | -91.394167 | 57953 | 2016 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 1200904 | 40.389167 | -91.394167 | 57953 | 2017 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 1201030 | 40.389167 | -91.394167 | 57953 | 2011 |
Make the new geometry of facility locations
geometry = [Point(xy) for xy in zip(facility_nerc.lon, facility_nerc.lat)]
Create new dataframe with the data I want to keep and the new geometry
crs = {'init': 'epsg:4326'}
keep_cols = ['NERC_Label', 'plant id', 'year']
facility_nerc = GeoDataFrame(facility_nerc[keep_cols], crs=crs, geometry=geometry)
facility_nerc.head()
NERC_Label | plant id | year | geometry | |
---|---|---|---|---|
0 | Indeterminate - various NERC membership | 57953 | 2014 | POINT (-91.39416700000001 40.389167) |
0 | Indeterminate - various NERC membership | 57953 | 2015 | POINT (-91.39416700000001 40.389167) |
0 | Indeterminate - various NERC membership | 57953 | 2016 | POINT (-91.39416700000001 40.389167) |
0 | Indeterminate - various NERC membership | 57953 | 2017 | POINT (-91.39416700000001 40.389167) |
0 | Indeterminate - various NERC membership | 57953 | 2011 | POINT (-91.39416700000001 40.389167) |
Just making sure that it does something
df_test = df.to_crs({'init': 'epsg:3395'})
df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1301c5f28>
df_test.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1334af7b8>
If you need to load the shapefile, download it here.
# If loading the shapefile
shape_path = ''
regions = gpd.read_file(shape_path)
path = join(data_path, 'Facility labels', 'Facility locations.csv')
location = pd.read_csv(path)
len(location)
8217
The nerc
column is labels from a spatial join using this same shapefile and lat/lon data in QGIS.
location.head()
plant id | state | lat | lon | nerc | |
---|---|---|---|---|---|
0 | 2 | AL | 33.458665 | -87.356823 | SERC |
1 | 3 | AL | 31.006900 | -88.010300 | SERC |
2 | 4 | AL | 32.583889 | -86.283056 | SERC |
3 | 7 | AL | 34.012800 | -85.970800 | SERC |
4 | 8 | AL | 33.644344 | -87.196486 | SERC |
location.loc[location['lat'].isnull()]
plant id | state | lat | lon | nerc |
---|
geometry = [Point(xy) for xy in zip(location.lon, location.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(location, crs=crs, geometry=geometry)
geo_df.head()
plant id | state | lat | lon | nerc | geometry | |
---|---|---|---|---|---|---|
0 | 2 | AL | 33.458665 | -87.356823 | SERC | POINT (-87.35682299999999 33.458665) |
1 | 3 | AL | 31.006900 | -88.010300 | SERC | POINT (-88.0103 31.0069) |
2 | 4 | AL | 32.583889 | -86.283056 | SERC | POINT (-86.283056 32.583889) |
3 | 7 | AL | 34.012800 | -85.970800 | SERC | POINT (-85.9708 34.0128) |
4 | 8 | AL | 33.644344 | -87.196486 | SERC | POINT (-87.19648599999999 33.644344) |
Every plant id is unique
len(geo_df)
8099
len(geo_df['plant id'].unique())
8099
df1 = gpd.sjoin(regions, geo_df)
df1.head()
NERC | NERC_Label | geometry | index_right | plant id | state | lat | lon | nerc | |
---|---|---|---|---|---|---|---|---|---|
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 4381 | 54930 | IA | 40.584150 | -91.424530 | - |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 639 | 1127 | IA | 40.703100 | -92.425300 | - |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 629 | 1104 | IA | 40.741200 | -91.116667 | - |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 630 | 1105 | IA | 40.747800 | -92.873056 | - |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 2562 | 7706 | IA | 40.816736 | -91.146667 | - |
We can already see that there are more rows in this dataframe than there were power plants.
len(df1)
8135
df2 = gpd.sjoin(geo_df, regions, op='within')
df2.head()
plant id | state | lat | lon | nerc | geometry | index_right | NERC | NERC_Label | |
---|---|---|---|---|---|---|---|---|---|
0 | 2 | AL | 33.458665 | -87.356823 | SERC | POINT (-87.35682299999999 33.458665) | 5 | SERC | SERC Reliability Corporation (SERC) |
1 | 3 | AL | 31.006900 | -88.010300 | SERC | POINT (-88.0103 31.0069) | 5 | SERC | SERC Reliability Corporation (SERC) |
2 | 4 | AL | 32.583889 | -86.283056 | SERC | POINT (-86.283056 32.583889) | 5 | SERC | SERC Reliability Corporation (SERC) |
3 | 7 | AL | 34.012800 | -85.970800 | SERC | POINT (-85.9708 34.0128) | 5 | SERC | SERC Reliability Corporation (SERC) |
4 | 8 | AL | 33.644344 | -87.196486 | SERC | POINT (-87.19648599999999 33.644344) | 5 | SERC | SERC Reliability Corporation (SERC) |
Same size dataframe as from Method 1. Yet again, there appear to be extra results.
len(df2)
8135
df3 = gpd.sjoin(regions, geo_df, op='contains')
df3.head()
NERC | NERC_Label | geometry | index_right | plant id | state | lat | lon | |
---|---|---|---|---|---|---|---|---|
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 4387 | 55340 | AR | 35.861900 | -90.025300 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 6136 | 172 | AR | 35.689300 | -89.994000 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 152 | 8055 | AR | 35.311000 | -93.235100 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 6217 | 187 | AR | 34.211913 | -93.110963 |
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... | 2928 | 59947 | AR | 33.631538 | -92.703611 |
Still too many results
len(df3)
8135
regions
NERC | NERC_Label | geometry | |
---|---|---|---|
index_right | |||
0 | - | Indeterminate - various NERC membership | (POLYGON ((-91.71222639199999 32.9756623780000... |
1 | FRCC | Florida Reliability Coordinating Council (FRCC) | (POLYGON ((-81.95506602699999 24.5196900770000... |
2 | MRO | Midwest Reliability Organization (MRO) | POLYGON ((-95.07148604699995 49.36451082200006... |
3 | NPCC | Northeast Power Coordinating Council (NPCC) | (POLYGON ((-73.82134384999995 40.6045004290000... |
4 | RFC | ReliabilityFirst Corporation (RFC) | (POLYGON ((-90.89359863199996 29.0467829150000... |
5 | SERC | SERC Reliability Corporation (SERC) | (POLYGON ((-91.71222639199999 32.9756623780000... |
6 | SPP | Southwest Power Pool (SPP) | (POLYGON ((-93.73310749999996 30.4137588960000... |
7 | TRE | Texas Reliability Entity (TRE) | POLYGON ((-100.0008629659999 35.02950123100004... |
8 | WECC | Western Electricity Coordinating Council (WECC) | (POLYGON ((-117.215183729 32.77737140300007, -... |
frcc = regions.loc[1, 'geometry']
serc = regions.loc[5, 'geometry']
I figured out that plant id 641 is one that shows up in both FRCC and SERC
plant_641 = geo_df.loc[geo_df['plant id'] == 641, 'geometry'].values[0]
frcc.contains(plant_641)
True
serc.contains(plant_641)
True
So 641 definitely is in both NERC regions. And apparently the regions overlap? But they aren't supposed to. And I didn't have any plants show up in multiple regions when doing the spatial join in QGIS.
frcc.intersects(serc)
True
There are 36 plants with duplicate NERC regions.
len(df2.loc[df2['plant id'].duplicated()].sort_values('plant id'))
36
pd.set_option('display.max_rows', 200)
df2.loc[df2['plant id'].duplicated(keep=False)].sort_values('plant id')
plant id | state | lat | lon | nerc | geometry | index_right | NERC | NERC_Label | |
---|---|---|---|---|---|---|---|---|---|
406 | 641 | FL | 30.566100 | -87.224400 | FRCC | POINT (-87.2244 30.5661) | 5 | SERC | SERC Reliability Corporation (SERC) |
406 | 641 | FL | 30.566100 | -87.224400 | FRCC | POINT (-87.2244 30.5661) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
407 | 642 | FL | 30.669200 | -84.886900 | FRCC | POINT (-84.8869 30.6692) | 5 | SERC | SERC Reliability Corporation (SERC) |
407 | 642 | FL | 30.669200 | -84.886900 | FRCC | POINT (-84.8869 30.6692) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
408 | 643 | FL | 30.268600 | -85.700300 | FRCC | POINT (-85.7003 30.2686) | 5 | SERC | SERC Reliability Corporation (SERC) |
408 | 643 | FL | 30.268600 | -85.700300 | FRCC | POINT (-85.7003 30.2686) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
432 | 690 | FL | 30.708571 | -84.863869 | FRCC | POINT (-84.86386899999999 30.708571) | 5 | SERC | SERC Reliability Corporation (SERC) |
432 | 690 | FL | 30.708571 | -84.863869 | FRCC | POINT (-84.86386899999999 30.708571) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
963 | 1773 | MI | 46.726100 | -88.662500 | MRO | POINT (-88.66249999999999 46.7261) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
963 | 1773 | MI | 46.726100 | -88.662500 | MRO | POINT (-88.66249999999999 46.7261) | 2 | MRO | Midwest Reliability Organization (MRO) |
964 | 1774 | MI | 46.696400 | -89.208900 | MRO | POINT (-89.2089 46.6964) | 2 | MRO | Midwest Reliability Organization (MRO) |
964 | 1774 | MI | 46.696400 | -89.208900 | MRO | POINT (-89.2089 46.6964) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
965 | 1775 | WI | 45.947200 | -88.218900 | MRO | POINT (-88.2189 45.9472) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
965 | 1775 | WI | 45.947200 | -88.218900 | MRO | POINT (-88.2189 45.9472) | 2 | MRO | Midwest Reliability Organization (MRO) |
967 | 1777 | MI | 46.131100 | -88.225000 | MRO | POINT (-88.22499999999999 46.1311) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
967 | 1777 | MI | 46.131100 | -88.225000 | MRO | POINT (-88.22499999999999 46.1311) | 2 | MRO | Midwest Reliability Organization (MRO) |
969 | 1780 | MI | 45.955300 | -88.195800 | MRO | POINT (-88.19580000000001 45.9553) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
969 | 1780 | MI | 45.955300 | -88.195800 | MRO | POINT (-88.19580000000001 45.9553) | 2 | MRO | Midwest Reliability Organization (MRO) |
970 | 1781 | MI | 45.990781 | -88.210514 | MRO | POINT (-88.210514 45.990781) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
970 | 1781 | MI | 45.990781 | -88.210514 | MRO | POINT (-88.210514 45.990781) | 2 | MRO | Midwest Reliability Organization (MRO) |
971 | 1784 | MI | 45.871900 | -88.069400 | MRO | POINT (-88.0694 45.8719) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
971 | 1784 | MI | 45.871900 | -88.069400 | MRO | POINT (-88.0694 45.8719) | 2 | MRO | Midwest Reliability Organization (MRO) |
972 | 1785 | MI | 46.159200 | -88.235300 | MRO | POINT (-88.2353 46.1592) | 2 | MRO | Midwest Reliability Organization (MRO) |
972 | 1785 | MI | 46.159200 | -88.235300 | MRO | POINT (-88.2353 46.1592) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
977 | 1821 | MI | 46.106278 | -88.334531 | MRO | POINT (-88.33453100000001 46.106278) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
977 | 1821 | MI | 46.106278 | -88.334531 | MRO | POINT (-88.33453100000001 46.106278) | 2 | MRO | Midwest Reliability Organization (MRO) |
992 | 1848 | MI | 45.788900 | -87.905800 | MRO | POINT (-87.9058 45.7889) | 2 | MRO | Midwest Reliability Organization (MRO) |
992 | 1848 | MI | 45.788900 | -87.905800 | MRO | POINT (-87.9058 45.7889) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
1978 | 4043 | WI | 44.253930 | -88.409560 | MRO | POINT (-88.40956 44.25393) | 2 | MRO | Midwest Reliability Organization (MRO) |
1978 | 4043 | WI | 44.253930 | -88.409560 | MRO | POINT (-88.40956 44.25393) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
2017 | 4122 | WI | 44.316400 | -88.314700 | MRO | POINT (-88.3147 44.3164) | 2 | MRO | Midwest Reliability Organization (MRO) |
2017 | 4122 | WI | 44.316400 | -88.314700 | MRO | POINT (-88.3147 44.3164) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
2020 | 4127 | WI | 44.200020 | -88.458171 | MRO | POINT (-88.45817099999999 44.20002) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
2020 | 4127 | WI | 44.200020 | -88.458171 | MRO | POINT (-88.45817099999999 44.20002) | 2 | MRO | Midwest Reliability Organization (MRO) |
2191 | 6192 | FL | 30.573611 | -86.215833 | FRCC | POINT (-86.21583299999999 30.573611) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
2191 | 6192 | FL | 30.573611 | -86.215833 | FRCC | POINT (-86.21583299999999 30.573611) | 5 | SERC | SERC Reliability Corporation (SERC) |
2565 | 7715 | FL | 30.592080 | -87.135391 | FRCC | POINT (-87.135391 30.59208) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
2565 | 7715 | FL | 30.592080 | -87.135391 | FRCC | POINT (-87.135391 30.59208) | 5 | SERC | SERC Reliability Corporation (SERC) |
2614 | 7822 | WI | 44.274582 | -88.319074 | MRO | POINT (-88.319074 44.274582) | 2 | MRO | Midwest Reliability Organization (MRO) |
2614 | 7822 | WI | 44.274582 | -88.319074 | MRO | POINT (-88.319074 44.274582) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
2947 | 10250 | FL | 30.266057 | -85.520670 | FRCC | POINT (-85.52067 30.266057) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
2947 | 10250 | FL | 30.266057 | -85.520670 | FRCC | POINT (-85.52067 30.266057) | 5 | SERC | SERC Reliability Corporation (SERC) |
3027 | 10416 | FL | 30.595800 | -87.252500 | FRCC | POINT (-87.2525 30.5958) | 5 | SERC | SERC Reliability Corporation (SERC) |
3027 | 10416 | FL | 30.595800 | -87.252500 | FRCC | POINT (-87.2525 30.5958) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
3434 | 50250 | FL | 30.596600 | -87.326400 | FRCC | POINT (-87.32640000000001 30.5966) | 5 | SERC | SERC Reliability Corporation (SERC) |
3434 | 50250 | FL | 30.596600 | -87.326400 | FRCC | POINT (-87.32640000000001 30.5966) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
3435 | 50251 | MI | 45.795600 | -87.955600 | MRO | POINT (-87.9556 45.7956) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
3435 | 50251 | MI | 45.795600 | -87.955600 | MRO | POINT (-87.9556 45.7956) | 2 | MRO | Midwest Reliability Organization (MRO) |
3471 | 50310 | FL | 30.473600 | -87.236100 | FRCC | POINT (-87.23609999999999 30.4736) | 5 | SERC | SERC Reliability Corporation (SERC) |
3471 | 50310 | FL | 30.473600 | -87.236100 | FRCC | POINT (-87.23609999999999 30.4736) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
3681 | 50725 | FL | 30.848333 | -87.113333 | FRCC | POINT (-87.113333 30.848333) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
3681 | 50725 | FL | 30.848333 | -87.113333 | FRCC | POINT (-87.113333 30.848333) | 5 | SERC | SERC Reliability Corporation (SERC) |
3723 | 50807 | FL | 30.141984 | -85.621103 | FRCC | POINT (-85.62110300000001 30.141984) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
3723 | 50807 | FL | 30.141984 | -85.621103 | FRCC | POINT (-85.62110300000001 30.141984) | 5 | SERC | SERC Reliability Corporation (SERC) |
4327 | 54842 | WI | 44.286900 | -88.333900 | MRO | POINT (-88.3339 44.2869) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
4327 | 54842 | WI | 44.286900 | -88.333900 | MRO | POINT (-88.3339 44.2869) | 2 | MRO | Midwest Reliability Organization (MRO) |
4352 | 54885 | WI | 44.276900 | -88.334400 | MRO | POINT (-88.3344 44.2769) | 2 | MRO | Midwest Reliability Organization (MRO) |
4352 | 54885 | WI | 44.276900 | -88.334400 | MRO | POINT (-88.3344 44.2769) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
4527 | 55135 | WI | 44.193600 | -88.506400 | MRO | POINT (-88.5064 44.1936) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
4527 | 55135 | WI | 44.193600 | -88.506400 | MRO | POINT (-88.5064 44.1936) | 2 | MRO | Midwest Reliability Organization (MRO) |
4610 | 55242 | FL | 30.566400 | -87.115000 | FRCC | POINT (-87.11499999999999 30.5664) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
4610 | 55242 | FL | 30.566400 | -87.115000 | FRCC | POINT (-87.11499999999999 30.5664) | 5 | SERC | SERC Reliability Corporation (SERC) |
5022 | 56037 | WI | 44.189036 | -88.468492 | MRO | POINT (-88.468492 44.189036) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
5022 | 56037 | WI | 44.189036 | -88.468492 | MRO | POINT (-88.468492 44.189036) | 2 | MRO | Midwest Reliability Organization (MRO) |
5397 | 56522 | FL | 30.928333 | -85.426111 | FRCC | POINT (-85.42611099999999 30.928333) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
5397 | 56522 | FL | 30.928333 | -85.426111 | FRCC | POINT (-85.42611099999999 30.928333) | 5 | SERC | SERC Reliability Corporation (SERC) |
6162 | 57502 | FL | 30.570000 | -87.390000 | FRCC | POINT (-87.39 30.57) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
6162 | 57502 | FL | 30.570000 | -87.390000 | FRCC | POINT (-87.39 30.57) | 5 | SERC | SERC Reliability Corporation (SERC) |
7160 | 58663 | WI | 44.115000 | -88.217222 | MRO | POINT (-88.21722199999999 44.115) | 2 | MRO | Midwest Reliability Organization (MRO) |
7160 | 58663 | WI | 44.115000 | -88.217222 | MRO | POINT (-88.21722199999999 44.115) | 4 | RFC | ReliabilityFirst Corporation (RFC) |
7859 | 59689 | FL | 30.515448 | -86.518407 | FRCC | POINT (-86.518407 30.515448) | 5 | SERC | SERC Reliability Corporation (SERC) |
7859 | 59689 | FL | 30.515448 | -86.518407 | FRCC | POINT (-86.518407 30.515448) | 1 | FRCC | Florida Reliability Coordinating Council (FRCC) |
facility_df.head()
f | fuel | month | plant id | total fuel (mmbtu) | year | generation (MWh) | elec fuel (mmbtu) | geography | last_updated | lat | lon | prime mover | datetime | quarter | all fuel fossil CO2 (kg) | elec fuel fossil CO2 (kg) | all fuel total CO2 (kg) | elec fuel total CO2 (kg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | M | SUB | 6 | 10360 | 0.0 | 2017 | 0.0 | 0.0 | USA-WI | 2017-08-24T11:46:12-04:00 | 44.4936 | -88.0303 | ALL | 2017-06-01 | 2 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | M | SUB | 5 | 10360 | 0.0 | 2017 | 0.0 | 0.0 | USA-WI | 2017-08-24T11:46:12-04:00 | 44.4936 | -88.0303 | ALL | 2017-05-01 | 2 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | M | SUB | 4 | 10360 | 0.0 | 2017 | 0.0 | 0.0 | USA-WI | 2017-08-24T11:46:12-04:00 | 44.4936 | -88.0303 | ALL | 2017-04-01 | 2 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | M | SUB | 3 | 10360 | 0.0 | 2017 | 0.0 | 0.0 | USA-WI | 2017-08-24T11:46:12-04:00 | 44.4936 | -88.0303 | ALL | 2017-03-01 | 1 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | M | SUB | 2 | 10360 | 0.0 | 2017 | 0.0 | 0.0 | USA-WI | 2017-08-24T11:46:12-04:00 | 44.4936 | -88.0303 | ALL | 2017-02-01 | 1 | 0.0 | 0.0 | 0.0 | 0.0 |
plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon']]
plants.drop_duplicates(inplace=True)
plants.groupby(['plant id']).max().hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1165f8ef0>]], dtype=object)
path = join(data_path, 'EIA downloads', 'EIA-860 2015', '2___Plant_Y2015.xlsx')
nercs2015 = pd.read_excel(path, skiprows=0, parse_cols='C,L')
nercs2015.columns = ['plant id', 'nerc']
path = join(data_path, 'EIA downloads', 'eia8602012', 'PlantY2012.xlsx')
nercs2012 = pd.read_excel(path, skiprows=0, parse_cols='B,J')
nercs2012.columns = ['plant id', 'nerc']
nercs2012.head()
plant id | nerc | |
---|---|---|
0 | 10867 | SERC |
1 | 50903 | RFC |
2 | 10671 | SPP |
3 | 2527 | NPCC |
4 | 3305 | SERC |
nercs = pd.concat([nercs2012, nercs2015]).drop_duplicates()
len(nercs), len(nercs2012), len(nercs2015)
(9178, 7289, 8928)
df = pd.merge(plants, nercs, on=['plant id'], how='left')
plants.tail()
plant id | year | lat | lon | |
---|---|---|---|---|
1544548 | 1275 | 2006 | 38.947222 | -99.528611 |
1544560 | 1275 | 2005 | 38.947222 | -99.528611 |
1544572 | 1275 | 2004 | 38.947222 | -99.528611 |
1544584 | 1275 | 2003 | 38.947222 | -99.528611 |
1544596 | 1275 | 2002 | 38.947222 | -99.528611 |
nercs.tail()
plant id | nerc | |
---|---|---|
8923 | 60510 | SERC |
8924 | 60511 | WECC |
8925 | 60514 | SERC |
8926 | 60550 | SERC |
8927 | 60571 | WECC |
nercs.nerc.unique()
array(['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'HICC', 'ASCC', 'FRCC', nan], dtype=object)
df.tail()
plant id | year | lat | lon | nerc | |
---|---|---|---|---|---|
89242 | 1275 | 2006 | 38.947222 | -99.528611 | SPP |
89243 | 1275 | 2005 | 38.947222 | -99.528611 | SPP |
89244 | 1275 | 2004 | 38.947222 | -99.528611 | SPP |
89245 | 1275 | 2003 | 38.947222 | -99.528611 | SPP |
89246 | 1275 | 2002 | 38.947222 | -99.528611 | SPP |
len(df.loc[df.isnull().any(axis=1), 'plant id'].unique())
88
from sklearn import neighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
nercs = ['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'FRCC']
df_slim = df.loc[df.nerc.isin(nercs), ['plant id', 'lat', 'lon', 'nerc']].dropna(subset=['lon']).drop_duplicates()
unknown = df_slim.loc[df_slim.nerc.isnull()]
n_neighbors = 10
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']
X_scale = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
y.unique()
array(['MRO', 'WECC', 'NPCC', 'SERC', 'SPP', 'RFC', 'FRCC', 'TRE'], dtype=object)
knn = neighbors.KNeighborsClassifier()
params = {'weights': ['uniform', 'distance'],
'n_neighbors': [10, 15, 20, 30],
'leaf_size': [3, 5, 10, 30],
'p': [1, 2]}
clf = GridSearchCV(knn, params, n_jobs=-1)
clf.fit(X_train, y_train)
GridSearchCV(cv=None, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params={}, iid=True, n_jobs=-1, param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [10, 15, 20, 30], 'leaf_size': [3, 5, 10, 30], 'p': [1, 2]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=0)
clf.best_estimator_, clf.best_params_, clf.best_score_
(KNeighborsClassifier(algorithm='auto', leaf_size=3, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=15, p=1, weights='distance'), {'leaf_size': 3, 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}, 0.96590695046148045)
No difference in score when X is preprocessed with StandardScaler
clf.score(X_scale, y)
0.98498422712933753
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights='uniform')
clf.score(X_test, y_test)
0.95693087224031848
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights='distance')
clf.score(X_test, y_test)
0.95946435034382915
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~/anaconda/envs/py36/lib/python3.6/site-packages/numpy/ma/core.py in min(obj, axis, out, fill_value, keepdims) 6441 try: -> 6442 return obj.min(axis=axis, fill_value=fill_value, out=out, **kwargs) 6443 except (AttributeError, TypeError): TypeError: _amin() got an unexpected keyword argument 'fill_value' During handling of the above exception, another exception occurred: AttributeError Traceback (most recent call last) <ipython-input-95-cf8de1f4ef1e> in <module>() 21 Z = Z.reshape(xx.shape) 22 plt.figure() ---> 23 plt.pcolormesh(xx, yy, Z, cmap=cmap_light) 24 25 # Plot also the training points ~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/pyplot.py in pcolormesh(*args, **kwargs) 3242 mplDeprecation) 3243 try: -> 3244 ret = ax.pcolormesh(*args, **kwargs) 3245 finally: 3246 ax._hold = washold ~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs) 1896 warnings.warn(msg % (label_namer, func.__name__), 1897 RuntimeWarning, stacklevel=2) -> 1898 return func(ax, *args, **kwargs) 1899 pre_doc = inner.__doc__ 1900 if pre_doc is None: ~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/axes/_axes.py in pcolormesh(self, *args, **kwargs) 5601 collection.set_norm(norm) 5602 collection.set_clim(vmin, vmax) -> 5603 collection.autoscale_None() 5604 5605 self.grid(False) ~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/cm.py in autoscale_None(self) 343 if self._A is None: 344 raise TypeError('You must first set_array for mappable') --> 345 self.norm.autoscale_None(self._A) 346 self.changed() 347 ~/anaconda/envs/py36/lib/python3.6/site-packages/matplotlib/colors.py in autoscale_None(self, A) 887 ' autoscale only None-valued vmin or vmax' 888 if self.vmin is None and np.size(A) > 0: --> 889 self.vmin = ma.min(A) 890 if self.vmax is None and np.size(A) > 0: 891 self.vmax = ma.max(A) ~/anaconda/envs/py36/lib/python3.6/site-packages/numpy/ma/core.py in min(obj, axis, out, fill_value, keepdims) 6445 # fill_value argument 6446 return asanyarray(obj).min(axis=axis, fill_value=fill_value, -> 6447 out=out, **kwargs) 6448 min.__doc__ = MaskedArray.min.__doc__ 6449 ~/anaconda/envs/py36/lib/python3.6/site-packages/numpy/ma/core.py in min(self, axis, out, fill_value, keepdims) 5592 if out is None: 5593 result = self.filled(fill_value).min( -> 5594 axis=axis, out=out, **kwargs).view(type(self)) 5595 if result.ndim: 5596 # Set the mask AttributeError: 'str' object has no attribute 'view'