#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns import numpy as np import geopandas as gpd # from shapely.geometry import Point # from geopandas import GeoDataFrame import os from os.path import join import pandas as pd import fiona # from cartopy import crs as ccrs # from urllib.request import urlopen # from zipfile import ZipFile # from io import BytesIO # from pathlib import Path sns.set(style='white') cwd = os.getcwd() data_path = join(cwd, '..', '..', 'Data storage') figure_path = join(cwd, '..', '..', 'Figures') file_date = '2018-03-06' # In[3]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-v -p pandas,geopandas,shapely') # ## Read NERC regions shapefile # [Link to the shapefile](https://github.com/gschivley/Index-variability/raw/master/Data%20storage/NERC_Regions_EIA) for local download # In[3]: # EIA NERC region shapefile, which has an "Indeterminate" region # path = os.path.join(data_path, 'NERC_Regions_EIA', 'NercRegions_201610.shp') # regions = gpd.read_file(path) # regions.crs # In[149]: path = os.path.join(data_path, 'nercregions', 'NERCregions.shp') regions_nerc = gpd.read_file(path) regions_nerc['nerc'] = regions_nerc['NERCregion'] # In[150]: regions_nerc # In[151]: regions_nerc.boundary.plot() # Now using NERC region shapefiles [created by DHS](https://hifld-dhs-gii.opendata.arcgis.com/datasets/12582592aeb842f5a7aab556a57a10d1_0) # In[133]: path = os.path.join(data_path, 'NERC_Regions', 'NERC_Regions.shp') regions = gpd.read_file(path) regions.columns = regions.columns.str.lower() regions['nerc'] = regions['name'].str.split().str[-1].str.strip('()') # regions = regions.loc[:, ['nerc', 'geometry']] regions.head() # In[136]: regions.loc[regions['nerc']=='WECC'].plot(cmap='tab20', alpha=0.5) # In[ ]: regions.plot() # In[132]: regions.to_crs(epsg=2163).plot(cmap='tab20', alpha=0.5, linewidth=0.5, edgecolor='0.1') # In[6]: # This is slow! regions_nerc = regions.dissolve(by='nerc', as_index=False) # In[8]: regions_nerc.to_crs(epsg=2163).plot() # In[ ]: path = join(data_path, 'NERC_Regions', 'Aggregated NERC Regions.geojson') regions_nerc.to_file(path, driver='GeoJSON') # In[ ]: fiona.open() # In[5]: path = join(data_path, 'NERC_Regions', 'Aggregated NERC Regions.geojson') regions_nerc = gpd.read_file(path, driver='GeoJSON') # In[4]: path = os.path.join(data_path, 'cb_2016_us_state_20m', 'cb_2016_us_state_20m.shp') states = gpd.read_file(path) states.crs # In[6]: drop_states = ['Alaska', 'Hawaii', 'Puerto Rico'] states = states.loc[~states['NAME'].isin(drop_states)] # In[7]: states.to_crs(epsg=2163).plot() # In[8]: path = join(data_path, 'final NERC data', 'Summary table {}.csv'.format(file_date)) index = pd.read_csv(path) index # In[152]: for nerc in regions_nerc['nerc'].unique(): try: val_2017 = index.loc[index['nerc']==nerc, '2017'].values[0] val_2001 = index.loc[index['nerc']==nerc, '2001'].values[0] reduce = index.loc[index['nerc']==nerc, 'Percent Reduction'].values[0] # print(val) regions_nerc.loc[regions_nerc['nerc']==nerc, 2017] = val_2017 regions_nerc.loc[regions_nerc['nerc']==nerc, 2001] = val_2001 regions_nerc.loc[regions_nerc['nerc']==nerc, 'reduction'] = '{:.0%}'.format(reduce) regions_nerc.loc[regions_nerc['nerc']==nerc, 'reduction value'] = reduce except: pass # In[153]: regions_nerc # In[154]: regions_albers = regions_nerc.to_crs(epsg=2163) states_albers = states.to_crs(epsg=2163) # In[155]: regions_albers.plot(column=2001, cmap='cividis_r', edgecolor='0.1', linewidth=1) # In[169]: def plot_nerc_annual(regions_proj, states_proj, data_col, text_col, cmap='cividis_r', vmin=None, vmax=None, title=None, cbar_title=None, **kwargs): states_ec = kwargs.get('states_ec', '0.6') regions_ec = kwargs.get('regions_ec', '0.2') regions_lw = kwargs.get('regions_lw', 0.75) font_size = kwargs.get('font_size', 9) bbox_alpha = kwargs.get('bbox_alpha', 0.7) FRCC_x = kwargs.get('FRCC_x', 4.75) SERC_x = kwargs.get('SERC_x', 2) SERC_y = kwargs.get('SERC_y', -2) SPP_y = kwargs.get('SPP_y', 1.75) RFC_y = kwargs.get('RFC_y', -0.5) fig, ax = plt.subplots(figsize=(8,3.5)) # set aspect to equal. This is done automatically # when using *geopandas* plot on it's own, but not when # working with pyplot directly. ax.set_aspect('equal') regions_proj.plot(ax=ax, column=data_col, cmap=cmap, legend=True, vmin=vmin, vmax=vmax) states_proj.plot(ax=ax, color='none', edgecolor=states_ec) regions_proj.plot(ax=ax, color='none', edgecolor=regions_ec, linewidth=regions_lw) # plt.text(x=1.1, y=1.01, s=cbar_title, transform=ax.transAxes, # ha='center', va='bottom', fontdict={'size':font_size}) plt.title(title) for point, nerc in zip(regions_proj.centroid, regions_proj['nerc'].values): text = regions_proj.loc[regions_proj['nerc']==nerc, text_col].values[0] # text = '{}'.format(nerc, reduce) x = point.x y = point.y if nerc == 'FRCC': x = x + conv_lon(4.75)#-79 y = y - conv_lat(1)#28 rot = -67 plt.text(x, y, text, ha='center', va='center', fontdict={'size':font_size}) elif nerc == 'NPCC': x = x - conv_lon(1.5) y = y + conv_lat(2.1) plt.text(x, y, text, ha='center', fontdict={'size':font_size}) elif nerc == 'SERC': x = x + conv_lon(SERC_x) y = y + conv_lat(SERC_y) plt.text(x, y, text, ha='center', va='center', bbox=dict(facecolor='white', alpha=bbox_alpha, boxstyle="square"), fontdict={'size':font_size}) elif nerc == 'RFC': # x = x + conv_lon(RFC_x) y = y + conv_lat(RFC_y) plt.text(x, y, text, ha='center', va='center', bbox=dict(facecolor='white', alpha=bbox_alpha, boxstyle="square"), fontdict={'size':font_size}) elif nerc == 'SPP': # x = x + 2 y = y + conv_lat(SPP_y) plt.text(x, y, text, ha='center', va='center', bbox=dict(facecolor='white', alpha=bbox_alpha, boxstyle="square"), fontdict={'size':font_size}) else: plt.text(x, y, text, ha='center', va='center', bbox=dict(facecolor='white', alpha=bbox_alpha, boxstyle="square"), fontdict={'size':font_size}) sns.despine(left=True, bottom=True) ax.set_yticklabels(labels=[]) ax.set_xticklabels(labels=[]) cax = fig.get_axes()[-1] cax.set_title(cbar_title, fontdict={'size':font_size}) # In[16]: # https://gist.github.com/springmeyer/871897 def conv_lon(x): newx = x * 20037508.34 / 180 return newx def conv_lat(y): newy = np.log(np.tan((90 + y) * np.pi / 360)) / (np.pi / 180) newy *= 20037508.34 / 180 return newy # ### Maps of 2001 and 2017 annual values # In[170]: title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001) kwargs = dict( regions_lw = 1, regions_ec = '0.1', SERC_y = -1.5, SPP_y = 2.25 ) vmin = regions_albers.loc[:, [2001, 2017]].min().min() vmax = regions_albers.loc[:, [2001, 2017]].max().max() plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2001, text_col='nerc', vmin=vmin, vmax=vmax, title=title, cbar_title='g CO$_2$/kWh', **kwargs) path = join(figure_path, 'NERC map_cividis_2001.png') plt.savefig(path, bbox_inches='tight', dpi=350) # In[171]: title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017) kwargs = dict( regions_lw = 1, regions_ec = '0.1', SERC_y = -1.5, SPP_y = 2.25 ) vmin = regions_albers.loc[:, [2001, 2017]].min().min() vmax = regions_albers.loc[:, [2001, 2017]].max().max() regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction'] plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2017, text_col='arrow reduction', vmin=vmin, vmax=vmax, title=title, cbar_title='g CO$_2$/kWh', **kwargs) path = join(figure_path, 'NERC map_cividis_2017_change.png') plt.savefig(path, bbox_inches='tight', dpi=350) # In[443]: title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001) vmin = regions_albers.loc[:, [2001, 2017]].min().min() vmax = regions_albers.loc[:, [2001, 2017]].max().max() plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2001, text_col='NERC', vmin=vmin, vmax=vmax, cmap='Reds', title=title, cbar_title='g CO$_2$/kWh') path = join(figure_path, 'NERC map_reds_2001.png') plt.savefig(path, bbox_inches='tight', dpi=350) # In[444]: title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017) vmin = regions_albers.loc[:, [2001, 2017]].min().min() vmax = regions_albers.loc[:, [2001, 2017]].max().max() regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction'] plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col=2017, text_col='arrow reduction', vmin=vmin, vmax=vmax, cmap='Reds', title=title, cbar_title='g CO$_2$/kWh') path = join(figure_path, 'NERC map_reds_2017_change.png') plt.savefig(path, bbox_inches='tight', dpi=350) # ### Maps of difference from national average # In[448]: title = '2001 U.S. Average\n{} g CO$_2$/kWh'.format(usa_2001) vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max()) vmin = -vmax plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2001 diff', text_col='NERC', cmap='PRGn_r', vmin=vmin, vmax=vmax, title=title, cbar_title='g CO$_2$/kWh') path = join(figure_path, 'NERC map_diverging_2001_2017_bounds.png') plt.savefig(path, bbox_inches='tight', dpi=350) # In[446]: title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017) vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max()) vmin = -vmax regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction'] plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2017 diff', text_col='arrow reduction', cmap='PRGn_r', vmin=vmin, vmax=vmax, title=title, cbar_title='g CO$_2$/kWh') path = join(figure_path, 'NERC map_diverging_2017.png') plt.savefig(path, bbox_inches='tight', dpi=350) # In[447]: title = '2017 U.S. Average\n{} g CO$_2$/kWh (↓ 30%)'.format(usa_2017) vmax = max(abs(regions_albers['2017 diff'].min()), regions_albers['2017 diff'].max()) vmin = -vmax regions_albers['arrow reduction'] = '↓ ' + regions_albers['reduction'] plot_nerc_annual(regions_proj=regions_albers, states_proj=states_albers, data_col='2017 diff', text_col='arrow reduction', cmap='BrBG_r', vmin=vmin, vmax=vmax, title=title, cbar_title='g CO$_2$/kWh') path = join(figure_path, 'NERC map_diverging_BrBG_2017.png') plt.savefig(path, bbox_inches='tight', dpi=350) # In[ ]: # ## Read file with facility location, generation, and fuel data # In[12]: path = os.path.join(data_path, 'Facility gen fuels and CO2 2017-08-31.zip') facility_df = pd.read_csv(path) # ### Drop plants without lat/lon data # From 2001-mid 2017, 9 facilities (out of 8,435) don't have valid lat/lon data. All of them are from 2010 or earlier. Drop these rows or the spatial join fails. The amount of generation from these plants is tiny - well under 0.05% in all years. # In[7]: print(len(facility_df['plant id'].unique()), 'total plants') print(len(facility_df.loc[facility_df['lat'].isnull(), 'plant id'].unique()), 'plants without lat/lon') # In[8]: years = facility_df.loc[facility_df['lat'].isnull(), 'year'].unique() for year in years: total_gen = facility_df.loc[facility_df['year'] == year, 'generation (MWh)'].sum() # Plant ids with no 'lat' in year no_loc_plants = facility_df.loc[(facility_df['lat'].isnull()) & (facility_df['year'] == year), 'plant id'].unique() no_loc_gen = facility_df.loc[(facility_df['year'] == year) & (facility_df['plant id'].isin(no_loc_plants)), 'generation (MWh)'].sum() percent_dropped = no_loc_gen / total_gen * 100 print('In {}, {:.3f}% of generation is from plants without lat/lon'.format(year, percent_dropped)) # In[9]: facility_df.dropna(inplace=True, subset=['lat', 'lon']) # In[10]: facility_df.columns # Because I have monthly data for every facility from 2001-2017, there are lots of duplicate rows. No need to do a spatial join on every row. Just keep one instance of each facility in each year. # In[11]: cols = ['lat', 'lon', 'plant id', 'year'] small_facility = facility_df.loc[:, cols].drop_duplicates() # Use `Point` from Shapely to create the `geometry` list of facility locations. `crs` is the coordinate reference system that translates lat/lon into a specific map projection. # In[12]: geometry = [Point(xy) for xy in zip(small_facility.lon, small_facility.lat)] # small_facility = small_facility.drop(['lon', 'lat'], axis=1) crs = {'init': 'epsg:4326'} geo_df = GeoDataFrame(small_facility, crs=crs, geometry=geometry) # In[13]: geo_df.head() # In[14]: len(geo_df) # ## Spatial join of NERC dataframe with polygons and facility dataframe with points # Joining the 9 regions (NERC) with 100,810 records takes much longer with facilities as the left dataframe in the join (2 min vs 12 seconds). Not quite sure why this is. The faster method leaves me with polygons rather than points tho. There might be a better way to rectify this, but I just create a new geodataframe with the geometry set as the lat/lon points. # # **EDIT** # Although the GeoPandas documentation says that the `op` parameter doesn't matter for point-in-polygon operations, using 'within' made the operation much faster (12 seconds). The first method below is probably preferable since it keeps the point geometry. # **Method 1** (slow unless you use `op='within'`) # In[15]: facility_nerc = gpd.sjoin(geo_df, regions, how='inner', op='within') # In[16]: facility_nerc.head() # **Method 2** (faster when using the default operation) # In[10]: facility_nerc = gpd.sjoin(df, geo_df, how="inner") # In[11]: facility_nerc.head() # Make the new geometry of facility locations # In[12]: geometry = [Point(xy) for xy in zip(facility_nerc.lon, facility_nerc.lat)] # Create new dataframe with the data I want to keep and the new geometry # In[13]: crs = {'init': 'epsg:4326'} keep_cols = ['NERC_Label', 'plant id', 'year'] facility_nerc = GeoDataFrame(facility_nerc[keep_cols], crs=crs, geometry=geometry) # In[14]: facility_nerc.head() # ## Changing the crs to see what happens # Just making sure that it does something # In[31]: df_test = df.to_crs({'init': 'epsg:3395'}) # In[32]: df.plot() # In[33]: df_test.plot() # ## NEW - read file with only plant id, state, and lat/lon # [Link to data file](https://github.com/gschivley/Index-variability/raw/master/Data%20storage/Facility%20labels/Facility%20locations.csv) # If you need to load the shapefile, [download it here](https://github.com/gschivley/Index-variability/raw/master/Data%20storage/NERC_Regions_EIA). # In[ ]: # If loading the shapefile shape_path = '' regions = gpd.read_file(shape_path) # In[3]: path = join(data_path, 'Facility labels', 'Facility locations.csv') location = pd.read_csv(path) # In[20]: len(location) # The `nerc` column is labels from a spatial join using this same shapefile and lat/lon data in QGIS. # In[21]: location.head() # In[22]: location.loc[location['lat'].isnull()] # In[4]: geometry = [Point(xy) for xy in zip(location.lon, location.lat)] # small_facility = small_facility.drop(['lon', 'lat'], axis=1) crs = {'init': 'epsg:4326'} geo_df = GeoDataFrame(location, crs=crs, geometry=geometry) # In[5]: geo_df.head() # **Every plant id is unique** # In[24]: len(geo_df) # In[25]: len(geo_df['plant id'].unique()) # ### Method 1 - use defaults # In[26]: df1 = gpd.sjoin(regions, geo_df) # In[27]: df1.head() # We can already see that there are more rows in this dataframe than there were power plants. # In[28]: len(df1) # ### Method 2 - use 'within' # In[6]: df2 = gpd.sjoin(geo_df, regions, op='within') # In[31]: df2.head() # Same size dataframe as from Method 1. Yet again, there appear to be extra results. # In[26]: len(df2) # ### Method 3 - use 'contains' # In[39]: df3 = gpd.sjoin(regions, geo_df, op='contains') # In[40]: df3.head() # Still too many results # In[41]: len(df3) # ### What facilities are missing? # In[32]: regions # In[23]: frcc = regions.loc[1, 'geometry'] serc = regions.loc[5, 'geometry'] # I figured out that plant id 641 is one that shows up in both FRCC and SERC # In[27]: plant_641 = geo_df.loc[geo_df['plant id'] == 641, 'geometry'].values[0] # In[28]: frcc.contains(plant_641) # In[29]: serc.contains(plant_641) # So 641 definitely is in both NERC regions. And apparently the regions overlap? But they aren't supposed to. And I didn't have any plants show up in multiple regions when doing the spatial join in QGIS. # In[37]: frcc.intersects(serc) # There are 36 plants with duplicate NERC regions. # In[38]: len(df2.loc[df2['plant id'].duplicated()].sort_values('plant id')) # In[10]: pd.set_option('display.max_rows', 200) # In[11]: df2.loc[df2['plant id'].duplicated(keep=False)].sort_values('plant id') # In[17]: facility_df.head() # In[49]: plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon']] plants.drop_duplicates(inplace=True) # In[24]: plants.groupby(['plant id']).max().hist() # In[116]: path = join(data_path, 'EIA downloads', 'EIA-860 2015', '2___Plant_Y2015.xlsx') nercs2015 = pd.read_excel(path, skiprows=0, parse_cols='C,L') nercs2015.columns = ['plant id', 'nerc'] path = join(data_path, 'EIA downloads', 'eia8602012', 'PlantY2012.xlsx') nercs2012 = pd.read_excel(path, skiprows=0, parse_cols='B,J') nercs2012.columns = ['plant id', 'nerc'] # In[117]: nercs2012.head() # In[118]: nercs = pd.concat([nercs2012, nercs2015]).drop_duplicates() # In[119]: len(nercs), len(nercs2012), len(nercs2015) # In[120]: df = pd.merge(plants, nercs, on=['plant id'], how='left') # In[121]: plants.tail() # In[122]: nercs.tail() # In[123]: nercs.nerc.unique() # In[67]: df.tail() # In[63]: len(df.loc[df.isnull().any(axis=1), 'plant id'].unique()) # In[148]: from sklearn import neighbors from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import numpy as np # In[124]: nercs = ['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'FRCC'] # In[125]: df_slim = df.loc[df.nerc.isin(nercs), ['plant id', 'lat', 'lon', 'nerc']].dropna(subset=['lon']).drop_duplicates() # In[126]: unknown = df_slim.loc[df_slim.nerc.isnull()] # In[69]: n_neighbors = 10 # In[155]: X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']] y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc'] X_scale = StandardScaler().fit_transform(X) # In[164]: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) # In[128]: y.unique() # In[165]: knn = neighbors.KNeighborsClassifier() # In[ ]: params = {'weights': ['uniform', 'distance'], 'n_neighbors': [10, 15, 20, 30], 'leaf_size': [3, 5, 10, 30], 'p': [1, 2]} clf = GridSearchCV(knn, params, n_jobs=-1) # In[160]: clf.fit(X_train, y_train) # In[161]: clf.best_estimator_, clf.best_params_, clf.best_score_ # No difference in score when X is preprocessed with `StandardScaler` # In[163]: clf.score(X_scale, y) # In[ ]: # In[102]: clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') clf.fit(X_train, y_train) # In[103]: clf.score(X_test, y_test) # In[104]: clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance') clf.fit(X_train, y_train) # In[105]: clf.score(X_test, y_test) # In[95]: h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for weights in ['uniform', 'distance']: # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title("3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)) # In[ ]: