This setup downloads all the zip files, extracts the contents, and identifies the correct header row in the correct file. I'm only getting 2 columns of data (plant id and NERC region), but it can be modified for other data.
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib import urlretrieve
import zipfile
import fnmatch
url = 'https://www.eia.gov/electricity/data/eia923'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find_all('table', attrs={'class': 'simpletable'})[0]
fns = []
links = []
for row in table.find_all('td', attrs={'align': 'center'}):
href = row.a.get('href')
fns.append(href.split('/')[-1])
links.append(url + '/' + href)
fns
['f923_2017.zip', 'f923_2016.zip', 'f923_2015.zip', 'f923_2014.zip', 'f923_2013.zip', 'f923_2012.zip', 'f923_2011.zip', 'f923_2010.zip', 'f923_2009.zip', 'f923_2008.zip', 'f906920_2007.zip', 'f906920_2006.zip', 'f906920_2005.zip', 'f906920_2004.zip', 'f906920_2003.zip', 'f906920_2002.zip', 'f906920_2001.zip']
path = os.path.join('Data storage', '923 raw data')
os.mkdir(path)
base_path = os.path.join('Data storage', '923 raw data')
for fn, link in zip(fns, links):
path = os.path.join(base_path, fn)
urlretrieve(link, filename=path)
('Data storage/923 raw data/f923_2017.zip', <httplib.HTTPMessage instance at 0x10ecbb9e0>)
('Data storage/923 raw data/f923_2016.zip', <httplib.HTTPMessage instance at 0x10ecc07e8>)
('Data storage/923 raw data/f923_2015.zip', <httplib.HTTPMessage instance at 0x10ecc0b00>)
('Data storage/923 raw data/f923_2014.zip', <httplib.HTTPMessage instance at 0x10ecc0e18>)
('Data storage/923 raw data/f923_2013.zip', <httplib.HTTPMessage instance at 0x10ecbb998>)
('Data storage/923 raw data/f923_2012.zip', <httplib.HTTPMessage instance at 0x10ecc0950>)
('Data storage/923 raw data/f923_2011.zip', <httplib.HTTPMessage instance at 0x10ecc0c68>)
('Data storage/923 raw data/f923_2010.zip', <httplib.HTTPMessage instance at 0x10ecc0ab8>)
('Data storage/923 raw data/f923_2009.zip', <httplib.HTTPMessage instance at 0x10ecc0dd0>)
('Data storage/923 raw data/f923_2008.zip', <httplib.HTTPMessage instance at 0x10ecbb6c8>)
('Data storage/923 raw data/f906920_2007.zip', <httplib.HTTPMessage instance at 0x10ecc08c0>)
('Data storage/923 raw data/f906920_2006.zip', <httplib.HTTPMessage instance at 0x10ecc09e0>)
('Data storage/923 raw data/f906920_2005.zip', <httplib.HTTPMessage instance at 0x10ecc3248>)
('Data storage/923 raw data/f906920_2004.zip', <httplib.HTTPMessage instance at 0x10ecc3368>)
('Data storage/923 raw data/f906920_2003.zip', <httplib.HTTPMessage instance at 0x10ecc3488>)
('Data storage/923 raw data/f906920_2002.zip', <httplib.HTTPMessage instance at 0x10ecc3a28>)
('Data storage/923 raw data/f906920_2001.zip', <httplib.HTTPMessage instance at 0x10ecc3b48>)
base_path = os.path.join('Data storage', '923 raw data')
for fn in fns:
zip_path = os.path.join(base_path, fn)
target_folder = os.path.join(base_path, fn.split('.')[0])
with zipfile.ZipFile(zip_path,"r") as zip_ref:
zip_ref.extractall(target_folder)
matches = []
for root, dirnames, filenames in os.walk(base_path):
for filename in fnmatch.filter(filenames, '*2_3*'):
matches.append(os.path.join(root, filename))
for filename in fnmatch.filter(filenames, 'eia923*'):
matches.append(os.path.join(root, filename))
for filename in fnmatch.filter(filenames, '*906920*.xls'):
matches.append(os.path.join(root, filename))
matches
['Data storage/923 raw data/f906920_2001/f906920y2001.xls', 'Data storage/923 raw data/f906920_2002/f906920y2002.xls', 'Data storage/923 raw data/f906920_2003/f906920_2003.xls', 'Data storage/923 raw data/f906920_2004/f906920_2004.xls', 'Data storage/923 raw data/f906920_2005/f906920_2005.xls', 'Data storage/923 raw data/f906920_2006/f906920_2006.xls', 'Data storage/923 raw data/f906920_2007/f906920_2007.xls', 'Data storage/923 raw data/f923_2008/eia923December2008.xls', 'Data storage/923 raw data/f923_2009/EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS', 'Data storage/923 raw data/f923_2010/EIA923 SCHEDULES 2_3_4_5 Final 2010.xls', 'Data storage/923 raw data/f923_2011/EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx', 'Data storage/923 raw data/f923_2012/EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx', 'Data storage/923 raw data/f923_2013/EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx', 'Data storage/923 raw data/f923_2014/EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx', 'Data storage/923 raw data/f923_2015/EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx', 'Data storage/923 raw data/f923_2016/EIA923_Schedules_2_3_4_5_M_12_2016.xlsx', 'Data storage/923 raw data/f923_2017/EIA923_Schedules_2_3_4_5_M_04_2017_22JUN2017.xlsx']
def clip_at_header(df, year):
"""Find the appropriate header row, only keep Plant Id and NERC Region columns,
and add a column with the year"""
header = df.loc[df.iloc[:, 8].str.contains('NERC').replace(np.nan, False)].index[0]
# print header
# Drop rows above header
df = df.loc[header + 1:, :]
# Only keep columns 0 (plant id) and 8 (NERC Region)
df = df.iloc[:, [0, 8]]
df.columns = ['Plant Id', 'NERC Region']
df.reset_index(inplace=True, drop=True)
df.dropna(inplace=True)
df['Plant Id'] = pd.to_numeric(df['Plant Id'])
df['Year'] = year
return df
df_list = []
for fn in matches:
year = int(fn.split('/')[-2].split('_')[-1])
df = pd.read_excel(fn)
df_list.append(clip_at_header(df, year))
nerc_assignment = pd.concat(df_list)
nerc_assignment.reset_index(inplace=True, drop=True)
nerc_assignment.drop_duplicates(inplace=True)
nerc_assignment['Year'] = pd.to_numeric(nerc_assignment['Year'])
nerc_region = nerc_assignment['NERC Region']
nerc_year = nerc_assignment['Year']
for region in nerc_assignment['NERC Region'].unique():
years = nerc_assignment.loc[nerc_region == region, 'Year'].unique()
print (region, list(years))
(u'SERC', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'ECAR', [2001, 2002, 2003, 2004, 2005]) (u'MAPP', [2001, 2002, 2003, 2004]) (u'NPCC', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'SPP', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'ERCOT', [2001, 2002, 2003, 2004, 2005, 2006]) (u'FRCC', [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'MAIN', [2001, 2002, 2003, 2004, 2005]) (u'WECC', [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'ASCC', [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'MAAC', [2002, 2003, 2004, 2005]) (u'HICC', [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'WSCC', [2002]) (u'MRO', [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'RFC', [2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]) (u'TRE', [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
path = os.path.join('Data storage', 'Plant NERC regions.csv')
nerc_assignment.to_csv(path, index=False)
Somehow I'm having trouble doing this
region_dict = dict(nerc_assignment.loc[nerc_assignment['Year'] == 2006,
['Plant Id', 'NERC Region']].values)
regions = ['ECAR', 'MAPP', 'MAIN', 'MAAC']
years = range(2001, 2006)
nerc_assignment.loc[(nerc_region.isin(regions)) &
(nerc_assignment['Year'].isin(years)),
'Corrected Region'] = nerc_assignment.loc[(nerc_region.isin(regions)) &
(nerc_assignment['Year'].isin(years)),
'Plant Id'].map(region_dict)
nerc_assignment.head()
Plant Id | NERC Region | Year | Corrected Region | |
---|---|---|---|---|
0 | 2 | SERC | 2001 | NaN |
1 | 3 | SERC | 2001 | NaN |
4 | 4 | SERC | 2001 | NaN |
5 | 5 | SERC | 2001 | NaN |
7 | 7 | SERC | 2001 | NaN |
nerc_assignment.loc[(nerc_assignment['Year'] == 2006) &
(nerc_assignment['Plant Id'] == 3), 'NERC Region'].values[0]
u'SERC'
nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) &
(nerc_assignment['Year'].isin(years)), 'Corrected Region'] = 'SERC'
nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) &
(nerc_assignment['Year'].isin(years)), 'Corrected Region']
1 SERC 1583 SERC 9921 SERC 19264 SERC 28525 SERC Name: Corrected Region, dtype: object
nerc_assignment.loc[nerc_assignment['Year'] == 2002].head()
Plant Id | NERC Region | Year | Corrected Region | |
---|---|---|---|---|
1582 | 2 | SERC | 2002 | NaN |
1583 | 3 | SERC | 2002 | SERC |
1586 | 4 | SERC | 2002 | NaN |
1587 | 7 | SERC | 2002 | NaN |
1590 | 8 | SERC | 2002 | NaN |
nerc_assignment.index = pd.MultiIndex.from_arrays([nerc_assignment['Year'],
nerc_assignment['Plant Id']])
nerc_assignment.head()
Plant Id | NERC Region | Year | Corrected Region | |
---|---|---|---|---|
0 | 2 | SERC | 2001 | NaN |
1 | 3 | SERC | 2001 | NaN |
4 | 4 | SERC | 2001 | NaN |
5 | 5 | SERC | 2001 | NaN |
7 | 7 | SERC | 2001 | NaN |
idx = pd.IndexSlice
regions_2006 = nerc_assignment.loc[idx[2006, :], 'NERC Region'].copy()
regions_2006 = nerc_assignment.xs(2006, level='Year')['NERC Region']
regions_2006
Plant Id 2 SERC 3 SERC 4 SERC 7 SERC 8 SERC 9 WECC 10 SERC 11 SERC 12 SERC 13 SERC 14 SERC 15 SERC 16 SERC 17 SERC 18 SERC 19 SERC 20 SERC 21 SERC 26 SERC 30 MRO 34 WECC 38 SERC 46 SERC 47 SERC 48 SERC 49 SERC 50 SERC 51 SPP 53 SERC 54 SERC ... 56544 MRO 56545 SERC 56554 WECC 56557 SPP 56558 SPP 56559 SPP 56561 SPP 56562 SPP 56570 WECC 56571 RFC 56577 MRO 56578 MRO 56579 MRO 56580 MRO 56581 MRO 56582 MRO 56583 MRO 56584 MRO 56585 MRO 56586 MRO 56587 MRO 56588 MRO 56589 MRO 56590 MRO 56591 WECC 56595 MRO 56597 SERC 56598 SERC 56599 SERC 56600 SERC Name: NERC Region, dtype: object
for year in range(2001, 2006):
nerc_assignment.xs(year, level='Year')['Corrected NERC'] = regions_2006
nerc_assignment
Plant Id | NERC Region | Year | Corrected NERC | ||
---|---|---|---|---|---|
Year | Plant Id | ||||
2001 | 2 | 2 | SERC | 2001 | NaN |
3 | 3 | SERC | 2001 | NaN | |
4 | 4 | SERC | 2001 | NaN | |
5 | 5 | SERC | 2001 | NaN | |
7 | 7 | SERC | 2001 | NaN | |
8 | 8 | SERC | 2001 | NaN | |
10 | 10 | SERC | 2001 | NaN | |
11 | 11 | SERC | 2001 | NaN | |
12 | 12 | SERC | 2001 | NaN | |
13 | 13 | SERC | 2001 | NaN | |
14 | 14 | SERC | 2001 | NaN | |
15 | 15 | SERC | 2001 | NaN | |
16 | 16 | SERC | 2001 | NaN | |
17 | 17 | SERC | 2001 | NaN | |
18 | 18 | SERC | 2001 | NaN | |
19 | 19 | SERC | 2001 | NaN | |
20 | 20 | SERC | 2001 | NaN | |
21 | 21 | SERC | 2001 | NaN | |
26 | 26 | SERC | 2001 | NaN | |
38 | 38 | SERC | 2001 | NaN | |
46 | 46 | SERC | 2001 | NaN | |
47 | 47 | SERC | 2001 | NaN | |
48 | 48 | SERC | 2001 | NaN | |
49 | 49 | SERC | 2001 | NaN | |
50 | 50 | SERC | 2001 | NaN | |
53 | 53 | SERC | 2001 | NaN | |
54 | 54 | ECAR | 2001 | NaN | |
55 | 55 | SERC | 2001 | NaN | |
56 | 56 | SERC | 2001 | NaN | |
60 | 60 | MAPP | 2001 | NaN | |
... | ... | ... | ... | ... | ... |
2017 | 60145 | 60145 | WECC | 2017 | NaN |
60149 | 60149 | SERC | 2017 | NaN | |
60152 | 60152 | WECC | 2017 | NaN | |
60185 | 60185 | WECC | 2017 | NaN | |
60186 | 60186 | WECC | 2017 | NaN | |
60210 | 60210 | TRE | 2017 | NaN | |
60218 | 60218 | SPP | 2017 | NaN | |
60256 | 60256 | SPP | 2017 | NaN | |
60258 | 60258 | WECC | 2017 | NaN | |
60263 | 60263 | WECC | 2017 | NaN | |
60278 | 60278 | NPCC | 2017 | NaN | |
60304 | 60304 | WECC | 2017 | NaN | |
60307 | 60307 | WECC | 2017 | NaN | |
60308 | 60308 | WECC | 2017 | NaN | |
60329 | 60329 | RFC | 2017 | NaN | |
60349 | 60349 | WECC | 2017 | NaN | |
60352 | 60352 | WECC | 2017 | NaN | |
60354 | 60354 | MRO | 2017 | NaN | |
60366 | 60366 | TRE | 2017 | NaN | |
60372 | 60372 | TRE | 2017 | NaN | |
60383 | 60383 | SERC | 2017 | NaN | |
60390 | 60390 | RFC | 2017 | NaN | |
60414 | 60414 | SPP | 2017 | NaN | |
60432 | 60432 | WECC | 2017 | NaN | |
60436 | 60436 | TRE | 2017 | NaN | |
60441 | 60441 | WECC | 2017 | NaN | |
60467 | 60467 | WECC | 2017 | NaN | |
60470 | 60470 | RFC | 2017 | NaN | |
60502 | 60502 | TRE | 2017 | NaN | |
60506 | 60506 | TRE | 2017 | NaN |
85768 rows × 4 columns