#!/usr/bin/env python # coding: utf-8 # # Download and get info from all EIA-923 Excel files # This setup downloads all the zip files, extracts the contents, and identifies the correct header row in the correct file. I'm only getting 2 columns of data (plant id and NERC region), but it can be modified for other data. # In[14]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import os import glob import numpy as np import requests from bs4 import BeautifulSoup from urllib import urlretrieve import zipfile import fnmatch # In[3]: url = 'https://www.eia.gov/electricity/data/eia923' r = requests.get(url) soup = BeautifulSoup(r.text, 'lxml') table = soup.find_all('table', attrs={'class': 'simpletable'})[0] # In[5]: fns = [] links = [] for row in table.find_all('td', attrs={'align': 'center'}): href = row.a.get('href') fns.append(href.split('/')[-1]) links.append(url + '/' + href) # In[6]: fns # In[7]: path = os.path.join('Data storage', '923 raw data') os.mkdir(path) # In[9]: base_path = os.path.join('Data storage', '923 raw data') for fn, link in zip(fns, links): path = os.path.join(base_path, fn) urlretrieve(link, filename=path) # In[13]: base_path = os.path.join('Data storage', '923 raw data') for fn in fns: zip_path = os.path.join(base_path, fn) target_folder = os.path.join(base_path, fn.split('.')[0]) with zipfile.ZipFile(zip_path,"r") as zip_ref: zip_ref.extractall(target_folder) # In[70]: matches = [] for root, dirnames, filenames in os.walk(base_path): for filename in fnmatch.filter(filenames, '*2_3*'): matches.append(os.path.join(root, filename)) for filename in fnmatch.filter(filenames, 'eia923*'): matches.append(os.path.join(root, filename)) for filename in fnmatch.filter(filenames, '*906920*.xls'): matches.append(os.path.join(root, filename)) # In[71]: matches # In[84]: def clip_at_header(df, year): """Find the appropriate header row, only keep Plant Id and NERC Region columns, and add a column with the year""" header = df.loc[df.iloc[:, 8].str.contains('NERC').replace(np.nan, False)].index[0] # print header # Drop rows above header df = df.loc[header + 1:, :] # Only keep columns 0 (plant id) and 8 (NERC Region) df = df.iloc[:, [0, 8]] df.columns = ['Plant Id', 'NERC Region'] df.reset_index(inplace=True, drop=True) df.dropna(inplace=True) df['Plant Id'] = pd.to_numeric(df['Plant Id']) df['Year'] = year return df # In[85]: df_list = [] for fn in matches: year = int(fn.split('/')[-2].split('_')[-1]) df = pd.read_excel(fn) df_list.append(clip_at_header(df, year)) # In[186]: nerc_assignment = pd.concat(df_list) nerc_assignment.reset_index(inplace=True, drop=True) nerc_assignment.drop_duplicates(inplace=True) # In[187]: nerc_assignment['Year'] = pd.to_numeric(nerc_assignment['Year']) # In[188]: nerc_region = nerc_assignment['NERC Region'] nerc_year = nerc_assignment['Year'] # In[189]: for region in nerc_assignment['NERC Region'].unique(): years = nerc_assignment.loc[nerc_region == region, 'Year'].unique() print (region, list(years)) # ### Export original data # In[190]: path = os.path.join('Data storage', 'Plant NERC regions.csv') nerc_assignment.to_csv(path, index=False) # ## Assign NERC region to pre-2005/6 facilities based on where they ended up # # Somehow I'm having trouble doing this # In[183]: region_dict = dict(nerc_assignment.loc[nerc_assignment['Year'] == 2006, ['Plant Id', 'NERC Region']].values) # In[184]: regions = ['ECAR', 'MAPP', 'MAIN', 'MAAC'] years = range(2001, 2006) nerc_assignment.loc[(nerc_region.isin(regions)) & (nerc_assignment['Year'].isin(years)), 'Corrected Region'] = nerc_assignment.loc[(nerc_region.isin(regions)) & (nerc_assignment['Year'].isin(years)), 'Plant Id'].map(region_dict) # In[185]: nerc_assignment.head() # In[162]: nerc_assignment.loc[(nerc_assignment['Year'] == 2006) & (nerc_assignment['Plant Id'] == 3), 'NERC Region'].values[0] # In[167]: nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) & (nerc_assignment['Year'].isin(years)), 'Corrected Region'] = 'SERC' # In[168]: nerc_assignment.loc[(nerc_assignment['Plant Id'] == 3) & (nerc_assignment['Year'].isin(years)), 'Corrected Region'] # In[170]: nerc_assignment.loc[nerc_assignment['Year'] == 2002].head() # In[115]: nerc_assignment.index = pd.MultiIndex.from_arrays([nerc_assignment['Year'], nerc_assignment['Plant Id']]) # In[165]: nerc_assignment.head() # In[136]: idx = pd.IndexSlice # In[137]: regions_2006 = nerc_assignment.loc[idx[2006, :], 'NERC Region'].copy() # In[145]: regions_2006 = nerc_assignment.xs(2006, level='Year')['NERC Region'] # In[146]: regions_2006 # In[151]: for year in range(2001, 2006): nerc_assignment.xs(year, level='Year')['Corrected NERC'] = regions_2006 # In[152]: nerc_assignment # In[ ]: