This script downlads and extracts the original data of renewable power plant lists from the data sources, processes and merges them. It subsequently adds the geolocation for each power plant. Finally it saves the DataFrames as pickle-files. Make sure you run the download and process Notebook before the validation and output Notebook.
version = '2019-04-05'
import logging
import os
import posixpath
import urllib.parse
import urllib.request
import re
import zipfile
import pickle
import urllib
import shutil
import numpy as np
import pandas as pd
import utm # for transforming geoinformation in the utm format
import requests
import fake_useragent
from string import Template
from IPython.display import display
import xlrd
import bs4
import bng_to_latlon
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
# for visualizing locations on maps
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.io import shapereader
import geopandas
import shapely
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%d %b %Y %H:%M:%S'
)
logger = logging.getLogger()
# Create input, intermediate and output folders if they don't exist
# If the paths are relative, the correspoding folders will be created
# inside the current working directory.
input_directory_path = os.path.join('input', 'original_data')
raw_input_directory_path = os.path.join('input', 'original_data', 'raw')
intermediate_directory_path = 'intermediate'
output_directory_path = os.path.join('output', 'renewable_power_plants')
os.makedirs(input_directory_path, exist_ok=True)
os.makedirs(raw_input_directory_path, exist_ok=True)
os.makedirs(intermediate_directory_path, exist_ok=True)
os.makedirs(output_directory_path, exist_ok=True)
The original data can either be downloaded from the original data sources as specified below or from the opsd-Server. Default option is to download from the original sources as the aim of the project is to stay as close to original sources as possible. However, if problems with downloads e.g. due to changing urls occur, you can still run the script with the original data from the opsd_server.
download_from = 'original_sources'
#download_from = 'opsd_server'
def get_opsd_download_url(filename):
global version
url_opsd = 'https://data.open-power-system-data.org/renewable_power_plants'
folder = 'original_data/raw'
opsd_download_url = "/".join([url_opsd, version, folder, filename])
return opsd_download_url
def download_and_cache(url, session=None, filename=None):
"""This function downloads a file into a folder called
original_data/raw and returns the local filepath.
If filename is specified, the local file will be named so."""
if 'user_agent' not in download_and_cache.__dict__:
download_and_cache.user_agent = fake_useragent.UserAgent()
user_agent = download_and_cache.user_agent
global raw_input_directory_path
if filename is None:
path = urllib.parse.urlsplit(url).path
filename = posixpath.basename(path)
filepath = os.path.join(raw_input_directory_path, filename)
print(url)
# check if file exists, if not download it
print(filepath)
if not os.path.exists(filepath):
if not session:
print('No session')
session = requests.session()
print("Downloading file: ", filename)
headers = {'User-Agent' : user_agent.random}
r = session.get(url, headers=headers, stream=True)
chuncksize = 1024
with open(filepath, 'wb') as file:
for chunck in r.iter_content(chuncksize):
file.write(chunck)
else:
print("Using local file from", filepath)
filepath = '' + filepath
print('Downloading: done.')
return filepath
This function saves each original data file in the appropriate country's directory, adding the structure to the input directory.
def save_to_country(country, source_name, filepath):
global input_directory_path
filename = filepath.split(os.sep)[-1]
new_directory_path = os.path.join(input_directory_path, country, source_name)
os.makedirs(new_directory_path, exist_ok=True)
if os.path.splitext(filepath)[1] != '.xlsx' and zipfile.is_zipfile(filepath):
zip_ref = zipfile.ZipFile(filepath, 'r')
zip_ref.extractall(new_directory_path)
zip_ref.close()
else:
new_filepath = os.path.join(new_directory_path, filename)
shutil.copy(filepath, new_filepath)
The next function saves the processed dataframes to the intermediate directory.
def save_to_intermediate(country, df):
global intermediate_directory_path
name = '{}_renewables.pickle'.format(country)
path = os.path.join(intermediate_directory_path, name)
df.to_pickle(path)
Column and value names of the original data sources will be translated to English and standardized across different sources. Standardized column names, e.g. "electrical_capacity" are required to merge data in one DataFrame.
The column and the value translation lists are provided in the input folder of the Data Package.
# Get column translation list
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
columnnames.head(2)
# Get value translation list
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
valuenames.head(2)
Read the list of data sources per country, accompanied by their download links.
source_df = pd.read_csv(os.path.join('input', 'sources.csv'))
source_df
We will use function get_download_urls
to get the urls as a dictionary {'data':{source:url}, 'geo':url}
for a given country from the source list. If the highest-level key is 'data'
, the corresponding dictionary refers to the files containing actual data. If it is 'geo'
, the url points to the file with postcode geodata for the chosen country, provided that there is such an entry in the file. Otherwise, the 'geo'
url will be empty. The choice whether the links are going to lead to the original sources or the OPSD server is made by passing 'original_sources'
or 'opsd_server'
as the function's argument download_from
.
Note that, as of April 5, 2019, the following sources are available only from the OPSD server and the data will be downloaded from it even if download_from
is set to 'original_sources'
:
Also, the BNetzA PV dataset will be loaded from OPSD server. The reason is that the "April 2018" sheet’s date columns needed "date" formatting to be manually re-applied from row 5221 onwards, otherwise the some date values would have been read as integer values. Therefore: This dataset will be loaded from OPSD server as well instead of its original source:
The original links which should be downloaded from OPSD are marked as inactive in the column active
in the above dataframe.
def get_filenames_for_opsd(source_df):
filenames_by_source = source_df[source_df['file_type'] == 'data'][['source', 'filename']]
filenames_by_source = filenames_by_source.set_index('source')
filenames_by_source = filenames_by_source.to_dict()['filename']
geo_url = None
if 'geo' in source_df['file_type'].values:
geo_filename = source_df[source_df['file_type'] == 'geo'].iloc[0]['filename']
geo_url = get_opsd_download_url(geo_filename)
return filenames_by_source, geo_url
def get_download_urls(source_df, country, download_from):
source_df = source_df[source_df['country'] == country]
geo_url = None
if download_from == 'original_sources':
data_urls = {}
# check if there are inactive urls
inactive_df = source_df[source_df['active'] == 'no']
if not inactive_df.empty:
filenames_by_source, geo_url = get_filenames_for_opsd(source_df)
data_urls.update({source : get_opsd_download_url(filenames_by_source[source]) for source in filenames_by_source})
active_df = source_df[source_df['active'] == 'yes']
urls = active_df[active_df['file_type'] == 'data'][['source', 'url']]
urls = urls.set_index('source')
data_urls.update(urls.to_dict()['url'])
if 'geo' in active_df['file_type'].values:
geo_url = source_df[source_df['file_type'] == 'geo'].iloc[0]['url']
elif download_from == 'opsd_server':
filenames_by_source, geo_url = get_filenames_for_opsd(source_df)
data_urls = {source : get_opsd_download_url(filenames_by_source[source]) for source in filenames_by_source}
else:
raise ValueError('download_from must be "original_sources" or "opsd_server".')
return {'data': data_urls, 'geo': geo_url}
Since many sources in the dataset come with the data on their geographical locations, having a way to visualize them on a map can be very useful. The next function serves exactly that purpose.
def visualize(latitudes, longitudes, country, categories=None, eps=0.03):
# Remove the locations not in Europe
european_latitude_mask = np.logical_and(latitudes >= 34, latitudes <= 81)
european_longitude_mask= np.logical_and(longitudes >= -31, longitudes <= 69)
european_mask = np.logical_and(european_latitude_mask, european_longitude_mask)
latitudes = latitudes[european_mask]
longitudes = longitudes[european_mask]
if categories is not None:
categories = categories[european_mask]
# Determine the coordinates of boundary locations
max_lat = latitudes.max()
min_lat = latitudes.min()
max_lon = longitudes.max()
min_lon = longitudes.min()
# Make the area to show a bit larger
max_lat = max_lat + (max_lat - min_lat) * eps
min_lat = min_lat - (max_lat - min_lat) * eps
max_lon = max_lon + (max_lon - min_lon) * eps
min_lon = min_lon - (max_lon - min_lon) * eps
# Get the shape file for visualizing countries
shp_filename = shapereader.natural_earth('10m', 'cultural', 'admin_0_countries')
df_geo = geopandas.read_file(shp_filename)
polygon = df_geo.loc[df_geo['ADMIN'] == country]['geometry'].values[0]
# Make sure that polygon is technically multi-part
# (see https://github.com/SciTools/cartopy/issues/948)
if type(polygon) == shapely.geometry.polygon.Polygon:
polygon=[polygon]
# Make the figure
figure(num=None, figsize=(8, 6), dpi=100, facecolor='white', edgecolor='k')
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_geometries(polygon, crs=ccrs.PlateCarree(), facecolor='white', edgecolor='0.5', zorder=1)
ax.set_extent([min_lon, max_lon, min_lat, max_lat], crs=ccrs.PlateCarree())
ax.coastlines(resolution='10m', color='black')
# Plot the locations
if categories is None:
ax.scatter(longitudes, latitudes, s=1.5, zorder=2, c='#123456')
else:
labels = categories.unique()
for label in labels:
category_mask = (categories == label)
latitude_subset = latitudes[category_mask]
longitude_subset = longitudes[category_mask]
ax.scatter(longitude_subset, latitude_subset, s=1.5, zorder=2, label=label)
ax.legend()
# Show the figure
plt.show()
For one country after the other, the original data is downloaded, read, processed, translated, eventually georeferenced and saved. If respective files are already in the local folder, these will be utilized.
To process the provided data pandas DataFrame is applied.
The data which will be processed below is provided by the following data sources:
Netztransparenz.de - Official grid transparency platform from the German Transmission System Operators (TSOs): 50Hertz, Amprion, TenneT and TransnetBW.
Bundesnetzagentur (BNetzA) - German Federal Network Agency for Electricity, Gas, Telecommunications, Posts and Railway (In separate files for data for roof-mounted PV power plants and for all other renewable energy power plants.)
Data URL for BNetzA gets updated every few month. To be sure, always check if the links (url_bnetza; url_bnetza_pv) are up to date.
# Define the lists of source names and get the download links
tsos = ['50Hertz', 'Amprion', 'TenneT', 'TransnetBW']
datasets = ['50Hertz', 'Amprion', 'TenneT', 'TransnetBW','bnetza','bnetza_pv','bnetza_pv_historic']
download_urls = get_download_urls(source_df, 'DE', download_from)
urls = download_urls['data']
# Download all data sets before processing.
filenames = {}
for source in urls:
filepath = download_and_cache(urls[source])
save_to_country('DE', source, filepath)
if os.path.splitext(filepath)[1] != '.xlsx' and zipfile.is_zipfile(filepath):
%time filenames[source] = zipfile.ZipFile(filepath)
else:
%time filenames[source] = filepath
# Read TSO data from zip file
dfs = {}
basenames_by_tso = {
'50Hertz': 'Netztransparenz%20Anlagenstammdaten%202017%2050Hertz%20Transmission%20GmbH',
'Amprion': 'Netztransparenz%20Anlagenstammdaten%202017%20Amprion%20GmbH_V03',
'TenneT': 'Netztransparenz%20Anlagenstammdaten%202017%20TenneT%20TSO%20GmbH',
'TransnetBW': 'Netztransparenz%20Anlagenstammdaten%202017%20TransnetBW%20GmbH',
}
for tso in tsos:
filename = urllib.parse.unquote(basenames_by_tso[tso]+'.csv')
print('Reading '+filename)
print(filenames[tso].namelist())
dfs[tso] = pd.read_csv(
filenames[tso].open(filename),
sep=';',
thousands='.',
decimal=',',
# Header have to have the same order for all TSOs. Therefore just define headers here.
# Remove the following three lines if for next version, headers should be read out initially to then check if order is the same everywhere.
names=['EEG-Anlagenschlüssel','Netzbetreiber Betriebsnummer','Netzbetreiber Name','STRASSE_FLURSTUECK','PLZ','Ort / Gemarkung','Gemeindeschlüssel','Bundesland','Installierte Leistung','Energieträger','Spannungsebene','Leistungsmessung','Regelbarkeit','Inbetriebnahme','Außerbetriebnahme','Netzzugang','Netzabgang'],
header=None,
skiprows=1,
parse_dates=[11, 12, 13, 14],
encoding='iso-8859-1',
dayfirst=True,
low_memory=False
)
print('Done reading '+tso+'_EEG-Anlagenstammdaten_2016.csv')
for filename in filenames.values():
if(isinstance(filename,zipfile.ZipFile)):
filename.close()
# Read BNetzA register
print('Reading bnetza: '+filenames['bnetza'])
dfs['bnetza'] = pd.read_excel(filenames['bnetza'],
sheet_name='Gesamtübersicht',
header=0,
converters={'4.9 Postleit-zahl': str, 'Gemeinde-Schlüssel': str}
)
skiprows = {'bnetza_pv_historic': 10, 'bnetza_pv': 9}
for dataset in ['bnetza_pv', 'bnetza_pv_historic']:
print(dataset)
print('Reading '+dataset+': '+filenames[dataset])
xls_handle = pd.ExcelFile(filenames[dataset])
print('Concatenating all '+dataset+' sheets into one dataframe')
dfs[dataset] = pd.concat(
(xls_handle.parse(
sheet,
skiprows=skiprows[dataset],
converters={'Anlage \nPLZ': str}
) for sheet in xls_handle.sheet_names),
sort=True
)
dfs[dataset].tail(2)
dfs['bnetza_pv_historic'] = dfs['bnetza_pv_historic'].drop(['Unnamed: 7'], axis=1)
pickle.dump( dfs, open( "intermediate/temp_dfs_DE_after_reading.pickle", "wb" ) )
dfs = pickle.load( open( "intermediate/temp_dfs_DE_after_reading.pickle", "rb" ) )
To standardise the DataFrame the original column names from the German TSOs and the BNetzA wil be translated and new english column names wil be assigned to the DataFrame. The unique column names are required to merge the DataFrame.
The column_translation_list is provided here as csv in the input folder. It is loaded in 2.3 Setup of translation dictionaries.
# Choose the translation terms for Germany, create dictionary and show dictionary
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
idx_DE = columnnames[columnnames['country'] == 'DE'].index
column_dict_DE = columnnames.loc[idx_DE].set_index('original_name')['opsd_name'].to_dict()
# Start the column translation process for each original data source
print('Translation')
for dataset in datasets:
# Remove newlines and any other duplicate whitespaces in column names:
dfs[dataset] = dfs[dataset].rename(columns={col: re.sub(r"\s+", ' ', col) for col in dfs[dataset].columns})
# Do column name translations
dfs[dataset].rename(columns=column_dict_DE, inplace=True)
print('done.')
All data source names and for the BNetzA-PV data the energy source level 2 will added.
# Add data source names to the DataFrames
for tso in tsos:
dfs[tso]['data_source'] = tso
dfs[tso]['tso'] = tso
dfs['bnetza']['data_source'] = 'BNetzA'
dfs['bnetza_pv']['data_source'] = 'BNetzA_PV'
dfs['bnetza_pv_historic']['data_source'] = 'BNetzA_PV_historic'
# Add for the BNetzA PV data the energy source level 2
dfs['bnetza_pv']['energy_source_level_2'] = 'Photovoltaics'
dfs['bnetza_pv_historic']['energy_source_level_2'] = 'Photovoltaics'
# Correct datetime-format
def decom_fkt(x):
x = str(x)
if x == 'nan':
x = ''
else:
x = x[0:10]
return x
dfs['bnetza']['decommissioning_date'] = dfs['bnetza']['decommissioning_date'].apply(decom_fkt)
# Select those columns of the orignal data which are utilised further
dfs['bnetza'] = dfs['bnetza'].loc[:, ('commissioning_date', 'decommissioning_date',
'notification_reason', 'energy_source_level_2',
'electrical_capacity_kW', 'thermal_capacity_kW',
'voltage_level', 'dso', 'eeg_id', 'bnetza_id',
'federal_state', 'postcode', 'municipality_code',
'municipality', 'address', 'address_number',
'utm_zone', 'utm_east', 'utm_north',
'data_source')]
for dataset in datasets: print(dataset+':'); display(dfs[dataset].tail(2))
The individual DataFrames from the TSOs (Netztransparenz.de) and BNetzA are merged.
# Merge DataFrames of each original source into a common DataFrame DE_renewables
dfs_list = []
for dataset in datasets:
dfs_list.append(dfs[dataset])
DE_renewables = pd.concat(dfs_list, sort=True)
DE_renewables.head(2)
# Make sure the decommissioning_column has the right dtype
%time DE_renewables['decommissioning_date'] = DE_renewables['decommissioning_date'].apply(pd.to_datetime)
DE_renewables['decommissioning_date'] = pd.to_datetime(DE_renewables['decommissioning_date'])
DE_renewables.reset_index(drop=True, inplace=True)
DE_renewables.head(2)
Different German terms for energy source level 2, energy source level 3, technology and voltage levels are translated and harmonized across the individual data sources. The value_translation_list is provided here as csv in the input folder. It is loaded in 2.3 Setup of translation dictionaries.
# Choose the translation terms for Germany, create dictionary and show dictionary
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
idx_DE = valuenames[valuenames['country'] == 'DE'].index
value_dict_DE = valuenames.loc[idx_DE].set_index('original_name')['opsd_name'].to_dict()
print('replacing..')
# Replace all original value names by the OPSD value names.
# Running time: some minutes. %time prints the time your computer required for this step
%time DE_renewables.replace(value_dict_DE, inplace=True)
DE_renewables['postcode'] = DE_renewables['postcode'].apply(pd.to_numeric, errors='ignore')
Separate and assign energy source level 1 - 3 and technology
# Create dictionary in order to assign energy_source to its subtype
energy_source_dict_DE = valuenames.loc[idx_DE].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Column energy_source partly contains energy source level 3 and technology information,
# thus this column is copied to new column technology...
DE_renewables['technology'] = DE_renewables['energy_source_level_2']
# ...and the energy source level 2 values are replaced by the higher level classification
DE_renewables['energy_source_level_2'].replace(
energy_source_dict_DE, inplace=True)
# Choose energy source level 2 entries where energy_source is "Bioenergy" in order to
# seperate Bioenergy subtypes to "energy_source_level_3" and subtypes for the rest to "technology"
idx_DE_Bioenergy = DE_renewables[DE_renewables['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to energy source level 3 for all entries where energy source level 2 is
# Bioenergy and delete those entries from technology
DE_renewables[['energy_source_level_3']] = DE_renewables.iloc[idx_DE_Bioenergy][['technology']]
DE_renewables.loc[idx_DE_Bioenergy]['technology'] = np.nan
# Assign energy source level 1 to the dataframe
DE_renewables['energy_source_level_1'] = 'Renewable energy'
Summary of DataFrame
# Electrical capacity per energy source level 2 (in MW)
DE_renewables.groupby(['energy_source_level_2'])['electrical_capacity_kW'].sum() / 1000
# kW to MW
DE_renewables[['electrical_capacity_kW', 'thermal_capacity_kW']] /= 1000
# adapt column name
DE_renewables.rename(columns={'electrical_capacity_kW': 'electrical_capacity',
'thermal_capacity_kW': 'thermal_capacity'}, inplace=True)
(for data with no existing geocoordinates)
The available post code in the original data provides a first approximation for the geocoordinates of the RE power plants.
The BNetzA data provides the full zip code whereas due to data privacy the TSOs only report the first three digits of the power plant's post code (e.g. 024xx) and no address. Subsequently a centroid of the post code region polygon is used to find the coordinates.
With data from
a CSV-file for all existing German post codes with matching geocoordinates has been compiled. The latitude and longitude coordinates were generated by running a PostgreSQL + PostGIS database. Additionally the respective TSO has been added to each post code. (A Link to the SQL script will follow here later)
(License: http://www.suche-postleitzahl.org/downloads, Open Database Licence for free use. Source of data: © OpenStreetMap contributors)
# Read generated postcode/location file
postcode = pd.read_csv(os.path.join('input', 'de_tso_postcode_full.csv'))
# Drop possible duplicates in postcodes
postcode.drop_duplicates('postcode', keep='last', inplace=True)
# Show first entries
postcode.head(2)
Merge geometry information by using the postcode
# Take postcode and longitude/latitude information
postcode = postcode[['postcode', 'lon', 'lat']]
# Cast DE_renewables['postcode'] to int64 in order to do the natural join of the dataframes
DE_renewables['postcode'] = pd.to_numeric(DE_renewables['postcode'], errors='coerce')
# Join two dataframes
DE_renewables = DE_renewables.merge(postcode, on=['postcode'], how='left')
(for data with already existing geoinformation)
In this section the existing geoinformation (in UTM-format) will be transformed into latidude and longitude coordiates as a uniform standard for geoinformation.
The BNetzA data set offers UTM Geoinformation with the columns utm_zone (UTM-Zonenwert), utm_east and utm_north. Most of utm_east-values include the utm_zone-value 32 at the beginning of the number. In order to properly standardize and transform this geoinformation into latitude and longitude it is necessary to remove this utm_zone value. For all UTM entries the utm_zone 32 is used by the BNetzA.
utm_zone | utm_east | utm_north | comment |
---|---|---|---|
32 | 413151.72 | 6027467.73 | proper coordinates |
32 | 32912159.6008 | 5692423.9664 | caused error by 32 |
How many different utm_zone values are in the data set?
DE_renewables.groupby(['utm_zone'])['utm_zone'].count()
Remove the utm_zone "32" from the utm_east value
# Find entries with 32 value at the beginning
idx_32 = (DE_renewables['utm_east'].astype(str).str[:2] == '32')
idx_notnull = DE_renewables['utm_east'].notnull()
# Remove 32 from utm_east entries
DE_renewables.loc[idx_32, 'utm_east'] = DE_renewables.loc[idx_32,
'utm_east'].astype(str).str[2:].astype(float)
def convert_to_latlon(utm_east, utm_north, utm_zone):
try:
return utm.to_latlon(utm_east, utm_north, utm_zone, 'U')
except:
return ''
DE_renewables['latlon'] = DE_renewables.loc[idx_notnull, ['utm_east', 'utm_north', 'utm_zone']].apply(
lambda x: convert_to_latlon(x[0], x[1], x[2]), axis=1).astype(str)
Conversion UTM to latitude and longitude
lat = []
lon = []
for row in DE_renewables['latlon']:
try:
# Split tuple format into the column lat and lon
row = row.lstrip('(').rstrip(')')
parts = row.split(',')
if(len(parts)<2):
raise Exception('This is not a proper tuple. So go to exception block.')
lat.append(parts[0])
lon.append(parts[1])
except:
# set NaN
lat.append(np.NaN)
lon.append(np.NaN)
DE_renewables['latitude'] = pd.to_numeric(lat)
DE_renewables['longitude'] = pd.to_numeric(lon)
# Add new values to DataFrame lon and lat
DE_renewables['lat'] = DE_renewables[['lat', 'latitude']].apply(
lambda x: x[1] if pd.isnull(x[0]) else x[0],
axis=1)
DE_renewables['lon'] = DE_renewables[['lon', 'longitude']].apply(
lambda x: x[1] if pd.isnull(x[0]) else x[0],
axis=1)
Check: missing coordinates by data source and type
#DE_renewables[DE_renewables['data_source'] == '50Hertz'].to_excel('test.xlsx')
print('Missing coordinates ', DE_renewables.lat.isnull().sum())
display(
DE_renewables[DE_renewables.lat.isnull()].groupby(
['energy_source_level_2','data_source']
)['data_source'].count()
)
print('Share of missing coordinates (note that NaN can mean it\'s all fine):')
DE_renewables[DE_renewables.lat.isnull()].groupby(
['energy_source_level_2','data_source']
)['data_source'].count() / DE_renewables.groupby(
['energy_source_level_2','data_source']
)['data_source'].count()
Remove temporary columns
# drop lonlat column that contains both, latitute and longitude
DE_renewables.drop(['latlon', 'longitude', 'latitude'], axis=1, inplace=True)
display(
DE_renewables[DE_renewables['data_source'] == 'TenneT'].head(3)[['lon','lat','data_source']]
)
display(
DE_renewables[DE_renewables['data_source'] == 'BNetzA'].head(3)[['lon','lat','data_source']]
)
Save temporary Pickle (to have a point to quickly return to if things break after this point):
pickle.dump( DE_renewables, open( "intermediate/temp_dfs_DE_before_cleaning.pickle", "wb" ) )
DE_renewables = pickle.load( open( "intermediate/temp_dfs_DE_before_cleaning.pickle", "rb" ) )
# Remove out-of-range dates
# Keep only values between 1900 and 2100 to rule out outliers / wrong values. Also, Excel doesn't support < 1900 entries.
DE_renewables = DE_renewables[
(DE_renewables['commissioning_date']>pd.Timestamp('1900')) &
(DE_renewables['commissioning_date']<pd.Timestamp('2107'))
]
DE_renewables['municipality_code'] = DE_renewables['municipality_code'].astype(str)
# Remove spaces from municipality code
DE_renewables['municipality_code'] = DE_renewables['municipality_code'].str.replace(' ', '', regex=False)
DE_renewables['municipality_code'] = pd.to_numeric(DE_renewables['municipality_code'], errors='coerce', downcast='integer')
# Merge address and address_number
to_string = lambda x: str(x) if not pd.isnull(x) else ''
DE_renewables['address'] = DE_renewables['address'].map(to_string) + ' ' + DE_renewables['address_number'].map(to_string)
# Make sure that the column has no whitespaces at the beginning and the end
DE_renewables['address'] = DE_renewables['address'].str.strip()
# Remove the column with address numbers as it is not needed anymore
del DE_renewables['address_number']
visualize(DE_renewables['lat'], DE_renewables['lon'], 'Germany', categories=DE_renewables['energy_source_level_2'])
The merged, translated, cleaned, DataFrame will be saved temporily as a pickle file, which stores a Python object fast.
DE_renewables.to_pickle('intermediate/DE_renewables.pickle')
del DE_renewables
The data which will be processed below is provided by the following data sources:
Energistyrelsen (ens) / Danish Energy Agency - The wind turbines register is released by the Danish Energy Agency.
Energinet.dk - The data of solar power plants are released by the leading transmission network operator Denmark.
# Get the download links
download_urls = get_download_urls(source_df, 'DK', download_from)
url_DK_ens = download_urls['data']['Energistyrelsen']
url_DK_energinet = download_urls['data']['Energinet']
url_DK_geo = download_urls['geo']
The function for reading the data on the wind turbines.
def read_dk_wind_turbines(filepath, sheet_name):
# Reads the data on Danish wind turbines
# from the sheet of the given name
# in the file with the path.
# Returns the data as a Pandas dataframe.
book = xlrd.open_workbook(filepath)
sheet = book.sheet_by_name(sheet_name)
# Since the column names are in two rows, not one,
# collect them in two parts. The first part is
# fixed and contains string column-names.
header = []
for i in range(0, 16):
# Make sure that strings do not contain the newline sign
# and have no blank spaces at their beginning or end.
column_name = sheet.cell_value(17, i).replace("\n", "").strip()
header = header + [column_name]
# The second part is variable. It consists of two subparts:
# 1) previous years (type float)
# 2) the past months of the current year (type date)
# Reading the previous years as column names
i = 16
cell = sheet.cell(16, i)
while cell.ctype == xlrd.XL_CELL_NUMBER:
column_name = str(int(cell.value))
header = header + [column_name]
i = i + 1
cell = sheet.cell(16, i)
# Reading the months of the current year as column names
while cell.ctype == xlrd.XL_CELL_DATE:
year, month, _, _, _, _ = xlrd.xldate_as_tuple(cell.value, book.datemode)
column_name = str("{}-{}".format(year, month))
header = header + [column_name]
i = i + 1
cell = sheet.cell(16, i)
# Skip the first 17 rows in the sheet. The rest contains the data.
df = pd.read_excel(filepath,
sheet_name=sheet_name,
skiprows=17,
skipfooter=3
)
#
df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)
# Set the column names.
df.columns = header
return df
# Get wind turbines data
wind_turbines_sheet_name = 'IkkeAfmeldte-Existing turbines'
DK_wind_filepath = download_and_cache(url_DK_ens)
save_to_country('DK', 'Energistyrelsen', DK_wind_filepath)
DK_wind_df = read_dk_wind_turbines(DK_wind_filepath,
wind_turbines_sheet_name
)
# Get photovoltaic data
DK_solar_filepath = download_and_cache(url_DK_energinet)
save_to_country('DK', 'Energinet', DK_solar_filepath)
DK_solar_df = pd.read_excel(DK_solar_filepath,
sheet_name='Data',
skiprows=[0],
converters={'Postnr': str}
)
# Choose the translation terms for Denmark, create dictionary and show dictionary
idx_DK = columnnames[columnnames['country'] == 'DK'].index
column_dict_DK = columnnames.loc[idx_DK].set_index('original_name')['opsd_name'].to_dict()
# Windows has problems reading the csv entry for east and north (DK).
# The reason might be the difference when opening the csv between linux and
# windows.
column_dict_DK_temp = {}
for k, v in column_dict_DK.items():
column_dict_DK_temp[k] = v
if v == 'utm_east' or v == 'utm_north':
# merge 2 lines to 1
new_key = ''.join(k.splitlines())
column_dict_DK_temp[new_key] = v
column_dict_DK = column_dict_DK_temp
column_dict_DK
# Replace column names based on column_dict_DK
DK_wind_df.rename(columns=column_dict_DK, inplace=True)
DK_solar_df.rename(columns=column_dict_DK, inplace=True)
# Add names of the data sources to the DataFrames
DK_wind_df['data_source'] = 'Energistyrelsen'
DK_solar_df['data_source'] = 'Energinet.dk'
# Add energy source level 2 and technology for each of the two DataFrames
DK_wind_df['energy_source_level_2'] = 'Wind'
DK_solar_df['energy_source_level_2'] = 'Solar'
DK_solar_df['technology'] = 'Photovoltaics'
# Choose the translation terms for Denmark, create dictionary and show dictionary
idx_DK = valuenames[valuenames['country'] == 'DK'].index
value_dict_DK = valuenames.loc[idx_DK].set_index('original_name')['opsd_name'].to_dict()
# Replace all original value names by the OPSD value names
DK_wind_df.replace(value_dict_DK, inplace=True)
DK_solar_df.replace(value_dict_DK, inplace=True)
UTM32 to latitude and longitude (Data from Energistyrelsen)
The Energistyrelsen data set offers UTM Geoinformation with the columns utm_east and utm_north belonging to the UTM zone 32. In this section the existing geoinformation (in UTM-format) will be transformed into latidude and longitude coordiates as a uniform standard for geoinformation.
# Index for all values with utm information
idx_notnull = DK_wind_df['utm_east'].notnull()
# Convert from UTM values to latitude and longitude coordinates
DK_wind_df['lonlat'] = DK_wind_df.loc[idx_notnull, ['utm_east', 'utm_north']
].apply(lambda x: utm.to_latlon(x[0],
x[1],
32,
'U'), axis=1).astype(str)
# Split latitude and longitude in two columns
lat = []
lon = []
for row in DK_wind_df['lonlat']:
try:
# Split tuple format
# into the column lat and lon
row = row.lstrip('(').rstrip(')')
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
# set NAN
lat.append(np.NaN)
lon.append(np.NaN)
DK_wind_df['lat'] = pd.to_numeric(lat)
DK_wind_df['lon'] = pd.to_numeric(lon)
# drop lonlat column that contains both, latitute and longitude
DK_wind_df.drop('lonlat', axis=1, inplace=True)
Postcode to lat/lon (WGS84) (for data from Energinet.dk)
The available post code in the original data provides an approximation for the geocoordinates of the solar power plants.
The postcode will be assigned to latitude and longitude coordinates with the help of the postcode table.
** geonames.org** The postcode data from Denmark is provided by Geonames and licensed under a Creative Commons Attribution 3.0 license.
# Get geo-information
zip_DK_geo = zipfile.ZipFile(download_and_cache(url_DK_geo))
# Read generated postcode/location file
DK_geo = pd.read_csv(zip_DK_geo.open('DK.txt'), sep='\t', header=-1)
# add column names as defined in associated readme file
DK_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
DK_geo.drop_duplicates('postcode', keep='last', inplace=True)
DK_geo['postcode'] = DK_geo['postcode'].astype(str)
# Add longitude/latitude infomation assigned by postcode (for Energinet.dk data)
DK_solar_df = DK_solar_df.merge(DK_geo[['postcode', 'lon', 'lat']],
on=['postcode'],
how='left')
# Show number of units with missing coordinates separated by wind and solar
print('Missing Coordinates DK_wind', DK_wind_df.lat.isnull().sum(), 'out of', len(DK_wind_df.index))
print('Missing Coordinates DK_solar', DK_solar_df.lat.isnull().sum(), 'out of', len(DK_solar_df.index))
zip_DK_geo.close()
# Merge DataFrames for wind and solar into DK_renewables
dataframes = [DK_wind_df, DK_solar_df]
DK_renewables = pd.concat(dataframes, sort=False)
DK_renewables = DK_renewables.reset_index()
# Assign energy source level 1 to the dataframe
DK_renewables['energy_source_level_1'] = 'Renewable energy'
# Merge the address and address-number columns into one
to_string = lambda x: str(x) if not pd.isnull(x) else ""
DK_renewables['address'] = DK_renewables['address'].map(to_string) + " " + DK_renewables['address_number'].map(to_string)
# Make sure that the column has no whitespaces at the beginning and the end
DK_renewables['address'] = DK_renewables['address'].str.strip()
# Select those columns of the orignal data which are utilised further
column_interest = ['commissioning_date', 'energy_source_level_1', 'energy_source_level_2',
'technology', 'electrical_capacity_kW', 'dso', 'gsrn_id', 'postcode',
'municipality_code', 'municipality', 'address',
'utm_east', 'utm_north', 'lon', 'lat', 'hub_height',
'rotor_diameter', 'manufacturer', 'model', 'data_source']
# Clean DataFrame from columns other than specified above
DK_renewables = DK_renewables.loc[:, column_interest]
DK_renewables.reset_index(drop=True, inplace=True)
# kW to MW
DK_renewables['electrical_capacity_kW'] /= 1000
# adapt column name
DK_renewables.rename(columns={'electrical_capacity_kW': 'electrical_capacity'},
inplace=True)
visualize(DK_renewables['lat'], DK_renewables['lon'], 'Denmark', categories=DK_renewables['energy_source_level_2'])
DK_renewables.to_pickle('intermediate/DK_renewables.pickle')
The data which will be processed below is provided by the following data source:
Ministry for the Ecological and Inclusive Transition - Number of installations and installed capacity of the different renewable source for every municipality in France. Service of observation and statistics, survey, date of last update: 31/01/2019. Data until 31/12/2017.
# Get the download links
download_urls = get_download_urls(source_df, 'FR', download_from)
url_FR_gouv = download_urls['data']['gouv.fr']
url_FR_geo = download_urls['geo']
# Get data of renewables per municipality
FR_re_filepath = download_and_cache(url_FR_gouv)
save_to_country('FR', 'Ministry for the Ecological and Inclusive Transition', FR_re_filepath)
FR_re_df = pd.read_excel(FR_re_filepath,
sheet_name='Commune',
encoding='UTF8',
thousands='.',
decimals=',',
header=[3, 4],
skipfooter=8, # contains summarized values
index_col=[0, 1], # required for MultiIndex
converters={'Code officiel géographique': str})
The French data source contains number of installations and sum of installed capacity per energy source per municipality. The structure is adapted to the power plant list of other countries. The list is limited to the plants which are covered by article 10 of february 2000 by an agreement to a purchase commitment.
# Rearrange data
FR_re_df.index.rename(['insee_com', 'municipality'], inplace=True)
FR_re_df.columns.rename(['energy_source_level_2', None], inplace=True)
FR_re_df = (FR_re_df
.stack(level='energy_source_level_2', dropna=False)
.reset_index(drop=False))
# Choose the translation terms for France, create dictionary and show dictionary
idx_FR = columnnames[columnnames['country'] == 'FR'].index
column_dict_FR = columnnames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
column_dict_FR
# Translate columnnames
FR_re_df.rename(columns=column_dict_FR, inplace=True)
# Drop all rows that contain NA
FR_re_df = FR_re_df.dropna()
FR_re_df.head(10)
FR_re_df['data_source'] = 'Ministry for the Ecological and Inclusive Transition'
FR_re_df['as_of_year'] = 2017 # Year for which the dataset has been compiled by the data source
Kept secret if number of installations < 3
If the number of installations is less than 3, it is marked with an s instead of the number 1 or 2 due to statistical confidentiality (further explanation by the data provider). Here, the s is changed to < 3. This is done in the same step as the other value translations of the energy sources.
# Choose the translation terms for France, create dictionary and show dictionary
idx_FR = valuenames[valuenames['country'] == 'FR'].index
value_dict_FR = valuenames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
value_dict_FR
# Replace all original value names by the OPSD value names
FR_re_df.replace(value_dict_FR, inplace=True)
Separate and assign energy source level 1-3 and technology
# Create dictionnary in order to assign energy_source to its subtype
energy_source_dict_FR = valuenames.loc[idx_FR].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Column energy_source partly contains subtype information, thus this column is copied
# to new column for energy_source_subtype...
FR_re_df['technology'] = FR_re_df['energy_source_level_2']
# ...and the energy source subtype values in the energy_source column are replaced by
# the higher level classification
FR_re_df['energy_source_level_2'].replace(energy_source_dict_FR, inplace=True)
# Assign energy_source_level_1 to the dataframe
FR_re_df['energy_source_level_1'] = 'Renewable energy'
FR_re_df.reset_index(drop=True, inplace=True)
# Choose energy source level 2 entries where energy source level 2 is Bioenergy in order to
# seperate Bioenergy subtypes to energy source level 3 and subtypes for the rest to technology
idx_FR_Bioenergy = FR_re_df[FR_re_df['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to energy source level 3 for all entries where energy source level 2 is
# Bioenergy and delete those entries from technology
FR_re_df[['energy_source_level_3']] = FR_re_df.iloc[idx_FR_Bioenergy][['technology']]
FR_re_df.loc[idx_FR_Bioenergy,'technology'] = np.nan
The available INSEE code in the original data provides a first approximation for the geocoordinates of the renewable power plants. The following data source is utilized for assigning INSEE code to coordinates of the municipalities:
OpenDataSoft publishes a list of French INSEE codes and corresponding coordinates is published under the Licence Ouverte (Etalab).
FR_geo_filepath = download_and_cache(url_FR_geo, filename='code-postal-insee-2015.csv')
# Read INSEE Code Data
FR_geo = pd.read_csv(FR_geo_filepath,
sep=';',
header=0,
converters={'Code_postal': str})
# Drop possible duplicates of the same INSEE code
FR_geo.drop_duplicates('INSEE_COM', keep='last', inplace=True)
# create columns for latitude/longitude
lat = []
lon = []
# split in latitude/longitude
for row in FR_geo['Geo Point']:
try:
# Split tuple format
# into the column lat and lon
row = row.lstrip('(').rstrip(')')
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
# set NAN
lat.append(np.NaN)
lon.append(np.NaN)
# add these columns to the INSEE DataFrame
FR_geo['lat'] = pd.to_numeric(lat)
FR_geo['lon'] = pd.to_numeric(lon)
# Column names of merge key have to be named identically
FR_re_df.rename(columns={'municipality_code': 'INSEE_COM'}, inplace=True)
# Merge longitude and latitude columns by the Code INSEE
FR_re_df = FR_re_df.merge(FR_geo[['INSEE_COM', 'lat', 'lon']],
on=['INSEE_COM'],
how='left')
# Translate Code INSEE column back to municipality_code
FR_re_df.rename(columns={'INSEE_COM': 'municipality_code'}, inplace=True)
visualize(FR_re_df['lat'], FR_re_df['lon'], 'France', categories=FR_re_df['energy_source_level_2'])
FR_re_df.to_pickle('intermediate/FR_renewables.pickle')
The data which will be processed below was provided by the following data source:
Urzad Regulacji Energetyki (URE) / Energy Regulatory Office - Number of installations and installed capacity per energy source of renewable energy. Summed per powiat (districts).
However, the data are no longer available from the original source, so they can be downloaded only from the OPSD server.
# Available only from the OPSD server
download_urls = get_download_urls(source_df, 'PL', download_from)
url_PL_url = download_urls['data']['Urzad Regulacji Energetyki']
PL_filepath = download_and_cache(url_PL_url)
save_to_country('PL', 'Urzad Regulacji Energetyki', PL_filepath)
# read rtf-file to string with the correct encoding
with open(PL_filepath, 'r') as rtf:
file_content = rtf.read()
file_content = file_content.encode('utf-8').decode('iso-8859-2')
The rtf file has one table for each district in the rtf-file which needs to be separated from each and other and restructured to get all plants in one DataFrame with the information: district, energy_source, number_of_installations, installed_capacity. Thus in the following, the separating items are defined, the district tables split in parts, all put in one list and afterwards transferred to a pandas DataFrame.
# a new line is separating all parts
sep_split_into_parts = r'{\fs12 \f1 \line }'
# separates the table rows of each table
sep_data_parts = r'\trql'
reg_exp_district = r'(?<=Powiat:).*(?=})'
reg_exp_installation_type = (
r'(?<=\\fs12 \\f1 \\pard \\intbl \\ql \\cbpat[2|3|4] \{\\fs12 \\f1 ).*(?=\})')
reg_exp_installation_value = (
r'(?<=\\fs12 \\f1 \\pard \\intbl \\qr \\cbpat[3|4] \{\\fs12 \\f1 ).*(?=})')
# split file into parts
parts = file_content.split(sep_split_into_parts)
# list containing the data
data_set = []
for part in parts:
# match district
district = re.findall(reg_exp_district, part)
if len(district) == 0:
pass
else:
district = district[0].lstrip()
# separate each part
data_parts = part.split(sep_data_parts)
# data structure:
# data_row = {'district': '', 'install_type': '', 'quantity': '', 'power': ''}
for data_rows in data_parts:
if "SUM" in data_rows:
# The summary follows now and we should omit it
break
wrapper_list = []
# match each installation type
installation_type = re.findall(reg_exp_installation_type, data_rows)
for inst_type in installation_type:
wrapper_list.append({'district': district, 'technology': inst_type})
# match data - contains twice as many entries as
# installation type (quantity, power vs. install type)
data_values = re.findall(reg_exp_installation_value, data_rows)
if len(data_values) == 0:
#log.debug('data values empty')
pass
else:
# connect data
for i, _ in enumerate(wrapper_list):
wrapper_list[i]['number_of_installations'] = data_values[(
i * 2)]
wrapper_list[i]['electrical_capacity'] = data_values[(
i * 2) + 1]
# prepare to write to file
for data in wrapper_list:
data_set.append(data)
# mapping of malformed unicode which appear in the Polish district names
polish_truncated_unicode_map = {
r'\uc0\u322': 'ł',
r'\uc0\u380': 'ż',
r'\uc0\u243': 'ó',
r'\uc0\u347': 'ś',
r'\uc0\u324': 'ń',
r'\uc0\u261': 'ą',
r'\uc0\u281': 'ę',
r'\uc0\u263': 'ć',
r'\uc0\u321': 'Ł',
r'\uc0\u378': 'ź',
r'\uc0\u346': 'Ś',
r'\uc0\u379': 'Ż'
}
# changing malformed unicode
for entry in data_set:
while r'\u' in entry['district']:
index = entry['district'].index(r'\u')
offset = index + 9
to_be_replaced = entry['district'][index:offset]
if to_be_replaced in polish_truncated_unicode_map.keys():
# offset + 1 because there is a trailing whitespace
entry['district'] = entry['district'].replace(entry['district'][index:offset + 1],
polish_truncated_unicode_map[to_be_replaced])
else:
break
# Create pandas DataFrame with similar structure as the other countries
PL_re_df = pd.DataFrame(data_set)
PL_re_df
PL_re_df['data_source'] = 'Urzad Regulacji Energetyki'
PL_re_df['as_of_year'] = 2016 # Year for which the dataset has been compiled by the data source
# Choose the translation terms for France, create dictionary and show dictionary
idx_PL = valuenames[valuenames['country'] == 'PL'].index
value_dict_PL = valuenames.loc[idx_PL].set_index('original_name')['opsd_name'].to_dict()
value_dict_PL
# Replace all original value names by the OPSD value names
PL_re_df.technology.replace(value_dict_PL, inplace=True)
Assign energy source level 1-3 and technology
# Assign energy_source_level_1 to the dataframe
PL_re_df['energy_source_level_1'] = 'Renewable energy'
# Create dictionnary in order to assign energy_source to its subtype
energy_source_dict_PL = valuenames.loc[idx_PL].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Create new column for energy_source
PL_re_df['energy_source_level_2'] = PL_re_df.technology
# Fill this with the energy source instead of subtype information
PL_re_df.energy_source_level_2.replace(energy_source_dict_PL, inplace=True)
# Choose energy_source entries where energy_source is "Bioenergy" in order to
# seperate Bioenergy subtypes to "energy_source_level_3" and subtypes for the rest to "technology"
idx_PL_Bioenergy = PL_re_df[PL_re_df['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to "energy_source_level_3" for all entries where energy_source_level_2 is
# "Bioenergy" and delete those entries from "technology"
PL_re_df[['energy_source_level_3']] = PL_re_df.iloc[idx_PL_Bioenergy][['technology']]
PL_re_df.loc[idx_PL_Bioenergy, 'technology'] = np.nan
Adjust datatype of numeric columns
# change type to numeric
PL_re_df['electrical_capacity'] = pd.to_numeric(PL_re_df['electrical_capacity'])
# Additionally commas are deleted
PL_re_df['number_of_installations'] = pd.to_numeric(
PL_re_df['number_of_installations'].str.replace(',', ''))
Aggregate and assign energy source level 1
For entries/rows of the same district and energy source level 2, electrical capacity and number of installations are aggregated.
# Skip this step.
#PL_re_df = PL_re_df.groupby(['district', 'energy_source_level_2', 'technology'],
# as_index=False
# ).agg({'electrical_capacity': sum,
# 'number_of_installations': sum,
# 'data_source': 'first'})
PL_re_df.to_pickle('intermediate/PL_renewables.pickle')
PL_re_df
The data which will be processed below is provided by the following data source:
Swiss Federal Office of Energy - Data of all renewable power plants receiving "Kostendeckende Einspeisevergütung" (KEV) which is the Swiss feed in tarif for renewable power plants. Geodata is based on municipality codes.
# point URLs to original data or the original source depending on the chosen download option
download_urls = get_download_urls(source_df, 'CH', download_from)
url_CH_KEV = download_urls['data']['BFE']
url_CH_geo = download_urls['geo']
# Get data of renewables per municipality
CH_re_filepath = download_and_cache(url_CH_KEV, filename='CH_original_file.xlsx')
save_to_country('CH', 'BFE', CH_re_filepath)
CH_re_df = pd.read_excel(CH_re_filepath,
sheet_name='KEV Bezüger 2017',
encoding='UTF8',
thousands='.',
decimals=','
#header=[0]
#skipfooter=9, # contains summarized values
#index_col=[0, 1], # required for MultiIndex
#converters={'Code officiel géographique':str}
)
# Choose the translation terms for Switzerland, create dictionary and show dictionary
idx_CH = columnnames[columnnames['country'] == 'CH'].index
column_dict_CH = columnnames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
column_dict_CH
# Translate columnnames
CH_re_df.columns = [column_name.replace("\n", "") for column_name in CH_re_df.columns]
CH_re_df.rename(columns=column_dict_CH, inplace=True)
CH_re_df['data_source'] = 'BFE'
# Choose the translation terms for Switzerland, create dictionary
idx_CH = valuenames[valuenames['country'] == 'CH'].index
value_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
Separate and assign energy source level 1-3 and technology
# Assign energy_source_level_1 to the dataframe
CH_re_df['energy_source_level_1'] = 'Renewable energy'
# Create dictionnary in order to assign energy_source to its subtype
#energy_source_dict_CH = valuenames.loc[idx_CH].set_index('opsd_name')['energy_source_level_2'].to_dict()
#
# ...and the energy source subtype values in the energy_source column are replaced by
# the higher level classification
#CH_re_df['energy_source_level_2'].replace(energy_source_dict_CH, inplace=True)
CH_re_df['energy_source_level_3'] = CH_re_df['technology']
# Create dictionnary in order to assign energy_source_level_2 to its subtype
idx_CH = valuenames[valuenames['country'] == 'CH'].index
energy_source_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['energy_source_level_2'].to_dict()
# Add energy_source_level_2
CH_re_df['energy_source_level_2'] = CH_re_df['energy_source_level_2'].replace(energy_source_dict_CH)
# Translate values in order to standardize energy_source_level_3
value_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
CH_re_df['energy_source_level_3'].replace(value_dict_CH, inplace=True)
# Standardize the values for technology
# 1. np.nan means that technology should not be specified for the respective kind of sources
# according to the hierarchy (http://open-power-system-data.org/2016-10-25-opsd_tree.svg)
# 2. 'Other or unspecified technology' means that technology should be specified
# but it was unclear or missing in the original dataset.
technology_translation_dictionary = {
'Klärgasanlage': np.nan,
'übrige Biomasse - WKK-Anlage': 'Other or unspecified technology',
'übrige Biomasse - Dampfprozess': 'Steam turbine',
'Schlammverbrennungsanlage': 'Combustion engine',
'WKK-Prozess': 'Other or unspecified technology',
'Kehrrichtverbrennungsanlage': 'Combustion engine',
'Integrierte Anlage': 'Photovoltaics',
'Angebaute Anlage': 'Photovoltaics',
'Freistehende Anlage': 'Photovoltaics',
'Trinkwasserkraftwerk': 'Other or unspecified technology',
'Durchlaufkraftwerk': 'Run-of-river',
'Dotierwasserkraftwerk': 'Other or unspecified technology',
'Ausleitkraftwerk': 'Other or unspecified technology',
'Wind Offshore': 'Other or unspecified technology',
'Abwasserkraftwerk': 'Other or unspecified technology',
'Unbekannt': 'Other or unspecified technology',
np.nan: 'Onshore',
None: 'Onshore'
}
CH_re_df['technology'].replace(technology_translation_dictionary, inplace=True)
# Add energy_source_level_1
CH_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
CH_re_df[['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']].drop_duplicates().sort_values(by='energy_source_level_2')
CH_re_df.reset_index(drop=True, inplace=True)
Replace the rest of the original terms with their OPSD equivalents
CH_re_df.replace(value_dict_CH, inplace=True)
Postcode to lat/lon (WGS84)
The available municipality code in the original data provides an approximation for the geocoordinates of the renewable power plants.
The postcode will be assigned to latitude and longitude coordinates with the help of the postcode table.
geonames.org The postcode data from Switzerland is provided by Geonames and licensed under a Creative Commons Attribution 3.0 license.
# Get geo-information
zip_CH_geo = zipfile.ZipFile(download_and_cache(url_CH_geo))
# Read generated postcode/location file
CH_geo = pd.read_csv(zip_CH_geo.open('CH.txt'), sep='\t', header=-1)
# add column names as defined in associated readme file
CH_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
CH_geo.drop_duplicates('postcode', keep='last', inplace=True)
CH_geo['postcode'] = CH_geo['postcode'].astype(str)
# harmonise data class
CH_geo.postcode = CH_geo.postcode.astype(int)
# Add longitude/latitude infomation assigned by municipality code
CH_re_df = pd.merge(CH_re_df,
CH_geo[['lat', 'lon', 'postcode']],
left_on='municipality_code',
right_on='postcode',
how='left'
)
zip_CH_geo.close()
# kW to MW
CH_re_df['electrical_capacity'] /= 1000
# kWh to MWh
CH_re_df['production'] /= 1000
columns_to_keep = ['project_name', 'energy_source_level_2','energy_source_level_3', 'technology', 'electrical_capacity',
'production', 'tariff', 'commissioning_date', 'contract_period_end',
'street', 'municipality_code', 'municipality', 'canton',
'company', 'title', 'surname', 'first_name', 'data_source',
'energy_source_level_1', 'lat', 'lon', 'postcode']
CH_re_df = CH_re_df.loc[:, columns_to_keep]
CH_re_df.reset_index(drop=True, inplace=True)
visualize(CH_re_df['lat'], CH_re_df['lon'], 'Switzerland', categories=CH_re_df['energy_source_level_2'])
save_to_intermediate('CH', CH_re_df)
Check and validation of the renewable power plants list as well as the creation of CSV/XLSX/SQLite files can be found in Part 2 of this script. It also generates a daily time series of cumulated installed capacities by energy source.
The data for the UK is provided by the following source:
UK Government Department of Business, Energy & Industrial Strategy (BEIS) - the data contains information on the UK renewable energy sources and are updated at the end of each quarter.
# point URLs to original data depending on the chosen download option
download_urls = get_download_urls(source_df, 'UK', download_from)
url_UK_geo = download_urls['geo']
if download_from == 'original_sources':
UK_page_url = download_urls['data']['BEIS']
# Get the url of the latest update
UK_link_selector = '.download strong'
response = requests.get(UK_page_url)
html = response.content
soup = bs4.BeautifulSoup(html, 'html.parser')
span = soup.find('span', class_='download')
link = span.find('a')
url_UK_beis = link['href']
else:
url_UK_beis = download_urls['data']['UK Government Department of Business, Energy & Industrial Strategy']
# Download the data and read them into a dataframe
UK_re_filepath = download_and_cache(url_UK_beis)
save_to_country('UK', 'UK Government Department of Business, Energy & Industrial Strategy', UK_re_filepath)
UK_re_df = pd.read_csv(UK_re_filepath,
header=6,
encoding='latin1',
parse_dates=['Record Last Updated (dd/mm/yyyy)', 'Operational'],
infer_datetime_format=True,
thousands=',',
)
# Drop empty columns and rows
UK_re_df.dropna(axis='index', how='all', inplace=True)
UK_re_df.dropna(axis='columns', how='all', inplace=True)
The downloaded dataset has to be cleaned:
CHP Enabled
contains five different strings: "No
", "Yes
", "no
", "yes
", and "No
" with a trailing white space, even though they represent only two distinct values. So, we have to ensure a 1-to-1 mapping between the true values of a feature and their representations for all the features present in the set.Battery
and Flywheels
are of no interest, so the facilities using them should be omitted.# Keep only operational facilities in the dataset
UK_re_df = UK_re_df.loc[UK_re_df["Development Status"] == "Operational"]
UK_re_df.reset_index(inplace=True, drop=True)
# Define functions for standardizing strings
def standardize_string(x, lower=False):
if pd.isnull(x):
return np.nan
if type(x) is str:
x = x.strip()
if lower:
x = x.lower()
return x
def standardize_column(df, column, lower=False):
df.loc[:, column]= df.apply(lambda row: standardize_string(row[column], lower=lower),
axis=1
)
# Apply them where needed
strip_and_lower = ['CHP Enabled']
strip_only = ['Country', 'County', 'Operator (or Applicant)', 'Mounting Type for Solar']
for column in strip_and_lower:
standardize_column(UK_re_df, column, lower=True)
for column in strip_only:
standardize_column(UK_re_df, column, lower=False)
# Drop Flywheels and Battery
UK_re_df = UK_re_df[~UK_re_df['Technology Type'].isin(['Flywheels', 'Battery'])]
UK_re_df.reset_index(drop=True, inplace=True)
# Copy the column "Technology Type" to a new column named "technology"
UK_re_df['technology'] = UK_re_df['Technology Type']
# Choose the translation terms for the UK and create the translation dictionary
idx_UK = columnnames[columnnames['country'] == 'UK'].index
column_dict_UK = columnnames.loc[idx_UK].set_index('original_name')['opsd_name'].to_dict()
# Show the dictionary
column_dict_UK
# Translate column names
UK_re_df.rename(columns=column_dict_UK, inplace=True)
UK_re_df['data_source'] = 'BEIS'
# Create dictionnary in order to assign energy_source_level_2 to its subtype
idx_UK = valuenames[valuenames['country'] == 'UK'].index
energy_source_dict_UK = valuenames.loc[idx_UK].set_index('original_name')['energy_source_level_2'].to_dict()
# Add energy_source_level_2
UK_re_df['energy_source_level_2'] = UK_re_df['energy_source_level_3'].replace(energy_source_dict_UK)
# Translate values in order to standardize energy_source_level_3
value_dict_UK = valuenames.loc[idx_UK].set_index('original_name')['opsd_name'].to_dict()
UK_re_df['energy_source_level_3'].replace(value_dict_UK, inplace=True)
# Standardize the values for technology
# 1. np.nan means that technology should not be specified for the respective kind of sources
# according to the hierarchy (http://open-power-system-data.org/2016-10-25-opsd_tree.svg)
# 2. 'Other or unspecified technology' means that technology should be specified
# but it was unclear or missing in the original dataset.
technology_translation_dictionary = {
'Biomass (co-firing)': 'Other or unspecified technology',
'Biomass (dedicated)': 'Other or unspecified technology',
'Advanced Conversion Technologies': 'Other or unspecified technology',
'Anaerobic Digestion': 'Other or unspecified technology',
'EfW Incineration': np.nan,
'Large Hydro': 'Other or unspecified technology',
'Small Hydro': 'Other or unspecified technology',
'Landfill Gas': np.nan,
'Solar Photovoltaics': 'Photovoltaics',
'Sewage Sludge Digestion': np.nan,
'Tidal Barrage and Tidal Stream': np.nan,
'Shoreline Wave': np.nan,
'Wind Offshore': 'Offshore',
'Wind Onshore': 'Onshore',
'Pumped Storage Hydroelectricity': 'Pumped storage'
}
UK_re_df['technology'].replace(technology_translation_dictionary, inplace=True)
# Add energy_source_level_1
UK_re_df['energy_source_level_1'] = 'Renewable energy'
# Show the hierarchy of sources present in the dataset
UK_re_df[['energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology']].drop_duplicates()
The facilities' location details comprise of the information on the address, county, region, country (England, Scotland, Wales, Northern Ireland), post code, and Easting (X) and Northing (Y) coordinates of each facility in the OSGB georeferencing system. To convert the easting and northing cordinates to standard WG84 latitude and longitude, we use package bng_latlon
.
# Define a wrapper for bng_to_latlon for handling None values
def to_lat_lon(easting, northing):
if pd.isnull(easting) or pd.isnull(northing):
return (None, None)
else:
return bng_to_latlon.OSGB36toWGS84(easting, northing)
# Convert easting and northing columns to numbers
UK_re_df['X-coordinate'] = pd.to_numeric(
UK_re_df['X-coordinate'].astype(str).str.replace(',', ''),
errors='coerce'
)
UK_re_df['Y-coordinate'] = pd.to_numeric(
UK_re_df['Y-coordinate'].astype(str).str.replace(',', ''),
errors='coerce'
)
# Convert easting and northing coordinates to standard latitude and longitude
latlon = UK_re_df.apply(lambda row: to_lat_lon(row["X-coordinate"], row["Y-coordinate"]),
axis=1
)
# Split a column of (latitude, longitude) pairs into two separate coordinate columns
latitude = latlon.apply(lambda x: x[0])
longitude = latlon.apply(lambda x: x[1])
# Add them to the dataframe
UK_re_df['latitude'] = latitude
UK_re_df['longitude'] = longitude
If the Easting and Northing coordinates of a facility are not provided, its latitude and longitude cannot be determined. For such sources, we look up the WGS84 coordinates in the geodataset provided by geonames.org, where the UK postcodes are paired with their latitudes and longitudes.
# Get geo-information
zip_UK_geo = zipfile.ZipFile(download_and_cache(url_UK_geo))
# Read generated postcode/location file
UK_geo = pd.read_csv(zip_UK_geo.open('GB_full.txt'), sep='\t', header=-1)
# add column names as defined in associated readme file
UK_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
UK_geo.drop_duplicates('postcode', keep='last', inplace=True)
UK_geo['postcode'] = UK_geo['postcode'].astype(str)
# Find the rows where latitude and longitude are unknown
missing_latlon_mask = UK_re_df['latitude'].isna() | UK_re_df['longitude'].isna()
missing_latlon = UK_re_df[missing_latlon_mask]
# Add longitude/latitude infomation assigned by post code
updated_latlon = pd.merge(missing_latlon,
UK_geo[['lat', 'lon', 'postcode']],
left_on='postcode',
right_on='postcode',
how='left'
)
# Return the updated rows to the original frame
UK_re_df = pd.merge(UK_re_df,
updated_latlon[['uk_beis_id', 'lat', 'lon']],
on='uk_beis_id',
how='left'
)
# Use the bng_to_latlon coordinates (columns: 'latitude' and 'longitude') if present,
# otherwise, use those obtained with UK_geo (columns: 'lat' and 'lon').
UK_re_df['longitude'] = UK_re_df.apply(lambda row: row['longitude'] if not pd.isnull(row['longitude'])
else row['lon'],
axis=1
)
UK_re_df['latitude'] = UK_re_df.apply(lambda row: row['latitude'] if not pd.isnull(row['latitude'])
else row['lat'],
axis=1
)
# Drop the UK_geo columns (lat/lon)
# as the information was moved to the 'latitude' and 'longitude' columns.
UK_re_df.drop(['lat', 'lon'], axis='columns', inplace=True)
zip_UK_geo.close()
In the cases where the full post code was not present in geonames.org, use its prefix to find the latitude / longitude pairs of locations covered by that prefix. Then, approximate those facilities' locations by the centroids of their prefix areas.
# Find the rows where latitude and longitude are unknown
missing_latlon_mask = UK_re_df['latitude'].isna() | UK_re_df['longitude'].isna()
missing_latlon = UK_re_df[missing_latlon_mask].copy()
missing_latlon = missing_latlon.reset_index()
# Determine their post code prefixes
prefixes = missing_latlon.apply(lambda row: str(row['postcode']).split(' ')[0],
axis=1
)
missing_latlon['Prefix'] = prefixes
# Determine the centroids of the areas covered by the prefixes
grouped_UK_geo=UK_geo.groupby(by=lambda i: str(UK_geo['postcode'].loc[i]).split(' ')[0])
# Assing the centroid coordinates to the facilities with unknown coordinates
updated_latlon = pd.merge(missing_latlon,
grouped_UK_geo.mean(),
left_on="Prefix",
right_index=True,
how="left"
)
# Return the updated rows to the original frame
UK_re_df = pd.merge(UK_re_df,
updated_latlon[['uk_beis_id', 'lat', 'lon']],
on='uk_beis_id',
how='left'
)
# Keep the already known coordinates (columns: 'latitude' and 'longitude') if present,
# otherwise, use those obtained by approximation (columns: 'lat' and 'lon').
UK_re_df['longitude'] = UK_re_df.apply(lambda row: row['longitude'] if not pd.isnull(row['longitude'])
else row['lon'],
axis=1
)
UK_re_df['latitude'] = UK_re_df.apply(lambda row: row['latitude'] if not pd.isnull(row['latitude'])
else row['lat'],
axis=1
)
# Drop the UK_geo columns (lat/lon)
# as the information was moved to the 'latitude' and 'longitude' columns.
UK_re_df.drop(['lat', 'lon'], axis='columns', inplace=True)
visualize(UK_re_df['latitude'], UK_re_df['longitude'], 'United Kingdom', categories=UK_re_df['energy_source_level_2'])
We see that some facilities appear to be located in the sea. Let us plot the original OSGB coordinates to see if translation to the standard longitude and latitude coordinates failed for some locations.
max_X = UK_re_df['X-coordinate'].max()
min_X = UK_re_df['X-coordinate'].min()
max_Y = UK_re_df['Y-coordinate'].max()
min_Y = UK_re_df['Y-coordinate'].min()
figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
ax = plt.axes(projection=ccrs.OSGB())
ax.coastlines('10m')
ax.scatter(UK_re_df['X-coordinate'], UK_re_df['Y-coordinate'],s=0.5)
plt.show()
As we can see, the maps are basically the same, which confirms that translation to the longitude and latitude coordinates is done correctly and that they reflect the positions specified by the original X and Y OSGB coordinates.
# Rename 'longitude' and 'latitude' to 'lon' and 'lat' to conform to the naming convention
# used for other countries.
UK_re_df.rename(columns={'longitude': 'lon', 'latitude': 'lat'}, inplace=True)
# Define the columns to keep
columns_of_interest = ['commissioning_date', 'uk_beis_id', 'operator', 'site_name',
'energy_source_level_1', 'energy_source_level_2', 'energy_source_level_3', 'technology',
'electrical_capacity', 'chp', 'support_robranding', 'support_fit', 'support_cfd',
'capacity_individual_turbine', 'number_of_turbines', 'solar_mounting_type',
'status', 'address', 'municipality', 'region', 'country', 'postcode', 'lon',
'lat', 'data_source'
]
# Clean the dataframe from columns other than those specified above
UK_re_df = UK_re_df.loc[:, columns_of_interest]
UK_re_df.reset_index(drop=True, inplace=True)
UK_re_df.to_pickle('intermediate/UK_renewables.pickle')
raw_input_directory_path
zip_archive = zipfile.ZipFile(raw_input_directory_path + '.zip', 'w', zipfile.ZIP_DEFLATED)
print("Zipping the raw files...")
for filename in os.listdir(raw_input_directory_path):
print("Adding", filename, "to the zip.")
filepath = os.path.join(raw_input_directory_path, filename)
zip_archive.write(filepath)
zip_archive.close()
print("Done!")
shutil.rmtree(raw_input_directory_path)