#!/usr/bin/env python
# coding: utf-8
#
# This script downlads and extracts the original data of renewable power plant lists from the data sources, processes and merges them. It subsequently adds the geolocation for each power plant. Finally it saves the DataFrames as pickle-files. Make sure you run the download and process Notebook before the validation and output Notebook.
# Table of Contents
#
# # Script setup
# In[ ]:
import logging
import os
import posixpath
import urllib.parse
import urllib.request
import re
import zipfile
import pickle
import numpy as np
import pandas as pd
import utm # for transforming geoinformation in the utm format
import requests
from string import Template
from IPython.display import display
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%d %b %Y %H:%M:%S'
)
logger = logging.getLogger()
# Create input and output folders if they don't exist
os.makedirs(os.path.join('input', 'original_data'), exist_ok=True)
os.makedirs('output', exist_ok=True)
os.makedirs(os.path.join('output', 'renewable_power_plants'), exist_ok=True)
# # Settings
# ## Choose download option
# The original data can either be downloaded from the original data sources as specified below or from the opsd-Server. Default option is to download from the original sources as the aim of the project is to stay as close to original sources as possible. However, if problems with downloads e.g. due to changing urls occur, you can still run the script with the original data from the opsd_server.
# In[ ]:
download_from = 'original_sources'
#download_from = 'opsd_server'
# ## Download function
# In[ ]:
def download_and_cache(url, session=None):
"""This function downloads a file into a folder called
original_data and returns the local filepath."""
path = urllib.parse.urlsplit(url).path
filename = posixpath.basename(path)
filepath = os.path.join('input', 'original_data', filename)
print(url)
# check if file exists, if not download it
print(filepath)
if not os.path.exists(filepath):
if not session:
print('No session')
session = requests.session()
print("Downloading file: ", filename)
r = session.get(url, stream=True)
chuncksize = 1024
with open(filepath, 'wb') as file:
for chunck in r.iter_content(chuncksize):
file.write(chunck)
else:
print("Using local file from", filepath)
filepath = '' + filepath
return filepath
# ## Setup translation dictionaries
#
# Column and value names of the original data sources will be translated to English and standardized across different sources. Standardized column names, e.g. "electrical_capacity" are required to merge data in one DataFrame.
# The column and the value translation lists are provided in the input folder of the Data Package.
# In[ ]:
# Get column translation list
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
columnnames.head(2)
# In[ ]:
# Get value translation list
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
valuenames.head(2)
# # Download and process per country
#
# For one country after the other, the original data is downloaded, read, processed, translated, eventually georeferenced and saved. If respective files are already in the local folder, these will be utilized.
# To process the provided data [pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe) is applied.
# ## Germany DE
# ### Download and read
# The data which will be processed below is provided by the following data sources:
#
# **[Netztransparenz.de](https://www.netztransparenz.de/de/Anlagenstammdaten.htm)** - Official grid transparency platform from the German TSOs (50Hertz, Amprion, TenneT and TransnetBW).
#
# **[Bundesnetzagentur (BNetzA)](https://www.bundesnetzagentur.de/DE/Sachgebiete/ElektrizitaetundGas/Unternehmen_Institutionen/ErneuerbareEnergien/ZahlenDatenInformationen/EEG_Registerdaten/EEG_Registerdaten_node.html)** - German Federal Network Agency for Electricity, Gas, Telecommunications, Posts and Railway (In separate files for **data for roof-mounted PV power plants** and for **all other renewable energy power plants**.)
#
# Data URL for BNetzA gets updated every few month. To be sure, always check if the links (url_bnetza; url_bnetza_pv) are up to date.
# In[ ]:
# point URLs to original data depending on the chosen download option
if download_from == 'original_sources':
tsos = ['50Hertz', 'Amprion', 'TenneT', 'TransnetBW']
datasets = ['50Hertz', 'Amprion', 'TenneT', 'TransnetBW','bnetza','bnetza_pv','bnetza_pv_historic']
url_pattern_netztransparenz = 'https://www.netztransparenz.de/portals/1/Content/Erneuerbare-Energien-Gesetz/EEG-Anlagestammdaten/${tso}_EEG-Anlagenstammdaten_2016.zip'
urls = {
'bnetza': 'https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/ErneuerbareEnergien/ZahlenDatenInformationen/VOeFF_Registerdaten/2017_12_Veroeff_RegDaten.xlsx?__blob=publicationFile&v=2',
'bnetza_pv': 'https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/ErneuerbareEnergien/ZahlenDatenInformationen/VOeFF_Registerdaten/PV_Datenmeldungen/Meldungen_Juli-Dez2017.xlsx?__blob=publicationFile&v=2',
'bnetza_pv_historic': 'https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/ErneuerbareEnergien/ZahlenDatenInformationen/VOeFF_Registerdaten/PV_Datenmeldungen/Meldungen_Aug-Juni2017.xlsx?__blob=publicationFile&v=2'
}
elif download_from == 'opsd_server':
print('Download from opsd_server is currently not implemented.')
# In[ ]:
## Note: Some BNetzA files fail to download automatically, so put them manually to the input/original_data/ folder
# Download all data sets before processing.
if download_from == 'original_sources':
filenames = {}
for tso in tsos:
url = Template(url_pattern_netztransparenz).substitute(tso=tso)
get_ipython().run_line_magic('time', 'filenames[tso] = zipfile.ZipFile(download_and_cache(url))')
for dataset in ['bnetza','bnetza_pv','bnetza_pv_historic']:
get_ipython().run_line_magic('time', 'filenames[dataset] = download_and_cache(urls[dataset])')
elif download_from == 'opsd_server':
print('Download from opsd_server is currently not implemented.')
# In[ ]:
# Read TSO data from zip file
dfs = {}
for tso in tsos:
print('Reading '+tso+'_EEG-Anlagenstammdaten_2016.csv')
dfs[tso] = pd.read_csv(
filenames[tso].open(tso+'_EEG-Anlagenstammdaten_2016.csv'),
sep=';',
thousands='.',
decimal=',',
header=0,
parse_dates=[11, 12, 13, 14],
encoding='iso-8859-1',
dayfirst=True,
low_memory=False
)
print('Done reading '+tso+'_EEG-Anlagenstammdaten_2016.csv')
# In[ ]:
key = 'Installierte Leistung'
dfs['Amprion'][key] = [x.replace('.', '').replace(',', '.') for x in dfs['Amprion'][key]]
dfs['Amprion'][key] = dfs['Amprion'][key].astype(float)
# In[ ]:
# Read BNetzA register
print('Reading bnetza: '+filenames['bnetza'])
dfs['bnetza'] = pd.read_excel(filenames['bnetza'],
sheetname='Gesamtübersicht',
header=0,
converters={'4.9 Postleit-zahl': str, 'Gemeinde-Schlüssel': str}
)
skiprows = {'bnetza_pv_historic': 10, 'bnetza_pv': 9}
for dataset in ['bnetza_pv', 'bnetza_pv_historic']:
print('Reading '+dataset+': '+filenames[dataset])
xls_handle = pd.ExcelFile(filenames[dataset])
print('Concatenating all '+dataset+' sheets into one dataframe')
dfs[dataset] = pd.concat(
xls_handle.parse(
sheet,
skiprows=skiprows[dataset],
converters={'Anlage \nPLZ': str}
) for sheet in xls_handle.sheet_names
)
dfs[dataset].tail(2)
# In[ ]:
# Drop empty column called "Unnamed: 4"
dfs['TransnetBW'] = dfs['TransnetBW'].drop(['Unnamed: 4'], axis=1)
dfs['bnetza_pv_historic'] = dfs['bnetza_pv_historic'].drop(['Unnamed: 7'], axis=1)
# In[ ]:
pickle.dump( dfs, open( "intermediate/temp_dfs_1.p", "wb" ) )
# In[ ]:
dfs = pickle.load( open( "intermediate/temp_dfs_1.p", "rb" ) )
# ### Translate column names
# To standardise the DataFrame the original column names from the German TSOs and the BNetzA wil be translated and new english column names wil be assigned to the DataFrame. The unique column names are required to merge the DataFrame.
# The column_translation_list is provided here as csv in the input folder. It is loaded in _2.3 Setup of translation dictionaries_.
# In[ ]:
# Choose the translation terms for Germany, create dictionary and show dictionary
columnnames = pd.read_csv(os.path.join('input', 'column_translation_list.csv'))
idx_DE = columnnames[columnnames['country'] == 'DE'].index
column_dict_DE = columnnames.loc[idx_DE].set_index('original_name')['opsd_name'].to_dict()
# In[ ]:
# Start the column translation process for each original data source
print('Translation')
for dataset in datasets:
# Remove newlines and any other duplicate whitespaces in column names:
dfs[dataset] = dfs[dataset].rename(columns={col: re.sub(r"\s+", ' ', col) for col in dfs[dataset].columns})
# Do column name translations
dfs[dataset].rename(columns=column_dict_DE, inplace=True)
print('done.')
# ### Add information and choose columns
# All data source names and for the BNetzA-PV data the energy source level 2 will added.
# In[ ]:
# Add data source names to the DataFrames
for tso in tsos:
dfs[tso]['data_source'] = tso
dfs['bnetza']['data_source'] = 'BNetzA'
dfs['bnetza_pv']['data_source'] = 'BNetzA_PV'
dfs['bnetza_pv_historic']['data_source'] = 'BNetzA_PV_historic'
# Add for the BNetzA PV data the energy source level 2
dfs['bnetza_pv']['energy_source_level_2'] = 'Photovoltaics'
dfs['bnetza_pv_historic']['energy_source_level_2'] = 'Photovoltaics'
# Correct datetime-format
def decom_fkt(x):
x = str(x)
if x == 'nan':
x = ''
else:
x = x[0:10]
return x
dfs['bnetza']['decommissioning_date'] = dfs['bnetza']['decommissioning_date'].apply(decom_fkt)
# In[ ]:
# Select those columns of the orignal data which are utilised further
dfs['bnetza'] = dfs['bnetza'].loc[:, ('commissioning_date', 'decommissioning_date',
'notification_reason', 'energy_source_level_2',
'electrical_capacity_kW', 'thermal_capacity_kW',
'voltage_level', 'dso', 'eeg_id', 'bnetza_id',
'federal_state', 'postcode', 'municipality_code',
'municipality', 'address', 'address_number',
'utm_zone', 'utm_east', 'utm_north',
'data_source')]
# In[ ]:
for dataset in datasets: print(dataset+':'); display(dfs[dataset].tail(2))
# ### Merge DataFrames
# The individual DataFrames from the TSOs (Netztransparenz.de) and BNetzA are merged.
# In[ ]:
# Merge DataFrames of each original source into a common DataFrame DE_renewables
dfs_list = []
for dataset in datasets:
dfs_list.append(dfs[dataset])
DE_renewables = pd.concat(dfs_list)
DE_renewables.head(2)
# In[ ]:
# Make sure the decommissioning_column has the right dtype
get_ipython().run_line_magic('time', "DE_renewables['decommissioning_date'] = DE_renewables['decommissioning_date'].apply(pd.to_datetime)")
DE_renewables['decommissioning_date'] = pd.to_datetime(DE_renewables['decommissioning_date'])
# In[ ]:
DE_renewables.reset_index(drop=True, inplace=True)
DE_renewables.head(2)
# ### Translate values and harmonize energy source level 2
# Different German terms for energy source level 2, energy source level 3, technology and voltage levels are translated and harmonized across the individual data sources. The value_translation_list is provided here as csv in the input folder. It is loaded in _2.3 Setup of translation dictionaries_.
# In[ ]:
# Choose the translation terms for Germany, create dictionary and show dictionary
valuenames = pd.read_csv(os.path.join('input', 'value_translation_list.csv'))
idx_DE = valuenames[valuenames['country'] == 'DE'].index
value_dict_DE = valuenames.loc[idx_DE].set_index('original_name')['opsd_name'].to_dict()
# In[ ]:
print('replacing..')
# Replace all original value names by the OPSD value names.
# Running time: some minutes. %time prints the time your computer required for this step
get_ipython().run_line_magic('time', 'DE_renewables.replace(value_dict_DE, inplace=True)')
# In[ ]:
DE_renewables['postcode'] = DE_renewables['postcode'].apply(pd.to_numeric, errors='ignore')
# **Separate and assign energy source level 1 - 3 and technology **
# In[ ]:
# Create dictionary in order to assign energy_source to its subtype
energy_source_dict_DE = valuenames.loc[idx_DE].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Column energy_source partly contains energy source level 3 and technology information,
# thus this column is copied to new column technology...
DE_renewables['technology'] = DE_renewables['energy_source_level_2']
# In[ ]:
# ...and the energy source level 2 values are replaced by the higher level classification
DE_renewables['energy_source_level_2'].replace(
energy_source_dict_DE, inplace=True)
# In[ ]:
# Choose energy source level 2 entries where energy_source is "Bioenergy" in order to
# seperate Bioenergy subtypes to "energy_source_level_3" and subtypes for the rest to "technology"
idx_DE_Bioenergy = DE_renewables[DE_renewables['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to energy source level 3 for all entries where energy source level 2 is
# Bioenergy and delete those entries from technology
DE_renewables[['energy_source_level_3']] = DE_renewables.iloc[idx_DE_Bioenergy][['technology']]
DE_renewables.loc[idx_DE_Bioenergy]['technology'] = np.nan
# In[ ]:
# Assign energy source level 1 to the dataframe
DE_renewables['energy_source_level_1'] = 'Renewable energy'
# **Summary of DataFrame**
# In[ ]:
# Electrical capacity per energy source level 2 (in MW)
DE_renewables.groupby(['energy_source_level_2'])['electrical_capacity_kW'].sum() / 1000
# ### Transform electrical capacity from kW to MW
# In[ ]:
# kW to MW
DE_renewables[['electrical_capacity_kW', 'thermal_capacity_kW']] /= 1000
# adapt column name
DE_renewables.rename(columns={'electrical_capacity_kW': 'electrical_capacity',
'thermal_capacity_kW': 'thermal_capacity'}, inplace=True)
# ### Georeferencing
# #### Get coordinates by postcode
# *(for data with no existing geocoordinates)*
#
# The available post code in the original data provides a first approximation for the geocoordinates of the RE power plants.
# The BNetzA data provides the full zip code whereas due to data privacy the TSOs only report the first three digits of the power plant's post code (e.g. 024xx) and no address. Subsequently a centroid of the post code region polygon is used to find the coordinates.
#
# With data from
# * http://www.suche-postleitzahl.org/downloads?download=plz-gebiete.shp.zip
# * http://www.suche-postleitzahl.org/downloads?download_file=plz-3stellig.shp.zip
# * http://www.suche-postleitzahl.org/downloads
#
# a CSV-file for all existing German post codes with matching geocoordinates has been compiled. The latitude and longitude coordinates were generated by running a PostgreSQL + PostGIS database. Additionally the respective TSO has been added to each post code. *(A Link to the SQL script will follow here later)*
#
# *(License: http://www.suche-postleitzahl.org/downloads, Open Database Licence for free use. Source of data: © OpenStreetMap contributors)*
# In[ ]:
# Read generated postcode/location file
postcode = pd.read_csv(os.path.join('input', 'de_tso_postcode_full.csv'))
# Drop possible duplicates in postcodes
postcode.drop_duplicates('postcode', keep='last', inplace=True)
# Show first entries
postcode.head(2)
# ** Merge geometry information by using the postcode**
# In[ ]:
# Take postcode and longitude/latitude information
postcode = postcode[['postcode', 'lon', 'lat']]
DE_renewables = DE_renewables.merge(postcode, on=['postcode'], how='left')
# #### Transform geoinformation
# *(for data with already existing geoinformation)*
#
# In this section the existing geoinformation (in UTM-format) will be transformed into latidude and longitude coordiates as a uniform standard for geoinformation.
#
# The BNetzA data set offers UTM Geoinformation with the columns *utm_zone (UTM-Zonenwert)*, *utm_east* and *utm_north*. Most of utm_east-values include the utm_zone-value **32** at the beginning of the number. In order to properly standardize and transform this geoinformation into latitude and longitude it is necessary to remove this utm_zone value. For all UTM entries the utm_zone 32 is used by the BNetzA.
#
#
# |utm_zone| utm_east| utm_north| comment|
# |---|---|---| ----|
# |32| 413151.72| 6027467.73| proper coordinates|
# |32| **32**912159.6008| 5692423.9664| caused error by 32|
#
# **How many different utm_zone values are in the data set?**
# In[ ]:
DE_renewables.groupby(['utm_zone'])['utm_zone'].count()
# **Remove the utm_zone "32" from the utm_east value**
# In[ ]:
# Find entries with 32 value at the beginning
idx_32 = (DE_renewables['utm_east'].astype(str).str[:2] == '32')
idx_notnull = DE_renewables['utm_east'].notnull()
# Remove 32 from utm_east entries
DE_renewables.loc[idx_32, 'utm_east'] = DE_renewables.loc[idx_32,
'utm_east'].astype(str).str[2:].astype(float)
# In[ ]:
def convert_to_latlon(utm_east, utm_north, utm_zone):
try:
return utm.to_latlon(utm_east, utm_north, utm_zone, 'U')
except:
return ''
DE_renewables['latlon'] = DE_renewables.loc[idx_notnull, ['utm_east', 'utm_north', 'utm_zone']].apply(
lambda x: convert_to_latlon(x[0], x[1], x[2]), axis=1).astype(str)
# **Conversion UTM to latitude and longitude**
# In[ ]:
lat = []
lon = []
for row in DE_renewables['latlon']:
try:
# Split tuple format into the column lat and lon
row = row.lstrip('(').rstrip(')')
parts = row.split(',')
if(len(parts)<2):
raise Exception('This is not a proper tuple. So go to exception block.')
lat.append(parts[0])
lon.append(parts[1])
except:
# set NaN
lat.append(np.NaN)
lon.append(np.NaN)
DE_renewables['latitude'] = pd.to_numeric(lat)
DE_renewables['longitude'] = pd.to_numeric(lon)
# Add new values to DataFrame lon and lat
DE_renewables['lat'] = DE_renewables[['lat', 'latitude']].apply(
lambda x: x[1] if pd.isnull(x[0]) else x[0],
axis=1)
DE_renewables['lon'] = DE_renewables[['lon', 'longitude']].apply(
lambda x: x[1] if pd.isnull(x[0]) else x[0],
axis=1)
# **Check: missing coordinates by data source and type**
# In[ ]:
#DE_renewables[DE_renewables['data_source'] == '50Hertz'].to_excel('test.xlsx')
# In[ ]:
print('Missing coordinates ', DE_renewables.lat.isnull().sum())
display(
DE_renewables[DE_renewables.lat.isnull()].groupby(
['energy_source_level_2','data_source']
)['data_source'].count()
)
print('Share of missing coordinates (note that NaN can mean it\'s all fine):')
DE_renewables[DE_renewables.lat.isnull()].groupby(
['energy_source_level_2','data_source']
)['data_source'].count() / DE_renewables.groupby(
['energy_source_level_2','data_source']
)['data_source'].count()
# **Remove temporary columns**
# In[ ]:
# drop lonlat column that contains both, latitute and longitude
DE_renewables.drop(['latlon', 'longitude', 'latitude'], axis=1, inplace=True)
# In[ ]:
display(
DE_renewables[DE_renewables['data_source'] == 'TenneT'].head(3)[['lon','lat','data_source']]
)
display(
DE_renewables[DE_renewables['data_source'] == 'BNetzA'].head(3)[['lon','lat','data_source']]
)
# ### Save
#
# The merged, translated, cleaned, DataFrame will be saved temporily as a pickle file, which stores a Python object fast.
# In[ ]:
DE_renewables.to_pickle('intermediate/DE_renewables.pickle')
# ## Denmark DK
# ### Download and read
# The data which will be processed below is provided by the following data sources:
#
# ** [Energistyrelsen (ens) / Danish Energy Agency](http://www.ens.dk/info/tal-kort/statistik-noegletal/oversigt-energisektoren/stamdataregister-vindmoller)** - The wind turbines register is released by the Danish Energy Agency.
#
# ** [Energinet.dk](http://www.energinet.dk/DA/El/Engrosmarked/Udtraek-af-markedsdata/Sider/Statistik.aspx)** - The data of solar power plants are released by the leading transmission network operator Denmark.
# In[ ]:
# point URLs to original data depending on the chosen download option
if download_from == 'original_sources':
# The link from ens.dk is currently unavailable. We are in contact with ens in order to be able
# to provide an alternative source. Until then choose download from 'opsd-server' where the
# last available data can be cached.
url_DK_ens = 'https://ens.dk/sites/ens.dk/files/Statistik/anlaegprodtilnettet.xls'
url_DK_energinet = 'https://www.energinet.dk/-/media/Energinet/El-CSI/Dokumenter/Data/SolcellerGraf-2016-11.xlsx'
url_DK_geo = 'http://download.geonames.org/export/zip/DK.zip'
elif download_from == 'opsd_server':
url_DK_ens = (url_opsd + version + folder + '/DK/anlaegprodtilnettet.xls')
url_DK_energinet = (url_opsd + version + folder + '/DK/SolcellerGraf%202016-11.xlsx')
url_DK_geo = (url_opsd + version + folder + 'DK/DK.zip')
# In[ ]:
# Get wind turbines data
DK_wind_df = pd.read_excel(download_and_cache(url_DK_ens),
sheetname='IkkeAfmeldte-Existing turbines',
thousands='.',
header=17,
skipfooter=3,
parse_cols=16,
converters={'Møllenummer (GSRN)': str,
'Kommune-nr': str,
'Postnr': str}
)
# Get photovoltaic data
DK_solar_df = pd.read_excel(download_and_cache(url_DK_energinet),
sheetname='Data',
converters={'Postnr': str}
)
# ### Translate column names
# In[ ]:
# Choose the translation terms for Denmark, create dictionary and show dictionary
idx_DK = columnnames[columnnames['country'] == 'DK'].index
column_dict_DK = columnnames.loc[idx_DK].set_index('original_name')['opsd_name'].to_dict()
# Windows has problems reading the csv entry for east and north (DK).
# The reason might be the difference when opening the csv between linux and
# windows.
column_dict_DK_temp = {}
for k, v in column_dict_DK.items():
column_dict_DK_temp[k] = v
if v == 'utm_east' or v == 'utm_north':
# merge 2 lines to 1
new_key = ''.join(k.splitlines())
column_dict_DK_temp[new_key] = v
column_dict_DK = column_dict_DK_temp
column_dict_DK
# In[ ]:
# Translate columns by list
DK_wind_df['X (øst) koordinat UTM 32 Euref89'] = DK_wind_df['X (øst) koordinat \nUTM 32 Euref89']
DK_wind_df['Y (nord) koordinat UTM 32 Euref89'] = DK_wind_df['Y (nord) koordinat \nUTM 32 Euref89']
#and 13 are the keys that make problems
DK_wind_df.drop(DK_wind_df.columns[[12, 13]], axis=1, inplace=True)
# Replace column names based on column_dict_DK
DK_wind_df.rename(columns=column_dict_DK, inplace=True)
DK_solar_df.rename(columns=column_dict_DK, inplace=True)
# ### Add data source and missing information
# In[ ]:
# Add names of the data sources to the DataFrames
DK_wind_df['data_source'] = 'Energistyrelsen'
DK_solar_df['data_source'] = 'Energinet.dk'
# Add energy source level 2 and technology for each of the two DataFrames
DK_wind_df['energy_source_level_2'] = 'Wind'
DK_solar_df['energy_source_level_2'] = 'Solar'
DK_solar_df['technology'] = 'Photovoltaics'
# ### Translate values and harmonize energy source level 2
# In[ ]:
# Choose the translation terms for Denmark, create dictionary and show dictionary
idx_DK = valuenames[valuenames['country'] == 'DK'].index
value_dict_DK = valuenames.loc[idx_DK].set_index('original_name')['opsd_name'].to_dict()
value_dict_DK
# In[ ]:
# Replace all original value names by the OPSD value names
DK_wind_df.replace(value_dict_DK, inplace=True)
# ### Georeferencing
# **UTM32 to latitude and longitude** *(Data from Energistyrelsen)*
#
# The Energistyrelsen data set offers UTM Geoinformation with the columns utm_east and utm_north belonging to the UTM zone 32. In this section the existing geoinformation (in UTM-format) will be transformed into latidude and longitude coordiates as a uniform standard for geoinformation.
# In[ ]:
# Index for all values with utm information
idx_notnull = DK_wind_df['utm_east'].notnull()
# In[ ]:
# Convert from UTM values to latitude and longitude coordinates
DK_wind_df['lonlat'] = DK_wind_df.loc[idx_notnull, ['utm_east', 'utm_north']
].apply(lambda x: utm.to_latlon(x[0],
x[1],
32,
'U'), axis=1).astype(str)
# In[ ]:
# Split latitude and longitude in two columns
lat = []
lon = []
for row in DK_wind_df['lonlat']:
try:
# Split tuple format
# into the column lat and lon
row = row.lstrip('(').rstrip(')')
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
# set NAN
lat.append(np.NaN)
lon.append(np.NaN)
DK_wind_df['lat'] = pd.to_numeric(lat)
DK_wind_df['lon'] = pd.to_numeric(lon)
# drop lonlat column that contains both, latitute and longitude
DK_wind_df.drop('lonlat', axis=1, inplace=True)
# **Postcode to lat/lon (WGS84)**
# *(for data from Energinet.dk)*
#
# The available post code in the original data provides an approximation for the geocoordinates of the solar power plants.
# The postcode will be assigned to latitude and longitude coordinates with the help of the postcode table.
#
# ** [geonames.org](http://download.geonames.org/export/zip/?C=N;O=D)** The postcode data from Denmark is provided by Geonames and licensed under a [Creative Commons Attribution 3.0 license](http://creativecommons.org/licenses/by/3.0/).
# In[ ]:
# Get geo-information
zip_DK_geo = zipfile.ZipFile(download_and_cache(url_DK_geo))
# Read generated postcode/location file
DK_geo = pd.read_csv(zip_DK_geo.open('DK.txt'), sep='\t', header=-1)
# add column names as defined in associated readme file
DK_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
DK_geo.drop_duplicates('postcode', keep='last', inplace=True)
DK_geo['postcode'] = DK_geo['postcode'].astype(str)
# In[ ]:
# Add longitude/latitude infomation assigned by postcode (for Energinet.dk data)
DK_solar_df = DK_solar_df.merge(DK_geo[['postcode', 'lon', 'lat']],
on=['postcode'],
how='left')
# In[ ]:
# Show number of units with missing coordinates seperated by wind and solar
print('Missing Coordinates DK_wind ', DK_wind_df.lat.isnull().sum())
print('Missing Coordinates DK_solar ', DK_solar_df.lat.isnull().sum())
# ### Merge DataFrames and choose columns
# In[ ]:
# Merge DataFrames for wind and solar into DK_renewables
dataframes = [DK_wind_df, DK_solar_df]
DK_renewables = pd.concat(dataframes)
DK_renewables = DK_renewables.reset_index()
# In[ ]:
# Assign energy source level 1 to the dataframe
DK_renewables['energy_source_level_1'] = 'Renewable energy'
# In[ ]:
# Select those columns of the orignal data which are utilised further
column_interest = ['commissioning_date', 'energy_source_level_1', 'energy_source_level_2',
'technology', 'electrical_capacity_kW', 'dso', 'gsrn_id', 'postcode',
'municipality_code', 'municipality', 'address', 'address_number',
'utm_east', 'utm_north', 'lon', 'lat', 'hub_height',
'rotor_diameter', 'manufacturer', 'model', 'data_source']
# In[ ]:
# Clean DataFrame from columns other than specified above
DK_renewables = DK_renewables.loc[:, column_interest]
DK_renewables.reset_index(drop=True, inplace=True)
# ### Transform electrical_capacity from kW to MW
# In[ ]:
# kW to MW
DK_renewables['electrical_capacity_kW'] /= 1000
# adapt column name
DK_renewables.rename(columns={'electrical_capacity_kW': 'electrical_capacity'},
inplace=True)
# ### Save
# In[ ]:
DK_renewables.to_pickle('intermediate/DK_renewables.pickle')
# ## France FR
# ### Download and read
# The data which will be processed below is provided by the following data source:
#
# ** [Ministry for the Ecological and Inclusive Transition](http://www.statistiques.developpement-durable.gouv.fr/energie-climat/r/energies-renouvelables.html?tx_ttnews%5Btt_news%5D=24638&cHash=d237bf9985fdca39d7d8c5dc84fb95f9)** - Number of installations and installed capacity of the different renewable source for every municipality in France. Service of observation and statistics, survey, date of last update: 15/12/2016. Data until 31/12/2015.
# In[ ]:
# point URLs to original data depending on the chosen download option
if download_from == 'original_sources':
url_FR_gouv = "http://www.statistiques.developpement-durable.gouv.fr/fileadmin/documents/Themes/Energies_et_climat/Les_differentes_energies/Energies_renouvelables/donnees_locales/2015/electricite-renouvelable-par-commune-2015.xls"
url_FR_geo = 'http://public.opendatasoft.com/explore/dataset/code-postal-code-insee-2015/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true'
else:
url_FR_gouv = (url_opsd + version + folder +
'/FR/electricite-renouvelable-par-commune-2015.xls')
url_FR_geo = (url_opsd + version + folder +
'FR/code-postal-code-insee-2015.csv')
# In[ ]:
# Get data of renewables per municipality
FR_re_df = pd.read_excel(download_and_cache(url_FR_gouv),
sheetname='Commune',
encoding='UTF8',
thousands='.',
decimals=',',
header=[3, 4],
skipfooter=8, # contains summarized values
index_col=[0, 1], # required for MultiIndex
converters={'Code officiel géographique': str})
# ### Rearrange columns and translate column names
# The French data source contains number of installations and sum of installed capacity per energy source per municipality. The structure is adapted to the power plant list of other countries. The list is limited to the plants which are covered by article 10 of february 2000 by an agreement to a purchase commitment.
# In[ ]:
# Rearrange data
FR_re_df.index.rename(['insee_com', 'municipality'], inplace=True)
FR_re_df.columns.rename(['energy_source_level_2', None], inplace=True)
FR_re_df = (FR_re_df
.stack(level='energy_source_level_2', dropna=False)
.reset_index(drop=False))
# In[ ]:
# Choose the translation terms for France, create dictionary and show dictionary
idx_FR = columnnames[columnnames['country'] == 'FR'].index
column_dict_FR = columnnames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
column_dict_FR
# In[ ]:
# Translate columnnames
FR_re_df.rename(columns=column_dict_FR, inplace=True)
# In[ ]:
# Drop all rows that just contain NA
FR_re_df = FR_re_df.dropna()
# ### Add data source
# In[ ]:
FR_re_df['data_source'] = 'Ministry for the Ecological and Inclusive Transition'
# ### Translate values and harmonize energy source level 2
# ** Kept secret if number of installations < 3**
#
# If the number of installations is less than 3, it is marked with an _s_ instead of the number 1 or 2 due to statistical confidentiality ([further explanation by the data provider](http://www.statistiques.developpement-durable.gouv.fr/fileadmin/documents/Themes/Energies_et_climat/Les_differentes_energies/Energies_renouvelables/donnees_locales/2014/methodo-donnees-locales-electricte-renouvelable-12-2015-b.pdf)). Here, the _s_ is changed to _< 3_. This is done in the same step as the other value translations of the energy sources.
# In[ ]:
# Choose the translation terms for France, create dictionary and show dictionary
idx_FR = valuenames[valuenames['country'] == 'FR'].index
value_dict_FR = valuenames.loc[idx_FR].set_index('original_name')['opsd_name'].to_dict()
value_dict_FR
# In[ ]:
# Replace all original value names by the OPSD value names
FR_re_df.replace(value_dict_FR, inplace=True)
# **Separate and assign energy source level 1-3 and technology**
# In[ ]:
# Create dictionnary in order to assign energy_source to its subtype
energy_source_dict_FR = valuenames.loc[idx_FR].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Column energy_source partly contains subtype information, thus this column is copied
# to new column for energy_source_subtype...
FR_re_df['technology'] = FR_re_df['energy_source_level_2']
# ...and the energy source subtype values in the energy_source column are replaced by
# the higher level classification
FR_re_df['energy_source_level_2'].replace(energy_source_dict_FR, inplace=True)
# In[ ]:
# Assign energy_source_level_1 to the dataframe
FR_re_df['energy_source_level_1'] = 'Renewable energy'
# In[ ]:
FR_re_df.reset_index(drop=True, inplace=True)
# In[ ]:
# Choose energy source level 2 entries where energy source level 2 is Bioenergy in order to
# seperate Bioenergy subtypes to energy source level 3 and subtypes for the rest to technology
idx_FR_Bioenergy = FR_re_df[FR_re_df['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to energy source level 3 for all entries where energy source level 2 is
# Bioenergy and delete those entries from technology
FR_re_df[['energy_source_level_3']] = FR_re_df.iloc[idx_FR_Bioenergy][['technology']]
FR_re_df.ix[idx_FR_Bioenergy,'technology'] = np.nan
# ### Georeferencing
# #### Municipality (INSEE) code to lon/lat
# The available INSEE code in the original data provides a first approximation for the geocoordinates of the renewable power plants. The following data source is utilized for assigning INSEE code to coordinates of the municipalities:
#
# ** [OpenDataSoft](http://public.opendatasoft.com/explore/dataset/code-postal-code-insee-2015/information/)** publishes a list of French INSEE codes and corresponding coordinates is published under the [Licence Ouverte (Etalab)](https://www.etalab.gouv.fr/licence-ouverte-open-licence).
# In[ ]:
# Downlad French geo-information. As download_and_cache_function is not working
# properly yet, thus other way of downloading
filename = 'code-postal-insee-2015.csv'
filepath = os.path.join('input', 'original_data', filename)
if not os.path.exists(filepath):
print("Downloading file: ", filename)
FR_geo_csv = urllib.request.urlretrieve(url_FR_geo, filepath)
else:
print("Using local file from", filepath)
# In[ ]:
# Read INSEE Code Data
FR_geo = pd.read_csv(filepath,
sep=';',
header=0,
converters={'Code_postal': str})
# Drop possible duplicates of the same INSEE code
FR_geo.drop_duplicates('INSEE_COM', keep='last', inplace=True)
# In[ ]:
# create columns for latitude/longitude
lat = []
lon = []
# split in latitude/longitude
for row in FR_geo['Geo Point']:
try:
# Split tuple format
# into the column lat and lon
row = row.lstrip('(').rstrip(')')
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
# set NAN
lat.append(np.NaN)
lon.append(np.NaN)
# add these columns to the INSEE DataFrame
FR_geo['lat'] = pd.to_numeric(lat)
FR_geo['lon'] = pd.to_numeric(lon)
# In[ ]:
# Column names of merge key have to be named identically
FR_re_df.rename(columns={'municipality_code': 'INSEE_COM'}, inplace=True)
# Merge longitude and latitude columns by the Code INSEE
FR_re_df = FR_re_df.merge(FR_geo[['INSEE_COM', 'lat', 'lon']],
on=['INSEE_COM'],
how='left')
# Translate Code INSEE column back to municipality_code
FR_re_df.rename(columns={'INSEE_COM': 'municipality_code'}, inplace=True)
# ### Save
# In[ ]:
FR_re_df.to_pickle('intermediate/FR_renewables.pickle')
# ## Poland PL
# ### Download and read
# The data which will be processed below is provided by the following data source:
#
# ** [Urzad Regulacji Energetyki (URE) / Energy Regulatory Office](http://www.ure.gov.pl/uremapoze/mapa.html)** - Number of installations and installed capacity per energy source of renewable energy. Summed per powiat (districts) .
# #### The Polish data has to be downloaded manually
# if you have not chosen download_from = opsd_server.
# - Go to http://www.ure.gov.pl/uremapoze/mapa.html (requires Flash Player)
# - Click on the British flag in the lower right corner for Englisch version
# - Set detail to highest (to the right) in the upper right corner
# - Click on the printer symbol in the lower left corner
# - 'Generate', then the rtf-file simple.rtf will be downloaded
# - Put it in the folder input/original_data on your computer
# - If the download is temporarily not working choose download_from == 'opsd_server'
# In[ ]:
if download_from == 'opsd_server':
url_PL_ure = (url_opsd + version + folder + '/PL/simple.rtf')
download_and_cache(url_PL_ure)
# In[ ]:
# read rtf-file to string with the correct encoding
with open(os.path.join('input', 'original_data', 'simple.rtf'), 'r') as rtf:
file_content = rtf.read()
file_content = file_content.encode('utf-8').decode('iso-8859-2')
# ### Rearrange data from rft-file
# The rtf file has one table for each district in the rtf-file which needs to be separated from each and other and restructured to get all plants in one DataFrame with the information: district, energy_source, number_of_installations, installed_capacity. Thus in the following, the separating items are defined, the district tables split in parts, all put in one list and afterwards transferred to a pandas DataFrame.
# In[ ]:
# a new line is separating all parts
sep_split_into_parts = r'{\fs12 \f1 \line }'
# separates the table rows of each table
sep_data_parts = r'\trql'
reg_exp_district = r'(?<=Powiat:).*(?=})'
reg_exp_installation_type = (
r'(?<=\\fs12 \\f1 \\pard \\intbl \\ql \\cbpat[2|3|4] \{\\fs12 \\f1 ).*(?=\})')
reg_exp_installation_value = (
r'(?<=\\fs12 \\f1 \\pard \\intbl \\qr \\cbpat[3|4] \{\\fs12 \\f1 ).*(?=})')
# split file into parts
parts = file_content.split(sep_split_into_parts)
# In[ ]:
# list containing the data
data_set = []
for part in parts:
# match district
district = re.findall(reg_exp_district, part)
if len(district) == 0:
pass
else:
district = district[0].lstrip()
# separate each part
data_parts = part.split(sep_data_parts)
# data structure:
# data_row = {'district': '', 'install_type': '', 'quantity': '', 'power': ''}
for data_rows in data_parts:
wrapper_list = []
# match each installation type
installation_type = re.findall(reg_exp_installation_type, data_rows)
for inst_type in installation_type:
wrapper_list.append({'district': district, 'technology': inst_type})
# match data - contains twice as many entries as
# installation type (quantity, power vs. install type)
data_values = re.findall(reg_exp_installation_value, data_rows)
if len(data_values) == 0:
#log.debug('data values empty')
pass
else:
# connect data
for i, _ in enumerate(wrapper_list):
wrapper_list[i]['number_of_installations'] = data_values[(
i * 2)]
wrapper_list[i]['electrical_capacity'] = data_values[(
i * 2) + 1]
# prepare to write to file
for data in wrapper_list:
data_set.append(data)
# In[ ]:
# mapping of malformed unicode which appear in the Polish district names
polish_truncated_unicode_map = {
r'\uc0\u322': 'ł',
r'\uc0\u380': 'ż',
r'\uc0\u243': 'ó',
r'\uc0\u347': 'ś',
r'\uc0\u324': 'ń',
r'\uc0\u261': 'ą',
r'\uc0\u281': 'ę',
r'\uc0\u263': 'ć',
r'\uc0\u321': 'Ł',
r'\uc0\u378': 'ź',
r'\uc0\u346': 'Ś',
r'\uc0\u379': 'Ż'
}
# In[ ]:
# changing malformed unicode
for entry in data_set:
while r'\u' in entry['district']:
index = entry['district'].index(r'\u')
offset = index + 9
to_be_replaced = entry['district'][index:offset]
if to_be_replaced in polish_truncated_unicode_map.keys():
# offset + 1 because there is a trailing whitespace
entry['district'] = entry['district'].replace(entry['district'][index:offset + 1],
polish_truncated_unicode_map[to_be_replaced])
else:
break
# In[ ]:
# Create pandas DataFrame with similar structure as the other countries
PL_re_df = pd.DataFrame(data_set)
# ### Add data source
# In[ ]:
PL_re_df['data_source'] = 'Urzad Regulacji Energetyki'
# ### Translate values and harmonize energy source level 2
# In[ ]:
# Choose the translation terms for France, create dictionary and show dictionary
idx_PL = valuenames[valuenames['country'] == 'PL'].index
value_dict_PL = valuenames.loc[idx_PL].set_index('original_name')['opsd_name'].to_dict()
value_dict_PL
# In[ ]:
# Replace all original value names by the OPSD value names
PL_re_df.technology.replace(value_dict_PL, inplace=True)
# **Assign energy source level 1-3 and technology**
# In[ ]:
# Assign energy_source_level_1 to the dataframe
PL_re_df['energy_source_level_1'] = 'Renewable energy'
# In[ ]:
# Create dictionnary in order to assign energy_source to its subtype
energy_source_dict_PL = valuenames.loc[idx_PL].set_index(
'opsd_name')['energy_source_level_2'].to_dict()
# Create new column for energy_source
PL_re_df['energy_source_level_2'] = PL_re_df.technology
# Fill this with the energy source instead of subtype information
PL_re_df.energy_source_level_2.replace(energy_source_dict_PL, inplace=True)
# In[ ]:
# Choose energy_source entries where energy_source is "Bioenergy" in order to
# seperate Bioenergy subtypes to "energy_source_level_3" and subtypes for the rest to "technology"
idx_PL_Bioenergy = PL_re_df[PL_re_df['energy_source_level_2'] == 'Bioenergy'].index
# Assign technology to "energy_source_level_3" for all entries where energy_source_level_2 is
# "Bioenergy" and delete those entries from "technology"
PL_re_df[['energy_source_level_3']] = PL_re_df.iloc[idx_PL_Bioenergy][['technology']]
PL_re_df.ix[idx_PL_Bioenergy, 'technology'] = np.nan
# ** Adjust datatype of numeric columns**
# In[ ]:
# change type to numeric
PL_re_df['electrical_capacity'] = pd.to_numeric(PL_re_df['electrical_capacity'])
# Additionally commas are deleted
PL_re_df['number_of_installations'] = pd.to_numeric(
PL_re_df['number_of_installations'].str.replace(',', ''))
# **Aggregate**
#
# For entries/rows of the same district and energy source level 2, electrical capacity and number of installations are aggregaated.
# In[ ]:
PL_re_df = PL_re_df.groupby(['district', 'energy_source_level_2', 'technology'],
as_index=False
).agg({'electrical_capacity': sum,
'number_of_installations': sum,
'data_source': 'first'})
# ### Save
# In[ ]:
PL_re_df.to_pickle('intermediate/PL_renewables.pickle')
# ## Switzerland CH
# ### Download and read
# The data which will be processed below is provided by the following data source:
#
# ** [Swiss Federal Office of Energy](http://www.bfe.admin.ch/themen/00612/02073/index.html?dossier_id=02166&lang=de)** - Data of all renewable power plants receiving "Kostendeckende Einspeisevergütung" (KEV) which is the Swiss feed in tarif for renewable power plants.
# Geodata is based on municipality codes.
# In[ ]:
# point URLs to original data depending on the chosen download option
if download_from == 'original_sources':
url_CH_KEV= "http://www.bfe.admin.ch/php/modules/publikationen/stream.php?extlang=de&name=de_620208649.xlsx&endung=Liste%20aller%20KEV-Bez%FCger%20im%20Jahr%202016"
url_CH_KEV='KEV-Bezüger_2016_Publikation.xlsx'
url_CH_geo = 'http://download.geonames.org/export/zip/CH.zip'
# In[ ]:
# Get data of renewables per municipality
CH_re_df = pd.read_excel(download_and_cache(url_CH_KEV),
sheetname='KEV-Bezüger 2016',
encoding='UTF8',
thousands='.',
decimals=','
#header=[0]
#skipfooter=9, # contains summarized values
#index_col=[0, 1], # required for MultiIndex
#converters={'Code officiel géographique':str}
)
# ### Translate column names
# In[ ]:
# Choose the translation terms for Switzerland, create dictionary and show dictionary
idx_CH = columnnames[columnnames['country'] == 'CH'].index
column_dict_CH = columnnames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
column_dict_CH
# In[ ]:
# Translate columnnames
CH_re_df.rename(columns=column_dict_CH, inplace=True)
# In[ ]:
# Drop all rows that just contain NA
CH_re_df = CH_re_df.dropna()
# ### Add data source
# In[ ]:
CH_re_df['data_source'] = 'BFE'
# ### Translate values and harmonize energy source level 2
# In[ ]:
# Choose the translation terms for France, create dictionary and show dictionary
idx_CH = valuenames[valuenames['country'] == 'CH'].index
value_dict_CH = valuenames.loc[idx_CH].set_index('original_name')['opsd_name'].to_dict()
value_dict_CH
# In[ ]:
# Replace all original value names by the OPSD value names
CH_re_df.replace(value_dict_CH, inplace=True)
# **Separate and assign energy source level 1-3 and technology**
# In[ ]:
# Create dictionnary in order to assign energy_source to its subtype
energy_source_dict_CH = valuenames.loc[idx_CH].set_index('opsd_name')['energy_source_level_2'].to_dict()
# ...and the energy source subtype values in the energy_source column are replaced by
# the higher level classification
CH_re_df['energy_source_level_2'].replace(energy_source_dict_CH, inplace=True)
# In[ ]:
# Assign energy_source_level_1 to the dataframe
CH_re_df['energy_source_level_1'] = 'Renewable energy'
# In[ ]:
CH_re_df.reset_index(drop=True, inplace=True)
# ### Georeferencing
# **Postcode to lat/lon (WGS84)**
#
# The available municipality code in the original data provides an approximation for the geocoordinates of the renewable power plants.
# The postcode will be assigned to latitude and longitude coordinates with the help of the postcode table.
#
# ** [geonames.org](http://download.geonames.org/export/zip/?C=N;O=D)** The postcode data from Switzerland is provided by Geonames and licensed under a [Creative Commons Attribution 3.0 license](http://creativecommons.org/licenses/by/3.0/).
# In[ ]:
# Get geo-information
zip_CH_geo = zipfile.ZipFile(download_and_cache(url_CH_geo))
# Read generated postcode/location file
CH_geo = pd.read_csv(zip_CH_geo.open('CH.txt'), sep='\t', header=-1)
# add column names as defined in associated readme file
CH_geo.columns = ['country_code', 'postcode', 'place_name', 'admin_name1',
'admin_code1', 'admin_name2', 'admin_code2', 'admin_name3',
'admin_code3', 'lat', 'lon', 'accuracy']
# Drop rows of possible duplicate postal_code
CH_geo.drop_duplicates('postcode', keep='last', inplace=True)
CH_geo['postcode'] = CH_geo['postcode'].astype(str)
# In[ ]:
# harmonise data class
CH_geo.postcode = CH_geo.postcode.astype(int)
# In[ ]:
# Add longitude/latitude infomation assigned by municipality code
CH_re_df = pd.merge(CH_re_df,
CH_geo[['lat', 'lon', 'postcode']],
left_on='municipality_code',
right_on='postcode',
how='left'
)
# ### Transform electrical_capacity from kW to MW
# In[ ]:
# kW to MW
CH_re_df['electrical_capacity'] /= 1000
# kWh to MWh
CH_re_df['production'] /= 1000
# ### Save
# In[ ]:
CH_re_df.to_pickle('intermediate/CH_renewables.pickle')
# Check and validation of the renewable power plants list as well as the creation of CSV/XLSX/SQLite files can be found in Part 2 of this script. It also generates a daily time series of cumulated installed capacities by energy source.
# In[ ]: