Copyright (c) 2019, ETH Zurich, Computer Engineering Group
All rights reserved.
This work is licensed under the Creative Commons Attribution 4.0 International License.
To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or
send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
The file is part of the dataset entiled "Long-Term Tracing of Indoor Solar Harvesting" which complements the following publication:
L. Sigrist, A. Gomez, and L. Thiele. "Dataset: Tracing Indoor Solar Harvesting." In Proceedings of the 2nd Workshop on Data Acquisition To Analysis (DATA '19), 2019. [under submission]
The dataset is published and documented on Zenodo:
Description
This Jupyter notebook imports, merges, and filters the raw measuements of the Indoor Solar Harvesting Dataset stored in the raw/
folder. The processed dataset is stored in the HDF5 format in the processed/
folder. It allows full reproduction of the precomputed HDF5 datasets published as part of the dataset mentioned above from the raw measurement files.
Python package requirements
The following Python packages are required to run this script: numpy pandas rocketlogger tables
Remarks regarding the timestamps
DATASET_TIME_REFERENCE = 'relative'
).DATASET_TIME_REFERENCE = 'local'
to direclty use the Linux real-time clock timestamps, or use DATASET_TIME_REFERENCE = 'network'
for a timer base that was synced with the network (see config line below the imports). While the first exhibits the drift over time similar to the 'relative'
timestamp calculation, the later timestamp is likely to introduce some jumps in the middle of the measurement.'local'
and 'network'
traces may contain currupt timestamps. However, they are reliably detected during import with this script and filtered out. For this reason, the HDF5 dataset generated using those timestamp configurations may contain a time series shorter than that in the raw dataset. The default 'relative'
configuration always includes all raw data available.For more details regarding the timestamp generation during the measurements check the documentation of the RocketLogger data format at https://gitlab.ethz.ch/tec/public/rocketlogger/wikis/data-format#channel-data.
import datetime
import os
import re
import numpy as np
import pandas as pd
from rocketlogger.data import RocketLoggerData, RocketLoggerDataWarning
# path to raw data root folder and the position names
TRACE_DATA_PATH = './raw'
TRACE_POSITIONS = ['pos06', 'pos13', 'pos14', 'pos16', 'pos17', 'pos18']
# available time reference options
TRACE_TIME_REFERENCES = [
'relative', # local sample index based, start time relative timestamp
'local', # local monotonic real-time clock timestamp
'network', # NTP synched and adjusted real-time clock timestamp
]
# correction of the real sampling rate (exact configured rate at 2.048 MHz clock; RocketLogger is using 49 PWM cycles at 100 MHz)
TRACE_SAMPLE_RATE_SCALING = (100e6 / 49) / 2.048e6
# path to store the processed dataset
DATASET_PATH = './processed'
# time reference to use for the import
DATASET_TIME_REFERENCE = 'relative' # <<<--- you may want to change this configuration (see timestamp remarks)
# map data file channel names to dataframe column names
POWER_CHANNEL_MAPPING = {
'I1': 'I_in',
'I2': 'I_bat',
'V1': 'V_in',
'V2': 'V_bat',
}
# map ambient file channel names to dataframe column names
SENSOR_CHANNEL_MAPPING = {
'TSL4531_left': 'Ev_left',
'TSL4531_right': 'Ev_right',
'BME280_press': 'P_amb',
'BME280_preas': 'P_amb', # channel naming bug in RocketLogger prior to v1.1.6 (#128)
'BME280_rh': 'RH_amb',
'BME280_temp': 'T_amb',
}
# power measurement intervals to filter out by node name
POWER_FILTER = {
'pos06': [
('2018-01-06T16:00:00', '2018-01-10T00:00:00'), # measurement file corrupted (indoor1_p27.rld)
],
}
# sensor measurement intervals to filter out by node name
SENSOR_FILTER = {
# no sensor filters defined
}
# temporary suppress console output (http://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/)
from contextlib import contextmanager
import sys, os
@contextmanager
def suppress_stdout():
with open(os.devnull, 'w') as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout
def import_filter_node_traces(node_trace_directory, decimation_factor=1):
'''
Import power and ambient traces of a given RocketLogger intance
:param node_trace_directory: The directly holding the recorded trace data
:param decimation_factor: Import decimation factor for power traces [default: 1]
:returns: tuple of the power and ambient sensor trace RocketLoggerData structures
'''
## collect files available for the node
node_data_file_list = os.listdir(node_trace_directory)
## filter data files
node_data_files = []
for data_file_name in node_data_file_list:
data_file_path = os.path.join(node_trace_directory, data_file_name)
## skip: non-files, non-RocketLogger data, RocketLogger partitioned files, ambient trace files
if not os.path.isfile(data_file_path):
continue
if not data_file_path.endswith('.rld'):
continue
if re.match(r'.*_p\d+(_.*)?\.rld$', data_file_path):
continue
if re.match(r'.*-ambient(_p\d+)?(_.*)?\.rld$', data_file_path):
continue
## add remaining file paths to file list
node_data_files.append(data_file_path)
## load data from collected files into a list
node_power_traces = []
node_sensor_traces = []
for data_file_path in node_data_files:
try:
power_trace = RocketLoggerData(data_file_path, join_files=True, decimation_factor=decimation_factor)
except:
print('Error: importing file `{}` failed'. format(data_file_path))
raise
## merge current channels
power_trace.merge_channels(keep_channels=True)
## import sensor data
sensor_file_path = re.sub(r'.rld$', r'-ambient.rld', data_file_path)
if os.path.isfile(sensor_file_path):
try:
sensor_trace = RocketLoggerData(sensor_file_path, join_files=True)
except:
print('Error: importing file `{}` failed'. format(data_file_path))
raise
else:
print('Warning: No accompanying sensor data found ({}).'.format(data_file_path))
sensor_trace = None
node_power_traces.append(power_trace)
node_sensor_traces.append(sensor_trace)
## sort traces by time
trace_start_times = [t._header['start_time'] for t in node_power_traces]
sorted_index = [e[0] for e in sorted(enumerate(trace_start_times), key=lambda element: element[1])]
node_power_traces = [node_power_traces[i] for i in sorted_index]
node_sensor_traces = [node_sensor_traces[i] for i in sorted_index]
return (node_power_traces, node_sensor_traces)
def traces_to_dataframe(traces, channel_to_column_mapping=None, time_reference='network'):
'''
Convert and merge RocketLoggerData traces to dataframes
:param traces: List of RocketLogger traces to convert
:param channel_to_column_mapping: Optional mapping of channels to colums.
- None for one-to-one mapping of channels to columns [default]
- List of channels for selection of channels (one-to-one channel to column mapping)
- Dict with channels to select as key and column name as value
:param time_reference: Optional time reference to use for the time stamp calculation
- 'relative': timestamp is calculated from measurement start time and sample index
- 'local': timestamp is calculated from measurement start time and the local monotonic real-time clock timestamps
- 'network': timestamp is calculated from network synced real-time clock timestamps [default]
:returns: time series data frame
'''
## argument parsing
if not isinstance(traces, list):
traces = [traces]
if not isinstance(channel_to_column_mapping, dict):
if channel_to_column_mapping is None:
channel_to_column_mapping = traces[0].get_channel_names()
if not isinstance(channel_to_column_mapping, list):
channel_to_column_mapping = [channel_to_column_mapping]
channel_to_column_mapping = {ch: ch for ch in channel_to_column_mapping}
df_columns = list(channel_to_column_mapping.values())
trace_channels = list(channel_to_column_mapping.keys())
## concatenate data frame from traces
trace_df = pd.DataFrame([], columns=set(df_columns))
for trace in traces:
## per trace filtering of channel mappings
channel_filter = [t in trace.get_channel_names() for t in trace_channels]
## get trace data
trace_data = trace.get_data([channel for (channel, exists) in zip(trace_channels, channel_filter) if exists])
trace_start_time = np.datetime64(trace._header['start_time'].replace(tzinfo=None))
trace_timestamp_relative = trace.get_time(absolute_time=False)
trace_timestamp_local = trace.get_time(absolute_time=True, time_reference='local')
trace_timestamp_network = trace.get_time(absolute_time=True, time_reference='network')
trace_columns = [column for (column, exists) in zip(df_columns, channel_filter) if exists]
relative_timestamp_base = trace_start_time
local_timestamp_offset = trace_start_time - trace_timestamp_local[0]
absolute_timestamp_valid = np.abs(trace_timestamp_network[0] - trace_start_time) < np.timedelta64(1, 's') # absolute timestamp bug in RocketLogger prior to v1.1.3 (#113)
## extract and check timestamp, skip files with warning if timestamp invalid
trace_timestamp = None
if DATASET_TIME_REFERENCE == 'network':
if not absolute_timestamp_valid:
print('Warning: skiping file `{}` with invalid network synced timestamp'.format(trace._filename))
continue
trace_timestamp = trace_timestamp_network
elif DATASET_TIME_REFERENCE == 'local':
if not absolute_timestamp_valid:
print('Warning: skiping file `{}` with invalid local timestamp'.format(trace._filename))
continue
trace_timestamp = local_timestamp_offset + trace_timestamp_local
elif DATASET_TIME_REFERENCE == 'relative':
trace_timestamp = relative_timestamp_base + trace_timestamp_relative * (np.timedelta64(10**9, 'ns') / TRACE_SAMPLE_RATE_SCALING)
else:
raise ValueError('Invalid time_reference value')
df = pd.DataFrame(trace_data, index=trace_timestamp, columns=trace_columns)
## calculate local delta time [s] for energy calculations
df.loc[1:, 'dt'] = np.diff(trace_timestamp) / np.timedelta64(1, 's')
## append to trace dataframe
trace_df = pd.concat([trace_df, df], sort=True)
return trace_df
def import_and_backup(position, time_reference):
'''
Helper function doing import and backup for a given position
:param position: name of the position to process
'''
node_data_path = os.path.join(TRACE_DATA_PATH, position)
print('Loading data from `{}`'.format(node_data_path))
with suppress_stdout():
(power_t, sensor_t) = import_filter_node_traces(node_data_path, decimation_factor=10)
print('Generating `{}` data frames'.format(position))
power_df = traces_to_dataframe(power_t, POWER_CHANNEL_MAPPING, time_reference)
sensor_df = traces_to_dataframe(sensor_t, SENSOR_CHANNEL_MAPPING, time_reference)
print('Filtering `{}` data frames'.format(position))
if position in POWER_FILTER.keys():
for (filter_start, filter_end) in POWER_FILTER[position]:
index_start = np.array(power_df.index > filter_start).nonzero()[0].min()
index_end = np.array(power_df.index < filter_end).nonzero()[0].max()
print('clearing power data between {} and {}'.format(filter_start, filter_end))
power_df = power_df.drop(power_df.iloc[index_start:index_end].index)
if position in SENSOR_FILTER.keys():
for (filter_start, filter_end) in SENSOR_FILTER[position]:
index_start = np.array(sensor_df.index > filter_start).nonzero()[0].min()
index_end = np.array(sensor_df.index < filter_end).nonzero()[0].max()
print('clearing power data between {} and {}'.format(filter_start, filter_end))
sensor_df = sensor_df.drop(sensor_df.iloc[index_start:index_end].index)
print('Backing up data frames for `{}`'.format(position))
power_df.to_hdf(os.path.join(DATASET_PATH, '{:s}_power_{:s}.h5'.format(position, time_reference)), 'dataset')
sensor_df.to_hdf(os.path.join(DATASET_PATH, '{:s}_sensor_{:s}.h5'.format(position, time_reference)), 'dataset')
## alternative and compressed export file formats
# power_df.to_hdf(os.path.join(DATASET_PATH, '{:s}_power_{:s}.gz.h5'.format(position, time_reference)), 'dataset', complib="zlib", complevel=9)
# sensor_df.to_hdf(os.path.join(DATASET_PATH, '{:s}_sensor_{:s}.gz.h5'.format(position, time_reference)), 'dataset', complib="zlib", complevel=9)
# power_df.to_pickle(os.path.join(DATASET_PATH, '{:s}_power_{:s}.p.bz2'.format(position, time_reference)))
# sensor_df.to_pickle(os.path.join(DATASET_PATH, '{:s}_sensor_{:s}.p.bz2'.format(position, time_reference)))
print('Done processing and backing up for `{}`'.format(position))
PARALLELIZATION_CORES = 0
## repeat import and backup for all positions
if PARALLELIZATION_CORES <= 1:
## serialize processing (when reading from network mounts)
for node_name in TRACE_POSITIONS:
import_and_backup(node_name, DATASET_TIME_REFERENCE)
else:
## Parallelize processing (when processing locally)
from multiprocessing import Pool
pool = Pool(processes=PARALLELIZATION_CORES)
pool.map(import_and_backup, TRACE_POSITIONS)
Loading data from `./raw/pos06` Generating `pos06` data frames Filtering `pos06` data frames clearing power data between 2018-01-06T16:00:00 and 2018-01-10T00:00:00 Backing up data frames for `pos06` Done processing and backing up for `pos06` Loading data from `./raw/pos13` Generating `pos13` data frames Filtering `pos13` data frames Backing up data frames for `pos13` Done processing and backing up for `pos13` Loading data from `./raw/pos14` Generating `pos14` data frames Filtering `pos14` data frames Backing up data frames for `pos14` Done processing and backing up for `pos14` Loading data from `./raw/pos16` Generating `pos16` data frames Filtering `pos16` data frames Backing up data frames for `pos16` Done processing and backing up for `pos16` Loading data from `./raw/pos17` Generating `pos17` data frames Filtering `pos17` data frames Backing up data frames for `pos17` Done processing and backing up for `pos17` Loading data from `./raw/pos18` Generating `pos18` data frames Filtering `pos18` data frames Backing up data frames for `pos18` Done processing and backing up for `pos18`