In [1]:
from __future__ import print_function 
import seaborn as sns
from IPython.core.display import HTML
from IPython.display import Image
import pandas as pd
import calendar

from nilmtk.disaggregate.co_1d import CO_1d
from nilmtk.cross_validation import train_test_split
import nilmtk.preprocessing.electricity.building as prepb
import nilmtk.stats.electricity.building as bstats
import nilmtk.stats.electricity.single as sstats
from nilmtk.dataset import DataSet
from nilmtk.dataset import REDD
from nilmtk.plots import plot_series
from nilmtk.sensors.electricity import Measurement
from nilmtk.metrics import (mean_normalized_error_power, 
                            fraction_energy_assigned_correctly, 
                            f_score,
                            rms_error_power)

import warnings
warnings.filterwarnings("ignore")

sns.set(font="serif")

def pretty_print_dict(dictionary):
    html = '<ul>'
    for key, value in dictionary.iteritems():
        html += '<li><strong>{}</strong>: '.format(key)
        if isinstance(value, list):
            html += '<ul>'
            for item in value:
                html += '<li>{}</li>'.format(item)
            html += '</ul></li>'
        else:
            html += '{}</li>'.format(value)
    html += '</ul>'
    display(HTML(html))

Demo of NILMTK v0.1 for NILM Workshop 2014

Loading Data

Load raw REDD .dat data

In [2]:
from nilmtk.dataset import REDD

redd = REDD()
redd.load('/data/REDD/low_freq/')

# Note increase in system memory usage.
# NILMTK v0.2 does not eagerly load data like v0.1 does.
Loading building 1:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 20
Loading building 2:
  chans:  1 2 3 4 5 6 7 8 9 10 11
Loading building 3:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
Loading building 4:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Loading building 5:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
Loading building 6:
  chans:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Examine dataset metadata

In [3]:
pretty_print_dict(redd.metadata)
  • name: REDD
  • full_name: Reference Energy Disaggregation Data Set
  • citations:
    • J. Zico Kolter and Matthew J. Johnson. REDD: A public data set for energy disaggregation research. In proceedings of the SustKDD workshop on Data Mining Applications in Sustainability, 2011.
  • urls:
    • http://redd.csail.mit.edu
  • timezone: US/Eastern
  • geographic_coordinates: (42.360091, -71.09416)

Appliance names are converted to use the NILMTK controlled vocabulary

Raw REDD appliance names:
In [4]:
!cat '/data/REDD/low_freq/house_1/labels.dat'
1 mains
2 mains
3 oven
4 oven
5 refrigerator
6 dishwaser
7 kitchen_outlets
8 kitchen_outlets
9 lighting
10 washer_dryer
11 microwave
12 bathroom_gfi
13 electric_heat
14 stove
15 kitchen_outlets
16 kitchen_outlets
17 lighting
18 lighting
19 washer_dryer
20 washer_dryer
NILMTK appliance names
In [5]:
electric = redd.buildings[1].utility.electric
electric.appliances.keys()

# Note that the washer dryer has been converted to a single DualSupply appliance
Out[5]:
[ApplianceName(name='fridge', instance=1),
 ApplianceName(name='washer dryer', instance=1),
 ApplianceName(name='kitchen outlets', instance=2),
 ApplianceName(name='lighting', instance=3),
 ApplianceName(name='kitchen outlets', instance=1),
 ApplianceName(name='space heater', instance=1),
 ApplianceName(name='kitchen outlets', instance=4),
 ApplianceName(name='kitchen outlets', instance=3),
 ApplianceName(name='lighting', instance=1),
 ApplianceName(name='dishwasher', instance=1),
 ApplianceName(name='microwave', instance=1),
 ApplianceName(name='bathroom misc', instance=1),
 ApplianceName(name='lighting', instance=2),
 ApplianceName(name='oven', instance=1),
 ApplianceName(name='hob', instance=1)]

Get data for a particular appliance

In [6]:
fridge = electric.appliances['fridge', 1]

# fridge is a 2-column matrix (a Python Pandas DataFrame object)

fridge.head()
Out[6]:
(power, active)
2011-04-18 09:22:13-04:00 6
2011-04-18 09:22:16-04:00 6
2011-04-18 09:22:20-04:00 6
2011-04-18 09:22:23-04:00 6
2011-04-18 09:22:26-04:00 6

5 rows × 1 columns

In [7]:
fridge["2011-04-18":"2011-04-19"].plot()
Out[7]:
<matplotlib.axes.AxesSubplot at 0x7f2194c37690>

HDF5 file format

Export to HDF5

DATA_DIRECTORY = '/path/to/hdf5/'
redd.export(DATA_DIRECTORY)

Load from HDF5

Note that we use the generic DataSet class to load HDF5...

from nilmtk.dataset import DataSet
dataset = DataSet()
dataset.load_hdf5(DATA_DIRECTORY)

Dataset Diagnostics and Statistics

All datasets have imperfections. We need to understand the imperfections of each dataset before proceeding with data analysis. We'll continue working with REDD but please note that we're not picking on REDD! All datasets have issues!

import nilmtk.stats.electricity.building as bstats

Gaps

There are two reasons why data might not be recorded:

  1. The appliance and meter were unplugged from the mains (hence the appliance is off when the appliance monitor is off).
  2. The appliance monitor is broken (hence we have no reliable information about the state of the appliance).

In the plots below, a dark rectangle shows the presence of a gap. A gap is defined as any pair of consecutive samples further apart than 4 times the sample period for that meter.

Plot gaps using rectangles

In [8]:
bstats.plot_missing_samples_using_rectangles(electric)

The advantages of plot_missing_samples_using_rectangles are:

  • clearly shows large gaps
  • shows all data so can be zoomed in to your heart's content

The disadvantages are:

  • The choice of max_sample_period is somewhat subjective
  • Because it plots lots of rectangles, it can be slow to plot.

To overcome both of these disadvantages, we have a sister function:

Plot gaps using bitmap

In [9]:
bstats.plot_missing_samples_using_bitmap(electric)
Seconds per pixel = 3917.6
Out[9]:
<matplotlib.axes.AxesSubplot at 0x7f2191f84550>

Proportion of energy submetered for house 1

In [10]:
bstats.proportion_of_energy_submetered(electric)
Calculating proportion of energy submetered...
Masking appliances with mains... may take a little while...Mains sample period = 1.0, max_sample_period = 20.0
Getting gap starts and ends...
Found 27 gap starts and 27 gap ends.
...............done
Inserting zeros... may take a little while...done inserting zeros
Out[10]:
0.76004797563554483

Proportion of energy per appliance

In [11]:
bstats.proportion_per_appliance(electric)
Masking appliances with mains... may take a little while...Mains sample period = 1.0, max_sample_period = 20.0
Getting gap starts and ends...
Found 27 gap starts and 27 gap ends.
...............done
Inserting zeros... may take a little while...done inserting zeros
Out[11]:
(fridge, 1)             0.143596
(lighting, 1)           0.112564
(washer dryer, 1)       0.093916
(kitchen outlets, 2)    0.073981
(dishwasher, 1)         0.066409
(microwave, 1)          0.058262
(kitchen outlets, 1)    0.055435
(lighting, 2)           0.046542
(oven, 1)               0.036079
(lighting, 3)           0.034429
(bathroom misc, 1)      0.018303
(kitchen outlets, 3)    0.014565
(kitchen outlets, 4)    0.005331
(space heater, 1)       0.000334
(hob, 1)                0.000302
dtype: float64

Statistics for a single appliance

import nilmtk.stats.electricity.single as sstats
In [12]:
APPLIANCE_TYPE = 'oven'
appliance = electric.appliances[(APPLIANCE_TYPE, 1)]

Sample period

In [13]:
sample_period = sstats.get_sample_period(appliance)

print('average sample period = {:.1f} seconds'.format(sample_period))
average sample period = 3.6 seconds

Dropout rate over entire timeseries

In [14]:
dropout_rate = sstats.get_dropout_rate(data=appliance)

print('average dropout rate = {:.1%}'.format(dropout_rate))
average dropout rate = 13.5%

Dropout rate per period

In [15]:
sstats.dropout_rate_per_period(data=appliance, rule='D').plot()

ylabel('dropout rate')
title('Dropout rate per day for ' + APPLIANCE_TYPE)
Out[15]:
<matplotlib.text.Text at 0x7f2192029850>

Histogram of power consumption

In [16]:
THRESHOLD = 1
appliance_filtered = (appliance[appliance > THRESHOLD]).icol(0).dropna()

xlabel('power (watts)')
ylabel('frequency')
h = hist(appliance_filtered.values, bins=100)
Conclusions from the histogram:
  • oven spends a lot of its time consuming about 2-50 Watts
  • appears to be properly 'on' when it's consuming over 1600 watts
  • So let's use 1000 watts as the on power threshold.
In [17]:
ON_POWER_THRESHOLD = 1000

In NILMTK v0.2, on_power_threshold is stored as metadata per appliance.

Number of hours the oven was switched on for

In [18]:
hours_on = sstats.hours_on(appliance, on_power_threshold=ON_POWER_THRESHOLD)

print(APPLIANCE_TYPE + ' was on for {:.1f} hours.'.format(hours_on))
oven was on for 4.5 hours.

Total energy consumed by the oven

In [19]:
kwh = sstats.energy(appliance)

print(APPLIANCE_TYPE + ' consumed {:.1f} kWh.'.format(kwh))
oven consumed 3.8 kWh.

Explore usage behaviour of an appliance

Usage (hours on and energy used) per period

In [20]:
usage = sstats.usage_per_period(appliance,
                                freq='D', 
                                on_power_threshold=ON_POWER_THRESHOLD)
usage.head(n=7)
Out[20]:
hours_on kwh
2011-04-18 NaN NaN
2011-04-19 0.000000 0.000063
2011-04-20 0.000000 0.000010
2011-04-21 0.000000 0.000014
2011-04-22 0.000000 0.000004
2011-04-23 0.000000 0.007393
2011-04-24 0.308056 0.510096

7 rows × 2 columns

Sanity-check by looking at power data
In [21]:
plot_series(appliance[:"2011-04-24"], 
            date_format='%Y-%m-%d', 
            tz_localize=False)
title(APPLIANCE_TYPE + ' power demand')
Out[21]:
<matplotlib.text.Text at 0x7f2191b51390>

Usage of the oven, hour-by-hour, over an average day

In [22]:
dist = sstats.activity_distribution(appliance, 
                                    bin_size='H', 
                                    timespan='D', 
                                    on_power_threshold=ON_POWER_THRESHOLD)

# Graph formatting
x = np.arange(dist.size)
ylabel('frequency')
xlabel('hour of day')
title('Usage of the oven, hour-by-hour, over an average day')
xlim([0, 24])
xticks(range(0, 25, 6))
bar(x, dist.values)
Out[22]:
<Container object of 24 artists>

Usage of the oven, day-by-day, over an average week

In [23]:
dist = sstats.activity_distribution(appliance, 
                                    bin_size='D', 
                                    timespan='W', 
                                    on_power_threshold=ON_POWER_THRESHOLD)
x = np.arange(dist.size)
ylabel('frequency')
xlabel('day of week')
title('')
xticks(np.arange(7)+0.5, calendar.day_name[0:7])
bar(x, dist.values)
Out[23]:
<Container object of 7 artists>

Histogram of on-durations

In [24]:
# Get a Series of booleans indicating when the oven is on:
on_series = sstats.on(appliance, on_power_threshold=ON_POWER_THRESHOLD)

# Now get the length of every on-duration
on_durations = sstats.durations(on_series, 
                                on_or_off='on',
                                ignore_n_off_samples=10)
xlabel('minutes on')
ylabel('frequency')
title('Distribution of on-durations for oven')
h = hist(on_durations/60, bins=10)

Descriptive stats of whole dataset

redd.describe()
NUMBER OF BUILDINGS: 6

NUMBER OF APPLIANCES PER BUILDING:
  min  =  9.00
  mean = 16.50
  mode = 15.00
  max  = 23.00
  std  =  4.31
PROPORTION OF ENERGY SUBMETERED PER BUILDLING:
  min  =  0.58
  mean =  0.70
  mode =  0.58
  max  =  0.89
  std  =  0.11
DROPOUT RATE PER CHANNEL, INCLUDING LARGE GAPS:
  min  =  0.13
  mean =  0.58
  mode =  0.92
  max  =  0.92
  std  =  0.24
DROPOUT RATE PER CHANNEL, IGNORING LARGE GAPS:
  min  =  0.00
  mean =  0.10
  mode =  0.09
  max  =  0.17
  std  =  0.04
MAINS UPTIME PER BUILDING (DAYS):
  min  =  3.60
  mean = 15.12
  mode =  3.60
  max  = 19.44
  std  =  5.43
PROPORTION OF TIME SLICES WHERE > 70% ENERGY IS SUBMETERED:
  min  =  0.23
  mean =  0.64
  mode =  0.23
  max  =  0.98
  std  =  0.26

Preprocessing

First, for reference, here's the plot from above showing missing samples:

In [25]:
building = redd.buildings[1]
bstats.plot_missing_samples_using_rectangles(building.utility.electric)
import nilmtk.preprocessing.electricity.building as prepb

Select data for a specific time period

building = prepb.filter_datetime(building, '7-13-2013', '8-4-2013')

Resampling

In [26]:
building = prepb.downsample(building, rule='1T')
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Fill large gaps in appliance data with zeros and forward-fill small gaps

In [27]:
building = prepb.fill_appliance_gaps(building)

# Now plot missing samples again:
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Detect gaps in mains and mask these out from appliance data

In [28]:
building = prepb.drop_missing_mains(building)
building = prepb.make_common_index(building)
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Sum the split-phase supply in REDD

In [29]:
building.utility.electric = building.utility.electric.sum_split_supplies()

Remove appliances which contribute less than 5% energy

In [30]:
# Hack to trick NILMTK into thinking that REDD mains measures
# active power not apparent. This will be handled much better
# in NILMTK v0.2
mains = building.utility.electric.mains[(1,1)]
mains.rename(columns={Measurement('power','apparent'): 
                      Measurement('power','active')}, 
             inplace=True)

building = prepb.filter_contribution_less_than_x(building, x=5)
Common Measurement:  Measurement(physical_quantity='power', type='active')
In [31]:
electric = building.utility.electric
bstats.plot_missing_samples_using_rectangles(electric)

Disaggregation

Split a building into training and test sets

from nilmtk.cross_validation import train_test_split
In [32]:
train, test = train_test_split(building, train_size = 0.5)
# train and test are now both Building objects

Training

from nilmtk.disaggregate.co_1d import CO_1d
In [33]:
DISAGG_FEATURE = Measurement('power', 'active')

disaggregator = CO_1d()
disaggregator.train(train, disagg_features=[DISAGG_FEATURE])
# Below is the learnt model
Out[33]:
{ApplianceName(name='dishwasher', instance=1): [0, 589],
 ApplianceName(name='fridge', instance=1): [0, 185],
 ApplianceName(name='kitchen outlets', instance=1): [0, 21],
 ApplianceName(name='kitchen outlets', instance=2): [0, 28],
 ApplianceName(name='lighting', instance=1): [0, 86],
 ApplianceName(name='microwave', instance=1): [0, 947],
 ApplianceName(name='washer dryer', instance=1): [0, 1957]}

Export appliance models to JSON

In [34]:
disaggregator.export_model('model.json')
!cat model.json
{"fridge_1": [0, 185], "washer dryer_1": [0, 1957], "kitchen outlets_2": [0, 28], "kitchen outlets_1": [0, 21], "dishwasher_1": [0, 589], "microwave_1": [0, 947], "lighting_1": [0, 86]}

Import appliance models from JSON

In [35]:
disaggregator.import_model('model.json')
!rm model.json
disaggregator.model
Out[35]:
{ApplianceName(name='dishwasher', instance=1): [0, 589],
 ApplianceName(name='fridge', instance=1): [0, 185],
 ApplianceName(name='kitchen outlets', instance=1): [0, 21],
 ApplianceName(name='kitchen outlets', instance=2): [0, 28],
 ApplianceName(name='lighting', instance=1): [0, 86],
 ApplianceName(name='microwave', instance=1): [0, 947],
 ApplianceName(name='washer dryer', instance=1): [0, 1957]}

Disaggregation

In [36]:
disaggregator.disaggregate(test)
predicted_power = disaggregator.predictions
128
{ApplianceName(name='fridge', instance=1): [0, 185], ApplianceName(name='washer dryer', instance=1): [0, 1957], ApplianceName(name='kitchen outlets', instance=2): [0, 28], ApplianceName(name='kitchen outlets', instance=1): [0, 21], ApplianceName(name='lighting', instance=1): [0, 86], ApplianceName(name='dishwasher', instance=1): [0, 589], ApplianceName(name='microwave', instance=1): [0, 947]}
In [37]:
predicted_power.head()
Out[37]:
(dishwasher, 1) (fridge, 1) (kitchen outlets, 1) (kitchen outlets, 2) (lighting, 1) (microwave, 1) (washer dryer, 1)
2011-04-27 19:10:00-04:00 0 185 21 28 0 0 0
2011-04-27 19:11:00-04:00 0 185 21 28 0 0 0
2011-04-27 19:12:00-04:00 0 185 21 28 0 0 0
2011-04-27 19:13:00-04:00 0 185 21 28 0 0 0
2011-04-27 19:14:00-04:00 0 185 21 28 0 0 0

5 rows × 7 columns

In [38]:
ax = test.utility.electric.mains[1,1]["2011-04-27 20:00":"2011-04-27 23:00:00"].plot()
ax.set_ylim([150,450])
title('aggregate power demand')
Out[38]:
<matplotlib.text.Text at 0x7f2191e39850>
In [39]:
ax = predicted_power["2011-04-27 20:00":"2011-04-27 23:00:00"].plot()
ax.set_ylim([0,200])
title('predicted appliance power demand')
Out[39]:
<matplotlib.text.Text at 0x7f219110e390>
In [40]:
app_ground = test.utility.electric.appliances
ground_truth_power = pd.DataFrame(
    {appliance: app_ground[appliance][DISAGG_FEATURE] 
     for appliance in app_ground})

ax = ground_truth_power["2011-04-27 20:00":"2011-04-27 23:00:00"].plot()
ax.set_ylim([0,250])
title('ground truth appliance power demand')
Out[40]:
<matplotlib.text.Text at 0x7f2191cd37d0>

Metrics

from nilmtk.metrics import (mean_normalized_error_power, 
                            fraction_energy_assigned_correctly, 
                            f_score,
                            rms_error_power)
In [41]:
metrics = {
    'mean normalized error power':
          mean_normalized_error_power,
    'fraction energy assigned correctly': 
          fraction_energy_assigned_correctly,
    'f score': f_score,
    'RMS Error Power': rms_error_power
}
In [42]:
for metric_name, metric_func in metrics.iteritems():
    result = metric_func(predicted_power, ground_truth_power)
    display(HTML('<h5>' + metric_name + '</h5>'))
    if isinstance(result, dict):
        pretty_print_dict(result)
    else:
        print(result)
    print()
RMS Error Power
  • ApplianceName(name='fridge', instance=1): 110.208348899
  • ApplianceName(name='washer dryer', instance=1): 216.400642527
  • ApplianceName(name='kitchen outlets', instance=2): 18.8756186696
  • ApplianceName(name='kitchen outlets', instance=1): 10.5264942354
  • ApplianceName(name='lighting', instance=1): 69.2372594931
  • ApplianceName(name='dishwasher', instance=1): 221.145976367
  • ApplianceName(name='microwave', instance=1): 223.294611141

f score
  • ApplianceName(name='fridge', instance=1): 0.486390231493
  • ApplianceName(name='washer dryer', instance=1): 0.692607003891
  • ApplianceName(name='kitchen outlets', instance=2): 0.0
  • ApplianceName(name='kitchen outlets', instance=1): 0.0
  • ApplianceName(name='lighting', instance=1): 0.517505876531
  • ApplianceName(name='dishwasher', instance=1): 0.236493374108
  • ApplianceName(name='microwave', instance=1): 0.25

fraction energy assigned correctly
0.771400439401

mean normalized error power
  • ApplianceName(name='fridge', instance=1): 1.60618649799
  • ApplianceName(name='washer dryer', instance=1): 0.703450115919
  • ApplianceName(name='kitchen outlets', instance=2): 0.714609954031
  • ApplianceName(name='kitchen outlets', instance=1): 0.516469554455
  • ApplianceName(name='lighting', instance=1): 1.36871998661
  • ApplianceName(name='dishwasher', instance=1): 2.85568760678
  • ApplianceName(name='microwave', instance=1): 3.57270358602

End

nilmtk website: nilmtk.github.io

Contact: jack.kelly@imperial.ac.uk

In [1]:
# CSS styling
from IPython.core.display import display, HTML
display(HTML(open('static/styles.css', 'r').read()));