Copyright (C) 2015 Jingkun Gao ([email protected])
This file is part of PLAID code repo.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
In this IPython Notebook, I will show how to load data and extract features from PLAID. Meanwhile, I will test the performance of appliance identification using PLAID.
Please make sure Python3 and other necessary packages (using **pip3 -r requirements.txt**) are installed,
# !pip install -r requiements.txt
%matplotlib inline
import subprocess
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
plt.style.use('ggplot')
from pylab import rcParams
rcParams['figure.figsize'] = (10, 6)
import matplotlib
font = {'family' : 'sans-serif',
'weight' : 'normal',
'size' : 17}
matplotlib.rc('font', **font)
import warnings
warnings.filterwarnings("ignore")
# functions to read data and meta data
def read_data_given_id(path,ids,progress=True,last_offset=0):
'''read data given a list of ids and CSV paths'''
start = datetime.now()
n = len(ids)
if n == 0:
return {}
else:
data = {}
for (i,ist_id) in enumerate(ids, start=1):
if progress and np.mod(i,np.ceil(n/10))==0:
print('%d/%d (%2.0f%s) have been read...\t time consumed: %ds'\
%(i,n,i/n*100,'%',(datetime.now()-start).seconds))
if last_offset==0:
data[ist_id] = np.genfromtxt(path+str(ist_id)+'.csv',delimiter=',',\
names='current,voltage',dtype=(float,float))
else:
p=subprocess.Popen(['tail','-'+str(int(last_offset)),path+str(ist_id)+'.csv'],\
stdout=subprocess.PIPE)
data[ist_id] = np.genfromtxt(p.stdout,delimiter=',',names='current,voltage',dtype=(float,float))
print('%d/%d (%2.0f%s) have been read(Done!) \t time consumed: %ds'\
%(n,n,100,'%',(datetime.now()-start).seconds))
return data
def clean_meta(ist):
'''remove None elements in Meta Data '''
clean_ist = ist.copy()
for k,v in ist.items():
if len(v) == 0:
del clean_ist[k]
return clean_ist
def parse_meta(meta):
'''parse meta data for easy access'''
M = {}
for m in meta:
for app in m:
M[int(app['id'])] = clean_meta(app['meta'])
return M
# specify path to read data and meta
# Please make sure you downloaded latest dataset from plaidplug.com.
Data_path = 'data/'
csv_path = Data_path + 'CSV/';
import json
# read meta
with open(Data_path + 'meta1.json') as data_file:
meta1 = json.load(data_file)
Meta = parse_meta([meta1])
meta1 = parse_meta([meta1])
# read data
# applinace types of all instances
Types = [x['type'] for x in Meta.values()]
# unique appliance types
Unq_type = list(set(Types))
Unq_type.sort()
IDs_for_read_data = list(Meta.keys())
# households of appliances
Locs = [x['header']['collection_time']+'_'+x['location'] for x in Meta.values()]
# unique households
Unq_loc = list(set(Locs))
Unq_loc.sort()
Origianl_Unq_type = Unq_type
print('Number of households: %d\nNumber of total measurements:%d'%(len(Unq_loc),len(Locs)))
# read data
# estimated time cost: ~ 1 mins
npts = 10000
Data = read_data_given_id(csv_path,IDs_for_read_data,progress=True, last_offset=npts)
print('Total number of instances:%d'%len(Data))
We start by extracting one representative period of steady state for each instance.
fs = 30000
f0 = 60
NS = fs//f0 # number of samples per period
NP = npts//NS # number of periods for npts
# calculate the representative one period of steady state
# (mean of the aggregated signals over one cycle)
n = len(Data)
rep_I = np.empty([n,NS])
rep_V = np.empty([n,NS])
for i in range(n):
tempI = np.sum(np.reshape(Data[i+1]['current'],[NP,NS]),0)/NP
tempV = np.sum(np.reshape(Data[i+1]['voltage'],[NP,NS]),0)/NP
# align current to make all samples start from 0 and goes up
ix = np.argsort(np.abs(tempI))
j = 0
while True:
if ix[j]<499 and tempI[ix[j]+1]>tempI[ix[j]]:
real_ix = ix[j]
break
else:
j += 1
rep_I[i,] = np.hstack([tempI[real_ix:],tempI[:real_ix]])
rep_V[i,] = np.hstack([tempV[real_ix:],tempV[:real_ix]])
Extract labels for appliance type and locations(used for train/test split).
type_Ids = {}
loc_Ids = {}
n = len(Data)
type_label = np.zeros(n,dtype='int')
loc_label = np.zeros(n,dtype='int')
for (ii,t) in enumerate(Unq_type):
type_Ids[t] = [i-1 for i,j in enumerate(Types,start=1) if j == t]
type_label[type_Ids[t]] = ii+1
for (ii,t) in enumerate(Unq_loc):
loc_Ids[t] = [i-1 for i,j in enumerate(Locs,start=1) if j == t]
loc_label[loc_Ids[t]] = ii+1
print('number of different types: %d'% len(Unq_type))
print('number of different households: %d'% len(Unq_loc))
We then look at the distributions of appliance types and locations.
plt.hist(type_label,len(Unq_type))
plt.title('distirbution of instances for each type')
print(Origianl_Unq_type)
print('Similified verstion:')
Unq_type[0]='AC';
Unq_type[1]='CFL';
Unq_type[6]='Bulb';
Unq_type[10]='Washer';
print(Unq_type)
plt.savefig('type_dist.eps', format='eps')
We can see that we have an unbalanced dataset, more instances for CFL, Hairdryer, Laptop and less for Fridge, Heater, Vacuum and Washing Machine.
plt.hist(loc_label,len(Unq_loc))
plt.title('distirbution of instances for each household')
plt.savefig('house_dist.eps', format='eps')
Next, we have rep_I, rep_V, type_label, loc_label. We will start from here to extract features and evalute classification performance. To begin, we first write a function for classifiers.
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
def myclassifiers(X_train,y_train,X_test,y_test,verbose=False):
knn = KNeighborsClassifier(n_neighbors=1)
gnb = GaussianNB()
logistic = LogisticRegression(C=1e5)
svc = svm.SVC(kernel='rbf',gamma=.7,C=1.0)
lda = LDA(solver='lsqr', shrinkage='auto')
qda = QDA()
dTree = tree.DecisionTreeClassifier(max_depth=10)
rForest = RandomForestClassifier(max_depth=10,n_estimators=20)
adaBoost = AdaBoostClassifier()
names = ["Nearest Neighbors(k=1)", "Naive Bayes","Logistic","RBF SVM", "LDA", "QDA",
"Decision Tree","Random Forest", "AdaBoost"]
classifiers = [knn, gnb, logistic, svc, lda, qda, dTree, rForest, adaBoost]
y_predict = []
acc = []
# print('Running',end="")
for (i,clf) in enumerate(classifiers):
if verbose:
print(' %s... '% names[i],end="")
clf.fit(X_train,y_train)
y_predict.append(clf.predict(X_test))
acc.append(clf.score(X_test,y_test))
return (acc,y_predict,names)
# test classifiers
X_train, X_test, y_train, y_test = train_test_split(rep_I[:,:10], type_label, test_size=0.1, random_state=0)
(acc,y_p,names) = myclassifiers(X_train,y_train,X_test,y_test)
acc
RawCF
RawCF = rep_I
# visualization of current features from certain type in RawCF
type_id = 4
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
plt.subplot(np.ceil(n/10),15,count)
plt.plot(RawCF[i,],'.')
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
# plt.title('Id: '+str(k),fontsize=10)
count += 1
fig.tight_layout()
print(Unq_type[type_id-1])
plt.show()
# plot five instances
ids = [1,8,3,4]
normalize = lambda x:x/(max(x)*1.1)
for i in ids:
plt.plot(normalize(RawCF[np.where(type_label==i)[0][0],]),'.',label=Unq_type[i-1])
plt.legend()
plt.xlabel('count')
plt.ylabel('normalized current value')
plt.title('One Period of Steady State')
PQ: Calculate real and reactive power from steady states
f0 = 60
fs = 3e4
n = len(Data)
NN = int(fs//f0)
NS = npts//NN
PQ = np.empty([n,NS,2])
for i in range(n):
for j in range(NS-1):
# extract current and voltage in two cycle
temp_I = Data[i+1]['current'][j*NN:(j+2)*NN]
temp_V = Data[i+1]['voltage'][j*NN:(j+2)*NN]
# extract abs part(apparent component), divided by the number of points
apparI = np.abs(2*np.fft.fft(temp_I))/NN
apparV = np.abs(2*np.fft.fft(temp_V))/NN
# phase difference
theta = np.angle(np.fft.fft(temp_V)) - np.angle(np.fft.fft(temp_I))
# calculate real/reactive power
tempP = 1/2*apparV*apparI*np.sin(theta)
tempQ = 1/2*apparV*apparI*np.cos(theta)
# sum the power of different orders
# PQ[i,j,0] = np.sum(tempP)
# PQ[i,j,1] = np.sum(tempQ)
# take only the fundamental active/reactive power
PQ[i,j,0] = (tempP[2])
PQ[i,j,1] = (tempQ[2])
PQ = np.delete(PQ,np.where(np.isnan(PQ))[1],1)
# choose the median value among all cycles
PQ = np.median(PQ,1)
import matplotlib.cm as cmx
import matplotlib.colors as colors
def get_cmap(N):
'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
RGB color.'''
color_norm = colors.Normalize(vmin=0, vmax=N-1)
scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
def map_index_to_rgb_color(index):
return scalar_map.to_rgba(index)
return map_index_to_rgb_color
cmap = get_cmap(11)
Legends=('AC','CFL','Fan','Fridge','Hairdryer','Heater','Bulb','Laptop','Microwave','Vaccum','Washer')
markers='ov^<>sph+Dx'
colors='bgrcmk'
minx,maxx,miny,maxy = (0,0,0,0)
fig=plt.figure(figsize=(12,6))
ax=fig.add_subplot(1,1,1)
ax.set_position([0.1,0.1,.6,0.8])
# visualization of current features from certain type in PQ
for i in range(11):
ix=np.where(type_label==i+1)[0]
# ignore appliances with high real/reactive power for better visualization
temp_PQ = np.delete(PQ[ix], np.unique(np.where(np.abs(PQ[ix]) > 3000)[0]), 0)
ix1 = np.random.randint(temp_PQ.shape[0],size=5)
tempx = temp_PQ[ix1,0]
tempy = temp_PQ[ix1,1]
plt.scatter(tempx,tempy,c=cmap(i),marker=markers[i], s=100, label=Legends[i])
minx = min(minx,min(tempx))
maxx = max(maxx,max(tempx))
miny = min(miny,min(tempy))
maxy = max(maxy,max(tempy))
plt.axis([minx*0.9,maxx*1.1,miny*0.9,maxy*1.1])
plt.xlabel('Real Power(W)')
plt.ylabel('Reactive Power (VAR)')
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.tight_layout()
plt.savefig('PQ.eps', format='eps')
HarmonicsF: We will take up to 21st order of harmonics
n = len(Data)
order = 21
p = order + 1 # number of harmonics to be extracted, include 0-th component
harmonics = np.linspace(0,order,num=p)
fs = 30000
npts = 10000
f0 = 120 # for power
HarmonicsF = np.empty([n,p])
# visualize power harmonics of one instance
ix = 100
temp_P = Data[ix]['current']*Data[ix]['voltage']
x = np.linspace(0,fs,num=npts)
y = np.abs(np.fft.fft(temp_P))
plt.plot(x[:200],y[:200],'.-')
harmonics
for i in range(n):
temp_P = Data[i+1]['current']*Data[i+1]['voltage']
y = np.abs(np.fft.fft(temp_P))
h = 40*harmonics
h = h.astype(int)
HarmonicsF[i,] =y[h]
HarmonicsF[1,]
HarmonicsF.shape
# visualization of harmonics features from certain type
from mpl_toolkits.mplot3d import Axes3D
ix1=np.where(type_label==9)[0][:4]
ix2=np.where(type_label==7)[0][:4]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=HarmonicsF[ix1,0], ys=HarmonicsF[ix1,1], zs=HarmonicsF[ix1,2],c='r',s=80)
ax.scatter(HarmonicsF[ix2,0], HarmonicsF[ix2,1], zs=HarmonicsF[ix2,2],c='b',s=80)
ax.set_xlabel('1st harmonics')
ax.set_ylabel('2nd harmonics')
ax.set_zlabel('3rd harmonics')
plt.show()
BinF : This feature is to downsample the data to reduce dimension of raw current/voltage.
num = 20 # number of bins
def get_BinF(X,num):
'''X should be nd array of size N*P, the output will be N*num'''
(N,P) = X.shape
newP = int(np.floor(P/num)*num)
newX = np.reshape(X[:,:newP],[N,num,newP//num])
BinF = np.sum(newX,2)
return BinF
BinF_I = get_BinF(rep_I,num)
BinF_V = get_BinF(rep_V,num)
BinF = np.hstack([BinF_I,BinF_V])
BinF.shape
Unq_type
BinF[1,:]
i = 100
plt.plot(BinF_I[i,:],'.')
plt.title('Down Sampled Current of ' + Unq_type[type_label[i]])
plt.xlabel('count')
plt.ylabel('value')
plt.show()
# visualization of all Bin current features from certain type in BinF
type_id = 4
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
plt.subplot(np.ceil(n/10),15,count)
plt.plot(BinF_I[i,],'.')
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
# plt.title('Id: '+str(k),fontsize=10)
count += 1
fig.tight_layout()
plt.show()
BinaryF: Raw 0/1 binary VI images
def center(X,w):
minX = np.amin(X)
maxX = np.amax(X)
dist = max(abs(minX),maxX)
X[X<-dist] = -dist
X[X>dist] = dist
d = (maxX-minX)/w
return (X,d)
def get_img_from_VI(V, I, width,hard_threshold=False,para=.5):
'''Get images from VI, hard_threshold, set para as threshold to cut off,5-10
soft_threshold, set para to .1-.5 to shrink the intensity'''
d = V.shape[0]
# doing interploation if number of points is less than width*2
if d<2* width:
newI = np.hstack([V, V[0]])
newV = np.hstack([I, I[0]])
oldt = np.linspace(0,d,d+1)
newt = np.linspace(0,d,2*width)
I = np.interp(newt,oldt,newI)
V = np.interp(newt,oldt,newV)
# center the current and voltage, get the size resolution of mesh given width
(I,d_c) = center(I,width)
(V,d_v) = center(V,width)
# find the index where the VI goes through in current-voltage axis
ind_c = np.floor((I-np.amin(I))/d_c).astype(int)
ind_v = np.floor((V-np.amin(V))/d_v).astype(int)
ind_c[ind_c==width] = width-1
ind_v[ind_v==width] = width-1
Img = np.zeros((width,width))
for i in range(len(I)):
Img[ind_c[i],width-ind_v[i]-1] += 1
if hard_threshold:
Img[Img<para] = 0
Img[Img!=0] = 1
return Img
else:
return (Img/np.max(Img))**para
n = len(Data)
width = 16
Imgs = np.zeros((n,width,width), dtype=np.float64)
for i in range(n):
Imgs[i,:,:] = get_img_from_VI(rep_V[i,], rep_I[i,], width,True,1)
BinaryF=np.reshape(Imgs,(n,width*width))
# visualization of all imgaes from certain type in Imgs
type_id = 1
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
plt.subplot(np.ceil(n/10),15,count)
plt.imshow(Imgs[i,:,:],cmap = cm.Greys_r,interpolation='None')
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
# plt.title('Id: '+str(k),fontsize=10)
count += 1
fig.tight_layout()
plt.show()
plt.style.use('classic')
# AC-1,CFL-2,hairdryer-5,laptop-8
ix=np.where(type_label==1)[0][30]
fig=plt.figure(figsize=(16,10))
ax1 = fig.add_subplot(2,3,1)
plt.plot(rep_V[ix,:],rep_I[ix,:])
plt.title('VI trajectory of Air Conditioner')
# plt.xlabel('Voltage')
# plt.ylabel('Current')
ax = fig.add_subplot(2,3,2)
plt.plot(rep_V[ix,:],rep_I[ix,:],'.')
xticks = np.linspace(min(rep_V[ix,:]),max(rep_V[ix,:]),num=16)
yticks = np.linspace(min(rep_I[ix,:]),max(rep_I[ix,:]),num=16)
plt.axis([min(rep_V[ix,:]),max(rep_V[ix,:]),min(rep_I[ix,:]),max(rep_I[ix,:])])
ax.set_xticks(xticks)
ax.set_yticks(yticks)
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
# plt.tick_params(axis='y',which='both',labelbottom='off')
plt.title('Meshed version of VI')
ax.grid(which='both')
ax3 = fig.add_subplot(2,3,3)
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.title('VI binary image of Air Conditioner')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
ax4 = fig.add_subplot(2,3,4)
ix=np.where(type_label==2)[0][13]
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
plt.title('VI binary image of CFL')
ax5 = fig.add_subplot(2,3,5)
ix=np.where(type_label==5)[0][19]
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
plt.title('VI binary image of Hairdryer')
ax6 = fig.add_subplot(2,3,6)
ix=np.where(type_label==8)[0][13]
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
plt.title('VI binary image of Laptop')
plt.tight_layout()
plt.show()
fig.savefig('VI.eps', format='eps')
plt.style.use('ggplot')
PCA_BinF, PCA_BinaryF, PCA_RawCF: Use projected components which keep more than 99% variations as the features
from sklearn.decomposition import PCA
# write a function to extract up to certain percent of projected components
def get_PCs(X,p):
# X: nd array of size sample_nums*features,
# p: percentage of variation to be taken
pca = PCA(whiten=True)
pca.fit(X)
ix=np.where(np.cumsum(pca.explained_variance_ratio_)>p)[0][0]
pca = PCA(n_components=ix,whiten=True)
return pca.fit_transform(X)
p = .99
PCA_BinF = get_PCs(BinF,p)
PCA_BinaryF = get_PCs(BinaryF,p)
PCA_RawCF = get_PCs(RawCF,p)
np.isnan(np.sum(PCA_BinF))
np.isnan(np.sum(PCA_BinaryF))
np.isnan(np.sum(PCA_RawCF))
To visualize the features, we use TSNE and PCA to project them into 2 dimensional space.
def lowDimVisualize(F,filename):
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
model = TSNE(n_components=2, random_state=0)
pca = PCA(n_components=2,whiten=True)
newHar = model.fit_transform(F)
TSNEF = newHar
PCAF = pca.fit_transform(F)
colors = cm.rainbow(np.linspace(0, 1, 11))
Legends=('AC','CFL','Fan','Fridge','Hairdryer','Heater','Bulb','Laptop','Microwave','Vaccum','Washer')
# markers='ov^<>sph+Dx'
fig = plt.figure(figsize=(14,7))
plt.subplot(1,2,1)
for i, c in zip(range(11), colors):
ix = np.where(type_label==i+1)[0]
plt.scatter(TSNEF[ix,0], TSNEF[ix,1], color=c,label=Legends[i])
plt.title('2D Visulization of TSNE')
plt.subplot(1,2,2)
for i, c in zip(range(11), colors):
ix = np.where(type_label==i+1)[0]
plt.scatter(PCAF[ix,0], PCAF[ix,1], color=c,label=Legends[i])
plt.title('2D Visulization of PCA')
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.tight_layout()
fig.savefig(filename, format='eps')
# look at harmonics
lowDimVisualize(HarmonicsF,'harmonics_TSNE_PCA.eps')
# look at binary images
lowDimVisualize(BinaryF,'Binary_TSNE_PCA.eps')
# look at raw current
lowDimVisualize(RawCF,'raw_current_TSNE_PCA.eps')
# look at one period of representative current and voltage
lowDimVisualize(np.hstack([rep_I,rep_V]),'raw_IV_1P_TSNE_PCA.eps')
# look at down sampled feature
lowDimVisualize(BinF,'down_sampled_TSNE_PCA.eps')
# look at Raw
D=np.array([np.hstack([Data[i+1]['current'],Data[i+1]['voltage']]) for i in range(1074)])
lowDimVisualize(D,'raw_TSNE_PCA.eps')