#!/usr/bin/env python
# coding: utf-8
# Copyright (C) 2015 Jingkun Gao (jingkun@cmu.edu)
#
# This file is part of PLAID code repo.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
# In this IPython Notebook, I will show how to load data and extract features from PLAID. Meanwhile, I will test the performance of appliance identification using PLAID.
#
# Please make sure Python3 and other necessary packages (using **pip3 -r requirements.txt**) are installed,
# In[1]:
# !pip install -r requiements.txt
# In[2]:
get_ipython().run_line_magic('matplotlib', 'inline')
import subprocess
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
plt.style.use('ggplot')
from pylab import rcParams
rcParams['figure.figsize'] = (10, 6)
import matplotlib
font = {'family' : 'sans-serif',
'weight' : 'normal',
'size' : 17}
matplotlib.rc('font', **font)
import warnings
warnings.filterwarnings("ignore")
# In[3]:
# functions to read data and meta data
def read_data_given_id(path,ids,progress=True,last_offset=0):
'''read data given a list of ids and CSV paths'''
start = datetime.now()
n = len(ids)
if n == 0:
return {}
else:
data = {}
for (i,ist_id) in enumerate(ids, start=1):
if progress and np.mod(i,np.ceil(n/10))==0:
print('%d/%d (%2.0f%s) have been read...\t time consumed: %ds'\
%(i,n,i/n*100,'%',(datetime.now()-start).seconds))
if last_offset==0:
data[ist_id] = np.genfromtxt(path+str(ist_id)+'.csv',delimiter=',',\
names='current,voltage',dtype=(float,float))
else:
p=subprocess.Popen(['tail','-'+str(int(last_offset)),path+str(ist_id)+'.csv'],\
stdout=subprocess.PIPE)
data[ist_id] = np.genfromtxt(p.stdout,delimiter=',',names='current,voltage',dtype=(float,float))
print('%d/%d (%2.0f%s) have been read(Done!) \t time consumed: %ds'\
%(n,n,100,'%',(datetime.now()-start).seconds))
return data
def clean_meta(ist):
'''remove None elements in Meta Data '''
clean_ist = ist.copy()
for k,v in ist.items():
if len(v) == 0:
del clean_ist[k]
return clean_ist
def parse_meta(meta):
'''parse meta data for easy access'''
M = {}
for m in meta:
for app in m:
M[int(app['id'])] = clean_meta(app['meta'])
return M
# In[4]:
# specify path to read data and meta
# Please make sure you downloaded latest dataset from plaidplug.com.
Data_path = 'data/'
csv_path = Data_path + 'CSV/';
import json
# read meta
with open(Data_path + 'meta1.json') as data_file:
meta1 = json.load(data_file)
Meta = parse_meta([meta1])
meta1 = parse_meta([meta1])
# In[5]:
# read data
# applinace types of all instances
Types = [x['type'] for x in Meta.values()]
# unique appliance types
Unq_type = list(set(Types))
Unq_type.sort()
IDs_for_read_data = list(Meta.keys())
# households of appliances
Locs = [x['header']['collection_time']+'_'+x['location'] for x in Meta.values()]
# unique households
Unq_loc = list(set(Locs))
Unq_loc.sort()
Origianl_Unq_type = Unq_type
# In[6]:
print('Number of households: %d\nNumber of total measurements:%d'%(len(Unq_loc),len(Locs)))
# In[7]:
# read data
# estimated time cost: ~ 1 mins
npts = 10000
Data = read_data_given_id(csv_path,IDs_for_read_data,progress=True, last_offset=npts)
# In[8]:
print('Total number of instances:%d'%len(Data))
# We start by extracting one representative period of steady state for each instance.
# In[9]:
fs = 30000
f0 = 60
NS = fs//f0 # number of samples per period
NP = npts//NS # number of periods for npts
# calculate the representative one period of steady state
# (mean of the aggregated signals over one cycle)
n = len(Data)
rep_I = np.empty([n,NS])
rep_V = np.empty([n,NS])
for i in range(n):
tempI = np.sum(np.reshape(Data[i+1]['current'],[NP,NS]),0)/NP
tempV = np.sum(np.reshape(Data[i+1]['voltage'],[NP,NS]),0)/NP
# align current to make all samples start from 0 and goes up
ix = np.argsort(np.abs(tempI))
j = 0
while True:
if ix[j]<499 and tempI[ix[j]+1]>tempI[ix[j]]:
real_ix = ix[j]
break
else:
j += 1
rep_I[i,] = np.hstack([tempI[real_ix:],tempI[:real_ix]])
rep_V[i,] = np.hstack([tempV[real_ix:],tempV[:real_ix]])
# Extract labels for appliance type and locations(used for train/test split).
# In[10]:
type_Ids = {}
loc_Ids = {}
n = len(Data)
type_label = np.zeros(n,dtype='int')
loc_label = np.zeros(n,dtype='int')
for (ii,t) in enumerate(Unq_type):
type_Ids[t] = [i-1 for i,j in enumerate(Types,start=1) if j == t]
type_label[type_Ids[t]] = ii+1
for (ii,t) in enumerate(Unq_loc):
loc_Ids[t] = [i-1 for i,j in enumerate(Locs,start=1) if j == t]
loc_label[loc_Ids[t]] = ii+1
print('number of different types: %d'% len(Unq_type))
print('number of different households: %d'% len(Unq_loc))
# We then look at the distributions of appliance types and locations.
# In[11]:
plt.hist(type_label,len(Unq_type))
plt.title('distirbution of instances for each type')
print(Origianl_Unq_type)
print('Similified verstion:')
Unq_type[0]='AC';
Unq_type[1]='CFL';
Unq_type[6]='Bulb';
Unq_type[10]='Washer';
print(Unq_type)
plt.savefig('type_dist.eps', format='eps')
# We can see that we have an unbalanced dataset, more instances for CFL, Hairdryer, Laptop and less for Fridge, Heater, Vacuum and Washing Machine.
# In[12]:
plt.hist(loc_label,len(Unq_loc))
plt.title('distirbution of instances for each household')
plt.savefig('house_dist.eps', format='eps')
# Next, we have rep_I, rep_V, type_label, loc_label. We will start from here to extract features and evalute classification performance. To begin, we first write a function for classifiers.
# # Classifiers
# In[13]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
def myclassifiers(X_train,y_train,X_test,y_test,verbose=False):
knn = KNeighborsClassifier(n_neighbors=1)
gnb = GaussianNB()
logistic = LogisticRegression(C=1e5)
svc = svm.SVC(kernel='rbf',gamma=.7,C=1.0)
lda = LDA(solver='lsqr', shrinkage='auto')
qda = QDA()
dTree = tree.DecisionTreeClassifier(max_depth=10)
rForest = RandomForestClassifier(max_depth=10,n_estimators=20)
adaBoost = AdaBoostClassifier()
names = ["Nearest Neighbors(k=1)", "Naive Bayes","Logistic","RBF SVM", "LDA", "QDA",
"Decision Tree","Random Forest", "AdaBoost"]
classifiers = [knn, gnb, logistic, svc, lda, qda, dTree, rForest, adaBoost]
y_predict = []
acc = []
# print('Running',end="")
for (i,clf) in enumerate(classifiers):
if verbose:
print(' %s... '% names[i],end="")
clf.fit(X_train,y_train)
y_predict.append(clf.predict(X_test))
acc.append(clf.score(X_test,y_test))
return (acc,y_predict,names)
# In[14]:
# test classifiers
X_train, X_test, y_train, y_test = train_test_split(rep_I[:,:10], type_label, test_size=0.1, random_state=0)
(acc,y_p,names) = myclassifiers(X_train,y_train,X_test,y_test)
# In[15]:
acc
# # Features
# ## 1. Raw Current
# **RawCF**
# In[16]:
RawCF = rep_I
# visualization of current features from certain type in RawCF
type_id = 4
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
plt.subplot(np.ceil(n/10),15,count)
plt.plot(RawCF[i,],'.')
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
# plt.title('Id: '+str(k),fontsize=10)
count += 1
fig.tight_layout()
print(Unq_type[type_id-1])
plt.show()
# In[17]:
# plot five instances
ids = [1,8,3,4]
normalize = lambda x:x/(max(x)*1.1)
for i in ids:
plt.plot(normalize(RawCF[np.where(type_label==i)[0][0],]),'.',label=Unq_type[i-1])
plt.legend()
plt.xlabel('count')
plt.ylabel('normalized current value')
plt.title('One Period of Steady State')
# ## 2. Real/Reactive Power, PQ Plane feature
# **PQ**: Calculate real and reactive power from steady states
# In[18]:
f0 = 60
fs = 3e4
n = len(Data)
NN = int(fs//f0)
NS = npts//NN
PQ = np.empty([n,NS,2])
for i in range(n):
for j in range(NS-1):
# extract current and voltage in two cycle
temp_I = Data[i+1]['current'][j*NN:(j+2)*NN]
temp_V = Data[i+1]['voltage'][j*NN:(j+2)*NN]
# extract abs part(apparent component), divided by the number of points
apparI = np.abs(2*np.fft.fft(temp_I))/NN
apparV = np.abs(2*np.fft.fft(temp_V))/NN
# phase difference
theta = np.angle(np.fft.fft(temp_V)) - np.angle(np.fft.fft(temp_I))
# calculate real/reactive power
tempP = 1/2*apparV*apparI*np.sin(theta)
tempQ = 1/2*apparV*apparI*np.cos(theta)
# sum the power of different orders
# PQ[i,j,0] = np.sum(tempP)
# PQ[i,j,1] = np.sum(tempQ)
# take only the fundamental active/reactive power
PQ[i,j,0] = (tempP[2])
PQ[i,j,1] = (tempQ[2])
PQ = np.delete(PQ,np.where(np.isnan(PQ))[1],1)
# choose the median value among all cycles
PQ = np.median(PQ,1)
# In[19]:
import matplotlib.cm as cmx
import matplotlib.colors as colors
def get_cmap(N):
'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
RGB color.'''
color_norm = colors.Normalize(vmin=0, vmax=N-1)
scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
def map_index_to_rgb_color(index):
return scalar_map.to_rgba(index)
return map_index_to_rgb_color
cmap = get_cmap(11)
Legends=('AC','CFL','Fan','Fridge','Hairdryer','Heater','Bulb','Laptop','Microwave','Vaccum','Washer')
markers='ov^<>sph+Dx'
colors='bgrcmk'
minx,maxx,miny,maxy = (0,0,0,0)
fig=plt.figure(figsize=(12,6))
ax=fig.add_subplot(1,1,1)
ax.set_position([0.1,0.1,.6,0.8])
# visualization of current features from certain type in PQ
for i in range(11):
ix=np.where(type_label==i+1)[0]
# ignore appliances with high real/reactive power for better visualization
temp_PQ = np.delete(PQ[ix], np.unique(np.where(np.abs(PQ[ix]) > 3000)[0]), 0)
ix1 = np.random.randint(temp_PQ.shape[0],size=5)
tempx = temp_PQ[ix1,0]
tempy = temp_PQ[ix1,1]
plt.scatter(tempx,tempy,c=cmap(i),marker=markers[i], s=100, label=Legends[i])
minx = min(minx,min(tempx))
maxx = max(maxx,max(tempx))
miny = min(miny,min(tempy))
maxy = max(maxy,max(tempy))
plt.axis([minx*0.9,maxx*1.1,miny*0.9,maxy*1.1])
plt.xlabel('Real Power(W)')
plt.ylabel('Reactive Power (VAR)')
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.tight_layout()
plt.savefig('PQ.eps', format='eps')
# ## 3. Harmonics
# **HarmonicsF**: We will take up to 21st order of harmonics
# In[20]:
n = len(Data)
order = 21
p = order + 1 # number of harmonics to be extracted, include 0-th component
harmonics = np.linspace(0,order,num=p)
fs = 30000
npts = 10000
f0 = 120 # for power
HarmonicsF = np.empty([n,p])
# visualize power harmonics of one instance
ix = 100
temp_P = Data[ix]['current']*Data[ix]['voltage']
x = np.linspace(0,fs,num=npts)
y = np.abs(np.fft.fft(temp_P))
plt.plot(x[:200],y[:200],'.-')
# In[21]:
harmonics
# In[22]:
for i in range(n):
temp_P = Data[i+1]['current']*Data[i+1]['voltage']
y = np.abs(np.fft.fft(temp_P))
h = 40*harmonics
h = h.astype(int)
HarmonicsF[i,] =y[h]
HarmonicsF[1,]
# In[23]:
HarmonicsF.shape
# In[24]:
# visualization of harmonics features from certain type
from mpl_toolkits.mplot3d import Axes3D
ix1=np.where(type_label==9)[0][:4]
ix2=np.where(type_label==7)[0][:4]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=HarmonicsF[ix1,0], ys=HarmonicsF[ix1,1], zs=HarmonicsF[ix1,2],c='r',s=80)
ax.scatter(HarmonicsF[ix2,0], HarmonicsF[ix2,1], zs=HarmonicsF[ix2,2],c='b',s=80)
ax.set_xlabel('1st harmonics')
ax.set_ylabel('2nd harmonics')
ax.set_zlabel('3rd harmonics')
plt.show()
# ## 4. Down sampled current/voltage
# **BinF** : This feature is to downsample the data to reduce dimension of raw current/voltage.
# In[25]:
num = 20 # number of bins
def get_BinF(X,num):
'''X should be nd array of size N*P, the output will be N*num'''
(N,P) = X.shape
newP = int(np.floor(P/num)*num)
newX = np.reshape(X[:,:newP],[N,num,newP//num])
BinF = np.sum(newX,2)
return BinF
BinF_I = get_BinF(rep_I,num)
BinF_V = get_BinF(rep_V,num)
# In[26]:
BinF = np.hstack([BinF_I,BinF_V])
# In[27]:
BinF.shape
# In[28]:
Unq_type
# In[29]:
BinF[1,:]
# In[30]:
i = 100
plt.plot(BinF_I[i,:],'.')
plt.title('Down Sampled Current of ' + Unq_type[type_label[i]])
plt.xlabel('count')
plt.ylabel('value')
plt.show()
# In[31]:
# visualization of all Bin current features from certain type in BinF
type_id = 4
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
plt.subplot(np.ceil(n/10),15,count)
plt.plot(BinF_I[i,],'.')
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.gca().yaxis.set_major_locator(plt.NullLocator())
# plt.title('Id: '+str(k),fontsize=10)
count += 1
fig.tight_layout()
plt.show()
# ## 5. VI binary image
# **BinaryF**: Raw 0/1 binary VI images
# In[32]:
def center(X,w):
minX = np.amin(X)
maxX = np.amax(X)
dist = max(abs(minX),maxX)
X[X<-dist] = -dist
X[X>dist] = dist
d = (maxX-minX)/w
return (X,d)
def get_img_from_VI(V, I, width,hard_threshold=False,para=.5):
'''Get images from VI, hard_threshold, set para as threshold to cut off,5-10
soft_threshold, set para to .1-.5 to shrink the intensity'''
d = V.shape[0]
# doing interploation if number of points is less than width*2
if d<2* width:
newI = np.hstack([V, V[0]])
newV = np.hstack([I, I[0]])
oldt = np.linspace(0,d,d+1)
newt = np.linspace(0,d,2*width)
I = np.interp(newt,oldt,newI)
V = np.interp(newt,oldt,newV)
# center the current and voltage, get the size resolution of mesh given width
(I,d_c) = center(I,width)
(V,d_v) = center(V,width)
# find the index where the VI goes through in current-voltage axis
ind_c = np.floor((I-np.amin(I))/d_c).astype(int)
ind_v = np.floor((V-np.amin(V))/d_v).astype(int)
ind_c[ind_c==width] = width-1
ind_v[ind_v==width] = width-1
Img = np.zeros((width,width))
for i in range(len(I)):
Img[ind_c[i],width-ind_v[i]-1] += 1
if hard_threshold:
Img[Imgp)[0][0]
pca = PCA(n_components=ix,whiten=True)
return pca.fit_transform(X)
# In[37]:
p = .99
PCA_BinF = get_PCs(BinF,p)
PCA_BinaryF = get_PCs(BinaryF,p)
PCA_RawCF = get_PCs(RawCF,p)
# In[38]:
np.isnan(np.sum(PCA_BinF))
# In[39]:
np.isnan(np.sum(PCA_BinaryF))
# In[40]:
np.isnan(np.sum(PCA_RawCF))
# # TSNE vs PCA
# To visualize the features, we use TSNE and PCA to project them into 2 dimensional space.
# In[41]:
def lowDimVisualize(F,filename):
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
model = TSNE(n_components=2, random_state=0)
pca = PCA(n_components=2,whiten=True)
newHar = model.fit_transform(F)
TSNEF = newHar
PCAF = pca.fit_transform(F)
colors = cm.rainbow(np.linspace(0, 1, 11))
Legends=('AC','CFL','Fan','Fridge','Hairdryer','Heater','Bulb','Laptop','Microwave','Vaccum','Washer')
# markers='ov^<>sph+Dx'
fig = plt.figure(figsize=(14,7))
plt.subplot(1,2,1)
for i, c in zip(range(11), colors):
ix = np.where(type_label==i+1)[0]
plt.scatter(TSNEF[ix,0], TSNEF[ix,1], color=c,label=Legends[i])
plt.title('2D Visulization of TSNE')
plt.subplot(1,2,2)
for i, c in zip(range(11), colors):
ix = np.where(type_label==i+1)[0]
plt.scatter(PCAF[ix,0], PCAF[ix,1], color=c,label=Legends[i])
plt.title('2D Visulization of PCA')
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.tight_layout()
fig.savefig(filename, format='eps')
# In[42]:
# look at harmonics
lowDimVisualize(HarmonicsF,'harmonics_TSNE_PCA.eps')
# In[43]:
# look at binary images
lowDimVisualize(BinaryF,'Binary_TSNE_PCA.eps')
# In[44]:
# look at raw current
lowDimVisualize(RawCF,'raw_current_TSNE_PCA.eps')
# In[45]:
# look at one period of representative current and voltage
lowDimVisualize(np.hstack([rep_I,rep_V]),'raw_IV_1P_TSNE_PCA.eps')
# In[46]:
# look at down sampled feature
lowDimVisualize(BinF,'down_sampled_TSNE_PCA.eps')
# In[47]:
# look at Raw
D=np.array([np.hstack([Data[i+1]['current'],Data[i+1]['voltage']]) for i in range(1074)])
lowDimVisualize(D,'raw_TSNE_PCA.eps')
# In[48]:
# look at combined
allF = np.concatenate((PQ, HarmonicsF,BinF, BinaryF),axis=1)
lowDimVisualize(allF,'combined_TSNE_PCA.eps')
# # Evaluation
# A reasonable way to evalute the features is to train and test on different appliances, to makes sure the test instances won't show up in the training set, we propose an evaluation strategy. There are 55 households, we train on some households and test on the rest. Here we use leave-one-out cross-validation, where we train on 54 households on test on the rest one household. The process is repeated 55 times and the mean accuracy is reported.
# In[49]:
# We orgainze the variables to be used
allF = np.concatenate((PQ, HarmonicsF,BinF, BinaryF),axis=1)
F = [RawCF, PQ, HarmonicsF,BinF, BinaryF, PCA_RawCF, PCA_BinF, PCA_BinaryF, allF]
namesF = [ 'Raw Current','PQ','Harmonics','Bin Current/Voltage', 'Binary Images', 'PCA on Current',
'PCA on Bin Current/Voltage','PCA on Binary Images','Combined']
print('type label:',type_label)
print('location label:', loc_label)
print('number of different types: %d\n'% len(Unq_type),Unq_type)
print('number of different households: %d\b'% len(Unq_loc))
# In[50]:
for f in F:
print(f.shape)
# In[51]:
start = datetime.now()
n = len(Unq_loc)
num_clf = 9
num_f = len(F)
acc = np.empty([num_f,num_clf])
predictedY = [[np.empty([0],dtype='int')]*num_clf]*num_f
trueY = np.empty([0],dtype='int')
# loop over each household
for i in range(n):
print('%d/%d fold...\t time consumed: %ds'%(i+1,n,(datetime.now()-start).seconds))
# split data into X_train,y_train,X_test,y_test based on households
ix_test = np.where(loc_label==i+1)[0]
ix_train = np.where(loc_label!=i+1)[0]
temp_acc = np.empty([num_f,num_clf])
# loop over each feature
for (j,f) in enumerate(F):
X_test = f[ix_test]
y_test = type_label[ix_test]
X_train = f[ix_train]
y_train = type_label[ix_train]
(temp,y_p,clf_names) = myclassifiers(X_train,y_train,X_test,y_test)
temp_acc[j] = np.array(temp)
predictedY[j] = [np.hstack([predictedY[j][ii],y_p[ii]]) for ii in range(num_clf)]
trueY = np.hstack([trueY,y_test])
acc += temp_acc
# In[52]:
real_acc_table = np.array([[accuracy_score(trueY, i) for i in j] for j in predictedY])
# In[53]:
np.set_printoptions(precision=4)
print(clf_names)
print(real_acc_table)
print(namesF)
# In[54]:
ix=[0,1,2,6,7]
temp_clf_name=[clf_names[i] for i in ix]
temp_table=real_acc_table[:,ix]
print(temp_clf_name)
print(temp_table)
print(namesF)
# In[55]:
print('The highest accuracy is: %.2f' % (1*np.max(real_acc_table)))
(ix1,ix2)=np.where(real_acc_table==np.max(real_acc_table))
print('It is from feature %s using classifier %s'%(namesF[ix1[0]],clf_names[ix2[0]]))
# In[56]:
# draw confusion matrix
plt.style.use('classic')
C = sklearn.metrics.confusion_matrix(trueY, predictedY[ix1[0]][ix2[0]])
conf_arr = C
norm_conf = []
for i in conf_arr:
a = 0
tmp_arr = []
a = sum(i)
for j in i:
tmp_arr.append(float(j)/float(a))
norm_conf.append(tmp_arr)
fig = plt.figure()
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet,
interpolation='nearest')
width = len(conf_arr)
height = len(conf_arr[0])
for x in range(width):
for y in range(height):
ax.annotate(str(conf_arr[x][y]), xy=(y, x),
horizontalalignment='center',
verticalalignment='center')
# cb = fig.colorbar(res)
# alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
alphabet=['AC(1)','CFL(2)','Fan(3)','Fridge(4)','Hairdryer(5)','Heater(6)','Bulb(7)','Laptop(8)','Microwave(9)','Vaccum(10)','Washer(11)']
xlabels=['1','2','3','4','5','6','7','8','9','10','11']
plt.xticks(range(width), xlabels[:width])
plt.yticks(range(height), alphabet[:height])
# plt.savefig('confusion_matrix.png', format='png',dpi=800)
plt.savefig('confusion_matrix.eps', format='eps')
plt.style.use('ggplot')
# In[57]:
# plot four instances
ids = [1,3,5,6,7]
normalize = lambda x:x/(max(x)*1.1)
plt.figure(figsize=(7,6))
for i in ids:
plt.plot(normalize(RawCF[np.where(type_label==i)[0][0],]),label=Unq_type[i-1])
plt.legend()
# ## Downsample test
# In[58]:
def downsample(X,newN):
newN = int(newN)
D = len(X)
seg = D//newN
newX = np.zeros([newN])
for i in range(newN):
newX[i] = X[seg*i]
return newX
# In[59]:
m = len(Data)
n = len(Unq_loc)
num_clf = 9
width = 16
fs = 3e4
f0 = 60
# from 30k Hz to 200 Hz
new_fs = [3e4,2e4]+[i*1e3 for i in range(10,0,-1)] + [i*1e2 for i in range(9,1,-1)]
# new_fs = [i*1e3 for i in range(3,0,-1)] + [i*1e2 for i in range(9,1,-1)]
n_fs = len(new_fs)
img_acc = np.empty([n_fs,num_clf])
predictedY = [[np.empty([0],dtype='int')]*num_clf]*n_fs
# loop over sampling frequency
start = datetime.now()
for ii in range(n_fs):
NN = new_fs[ii]//f0
new_Imgs = np.zeros((m,width,width), dtype=np.float64)
# extract VI image feature for corresponding sampling frequency
for j in range(m):
new_V = downsample(rep_V[j,],NN)
new_I = downsample(rep_I[j,],NN)
new_Imgs[j,:,:] = get_img_from_VI(new_V, new_I, width, True,1)
# new_Imgs[j,:,:] = get_img_from_VI(rep_V[j,], rep_I[j,], width,True,1)
newBinaryF=np.reshape(new_Imgs,(m,width*width))
print('%d/%d\t sampling frequency:%6.0f Hz'%(ii+1,n_fs,new_fs[ii]),end='')
# train/test for each household
trueY = np.empty([0],dtype='int')
for i in range(n):
# split data into X_train,y_train,X_test,y_test based on households
ix_test = np.where(loc_label==i+1)[0]
ix_train = np.where(loc_label!=i+1)[0]
X_test = newBinaryF[ix_test]
y_test = type_label[ix_test]
X_train = newBinaryF[ix_train]
y_train = type_label[ix_train]
(_,y_p,_) = myclassifiers(X_train,y_train,X_test,y_test)
predictedY[ii] = [np.hstack([predictedY[ii][jj],y_p[jj]]) for jj in range(num_clf)]
trueY = np.hstack([trueY,y_test])
# print('%d/%d -- %d/%d fold...\t time consumed: %ds'%(ii,n_fs,i+1,n,(datetime.now()-start).seconds))
img_acc[ii,]= np.array([accuracy_score(trueY, jj) for jj in predictedY[ii]])
print("\t time consumed: %4ds\t accuracy: %.2f" % ((datetime.now()-start).seconds, max(img_acc[ii,])))
# In[60]:
np.set_printoptions(precision=2)
print(img_acc*100)
# In[61]:
ix=[0,1,2,6,7]
legend_names=("kNN(1)", "GNB","LGC","DTree","RForest")
acc_plot=img_acc[:,ix]
print(legend_names)
print(acc_plot)
print(new_fs)
acc_plot
# In[62]:
fig = plt.figure()
markers='.ov^<>'
colors='bgrcmk'
legend_names=("kNN(1)", "GNB","LGC","DTree","RForest")
for i in range(5):
plt.semilogx(new_fs, acc_plot[:,i],colors[i]+markers[i]+'-',label=legend_names[i],linewidth=2.0)
# plt.title('semilogx')
plt.grid(True)
plt.gca().invert_xaxis()
# plt.legend(bbox_to_anchor=(1.05, 1),loc=1, borderaxespad=0.)
plt.legend(loc='lower left',prop={'size':19})
# fig.legend((tuple(l)),(legend_names),'upper right')
# fig.legend((l[0],l[1],l[2],l[3],l[4]),legend_names,'upper right')
plt.xlabel('Sampling Frequency')
plt.ylabel('Accuracy')
# plt.ylim([0,1])
# plt.show()
# plt.savefig('acc_fs.png', format='png',dpi=800)
# plt.savefig('acc_fs.eps', format='eps')