Copyright (C) 2015 Jingkun Gao ([email protected])

This file is part of PLAID code repo.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>

In this IPython Notebook, I will show how to load data and extract features from PLAID. Meanwhile, I will test the performance of appliance identification using PLAID.

Please make sure Python3 and other necessary packages (using pip3 to install *ipython, jinja2, jsonschema, matplotlib, numpy, scipy, pyzmq, pygments, tornado, scikit-learn*) are installed,

In [1]:
%matplotlib inline

import subprocess
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
plt.style.use('ggplot')

from pylab import rcParams
rcParams['figure.figsize'] = (10, 6)

import matplotlib
font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 17}
matplotlib.rc('font', **font)

import warnings
warnings.filterwarnings("ignore")
In [2]:
# functions to read data and meta data
def read_data_given_id(path,ids,progress=True,last_offset=0):
    '''read data given a list of ids and CSV paths'''
    start = datetime.now()
    n = len(ids)
    if n == 0:
        return {}
    else:
        data = {}
        for (i,ist_id) in enumerate(ids, start=1):
            if progress and np.mod(i,np.ceil(n/10))==0:
                print('%d/%d (%2.0f%s) have been read...\t time consumed: %ds'\
                      %(i,n,i/n*100,'%',(datetime.now()-start).seconds))
            if last_offset==0:
                data[ist_id] = np.genfromtxt(path+str(ist_id)+'.csv',delimiter=',',\
                                         names='current,voltage',dtype=(float,float))
            else:
                p=subprocess.Popen(['tail','-'+str(int(last_offset)),path+str(ist_id)+'.csv'],\
                                   stdout=subprocess.PIPE)
                data[ist_id] = np.genfromtxt(p.stdout,delimiter=',',names='current,voltage',dtype=(float,float))
        print('%d/%d (%2.0f%s) have been read(Done!) \t time consumed: %ds'\
            %(n,n,100,'%',(datetime.now()-start).seconds)) 
        return data


def clean_meta(ist):
    '''remove None elements in Meta Data ''' 
    clean_ist = ist.copy()
    for k,v in ist.items():
        if len(v) == 0:
            del clean_ist[k]
    return clean_ist
                
def parse_meta(meta):
    '''parse meta data for easy access'''
    M = {}
    for m in meta:
        for app in m:
            M[int(app['id'])] = clean_meta(app['meta'])
    return M
In [3]:
# specify path to read data and meta
# Please make sure you downloaded latest dataset from plaidplug.com.
Data_path = 'PLAID/'
csv_path = Data_path + 'CSV/';

import json

# read meta
with open(Data_path + 'meta1.json') as data_file:    
    meta1 = json.load(data_file)

    
Meta = parse_meta([meta1])
meta1 = parse_meta([meta1])
In [4]:
# read data
# applinace types of all instances
Types = [x['type'] for x in Meta.values()]
# unique appliance types
Unq_type = list(set(Types)) 
Unq_type.sort()
IDs_for_read_data = list(Meta.keys())
# households of appliances
Locs = [x['header']['collection_time']+'_'+x['location'] for x in Meta.values()]
# unique households
Unq_loc = list(set(Locs))
Unq_loc.sort()
Origianl_Unq_type = Unq_type
In [5]:
print('Number of households: %d\nNumber of total measurements:%d'%(len(Unq_loc),len(Locs)))
Number of households: 55
Number of total measurements:1074
In [6]:
# read data
# estimated time cost:  ~ 1 mins
npts = 10000
Data = read_data_given_id(csv_path,IDs_for_read_data,progress=True, last_offset=npts)
108/1074 (10%) have been read...	 time consumed: 8s
216/1074 (20%) have been read...	 time consumed: 20s
324/1074 (30%) have been read...	 time consumed: 32s
432/1074 (40%) have been read...	 time consumed: 44s
540/1074 (50%) have been read...	 time consumed: 55s
648/1074 (60%) have been read...	 time consumed: 64s
756/1074 (70%) have been read...	 time consumed: 76s
864/1074 (80%) have been read...	 time consumed: 88s
972/1074 (91%) have been read...	 time consumed: 98s
1074/1074 (100%) have been read(Done!) 	 time consumed: 107s
In [7]:
print('Total number of instances:%d'%len(Data))
Total number of instances:1074

We start by extracting one representative period of steady state for each instance.

In [8]:
fs = 30000
f0 = 60
NS = fs/f0 # number of samples per period
NP = npts/NS # number of periods for npts

# calculate the representative one period of steady state 
# (mean of the aggregated signals over one cycle)
n = len(Data)
rep_I = np.empty([n,NS])
rep_V = np.empty([n,NS])
for i in range(n):
    tempI = np.sum(np.reshape(Data[i+1]['current'],[NP,NS]),0)/NP
    tempV = np.sum(np.reshape(Data[i+1]['voltage'],[NP,NS]),0)/NP
    # align current to make all samples start from 0 and goes up
    ix = np.argsort(np.abs(tempI))
    j = 0
    while True:
        if ix[j]<499 and tempI[ix[j]+1]>tempI[ix[j]]:
            real_ix = ix[j]
            break
        else:
            j += 1
    rep_I[i,] = np.hstack([tempI[real_ix:],tempI[:real_ix]])
    rep_V[i,] = np.hstack([tempV[real_ix:],tempV[:real_ix]])

Extract labels for appliance type and locations(used for train/test split).

In [9]:
type_Ids = {}
loc_Ids = {}
n = len(Data)
type_label = np.zeros(n,dtype='int')
loc_label = np.zeros(n,dtype='int')
for (ii,t) in enumerate(Unq_type):
    type_Ids[t] = [i-1 for i,j in enumerate(Types,start=1) if j == t]
    type_label[type_Ids[t]] = ii+1
for (ii,t) in enumerate(Unq_loc):
    loc_Ids[t] = [i-1 for i,j in enumerate(Locs,start=1) if j == t]
    loc_label[loc_Ids[t]] = ii+1
print('number of different types: %d'% len(Unq_type))
print('number of different households: %d'% len(Unq_loc))
number of different types: 11
number of different households: 55

We then look at the distributions of appliance types and locations.

In [10]:
plt.hist(type_label,len(Unq_type))
plt.title('distirbution of instances for each type')
print(Origianl_Unq_type)
print('Similified verstion:')
Unq_type[0]='AC';
Unq_type[1]='CFL';
Unq_type[6]='Bulb';
Unq_type[10]='Washer';
print(Unq_type)
plt.savefig('type_dist.eps', format='eps')
['Air Conditioner', 'Compact Fluorescent Lamp', 'Fan', 'Fridge', 'Hairdryer', 'Heater', 'Incandescent Light Bulb', 'Laptop', 'Microwave', 'Vacuum', 'Washing Machine']
Similified verstion:
['AC', 'CFL', 'Fan', 'Fridge', 'Hairdryer', 'Heater', 'Bulb', 'Laptop', 'Microwave', 'Vacuum', 'Washer']

We can see that we have an unbalanced dataset, more instances for CFL, Hairdryer, Laptop and less for Fridge, Heater, Vacuum and Washing Machine.

In [11]:
plt.hist(loc_label,len(Unq_loc))
plt.title('distirbution of instances for each household')
plt.savefig('house_dist.eps', format='eps')

Next, we have rep_I, rep_V, type_label, loc_label. We will start from here to extract features and evalute classification performance. To begin, we first write a function for classifiers.

Classifiers

In [12]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

def myclassifiers(X_train,y_train,X_test,y_test,verbose=False):
    
    knn = KNeighborsClassifier(n_neighbors=1)
    gnb = GaussianNB()
    logistic = LogisticRegression(C=1e5)
    svc = svm.SVC(kernel='rbf',gamma=.7,C=1.0)
    lda = LDA(solver='lsqr', shrinkage='auto')
    qda = QDA()
    dTree = tree.DecisionTreeClassifier(max_depth=10)
    rForest = RandomForestClassifier(max_depth=10,n_estimators=20)
    adaBoost = AdaBoostClassifier()
    
    names = ["Nearest Neighbors(k=1)", "Naive Bayes","Logistic","RBF SVM", "LDA", "QDA",
             "Decision Tree","Random Forest", "AdaBoost"]
    classifiers = [knn, gnb, logistic, svc, lda, qda, dTree, rForest, adaBoost]
    y_predict = []
    acc = []
#     print('Running',end="")
    for (i,clf) in enumerate(classifiers):
        if verbose:
            print(' %s... '% names[i],end="")
        clf.fit(X_train,y_train)
        y_predict.append(clf.predict(X_test))
        acc.append(clf.score(X_test,y_test))
    return (acc,y_predict,names)
In [13]:
# test classifiers

X_train, X_test, y_train, y_test = train_test_split(rep_I[:,:10], type_label, test_size=0.1, random_state=0)
(acc,y_p,names) = myclassifiers(X_train,y_train,X_test,y_test)
In [14]:
acc
Out[14]:
[0.62037037037037035,
 0.46296296296296297,
 0.40740740740740738,
 0.47222222222222221,
 0.41666666666666669,
 0.47222222222222221,
 0.54629629629629628,
 0.62962962962962965,
 0.21296296296296297]

Features

1. Raw Current

RawCF

In [15]:
RawCF = rep_I
# visualization of current features from certain type in RawCF
type_id = 4
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
    plt.subplot(np.ceil(n/10),15,count)
    plt.plot(RawCF[i,],'.')
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
#     plt.title('Id: '+str(k),fontsize=10)
    count += 1
fig.tight_layout()
print(Unq_type[type_id-1])
plt.show()  
Fridge
In [16]:
# plot five instances
ids = [1,8,3,4]
normalize = lambda x:x/(max(x)*1.1)
for i in ids:
    plt.plot(normalize(RawCF[np.where(type_label==i)[0][0],]),'.',label=Unq_type[i-1])
    plt.legend()
plt.xlabel('count')
plt.ylabel('normalized current value')
plt.title('One Period of Steady State')
Out[16]:
<matplotlib.text.Text at 0x11256a6a0>

2. Real/Reactive Power, PQ Plane feature

PQ: Calculate real and reactive power from steady states

In [17]:
f0 = 60
fs = 3e4
n = len(Data)
NN = int(fs//f0)
NS = npts//NN
PQ = np.empty([n,NS,2])

for i in range(n):
    for j in range(NS-1):
        # extract current and voltage in two cycle
        temp_I = Data[i+1]['current'][j*NN:(j+2)*NN]
        temp_V = Data[i+1]['voltage'][j*NN:(j+2)*NN]
        # extract abs part(apparent component), divided by the number of points
        apparI = np.abs(2*np.fft.fft(temp_I))/NN
        apparV = np.abs(2*np.fft.fft(temp_V))/NN
        # phase difference
        theta = np.angle(np.fft.fft(temp_V)) - np.angle(np.fft.fft(temp_I))
        # calculate real/reactive power
        tempP = 1/2*apparV*apparI*np.sin(theta)
        tempQ = 1/2*apparV*apparI*np.cos(theta)
        # sum the power of different orders
#         PQ[i,j,0] = np.sum(tempP)
#         PQ[i,j,1] = np.sum(tempQ)
        # take only the fundamental active/reactive power 
        PQ[i,j,0] = (tempP[2])
        PQ[i,j,1] = (tempQ[2])
PQ = np.delete(PQ,np.where(np.isnan(PQ))[1],1)
# choose the median value among all cycles
PQ = np.median(PQ,1)
In [18]:
import matplotlib.cm as cmx
import matplotlib.colors as colors

def get_cmap(N):
    '''Returns a function that maps each index in 0, 1, ... N-1 to a distinct 
    RGB color.'''
    color_norm  = colors.Normalize(vmin=0, vmax=N-1)
    scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv') 
    def map_index_to_rgb_color(index):
        return scalar_map.to_rgba(index)
    return map_index_to_rgb_color 

cmap = get_cmap(11)
Legends=('AC','CFL','Fan','Fridge','Hairdryer','Heater','Bulb','Laptop','Microwave','Vaccum','Washer')
markers='ov^<>sph+Dx'
colors='bgrcmk'
minx,maxx,miny,maxy = (0,0,0,0)
fig=plt.figure(figsize=(12,6))
ax=fig.add_subplot(1,1,1)
ax.set_position([0.1,0.1,.6,0.8])
# visualization of current features from certain type in PQ
for i in range(11):
    ix=np.where(type_label==i+1)[0]
    # ignore appliances with high real/reactive power for better visualization
    temp_PQ = np.delete(PQ[ix], np.unique(np.where(np.abs(PQ[ix]) > 3000)[0]), 0)
    ix1 = np.random.randint(temp_PQ.shape[0],size=5)
    tempx = temp_PQ[ix1,0]
    tempy = temp_PQ[ix1,1]
    plt.scatter(tempx,tempy,c=cmap(i),marker=markers[i], s=100, label=Legends[i])
    minx = min(minx,min(tempx))
    maxx = max(maxx,max(tempx))
    miny = min(miny,min(tempy))
    maxy = max(maxy,max(tempy))    
plt.axis([minx*0.9,maxx*1.1,miny*0.9,maxy*1.1])
plt.xlabel('Real Power(W)')
plt.ylabel('Reactive Power (VAR)')
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.tight_layout()
plt.savefig('PQ.eps', format='eps')

3. Harmonics

HarmonicsF: We will take up to 21st order of harmonics

In [19]:
n = len(Data)
order = 21
p = order + 1 # number of harmonics to be extracted, include 0-th component
harmonics = np.linspace(0,order,num=p)
fs = 30000
npts = 10000
f0 = 120 # for power

HarmonicsF = np.empty([n,p])
# visualize power harmonics of one instance
ix = 100
temp_P = Data[ix]['current']*Data[ix]['voltage']
x = np.linspace(0,fs,num=npts)
y = np.abs(np.fft.fft(temp_P))
plt.plot(x[:200],y[:200],'.-')
Out[19]:
[<matplotlib.lines.Line2D at 0x112e15fd0>]
In [20]:
harmonics
Out[20]:
array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.])
In [21]:
for i in range(n):
    temp_P = Data[i+1]['current']*Data[i+1]['voltage']
    y = np.abs(np.fft.fft(temp_P))
    h = 40*harmonics
    h = h.astype(int)
    HarmonicsF[i,] =y[h]
HarmonicsF[1,]
Out[21]:
array([ 241954.15403439,  245235.14034795,  149656.45694216,
         60735.40515085,   38803.81561883,   24499.81562646,
         21877.30176295,   34387.33122276,   26462.95516803,
         19350.51102955,   21857.66557608,   14818.50723871,
         12411.08847785,   16376.94359118,   14223.17238561,
         13936.37074914,   13906.62194871,    9392.90600686,
          6516.84211193,    5567.63471569,    5381.62618865,
          7376.58529023])
In [22]:
HarmonicsF.shape
Out[22]:
(1074, 22)
In [23]:
# visualization of harmonics features from certain type
from mpl_toolkits.mplot3d import Axes3D

ix1=np.where(type_label==9)[0][:4]
ix2=np.where(type_label==7)[0][:4]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=HarmonicsF[ix1,0], ys=HarmonicsF[ix1,1], zs=HarmonicsF[ix1,2],c='r',s=80)
ax.scatter(HarmonicsF[ix2,0], HarmonicsF[ix2,1], zs=HarmonicsF[ix2,2],c='b',s=80)
ax.set_xlabel('1st harmonics')
ax.set_ylabel('2nd harmonics')
ax.set_zlabel('3rd harmonics')

plt.show()

4. Down sampled current/voltage

BinF : This feature is to downsample the data to reduce dimension of raw current/voltage.

In [24]:
num = 20 # number of bins

def get_BinF(X,num):
    '''X should be nd array of size N*P, the output will be N*num'''
    (N,P) = X.shape
    newP = np.floor(P/num)*num
    newX = np.reshape(X[:,:newP],[N,num,newP/num])
    BinF = np.sum(newX,2)
    return BinF
BinF_I = get_BinF(rep_I,num)    
BinF_V = get_BinF(rep_V,num)    
In [25]:
BinF = np.hstack([BinF_I,BinF_V])
In [26]:
BinF.shape
Out[26]:
(1074, 40)
In [27]:
Unq_type
Out[27]:
['AC',
 'CFL',
 'Fan',
 'Fridge',
 'Hairdryer',
 'Heater',
 'Bulb',
 'Laptop',
 'Microwave',
 'Vacuum',
 'Washer']
In [28]:
BinF[1,:]
Out[28]:
array([  1.71650000e+00,   1.75350000e+00,   1.56700000e+00,
         1.41700000e+00,   1.22300000e+00,   1.03950000e+00,
         1.12575000e+01,   1.89370000e+01,   1.20715000e+01,
         3.95400000e+00,  -1.60750000e+00,  -1.62750000e+00,
        -1.38850000e+00,  -1.23150000e+00,  -1.04950000e+00,
        -8.66500000e-01,  -1.11200000e+01,  -1.86765000e+01,
        -1.18595000e+01,  -3.74000000e+00,  -3.99555650e+03,
        -3.16559200e+03,  -2.13829055e+03,  -9.16609555e+02,
         4.70130156e+02,   1.70416765e+03,   2.81547730e+03,
         3.53513050e+03,   4.05019950e+03,   4.20040500e+03,
         3.96450650e+03,   3.13474400e+03,   2.10894995e+03,
         8.88079580e+02,  -5.06118388e+02,  -1.74206265e+03,
        -2.85133050e+03,  -3.57194350e+03,  -4.08391200e+03,
        -4.23233850e+03])
In [29]:
i = 100
plt.plot(BinF_I[i,:],'.')
plt.title('Down Sampled Current of ' + Unq_type[type_label[i]])
plt.xlabel('count')
plt.ylabel('value')
plt.show()
In [30]:
# visualization of all Bin current features from certain type in BinF
type_id = 4
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
    plt.subplot(np.ceil(n/10),15,count)
    plt.plot(BinF_I[i,],'.')
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
#     plt.title('Id: '+str(k),fontsize=10)
    count += 1
fig.tight_layout()
plt.show()  

5. VI binary image

BinaryF: Raw 0/1 binary VI images

In [31]:
def center(X,w):
    minX = np.amin(X)
    maxX = np.amax(X)
    dist = max(abs(minX),maxX)
    X[X<-dist] = -dist
    X[X>dist] = dist
    d = (maxX-minX)/w
    return (X,d)
    
def get_img_from_VI(V, I, width,hard_threshold=False,para=.5):
    '''Get images from VI, hard_threshold, set para as threshold to cut off,5-10
    soft_threshold, set para to .1-.5 to shrink the intensity'''
    
    d = V.shape[0]
    # doing interploation if number of points is less than width*2
    if d<2* width:
        newI = np.hstack([V, V[0]])
        newV = np.hstack([I, I[0]])
        oldt = np.linspace(0,d,d+1)
        newt = np.linspace(0,d,2*width)
        I = np.interp(newt,oldt,newI)
        V = np.interp(newt,oldt,newV)
    # center the current and voltage, get the size resolution of mesh given width    
    (I,d_c)  = center(I,width)
    (V,d_v)  = center(V,width)
    
    #  find the index where the VI goes through in current-voltage axis
    ind_c = np.floor((I-np.amin(I))/d_c)
    ind_v = np.floor((V-np.amin(V))/d_v)
    ind_c[ind_c==width] = width-1
    ind_v[ind_v==width] = width-1
    
    Img = np.zeros((width,width))
    
    for i in range(len(I)):
        Img[ind_c[i],width-ind_v[i]-1] += 1
    
    if hard_threshold:
        Img[Img<para] = 0
        Img[Img!=0] = 1
        return Img
    else:
        return (Img/np.max(Img))**para
    
In [32]:
n = len(Data)
width = 16

Imgs = np.zeros((n,width,width), dtype=np.float64)
for i in range(n):
    Imgs[i,:,:] = get_img_from_VI(rep_V[i,], rep_I[i,], width,True,1)
BinaryF=np.reshape(Imgs,(n,width*width))
In [33]:
# visualization of all imgaes from certain type in Imgs
type_id = 1
n = sum(type_label==type_id)
fig = plt.figure(figsize=(14,np.ceil(n/15)))
count = 1
for i in np.where(type_label==type_id)[0]:
    plt.subplot(np.ceil(n/10),15,count)
    plt.imshow(Imgs[i,:,:],cmap = cm.Greys_r,interpolation='None')
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
#     plt.title('Id: '+str(k),fontsize=10)
    count += 1
fig.tight_layout()
plt.show()  
In [34]:
plt.style.use('classic')
# AC-1,CFL-2,hairdryer-5,laptop-8
ix=np.where(type_label==1)[0][30]
fig=plt.figure(figsize=(16,10))
ax1 = fig.add_subplot(2,3,1)
plt.plot(rep_V[ix,:],rep_I[ix,:])
plt.title('VI trajectory of Air Conditioner')
# plt.xlabel('Voltage')
# plt.ylabel('Current')

ax = fig.add_subplot(2,3,2)
plt.plot(rep_V[ix,:],rep_I[ix,:],'.')
xticks = np.linspace(min(rep_V[ix,:]),max(rep_V[ix,:]),num=16)
yticks = np.linspace(min(rep_I[ix,:]),max(rep_I[ix,:]),num=16)
plt.axis([min(rep_V[ix,:]),max(rep_V[ix,:]),min(rep_I[ix,:]),max(rep_I[ix,:])])
ax.set_xticks(xticks)
ax.set_yticks(yticks)
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
# plt.tick_params(axis='y',which='both',labelbottom='off')
plt.title('Meshed version of VI')
ax.grid(which='both')

ax3 = fig.add_subplot(2,3,3)
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.title('VI binary image of Air Conditioner')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')

ax4 = fig.add_subplot(2,3,4)
ix=np.where(type_label==2)[0][13]
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
plt.title('VI binary image of CFL')


ax5 = fig.add_subplot(2,3,5)
ix=np.where(type_label==5)[0][19]
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
plt.title('VI binary image of Hairdryer')

ax6 = fig.add_subplot(2,3,6)
ix=np.where(type_label==8)[0][13]
plt.imshow(Imgs[ix,:,:],cmap = cm.Greys_r,interpolation='None')
plt.tick_params(axis='both',which='both',labelbottom='off',labelleft='off')
plt.title('VI binary image of Laptop')


plt.tight_layout()
plt.show()
fig.savefig('VI.eps', format='eps')
plt.style.use('ggplot')

6. PCA on abve three

PCA_BinF, PCA_BinaryF, PCA_RawCF: Use projected components which keep more than 99% variations as the features

In [35]:
from sklearn.decomposition import PCA

# write a function to extract up to certain percent of projected components
def get_PCs(X,p):
    # X: nd array of size sample_nums*features, 
    # p: percentage of variation to be taken
    pca = PCA(whiten=True)
    pca.fit(X)
    ix=np.where(np.cumsum(pca.explained_variance_ratio_)>p)[0][0]
    pca = PCA(n_components=ix,whiten=True)
    return pca.fit_transform(X)
    
In [36]:
p = .99
PCA_BinF = get_PCs(BinF,p)
PCA_BinaryF = get_PCs(BinaryF,p)
PCA_RawCF = get_PCs(RawCF,p)
In [37]:
np.isnan(np.sum(PCA_BinF))
Out[37]:
False
In [38]:
np.isnan(np.sum(PCA_BinaryF))
Out[38]:
False
In [39]:
np.isnan(np.sum(PCA_RawCF))
Out[39]:
False

TSNE vs PCA

To visualize the features, we use TSNE and PCA to project them into 2 dimensional space.

In [40]:
def lowDimVisualize(F,filename):

    from sklearn.manifold import TSNE
    from sklearn.decomposition import PCA
    model = TSNE(n_components=2, random_state=0)
    pca = PCA(n_components=2,whiten=True)
    newHar = model.fit_transform(F)

    TSNEF = newHar
    PCAF = pca.fit_transform(F)


    colors = cm.rainbow(np.linspace(0, 1, 11))
    Legends=('AC','CFL','Fan','Fridge','Hairdryer','Heater','Bulb','Laptop','Microwave','Vaccum','Washer')
    # markers='ov^<>sph+Dx'
    fig = plt.figure(figsize=(14,7))
    plt.subplot(1,2,1)
    for i, c in zip(range(11), colors):
        ix = np.where(type_label==i+1)[0]
        plt.scatter(TSNEF[ix,0], TSNEF[ix,1], color=c,label=Legends[i])
    plt.title('2D Visulization of TSNE')
    plt.subplot(1,2,2)
    for i, c in zip(range(11), colors):
        ix = np.where(type_label==i+1)[0]
        plt.scatter(PCAF[ix,0], PCAF[ix,1], color=c,label=Legends[i])
    plt.title('2D Visulization of PCA')

    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
    # plt.tight_layout()
    fig.savefig(filename, format='eps')
In [41]:
# look at harmonics
lowDimVisualize(HarmonicsF,'harmonics_TSNE_PCA.eps')
In [42]:
# look at binary images
lowDimVisualize(BinaryF,'Binary_TSNE_PCA.eps')
In [43]:
# look at raw current
lowDimVisualize(RawCF,'raw_current_TSNE_PCA.eps')