This notebook composed by Justin Elszasz.
Exploratory data analysis is conducted for a dataset consisting of hourly apartment electricity consumption in kilowatt-hours. The data are obtained directly from the utility company Baltimore Gas and Electric via the Green Button smart meter data protocol. Additionally, hourly weather data are obtained using the open API at Weather Underground (weatherunderground.com).
Data are missing from May 1 to May 23. There are two apartments shown. A description of the differences between the two apartments can be found here: http://jtelszasz.github.io/blog_project/articles/The-New-Pad/
import pandas as pd
import numpy as np
import import_funcs
#reload(import_funcs)
I wrote a couple of functions to cleanly import the data for both the electricity use and for weather using the Weather Underground API and put them in a class (import_funcs.py) of their own. I did this (as opposed to importing directly here) because I intend to use those functions elsewhere. Sorry I don't have them nicely embedded here.
# Import the BGE hourly electricity data and weather data using import_funcs.py class
old_apt = import_funcs.BGEdata("old apt")
new_apt = import_funcs.BGEdata("new apt")
weather = import_funcs.weather()
# Merge into one Pandas dataframe
old_apt_and_weather = pd.merge(weather,old_apt,left_index=True,right_index=True)
new_apt_and_weather = pd.merge(weather,new_apt,left_index=True,right_index=True)
#new_apt_and_weather = new_apt
new_apt.to_csv('New_Apt_thru_Jan_2015.csv')
# Pretty colors
gray_light = '#bbbbbb'
gray_med = '#737373'
red_orange = '#ff3700'
fig = plt.figure(figsize=[10,3])
plot_south = old_apt_and_weather['USAGE'].plot(color=gray_light,linewidth=.6)
plot_north = new_apt_and_weather['USAGE'].plot(color=red_orange,grid='off',linewidth=.6)
xlim([min(old_apt_and_weather.index),max(new_apt_and_weather.index)])
title('Time Series, Hourly Electricity Consumption')
legend(['Old Apt','New Apt'],loc='upper left')
ylabel('Usage (kWh)')
xlabel('')
fig.savefig('plots/New-Old-TS.png')
fig = plt.figure(figsize=[14,3])
plot_north = new_apt_and_weather['USAGE'].plot(color=red_orange,grid='off',linewidth=.6)
#xlim([min(old_apt_and_weather.index),max(new_apt_and_weather.index)])
title('Time Series, Hourly Electricity Consumption')
#legend(['Old Apt','New Apt'],loc='upper left')
ylabel('Usage (kWh)')
xlabel('')
#fig.savefig('plots/New-TS.png')
<matplotlib.text.Text at 0xa90b750>
fig = plt.figure(figsize=[8,6])
plot_old = plot(old_apt_and_weather['tempF'],old_apt_and_weather['USAGE'],'x',color=gray_light)
plot_new = plot(new_apt_and_weather['tempF'],new_apt_and_weather['USAGE'],'x',color=red_orange)
legend(['Old Apt','New Apt'],loc='upper left')
title('Electricity Usage and Outdoor Temperature')
ylabel('Usage (kWh)')
xlabel('Outdoor Temperature (deg F)')
fig.savefig('plots/New-Old-Elec-Temp.png')
Define a functions to produce histogram and cumulative distribution.
# Histogram
def elec_hist(data,date_range,hist_title):
fig = plt.figure()
elec_pdf = data.ix[date_range].hist(weights=np.ones_like(data.ix[date_range])/data.ix[date_range].size,bins=20,grid=False)
#xlim(xmax=)
#ylim(ymax=.5)
ylabel('Proportion')
xlabel('Usage (kWh)')
title(hist_title)
elec_pdf.text(0.1,.14,'N = %d hours'%size(data.ix[date_range]))
# Cumulative Distribution Function
def elec_cdf(data,date_range,cdf_title):
fig = plt.figure()
elec_cdf = data.ix[date_range].hist(normed=1,cumulative=True,histtype='step',bins=200,grid=False)
#xlim(xmax=6)
#ylim(ymax=1)
ylabel('Proportion of Hours (x <= X)')
xlabel('Usage (kWh)')
title(cdf_title)
#fig.text(0,1,'N_feb = %d hours \n N_mar = %d hours \n N_new = %d hours'%[size(old_apt_and_weather['USAGE'].ix['feb 2014']),size(old_apt_and_weather['USAGE'].ix['mar 2014']),size(new_apt_and_weather['USAGE'])])
#legend(['St. Paul South - Feb','St. Paul South - March','St. Paul North'],'lower right')
elec_hist(old_apt_and_weather['USAGE'],'feb 2014','Old Apartment \n February 2014')
The following histogram is for the new apartment.
elec_hist(new_apt_and_weather['USAGE'],'june 2014','New Apartment \n June 2014')
Below are the cumulative distributions for February and March, which give the probability that usage is a given quantity or less.
fig = plt.figure()
CDF_old = old_apt_and_weather['USAGE'].hist(normed=1,cumulative=True,histtype='step',bins=200,grid=False)
CDF_new = new_apt_and_weather['USAGE'].hist(normed=1,cumulative=True,histtype='step',bins=200,grid=False)
xlim(xmax=max(max(old_apt_and_weather['USAGE']),max(new_apt_and_weather['USAGE'])))
ylim(ymax=1)
ylabel('Proportion of Hours (x <= X)')
xlabel('Usage (kWh)')
title('Elec. Usage Cumulative Distributions')
#fig.text(0,1,'N_feb = %d hours \n N_mar = %d hours \n N_new = %d hours'%[size(old_apt_and_weather['USAGE'].ix['feb 2014']),size(old_apt_and_weather['USAGE'].ix['mar 2014']),size(new_apt_and_weather['USAGE'])])
legend(['Old Apt','New Apt'],'lower right')
#fig.savefig('ElecCDF - Feb_March2014.png')
<matplotlib.legend.Legend at 0xace6410>
Description
# Define a function to calculate average weekday and weekend
def avg_day(df,start_date,end_date):
# df is pandas df, has 'USAGE' and 'tempF' fields, indexed by pandas datetime
# date_range is string for .ix call
# returns average_day
# average_day[True
df_dates = df.ix[start_date:end_date]
# First groupby weekday/weekend and hour of day
weekday_weekend_hour = df_dates.groupby([(df_dates.index.dayofweek==5)|(df_dates.index.dayofweek==6),df_dates.index.hour])
# Calculate an average weekday and average weekend by hour (electricity and outdoor temp)
average_day = pd.DataFrame([weekday_weekend_hour['USAGE'].mean(),weekday_weekend_hour['tempF'].mean()])
average_weekend = average_day[True]
average_weekday = average_day[False]
return average_weekday, average_weekend
This plots the overall average weekday and weekend from Jan. 18 through March. Subsequent plots separate February and March.
weekday_oldapt_march, weekend_oldapt_march = avg_day(old_apt_and_weather,'1-mar-2014','31-mar-2014')
weekday_newapt_june, weekend_newapt_june = avg_day(new_apt_and_weather,'1-jun-2014','30-jun-2014')
fig = plt.figure(figsize=[12,6])
weekend_linestyle = 'dashed'
linewidth = 2
plot1 = weekend_oldapt_march.ix['USAGE'].plot(linestyle=weekend_linestyle,marker='',color=gray_light,linewidth=linewidth)
plot2 = weekday_oldapt_march.ix['USAGE'].plot(marker='',color=gray_light,linewidth=linewidth)
plot3 = weekend_newapt_june.ix['USAGE'].plot(linestyle=weekend_linestyle,marker='',color=red_orange,linewidth=linewidth)
plot4 = weekday_newapt_june.ix['USAGE'].plot(marker='',color=red_orange,grid='off',linewidth=linewidth)
ylim(ymin=0,ymax=3)
xlim(xmax=23)
xticks(np.arange(0,23,2))
xlabel('Hour of Day')
ylabel('Electricity Usage (kWh)')
title('Average Day of Electricity Consumption')
legend(['Old Apt, Weekend in March, (Daily Total = %.2f kWh)' % round(sum(weekend_oldapt_march.ix['USAGE']),1),
'Old Apt, Weekday in March, (Daily Total = %.2f kWh)' % round(sum(weekday_oldapt_march.ix['USAGE']),1),
'New Apt, Weekend in June, (Daily Total = %.2f kWh)' % round(sum(weekend_newapt_june.ix['USAGE']),1),
'New Apt, Weekday in June, (Daily Total = %.2f kWh)' % round(sum(weekday_newapt_june.ix['USAGE']),1)],
'upper right')
fig.savefig('plots/Average_Days_New_and_Old_Apts.png')
print 'Total Daily Usage'
print 'Weekday @ Old Apartment in March:',round(sum(weekday_oldapt_march.ix['USAGE']),1)
print 'Weekend @ Old Apartment in March:',round(sum(weekend_oldapt_march.ix['USAGE']),1)
print 'Weekday @ New Apartment in June:',round(sum(weekday_newapt_june.ix['USAGE']),1)
print 'Weekend @ New Apartment in June:',round(sum(weekend_newapt_june.ix['USAGE']),1)
Total Daily Usage Weekday @ Old Apartment in March: 22.0 Weekend @ Old Apartment in March: 18.0 Weekday @ New Apartment in June: 19.8 Weekend @ New Apartment in June: 24.5
a=pd.DataFrame(old_apt_and_weather['USAGE'].groupby(old_apt_and_weather.index.date).sum(),index=pd.to_datetime(old_apt_and_weather.index.date))
a = a.groupby(a.index.month).mean().plot(marker='o')