import numpy as np
import pandas as pd
from datetime import datetime
import time
import calendar
import matplotlib.dates as dates
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import seaborn as sns
colors = sns.hls_palette(8, l=.3, s=.8)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
Get Data, put it into a Pandas dataframe (nb = northbound crossings, sb = southbound crossings)
fileUrl = 'https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD'
df = pd.read_csv(fileUrl)
df.columns = ['timestamp','nb', 'sb'] #rename columns
df['tot'] = df['nb']+df['sb'] #add a total column
Parse the timestamp column
df['timestamp'] = df['timestamp'].apply(lambda x: pd.datetools.parse(x).strftime("%m/%d/%Y %I:%M:%S %p"))
Add columns for weekday, hour, month, day of week and day of year
#Note: This is slow. There must be a better way
df['year'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%y')))
df['weekdayname'] = df['timestamp'].apply(lambda x: pd.datetools.parse(x).strftime('%A'))
df['weekday'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%w')))
df['hour'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%H')))
df['month'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%m')))
df['day'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%d')))
df['year'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%y')))
df['dayofyear'] = df['timestamp'].apply(lambda x: int(pd.datetools.parse(x).strftime('%j')))
df['dayofyear_float'] = [df.dayofyear[i]+df.hour[i]/24.0 for i in range(0,len(df))]
Take a look at the first 20 columns
df.head(20)
timestamp | nb | sb | tot | year | weekdayname | weekday | hour | month | day | dayofyear | dayofyear_float | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10/02/2012 12:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 0 | 10 | 2 | 276 | 276.000000 |
1 | 10/02/2012 01:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 1 | 10 | 2 | 276 | 276.041667 |
2 | 10/02/2012 02:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 2 | 10 | 2 | 276 | 276.083333 |
3 | 10/02/2012 03:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 3 | 10 | 2 | 276 | 276.125000 |
4 | 10/02/2012 04:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 4 | 10 | 2 | 276 | 276.166667 |
5 | 10/02/2012 05:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 5 | 10 | 2 | 276 | 276.208333 |
6 | 10/02/2012 06:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 6 | 10 | 2 | 276 | 276.250000 |
7 | 10/02/2012 07:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 7 | 10 | 2 | 276 | 276.291667 |
8 | 10/02/2012 08:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 8 | 10 | 2 | 276 | 276.333333 |
9 | 10/02/2012 09:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 9 | 10 | 2 | 276 | 276.375000 |
10 | 10/02/2012 10:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 10 | 10 | 2 | 276 | 276.416667 |
11 | 10/02/2012 11:00:00 AM | 0 | 0 | 0 | 12 | Tuesday | 2 | 11 | 10 | 2 | 276 | 276.458333 |
12 | 10/02/2012 12:00:00 PM | 0 | 0 | 0 | 12 | Tuesday | 2 | 12 | 10 | 2 | 276 | 276.500000 |
13 | 10/02/2012 01:00:00 PM | 7 | 48 | 55 | 12 | Tuesday | 2 | 13 | 10 | 2 | 276 | 276.541667 |
14 | 10/02/2012 02:00:00 PM | 55 | 75 | 130 | 12 | Tuesday | 2 | 14 | 10 | 2 | 276 | 276.583333 |
15 | 10/02/2012 03:00:00 PM | 81 | 71 | 152 | 12 | Tuesday | 2 | 15 | 10 | 2 | 276 | 276.625000 |
16 | 10/02/2012 04:00:00 PM | 167 | 111 | 278 | 12 | Tuesday | 2 | 16 | 10 | 2 | 276 | 276.666667 |
17 | 10/02/2012 05:00:00 PM | 393 | 170 | 563 | 12 | Tuesday | 2 | 17 | 10 | 2 | 276 | 276.708333 |
18 | 10/02/2012 06:00:00 PM | 236 | 145 | 381 | 12 | Tuesday | 2 | 18 | 10 | 2 | 276 | 276.750000 |
19 | 10/02/2012 07:00:00 PM | 104 | 71 | 175 | 12 | Tuesday | 2 | 19 | 10 | 2 | 276 | 276.791667 |
Take a look at the last 20 columns
df.tail(20)
timestamp | nb | sb | tot | year | weekdayname | weekday | hour | month | day | dayofyear | dayofyear_float | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
18220 | 10/31/2014 04:00:00 AM | 1 | 3 | 4 | 14 | Friday | 5 | 4 | 10 | 31 | 304 | 304.166667 |
18221 | 10/31/2014 05:00:00 AM | 7 | 22 | 29 | 14 | Friday | 5 | 5 | 10 | 31 | 304 | 304.208333 |
18222 | 10/31/2014 06:00:00 AM | 18 | 57 | 75 | 14 | Friday | 5 | 6 | 10 | 31 | 304 | 304.250000 |
18223 | 10/31/2014 07:00:00 AM | 52 | 128 | 180 | 14 | Friday | 5 | 7 | 10 | 31 | 304 | 304.291667 |
18224 | 10/31/2014 08:00:00 AM | 66 | 203 | 269 | 14 | Friday | 5 | 8 | 10 | 31 | 304 | 304.333333 |
18225 | 10/31/2014 09:00:00 AM | 42 | 94 | 136 | 14 | Friday | 5 | 9 | 10 | 31 | 304 | 304.375000 |
18226 | 10/31/2014 10:00:00 AM | 18 | 34 | 52 | 14 | Friday | 5 | 10 | 10 | 31 | 304 | 304.416667 |
18227 | 10/31/2014 11:00:00 AM | 14 | 22 | 36 | 14 | Friday | 5 | 11 | 10 | 31 | 304 | 304.458333 |
18228 | 10/31/2014 12:00:00 PM | 17 | 16 | 33 | 14 | Friday | 5 | 12 | 10 | 31 | 304 | 304.500000 |
18229 | 10/31/2014 01:00:00 PM | 16 | 25 | 41 | 14 | Friday | 5 | 13 | 10 | 31 | 304 | 304.541667 |
18230 | 10/31/2014 02:00:00 PM | 25 | 27 | 52 | 14 | Friday | 5 | 14 | 10 | 31 | 304 | 304.583333 |
18231 | 10/31/2014 03:00:00 PM | 76 | 44 | 120 | 14 | Friday | 5 | 15 | 10 | 31 | 304 | 304.625000 |
18232 | 10/31/2014 04:00:00 PM | 134 | 81 | 215 | 14 | Friday | 5 | 16 | 10 | 31 | 304 | 304.666667 |
18233 | 10/31/2014 05:00:00 PM | 228 | 100 | 328 | 14 | Friday | 5 | 17 | 10 | 31 | 304 | 304.708333 |
18234 | 10/31/2014 06:00:00 PM | 106 | 75 | 181 | 14 | Friday | 5 | 18 | 10 | 31 | 304 | 304.750000 |
18235 | 10/31/2014 07:00:00 PM | 44 | 21 | 65 | 14 | Friday | 5 | 19 | 10 | 31 | 304 | 304.791667 |
18236 | 10/31/2014 08:00:00 PM | 22 | 9 | 31 | 14 | Friday | 5 | 20 | 10 | 31 | 304 | 304.833333 |
18237 | 10/31/2014 09:00:00 PM | 20 | 12 | 32 | 14 | Friday | 5 | 21 | 10 | 31 | 304 | 304.875000 |
18238 | 10/31/2014 10:00:00 PM | 13 | 11 | 24 | 14 | Friday | 5 | 22 | 10 | 31 | 304 | 304.916667 |
18239 | 10/31/2014 11:00:00 PM | 6 | 12 | 18 | 14 | Friday | 5 | 23 | 10 | 31 | 304 | 304.958333 |
View the raw data
fig = plt.figure()
fig.set_size_inches(20,5)
ax = fig.add_subplot(111)
df.plot(ax=ax, x='timestamp',y='tot')
<matplotlib.axes._subplots.AxesSubplot at 0x10c3631d0>
The big spikes in early 2014 were reported by SDOT as faulty sensor data, and were supposed to have been removed from the raw data, but apparently weren't. See here: http://www.seattlebikeblog.com/2014/04/29/monday-appears-to-smash-fremont-bridge-bike-counter-record/
Show the spurious data points:
df[df.nb>1000]
timestamp | nb | sb | tot | year | weekdayname | weekday | hour | month | day | dayofyear | dayofyear_float | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
13641 | 04/23/2014 09:00:00 AM | 1217 | 178 | 1395 | 14 | Wednesday | 3 | 9 | 4 | 23 | 113 | 113.375000 |
13689 | 04/25/2014 09:00:00 AM | 1186 | 167 | 1353 | 14 | Friday | 5 | 9 | 4 | 25 | 115 | 115.375000 |
13762 | 04/28/2014 10:00:00 AM | 2621 | 58 | 2679 | 14 | Monday | 1 | 10 | 4 | 28 | 118 | 118.416667 |
13785 | 04/29/2014 09:00:00 AM | 1795 | 222 | 2017 | 14 | Tuesday | 2 | 9 | 4 | 29 | 119 | 119.375000 |
Replace the spurious points with the average of the points before and after each
for i in df[df.nb>1000].index:
df.nb[i] = (df.nb[i-1]+df.nb[i+1])/2
#recalculate the total column
df['tot'] = df['nb']+df['sb']
Plot the raw data again to confirm that the big spikes in data are gone
fig = plt.figure()
fig.set_size_inches(20,5)
ax = fig.add_subplot(111)
df.plot(ax=ax, x='timestamp',y='tot')
<matplotlib.axes._subplots.AxesSubplot at 0x10b72de50>
Make a new dataframe with average crossings for each day of the week
plt.rcParams['figure.figsize'] = (10.0, 5)
daydf = pd.DataFrame(columns=('day', 'year', 'm_tot','standev_tot','m_nb','standev_nb','m_sb','standev_sb'))
week = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
count = 0
for day in np.sort(df.weekday.unique()):
for year in df.year.unique():
tot = []
nb = []
sb = []
for d in df[(df.year == year) & (df.weekday == day)].dayofyear.unique():
tot.append(int(np.sum(df[(df.year == year) & (df.dayofyear == d)].tot)))
nb.append(int(np.sum(df[(df.year == year) & (df.dayofyear == d)].nb)))
sb.append(int(np.sum(df[(df.year == year) & (df.dayofyear == d)].sb)))
daydf.loc[count] = [week[day],year,np.mean(tot),np.std(tot),np.mean(nb),np.std(nb),np.mean(sb),np.std(sb)]
count += 1
Plot average crossings by day of week for each year:
fig = plt.figure()
fig.set_size_inches(10,5)
ax = fig.add_subplot(111)
bar_width = 0.85/len(df.year.unique())
for i in range(len(df.year.unique())):
means = np.array(daydf[daydf.year == df.year.unique()[i]].m_tot)
standev = np.array(daydf[daydf.year == df.year.unique()[i]].standev_tot)
plt.bar(np.sort(df.weekday.unique())+i*bar_width,means,width=bar_width,yerr=standev,color=colors[i],error_kw=dict(ecolor='gray'))
ax.set_xticklabels(['Sun','Mon','Tue','Wed','Thu','Fri','Sat'])
ax.set_xticks(np.sort(df.weekday.unique())+0.85/2)
ax.legend(['2012','2013','2014'],loc=2)
ax.set_xlabel('Day of Week',fontsize =14)
ax.set_ylabel('Count',fontsize=14)
ax.set_title('Total crossings by day of week',fontsize=16)
fig = plt.figure()
fig.set_size_inches(10,5)
ax = fig.add_subplot(111)
bar_width = 0.85/len(df.year.unique())
for i in range(len(df.year.unique())):
means_nb = np.array(daydf[daydf.year == df.year.unique()[i]].m_nb)
standev_nb = np.array(daydf[daydf.year == df.year.unique()[i]].standev_nb)
means_sb = np.array(daydf[daydf.year == df.year.unique()[i]].m_sb)
standev_sb = np.array(daydf[daydf.year == df.year.unique()[i]].standev_sb)
plt.bar(np.sort(df.weekday.unique())+i*bar_width,means_sb,width=bar_width,color=tuple(x/2 for x in colors[i]))
plt.bar(np.sort(df.weekday.unique())+i*bar_width,means_nb,bottom=means_sb,width=bar_width,color=colors[i])
ax.set_xticklabels(['Sun','Mon','Tue','Wed','Thu','Fri','Sat'])
ax.set_xticks(np.sort(df.weekday.unique())+0.85/2)
ax.legend(['sb 2012','nb 2012','sb 2013','nb 2013','sb 2014','nb 2014'],loc=2)
ax.set_xlabel('Day of Week',fontsize =14)
ax.set_ylabel('Count',fontsize=14)
ax.set_title('Total crossings by day of week, separated by northbound/southbound',fontsize=16)
<matplotlib.text.Text at 0x10cd51fd0>
Plot by month
plt.rcParams['figure.figsize'] = (10.0, 5)
monthdf = pd.DataFrame(columns=('month', 'year',
'm_tot','standev_tot',
'm_nb','standev_nb',
'm_sb','standev_sb',
'sum_tot','sum_nb','sum_sb'))
months = range(1,13)
count = 0
for month in months:
for year in df.year.unique():
tot = []
nb = []
sb = []
for d in df[(df.year == year) & (df.month == month)].dayofyear.unique():
tot.append(int(np.sum(df[(df.year == year) & (df.dayofyear == d)].tot)))
nb.append(int(np.sum(df[(df.year == year) & (df.dayofyear == d)].nb)))
sb.append(int(np.sum(df[(df.year == year) & (df.dayofyear == d)].sb)))
monthdf.loc[count] = [month,year,np.mean(tot),np.std(tot),np.mean(nb),np.std(nb),np.mean(sb),np.std(sb),
np.sum(tot),np.sum(nb),np.sum(sb)]
count += 1
fig = plt.figure()
fig.set_size_inches(20,8)
ax = fig.add_subplot(111)
bar_width = 0.85/len(df.year.unique())
for i in range(len(df.year.unique())):
plt.bar(np.sort(df.month.unique())+i*bar_width,
monthdf[monthdf.year == df.year.unique()[i]].sum_tot,
width=bar_width,color=colors[i])
month_names = [calendar.month_name[i] for i in range(1,13)]
ax.set_xticklabels(range(1,13))
ax.set_xticklabels(month_names,rotation=30)
ax.set_xticks(np.sort(df.month.unique())+0.85/2)
ax.legend(['2012','2013','2014'],loc=2,fontsize=14)
ax.set_xlabel('Month',fontsize =14)
ax.set_ylabel('Monthly Crossings',fontsize=14)
ax.set_title('Total Crossings by Month',fontsize=16)
ax.set_xlim([1,13])
(1, 13)
Calculate the cumulative sum for each calendar year and rolling 12 month period
df['yearCumulativeSum']=0
df['yearCumulativeSum_PercentIncrease']=np.nan
df['RollingYearlyTotal'] = np.nan
df['RollingYearlyTotal_PercentIncrease']=np.nan
for i in range(len(df)):
df.yearCumulativeSum[i] = np.sum(df[:i][df.year==df.year[i]].tot)
idx = 0
for i in range(len(df)):
lastyear = df[(df.year==(df.year[i]-1)) & (df.dayofyear_float==df.dayofyear_float[i])].index
if len(lastyear) == 0:
idx = idx #if a datapoint didn't exist for the same time last year, use last idx
else:
idx = lastyear[0] #the zero is necessary because there's at least one example of two datapoints for the same datetime
df.RollingYearlyTotal[i] = np.sum(df.tot[idx:i])
df.RollingYearlyTotal_PercentIncrease[i] = 100*(df.RollingYearlyTotal[i] - df.RollingYearlyTotal[idx])/(1.0*df.RollingYearlyTotal[idx])
if df.year[i]>=14: #Percent increase on the cumulative calendar year total only makes sense for 2014 and beyond
df.yearCumulativeSum_PercentIncrease[i] = 100*(df.yearCumulativeSum[i] - df.yearCumulativeSum[idx])/(1.0*df.yearCumulativeSum[idx])
df.tail()
timestamp | nb | sb | tot | year | weekdayname | weekday | hour | month | day | dayofyear | dayofyear_float | yearCumulativeSum | yearCumulativeSum_PercentIncrease | RollingYearlyTotal | RollingYearlyTotal_PercentIncrease | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18235 | 10/31/2014 07:00:00 PM | 44 | 21 | 65 | 14 | Friday | 5 | 19 | 10 | 31 | 304 | 304.791667 | 901985 | 9.607194 | 1005085.5 | 10.231400 |
18236 | 10/31/2014 08:00:00 PM | 22 | 9 | 31 | 14 | Friday | 5 | 20 | 10 | 31 | 304 | 304.833333 | 902050 | 9.599377 | 1005032.5 | 10.219181 |
18237 | 10/31/2014 09:00:00 PM | 20 | 12 | 32 | 14 | Friday | 5 | 21 | 10 | 31 | 304 | 304.875000 | 902081 | 9.595553 | 1005006.5 | 10.213671 |
18238 | 10/31/2014 10:00:00 PM | 13 | 11 | 24 | 14 | Friday | 5 | 22 | 10 | 31 | 304 | 304.916667 | 902113 | 9.594648 | 1005002.5 | 10.210815 |
18239 | 10/31/2014 11:00:00 PM | 6 | 12 | 18 | 14 | Friday | 5 | 23 | 10 | 31 | 304 | 304.958333 | 902137 | 9.592637 | 1004989.5 | 10.206851 |
Plot cumulative sums (the dashed line represents the 1 million crossing pace)
fig = plt.figure()
fig.set_size_inches(20,8)
ax = fig.add_subplot(111)
for year in df.year.unique():
df[df.year==year].plot(x='dayofyear_float',y='yearCumulativeSum',color=colors[np.where(df.year.unique()==year)[0][0]],linewidth=3)
ax.plot([0,365],[0,1E6],'--k')
ax.legend(['20'+str(year) for year in df.year.unique()],loc=2,frameon=True,shadow=True,fontsize=14)
ax.xaxis.set_major_locator(dates.MonthLocator())
ax.xaxis.set_minor_locator(dates.MonthLocator(bymonthday=15))
ax.xaxis.set_major_formatter(ticker.NullFormatter())
ax.xaxis.set_minor_formatter(dates.DateFormatter('%b'))
for tick in ax.xaxis.get_minor_ticks():
tick.tick1line.set_markersize(0)
tick.tick2line.set_markersize(0)
tick.label1.set_horizontalalignment('center')
ax.set_xlabel('Month',fontsize=14)
ax.set_ylabel('Cumulative total crossings',fontsize=14)
ax.set_title('Cumulative crossings by calendar year',fontsize=16)
<matplotlib.text.Text at 0x10bad3510>
Plot the year over year increase in the cumulative total (this only makes sense for 2014 and beyond):
fig = plt.figure()
fig.set_size_inches(20,8)
ax = fig.add_subplot(111)
for year in df.year.unique()[df.year.unique()>=14]:
df[df.year==year].plot(x='dayofyear_float',y='yearCumulativeSum_PercentIncrease',color=colors[np.where(df.year.unique()==year)[0][0]],linewidth=3)
ax.legend(['20'+str(year)+' increase over 20'+str(year-1) for year in df.year.unique()[df.year.unique()>=14]],loc=2,frameon=True,shadow=True,fontsize=14)
ax.xaxis.set_major_locator(dates.MonthLocator())
ax.xaxis.set_minor_locator(dates.MonthLocator(bymonthday=15))
ax.xaxis.set_major_formatter(ticker.NullFormatter())
ax.xaxis.set_minor_formatter(dates.DateFormatter('%b'))
for tick in ax.xaxis.get_minor_ticks():
tick.tick1line.set_markersize(0)
tick.tick2line.set_markersize(0)
tick.label1.set_horizontalalignment('center')
ax.set_xlabel('Month',fontsize=14)
ax.set_ylabel('Cumulative total crossings',fontsize=14)
ax.set_title('Percent increase inumulative crossings over previous calendar year',fontsize=16)
ax.set_ylim([0,40])
(0, 40)
Plot a rolling 12 month total
fig = plt.figure()
fig.set_size_inches(20,6)
ax = fig.add_subplot(111)
df.plot(x='timestamp',y='RollingYearlyTotal',ax=ax,linewidth=2,color='k')
ax.set_xlabel('Date',fontsize=14)
ax.set_ylabel('Crossings in past 12 months',fontsize=14)
ax.set_title('Rolling 12 month total crossings',fontsize=16)
ax.set_ylim([0,1050000])
xticks = [0]
month = df.month[0]
for i in range(len(df)):
if df.month[i] > month or df.month[i] == (month - 11):
month = df.month[i]
xticks.append(i)
xticks.append(len(df))
ax.set_xticks(xticks)
xtl = [str(df.month[xt])+'/'+str(df.year[xt]) for xt in xticks[:-1]]
ax.set_xticklabels(xtl)
ax.xaxis.set_major_formatter(ticker.NullFormatter())
xtickminor = [(xticks[i]+xticks[i+1])/2 for i in range(len(xticks)-1)]
ax.xaxis.set_minor_locator(ticker.FixedLocator(xtickminor))
ax.xaxis.set_minor_formatter(ticker.FixedFormatter(xtl))
print 'The millionth crossing in a one year period first occurred on',df.timestamp[df[df.RollingYearlyTotal>1E6].index[0]]
The millionth crossing in a one year period first occurred on 09/29/2014 09:00:00 AM