import numpy as np
import pandas as pd
from datetime import timedelta, datetime
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 2)
import seaborn as sns
sns.set(font_scale=0.6)
import outlier
import utilities as util
Use scipy logsumexp().
df_daily = pd.read_pickle("TrainValidationData/df_daily.pkl")
df_daily_cleaned = pd.read_pickle("TrainValidationData/df_daily_cleaned.pkl")
Use the baysian method to look for changes in the data
!pip install git+https://github.com/hildensia/bayesian_changepoint_detection.git
Apply the Baysian Change Point Detection to each station/night. Note this takes about 30 min to processes
df_daily_cleaned_changes = df_daily_cleaned.set_index(['Station', 'Night'], append=True)
yy = df_daily_cleaned_changes.groupby(level=['Station', 'Night']).transform(util.find_changes)
df_daily_cleaned_changes[['ChangeEntry', 'ChangeExit']] = yy[['Entry', 'Exit']]
df_daily_cleaned_changes = df_daily_cleaned_changes.reset_index(['Station', 'Night'])
df_daily_cleaned_changes.to_pickle("TrainValidationData/df_daily_cleaned_changes.pkl")
df_daily_changes = df_daily.set_index(['Station', 'Night'], append=True)
yy = df_daily_changes.groupby(level=['Station', 'Night']).transform(util.find_changes)
df_daily_changes[['ChangeEntry', 'ChangeExit']] = yy[['Entry', 'Exit']]
df_daily_changes = df_daily_changes.reset_index(['Station', 'Night'])
df_daily_changes.to_pickle("TrainValidationData/df_daily_changes.pkl")
df_daily_changes = pd.read_pickle("TrainValidationData/df_daily_changes.pkl")
ts, axes = plot_bcp(df_daily_changes, "Kings Cross Station", "Friday")
mark(['19-Sep-2014'], axes)
outlier.plot(ts[:'12-Sep-2014'].Exit, ax=axes[0])
outlier.plot(ts['16-Sep-2014':].Exit, ax=axes[0])
axes[0].legend(loc='best', ncol = 3)
<matplotlib.legend.Legend at 0x7fcd26f95748>
This indicates a change during Sept/Oct 2014, probably at 19 Sep.
%matplotlib inline
ts, axes = plot_bcp(df_daily_changes, "Kings Cross Station", "Saturday")
outlier.plot(ts.Exit, ax=axes[0]);
These probabilities are very low and probably don't indicate a chage. Its more likely that the outliers are triggering this. Invesitage why there are outliers.
%matplotlib inline
df, axes = myplot("Newtown Station", "Friday")
mark(['10-May-2013', '21-Feb-2014', '22-Aug-2014'], axes)
outlier.plot(df['17-May-2013':'14-Feb-2014'].Exit, ax=axes[0])
outlier.plot(df['28-Feb-2014':'15-Aug-2014'].Exit, ax=axes[0])
outlier.plot(df['29-Aug-2014':].Exit, ax=axes[0])
<matplotlib.axes._subplots.AxesSubplot at 0x7f552f3ad2b0>
%matplotlib inline
df, axes = myplot("Newtown Station", "Saturday")
%matplotlib inline
df, axes = myplot("Parramatta Station", "Friday")
mark(['1-Aug-2014'], axes)
outlier.plot(df[:'25-July-2014'].Exit, ax=axes[0])
outlier.plot(df['8-Aug-2014':].Exit, ax=axes[0])
<matplotlib.axes._subplots.AxesSubplot at 0x7f552eb56b70>
%matplotlib inline
df, axes = myplot("Parramatta Station", "Saturday")
mark(['16-Aug-2014'], axes)
outlier.plot(df[:'9-Aug-2014'].Exit, ax=axes[0])
outlier.plot(df['23-Aug-2014':].Exit, ax=axes[0])
<matplotlib.axes._subplots.AxesSubplot at 0x7f552e926ef0>