#!/usr/bin/env python # coding: utf-8 #
# #

Predictive Analytics (QBUS2820)

#

Tutorial 14: Seasonal ARIMA

#
# # This notebook brings all the forecasting material together in a case study. Our objective is to forecast the monthly ridership in Sydney trains based on data provided by the [NSW Bureau of Transport Statistics](https://opendata.transport.nsw.gov.au/search/type/dataset). This is a fundamental policy issue for the city as population growth puts [increasing pressure](http://www.smh.com.au/nsw/revealed-new-metro-between-sydney-cbd-and-parramatta-20160831-gr5d6m.html) on public transport, with some rail lines being notorious for overcrowding. # # #####Content: # # Data: Sydney monthly train journeys
# Exploratory data analysis
# ARIMA
# (a) Identification
# (b) Automatic selection
# (c) Estimation
# (d) Diagnostics
# Exponential smoothing
# Model validation
# Forecast
# # # This notebook assumes the following packages and settings. # In[1]: # Packages import warnings warnings.filterwarnings("ignore") import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm # In[2]: # Plot settings sns.set_context('notebook') sns.set_style('ticks') red='#D62728' blue='#1F77B4' get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Predefined plots # Time series plot def ts_plot(y, color=red): fig, ax= plt.subplots(figsize=(9,6)) y.plot(color=color, ax=ax) ax.set_xlabel('') ax.set_xticks([], minor=True) sns.despine() return fig, ax # ACF and PACF plots def acf_pacf_plot(y): fig, ax = plt.subplots(1,2, figsize=(12,5)) sm.graphics.tsa.plot_acf(y, lags=40, ax=ax[0]) sm.graphics.tsa.plot_pacf(y, lags=40, ax=ax[1]) sns.despine() fig.tight_layout() return fig, ax #Histogram def hist(y): fig, ax= plt.subplots(figsize=(8,5)) sns.distplot(y, ax=ax, hist_kws={'alpha': 0.8, 'edgecolor':'black', 'color': blue}, kde_kws={'color': 'black', 'alpha': 0.7}) sns.despine() return fig, ax # ##Data: Sydney monthly train journeys # In[4]: data=pd.read_csv('journeys.csv', index_col='Month', parse_dates=True, dayfirst=True) data.index=data.index.to_period() y=data['Journeys'] data.tail() # We rescale the series to be in millions of rides, to facilitate the intepretation and avoid possible numerical problems. # In[5]: y=y/(10**6) # As a general rule, you should avoid working with large numbers since they lead to less precise numerical calculations and an accumulation of numerical precision errors. The next cell illustrates this problem. # In[6]: x=10**8 for i in range(1000): x+=0.001 print(x-10**8) x=0 for i in range(1000): x+=0.001 print(x) # ##Exploratory Data Analysis # # The next figure shows the time series. # In[7]: fig, ax= ts_plot(y) ax.set_ylabel('Journeys') ax.set_title('Monthly journeys in Sydney trains (in millions of rides)', fontsize=13) plt.show() # We conduct a X13-ARIMA-SEATS decomposition for the data. The results show that the number of journeys on Sydney trains had a slightly negative trend until 2006, but has increased steadily with a roughly linear trend since then: the number of users grew by 22% in the last ten years of the data. The series has a clear seasonal pattern that seems to be slightly growing in size with the trend. # In[8]: ts=y.copy() ts.index=ts.index.to_datetime() # remember that the decomposition functions only work with datetime decomposition=sm.tsa.x13_arima_analysis(ts) seasonal=decomposition.observed/decomposition.seasadj from forecast import plot_components_x13 plot_components_x13(decomposition) plt.show() # To better understand the seasonal pattern, the next cell draws a seasonal plot. The number of journeys peaks in March, May and August, and is lowest during the summer months. # In[37]: months=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] fig, ax = plt.subplots(figsize=(8,5)) ax.plot(seasonal.groupby(seasonal.index.month).mean(), color=red, marker='.', markersize=15) ax.set_xlim(0.5,12.5) ax.set_xticks(np.arange(1,13)) ax.set_xticklabels(months, rotation='-90') ax.set_ylabel('Seasonal index') ax.set_title('Seasonal plot', fontsize=13) sns.despine() plt.show() # It is useful to save the help X13-ARIMA output for reference. # In[10]: f=open('x13results.txt', mode='w+') f.write(decomposition.results) f.close() # ##ARIMA # # # ###Identification # # To identify an appropriate ARIMA specification for the series, we start by creating a data frame to store the first differenced, seasonally differenced, and first and seasonally differenced series. # In[11]: df=pd.DataFrame(y.copy()) df.columns.values[0]='original' df['first_diff']=y-y.shift(1) df['seasonal_diff']=y-y.shift(12) df['diff']=df['first_diff']-df['first_diff'].shift(12) df=df.dropna() df.tail().round(3) # Taking the first difference reveals a slow decay in ACF at the seasonal lags, suggesting the need for seasonal difference to stationarity. # In[12]: ts_plot(df['first_diff']) plt.title('First difference') plt.show() # In[13]: acf_pacf_plot(df['first_diff']) plt.show() # Taking only the seasonal difference leads to a still clearly nonstationary series, confirming the need for both transformations. # In[14]: ts_plot(df['seasonal_diff']) plt.title('Seasonal difference') plt.show() # In[15]: acf_pacf_plot(df['seasonal_diff']) plt.show() # The next figure shows the first and seasonally differenced series, followed by the ACF and PACF plots. The ACF and PACF plots suggest the an AR(2) model: since the autocorrelations decrease gradually, while the partial autocorrelations display a clear cut-off after lag 2. # In[16]: ts_plot(df['diff']) # first and seasonal difference plt.title('First and seasonal difference') plt.show() # In[17]: acf_pacf_plot(df['diff']) plt.show() # No seasonal AR or MA pattern is easily discernible from the ACF and PACFs plots above. Hence, I fit an AR(2) to model to the first and seasonally differenced series and examine the residuals for the remaining autocorrelations. The ACF and PACF plots for the residulal series seem to be consistent with a seasonal MA(1) model, due to a significant autocorrelation at lag 12 and significant partial autocorrelations at lags 12 and 24. # In[18]: arima = sm.tsa.ARIMA(df['diff'], order=(2, 0, 0)).fit() acf_pacf_plot(arima.resid) plt.show() # ###Automatic selection # # We can also conduct automatic ARIMA order selection based on the X13 ARIMA decomposition software, which is in turn based on the AIC. The automatic selection procedure confirms our choice based on the analysis of the ACF and PACF plots. In practice, it is important to use both the visual and automatic approaches and compare the results. # In[19]: results=sm.tsa.x13_arima_select_order(y) print(results.order) # non-seasonal part print(results.sorder) # seasonal part # ##Estimation # In[20]: sarima=sm.tsa.statespace.SARIMAX(y, order=(2, 1, 0), seasonal_order=(0, 1, 1, 12)).fit() print(sarima.summary()) # ###Diagnostics # # In[21]: resid=sarima.resid[13:] # the first 13 residuals are inialisation ones # Below are the residual plot, ACF and PACF for the ARIMA(2,1,0)(0,1,1) model. Therea are no apparent patterns in these plots, so that model seems to adequately capture time series dependence in the date. # In[22]: ts_plot(resid, color=blue) plt.title('Residual plot') plt.show() # In[23]: acf_pacf_plot(resid) plt.show() # The histogram of the residuals is consistent with a normal or near normal distribution. # In[24]: hist(resid) plt.show() # This part of the model output gives further information. The distribution of the residuals is approximately symmetric and displays near Gaussian sample kurtosis, such that we do not reject the null hypothesis of normality. # In[25]: sarima.summary().tables[2] # The output also shows hypothesis tests for no serial correlation and constant variance (both of which we do not reject). # ##Exponential Smoothing # # ###Estimation # # I estimate several Holt-Winters specifications, allowing for additive or multiplicative seasonality, a damped trend, and a log transformation. When modelling the level of the series, the model selection criteria and the overall fit suggest that a multiplicative model without damping is the preferred specification for the data. Later, we will use a validation set to select between the multiplicative and log-additive models. # In[26]: import forecast ahw=forecast.holtwinters(y, additive=True, damped=False, m=12) ahw.fit() ahw.summary() # In[27]: ahwd=forecast.holtwinters(y, additive=True, damped=True, m=12) ahwd.fit() ahwd.summary() # In[28]: mhw=forecast.holtwinters(y, additive=False, damped=False, m=12) mhw.fit() mhw.summary() # In[29]: mhwd=forecast.holtwinters(y, additive=False, damped=True, m=12) mhwd.fit() mhwd.summary() # In[30]: ahw_log=forecast.holtwinters(np.log(y), additive=True, damped=False, m=12) ahw_log.fit() ahw_log.summary() # ###Diagnostics # # The figures below show the residual plot, ACF and PACF for the multiplicative Holt-Winters model. We find that the exponential smoothing method is not a good fit for the data due to the presence of substantial remaining autocorrelation. The autocorrelation is due to a short run behaviour that is picked up by the AR(2) part of the ARIMA model above. # In[31]: resid=pd.Series(mhw.resid, index=y.index).iloc[13:] ts_plot(resid, color=blue) plt.title('Residual plot') plt.show() # In[32]: acf_pacf_plot(resid) plt.show() # ##Model validation # # We now use real-time forecasting (2010 onwards, 63 observations), to confirm whether we should prefer the ARIMA model over the methods, and to estimate how it may perform for short and medium run forecasts. We consider forecasts one month and one year ahead. # # The results clearly show that the ARIMA(2,1,0)(0,1,0) model outperforms exponential smoothing for forecasting. # In[33]: # Real time forecasting validation=y['2010':].index start = y.index.get_loc('2010-01') results=pd.DataFrame(0.0, index=y.iloc[start:].index, columns=['Seasonal RW','Holt-Winters', 'Log Holt-Winters', 'Seasonal ARIMA', 'Actual']) results['Actual'] = y.iloc[start:] for i in range(start, len(y)): j=i-start # seasonal random walk forecast results.iloc[j,0]=y.iloc[i-12] # multiplicative holt winters model = forecast.holtwinters(y.iloc[:i] , additive=False, damped=False, m=12) model.fit() results.iloc[j,1]=model.forecast(1)[0] # log holt winters model = forecast.holtwinters(np.log(y.iloc[:i]), additive=True, damped=False, m=12) model.fit() results.iloc[j,2]=np.exp(model.forecast(h=1)[0]+model.forecastvariance(h=1)[0]/2) # seasonal ARIMA model = sm.tsa.statespace.SARIMAX(y.iloc[:i], order=(2, 1, 0), seasonal_order=(0, 1, 1, 12)).fit() results.iloc[j,3]=model.forecast()[0] # In[34]: from statlearning import rmse_jack table = pd.DataFrame(0.0, index=results.columns[:-1], columns=['RMSE','SE']) for i in range(len(results.columns)-1): table.iloc[i,0], table.iloc[i,1] = rmse_jack(results.iloc[:,i], results.iloc[:,-1]) table.round(3) # In[35]: # Real time forecasting (One year ahead) validation=y['2010':].index start = y.index.get_loc('2010-01') results=pd.DataFrame(0.0, index=y.iloc[start:].index, columns=['Seasonal RW','Holt-Winters', 'Log Holt-Winters', 'Seasonal ARIMA', 'Actual']) results['Actual'] = y.iloc[start:] for i in range(start, len(y)): j=i-start # seasonal random walk forecast results.iloc[j,0]=y.iloc[i-12] # multiplicative holt winters model = forecast.holtwinters(y.iloc[:i-12+1] , additive=False, damped=False, m=12) model.fit() results.iloc[j,1]=model.forecast(h=12)[-1] # log holt winters model = forecast.holtwinters(np.log(y.iloc[:i-12+1]), additive=True, damped=False, m=12) model.fit() results.iloc[j,2]=np.exp(model.forecast(h=12)[-1]+model.forecastvariance(h=12)[-1]/2) # seasonal ARIMA model = sm.tsa.statespace.SARIMAX(y.iloc[:i-12+1], order=(2, 1, 0), seasonal_order=(0, 1, 1, 12)).fit() results.iloc[j,3]=model.forecast(steps=12)[-1] # The standard error is not valid in this case because the forecast errors are correlated table = pd.DataFrame(0.0, index=results.columns[:-1], columns=['RMSE']) for i in range(len(results.columns)-1): table.iloc[i,0], _ = rmse_jack(results.iloc[:,i], results.iloc[:,-1]) table.round(3) # ##Forecast # # Here are the final forecasts based on the ARIMA(2,1,0)(0,1,1) model. We predict a continuing increase in the number of train rides. # In[36]: h=24 test=pd.period_range(start=y.index[-1]+1, periods=h, freq='M') y_pred = sarima.forecast(steps=h) y_pred = pd.Series(y_pred, index=test) intv1 = sarima.get_forecast(h).conf_int(alpha=0.2) intv2 = sarima.get_forecast(h).conf_int(alpha=0.1) intv3 = sarima.get_forecast(h).conf_int(alpha=0.01) intv1=pd.DataFrame(intv1, index=test) intv2=pd.DataFrame(intv2, index=test) intv3=pd.DataFrame(intv3, index=test) fig, ax = forecast.fanchart(y['2012':], y_pred, intv1, intv2, intv3) ax.set_xlabel('') ax.set_xticks([], minor=True) ax.set_ylabel('Journeys') ax.set_title('ARIMA(2,1,0)(0,1,1) Sydney train journeys forecast', fontsize=13) sns.despine() plt.show()