%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling as pdp
import statsmodels.api as sm
from plotly import tools
from plotly.graph_objs import Bar, Figure, Layout, Scatter
from plotly.offline import init_notebook_mode, iplot
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA, ARMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
init_notebook_mode(connected=True)
save_image = None # 'png' if save image
TypeError: Cannot cast ufunc subtract output from dtype('float64') to dtype('int64') with casting rule 'same_kind'
ValueError: There is no frequency for these dates and date 2018-04-01 00:00:00 is not in dates index. Try giving a date that is in the dates index or use an integer
data_file = '../data/processed/data.csv'
df = pd.read_csv(data_file)
df.date = pd.to_datetime(df.date)
df = df.set_index('date')
date_index = pd.date_range('2017-10-01', end='2018-04-30', freq='D')
df = df.reindex(date_index)
df.pv = df.pv.astype(np.float64)
df['log_pv'] = np.log(df.pv)
print(df.pv.head())
2017-10-01 2.0 2017-10-02 9.0 2017-10-03 3.0 2017-10-04 21.0 2017-10-05 13.0 Freq: D, Name: pv, dtype: float64
df.head()
pv | entry | pv_per_entry | log_pv | |
---|---|---|---|---|
2017-10-01 | 2.0 | 8 | 0.250 | 0.693147 |
2017-10-02 | 9.0 | 8 | 1.125 | 2.197225 |
2017-10-03 | 3.0 | 8 | 0.375 | 1.098612 |
2017-10-04 | 21.0 | 8 | 2.625 | 3.044522 |
2017-10-05 | 13.0 | 8 | 1.625 | 2.564949 |
df.dtypes
pv float64 entry int64 pv_per_entry float64 log_pv float64 dtype: object
df_train = df[df.index < '2018-04-01']
df_test = df[df.index >= '2018-04-01']
pdp.ProfileReport(df)
Dataset info
Number of variables | 5 |
---|---|
Number of observations | 212 |
Total Missing (%) | 0.0% |
Total size in memory | 8.4 KiB |
Average record size in memory | 40.4 B |
Variables types
Numeric | 3 |
---|---|
Categorical | 0 |
Boolean | 0 |
Date | 1 |
Text (Unique) | 0 |
Rejected | 1 |
Unsupported | 0 |
Warnings
pv_per_entry
is highly correlated with pv
(ρ = 0.98508) Rejectedentry
Numeric
Distinct count | 29 |
---|---|
Unique (%) | 13.7% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 25.599 |
---|---|
Minimum | 8 |
Maximum | 36 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 8 |
---|---|
5-th percentile | 10 |
Q1 | 17 |
Median | 29.5 |
Q3 | 33 |
95-th percentile | 35 |
Maximum | 36 |
Range | 28 |
Interquartile range | 16 |
Descriptive statistics
Standard deviation | 8.963 |
---|---|
Coef of variation | 0.35013 |
Kurtosis | -1.1647 |
Mean | 25.599 |
MAD | 8.0162 |
Skewness | -0.56399 |
Sum | 5427 |
Variance | 80.336 |
Memory size | 1.7 KiB |
Value | Count | Frequency (%) | |
33 | 40 | 18.9% | |
35 | 24 | 11.3% | |
32 | 10 | 4.7% | |
31 | 9 | 4.2% | |
36 | 8 | 3.8% | |
34 | 8 | 3.8% | |
11 | 8 | 3.8% | |
25 | 8 | 3.8% | |
13 | 7 | 3.3% | |
16 | 7 | 3.3% | |
Other values (19) | 83 | 39.2% |
Minimum 5 values
Value | Count | Frequency (%) | |
8 | 6 | 2.8% | |
9 | 4 | 1.9% | |
10 | 3 | 1.4% | |
11 | 8 | 3.8% | |
12 | 6 | 2.8% |
Maximum 5 values
Value | Count | Frequency (%) | |
32 | 10 | 4.7% | |
33 | 40 | 18.9% | |
34 | 8 | 3.8% | |
35 | 24 | 11.3% | |
36 | 8 | 3.8% |
index
Date
Distinct count | 212 |
---|---|
Unique (%) | 100.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Minimum | 2017-10-01 00:00:00 |
---|---|
Maximum | 2018-04-30 00:00:00 |
log_pv
Numeric
Distinct count | 164 |
---|---|
Unique (%) | 77.4% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 4.5803 |
---|---|
Minimum | 0 |
Maximum | 6.9412 |
Zeros (%) | 0.9% |
Quantile statistics
Minimum | 0 |
---|---|
5-th percentile | 2.4849 |
Q1 | 3.801 |
Median | 4.9452 |
Q3 | 5.5225 |
95-th percentile | 6.0012 |
Maximum | 6.9412 |
Range | 6.9412 |
Interquartile range | 1.7214 |
Descriptive statistics
Standard deviation | 1.2433 |
---|---|
Coef of variation | 0.27144 |
Kurtosis | 1.2054 |
Mean | 4.5803 |
MAD | 0.99438 |
Skewness | -1.0761 |
Sum | 971.02 |
Variance | 1.5457 |
Memory size | 1.7 KiB |
Value | Count | Frequency (%) | |
2.5649493574615367 | 4 | 1.9% | |
4.61512051684126 | 3 | 1.4% | |
2.4849066497880004 | 3 | 1.4% | |
3.1354942159291497 | 3 | 1.4% | |
3.4339872044851463 | 3 | 1.4% | |
3.5263605246161616 | 3 | 1.4% | |
5.869296913133774 | 3 | 1.4% | |
0.0 | 2 | 0.9% | |
5.135798437050262 | 2 | 0.9% | |
4.48863636973214 | 2 | 0.9% | |
Other values (154) | 184 | 86.8% |
Minimum 5 values
Value | Count | Frequency (%) | |
0.0 | 2 | 0.9% | |
0.6931471805599453 | 1 | 0.5% | |
1.0986122886681098 | 2 | 0.9% | |
1.3862943611198906 | 1 | 0.5% | |
2.1972245773362196 | 2 | 0.9% |
Maximum 5 values
Value | Count | Frequency (%) | |
6.100318952020064 | 1 | 0.5% | |
6.124683390894205 | 1 | 0.5% | |
6.272877006546167 | 1 | 0.5% | |
6.428105272684596 | 1 | 0.5% | |
6.9411900550683745 | 1 | 0.5% |
pv
Numeric
Distinct count | 164 |
---|---|
Unique (%) | 77.4% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 165 |
---|---|
Minimum | 1 |
Maximum | 1034 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
---|---|
5-th percentile | 12 |
Q1 | 44.75 |
Median | 140.5 |
Q3 | 250.25 |
95-th percentile | 403.9 |
Maximum | 1034 |
Range | 1033 |
Interquartile range | 205.5 |
Descriptive statistics
Standard deviation | 143.5 |
---|---|
Coef of variation | 0.86966 |
Kurtosis | 5.4718 |
Mean | 165 |
MAD | 111.47 |
Skewness | 1.5815 |
Sum | 34981 |
Variance | 20592 |
Memory size | 1.7 KiB |
Value | Count | Frequency (%) | |
13.0 | 4 | 1.9% | |
23.0 | 3 | 1.4% | |
354.0 | 3 | 1.4% | |
101.0 | 3 | 1.4% | |
12.0 | 3 | 1.4% | |
31.0 | 3 | 1.4% | |
34.0 | 3 | 1.4% | |
43.0 | 2 | 0.9% | |
181.0 | 2 | 0.9% | |
67.0 | 2 | 0.9% | |
Other values (154) | 184 | 86.8% |
Minimum 5 values
Value | Count | Frequency (%) | |
1.0 | 2 | 0.9% | |
2.0 | 1 | 0.5% | |
3.0 | 2 | 0.9% | |
4.0 | 1 | 0.5% | |
9.0 | 2 | 0.9% |
Maximum 5 values
Value | Count | Frequency (%) | |
446.0 | 1 | 0.5% | |
457.0 | 1 | 0.5% | |
530.0 | 1 | 0.5% | |
619.0 | 1 | 0.5% | |
1034.0 | 1 | 0.5% |
pv_per_entry
Highly correlated
This variable is highly correlated with pv
and should be ignored for analysis
Correlation | 0.98508 |
---|
data = [Scatter(x=df.index, y=df.pv, name='pv')]
fig = Figure(data=data, layout=Layout(title='PV数'))
iplot(fig, image=save_image, filename='PV数')
fig = tools.make_subplots(rows=3, cols=1)
fig.append_trace(Scatter(x=df.index, y=df.pv, name='pv'), 1, 1)
fig.append_trace(Scatter(x=df.index, y=df.log_pv, name='pv(log)'), 2, 1)
fig.append_trace(Scatter(x=df.index, y=df.pv.diff(), name='pv(diff)'), 3, 1)
fig['layout'].update(title='PV数の推移')
iplot(fig, image=save_image, filename='PV数の推移')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x2,y2 ] [ (3,1) x3,y3 ]
df.pv.index[:-1]
DatetimeIndex(['2017-10-01', '2017-10-02', '2017-10-03', '2017-10-04', '2017-10-05', '2017-10-06', '2017-10-07', '2017-10-08', '2017-10-09', '2017-10-10', ... '2018-04-20', '2018-04-21', '2018-04-22', '2018-04-23', '2018-04-24', '2018-04-25', '2018-04-26', '2018-04-27', '2018-04-28', '2018-04-29'], dtype='datetime64[ns]', length=211, freq='D')
data = [Scatter(x=df.pv[:-1], y=df.pv[1:], mode='markers')]
corr = np.corrcoef(df.pv[:-1], df.pv[1:])[0][1]
fig = Figure(data=data, layout=Layout(title='ラグ1の散布図(corr={:.2})'.format(corr)))
iplot(fig, image=save_image, filename='ラグ1の散布図')
n = 40
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
sm.graphics.tsa.plot_acf(df.pv, lags=n, ax=ax1)
sm.graphics.tsa.plot_pacf(df.pv, lags=n, ax=ax2)
if save_image:
fig.savefig('コレログラム1.png')
acf = sm.tsa.stattools.acf(df.pv, nlags=n)
pacf = sm.tsa.stattools.pacf(df.pv, nlags=n, method='ywm')
fig = tools.make_subplots(rows=2, cols=1)
fig.append_trace(Bar(y=acf, name='自己相関'), 1, 1)
fig.append_trace(Bar(y=pacf, name='偏自己相関'), 2, 1)
fig['layout'].update(
title='コレログラム',
shapes=[{
'xref': 'x1',
'yref': 'y1',
'type': 'ract',
'x0': 0,
'y0': -1.96 / np.sqrt(len(df)),
'x1': n,
'y1': 1.96 / np.sqrt(len(df)),
'fillcolor': '#808080',
'opacity': 0.5,
'line': {
'width': 0
}
},{
'xref': 'x2',
'yref': 'y2',
'type': 'ract',
'x0': 0,
'y0': -1.96 / np.sqrt(len(df)),
'x1': n,
'y1': 1.96 / np.sqrt(len(df)),
'fillcolor': '#808080',
'opacity': 0.5,
'line': {
'width': 0
}
}])
iplot(fig, image=save_image, filename='コレログラム2')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x2,y2 ]
res = sm.tsa.stattools.adfuller(df.pv)
print('p-value = {:.4}'.format(res[1]))
p-value = 0.746
res = sm.tsa.stattools.adfuller(df.log_pv)
print('p-value = {:.4}'.format(res[1]))
p-value = 0.02229
res = sm.tsa.stattools.adfuller(df.pv.diff().dropna())
print('p-value = {:.4}'.format(res[1]))
p-value = 2.166e-06
seasonal_decomposeを用いて、移動平均による季節成分の分解を行う
res = sm.tsa.seasonal_decompose(df.pv, freq=7)
fig = tools.make_subplots(rows=4, cols=1)
fig.append_trace(Scatter(x=res.observed.index, y=res.observed, name='Original'), 1, 1)
fig.append_trace(Scatter(x=res.trend.index, y=res.trend, name='Trend'), 2, 1)
fig.append_trace(Scatter(x=res.seasonal.index, y=res.seasonal, name='Seasonal'), 3, 1)
fig.append_trace(Scatter(x=res.resid.index, y=res.resid, name='Resid'), 4, 1)
fig['layout'].update(title='成分分解')
iplot(fig, image=save_image, filename='成分分解')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x2,y2 ] [ (3,1) x3,y3 ] [ (4,1) x4,y4 ]
res = sm.tsa.seasonal_decompose(df.log_pv, freq=7)
fig = tools.make_subplots(rows=4, cols=1)
fig.append_trace(Scatter(x=res.observed.index, y=res.observed, name='Original'), 1, 1)
fig.append_trace(Scatter(x=res.trend.index, y=res.trend, name='Trend'), 2, 1)
fig.append_trace(Scatter(x=res.seasonal.index, y=res.seasonal, name='Seasonal'), 3, 1)
fig.append_trace(Scatter(x=res.resid.index, y=res.resid, name='Resid'), 4, 1)
fig['layout'].update(title='成分分解(対数)')
iplot(fig, image=save_image, filename='成分分解(対数)')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x2,y2 ] [ (3,1) x3,y3 ] [ (4,1) x4,y4 ]
data = [Scatter(x=df.index, y=df.pv.diff(), name='pv(diff)')]
fig = Figure(data=data, layout=Layout(title='差分データ'))
iplot(fig, image=save_image, filename='差分データ')
res = sm.tsa.seasonal_decompose(df.pv, freq=7)
s = df.pv - res.seasonal
data = [Scatter(x=s.index, y=s, name='pv(seasonal)')]
fig = Figure(data=data, layout=Layout(title='季節調整済みデータ'))
iplot(fig, image=save_image, filename='季節調整済みデータ')
res = sm.tsa.seasonal_decompose(df.pv, freq=7)
s_diff = (df.pv - res.seasonal).diff().dropna()
data = [Scatter(x=s_diff.index, y=s_diff, name='pv(seasonal+diff)')]
fig = Figure(data=data, layout=Layout(title='差分+季節調整済みデータ'))
iplot(fig, image=save_image, filename='差分+季節調整済みデータ')
res = sm.tsa.stattools.adfuller(s_diff)
print('p-value = {:.4}'.format(res[1]))
p-value = 3.963e-10
n = 40
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
sm.graphics.tsa.plot_acf(s_diff, lags=n, ax=ax1)
sm.graphics.tsa.plot_pacf(s_diff, lags=n, ax=ax2)
if save_image:
fig.savefig('コレログラム(s_diff).png')
data = [
Scatter(x=df.index, y=df.pv_per_entry, name='pv / entry'),
Scatter(x=df.index, y=df.entry, name='entry')
]
fig = Figure(data=data, layout=Layout(title='1エントリあたりのPV数'))
iplot(fig, image=save_image, filename='1エントリあたりのPV数')
res = sm.tsa.seasonal_decompose(df.pv_per_entry, freq=7)
fig = tools.make_subplots(rows=4, cols=1)
fig.append_trace(Scatter(x=res.observed.index, y=res.observed, name='Original'), 1, 1)
fig.append_trace(Scatter(x=res.trend.index, y=res.trend, name='Trend'), 2, 1)
fig.append_trace(Scatter(x=res.seasonal.index, y=res.seasonal, name='Seasonal'), 3, 1)
fig.append_trace(Scatter(x=res.resid.index, y=res.resid, name='Resid'), 4, 1)
fig['layout'].update(title='成分分解(PV / エントリ数)')
iplot(fig, image=save_image, filename='成分分解(PV / エントリ数)')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x2,y2 ] [ (3,1) x3,y3 ] [ (4,1) x4,y4 ]
ts = df.pv
ts_train = df_train.pv
ts_test = df_test.pv
def fit_model(ts_train, order, seasonal_order):
model = SARIMAX(
ts_train,
order=order,
seasonal_order=seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False)
result = model.fit()
return result
def eval_model(ts_train, ts_test, result):
train_pred = result.predict()
test_pred = result.forecast(len(ts_test))
test_pred_ci = result.get_forecast(len(ts_test)).conf_int()
train_rmse = np.sqrt(mean_squared_error(ts_train, train_pred))
test_rmse = np.sqrt(mean_squared_error(ts_test, test_pred))
print('RMSE(train):\t{:.5}\nRMSE(test):\t{:.5}'.format(
train_rmse, test_rmse))
return train_pred, test_pred, test_pred_ci
def make_fig(ts, train_pred, test_pred, test_pred_ci, result, title='SARIMA'):
data = [
Scatter(x=ts.index, y=ts, name='Original', line=dict(dash='dot')),
Scatter(x=train_pred.index, y=train_pred, name='Predict(train)'),
Scatter(x=test_pred.index, y=test_pred, name='Predict(test)'),
# Scatter(x=result.resid.index, y=result.resid, name='Resid')
]
fig1 = Figure(data=data, layout=Layout(title=title))
fig2, ax = plt.subplots()
ts.plot(ax=ax, label='Original', linestyle="dashed")
train_pred.plot(ax=ax, label='Predict(train)')
test_pred.plot(ax=ax, label='Predict(test)')
ax.fill_between(
test_pred_ci.index,
test_pred_ci.iloc[:, 0],
test_pred_ci.iloc[:, 1],
color='k',
alpha=.2)
ax.legend()
ax.set_title(title)
fig3, ax = plt.subplots()
ts['2018-04-01':].plot(ax=ax, label='Original', linestyle="dashed")
test_pred.plot(ax=ax, label='Predict(test)', color='g')
ax.fill_between(
test_pred_ci.index,
test_pred_ci.iloc[:, 0],
test_pred_ci.iloc[:, 1],
color='k',
alpha=.2)
ax.legend()
ax.set_title(title)
return fig1, fig2, fig3
warnings.filterwarnings('ignore')
s_diff_train = s_diff[s_diff.index < '2018-04-01']
res = sm.tsa.arma_order_select_ic(s_diff_train, max_ar=5, max_ma=7, ic='aic')
res.aic_min_order
(2, 6)
order=(2,1,6)
seasonal_order=(1,1,1,7)
result = fit_model(ts_train, order, seasonal_order)
train_pred, test_pred, test_pred_ci = eval_model(ts_train, ts_test, result)
result.summary()
RMSE(train): 58.447 RMSE(test): 150.03
Dep. Variable: | pv | No. Observations: | 182 |
---|---|---|---|
Model: | SARIMAX(2, 1, 6)x(1, 1, 1, 7) | Log Likelihood | -890.102 |
Date: | Sun, 06 May 2018 | AIC | 1802.205 |
Time: | 21:06:17 | BIC | 1837.449 |
Sample: | 10-01-2017 | HQIC | 1816.492 |
- 03-31-2018 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | -0.6489 | 4.479 | -0.145 | 0.885 | -9.427 | 8.129 |
ar.L2 | 0.3797 | 4.560 | 0.083 | 0.934 | -8.558 | 9.317 |
ma.L1 | 0.1962 | 13.725 | 0.014 | 0.989 | -26.704 | 27.096 |
ma.L2 | -0.9070 | 14.113 | -0.064 | 0.949 | -28.568 | 26.754 |
ma.L3 | -0.1296 | 2.655 | -0.049 | 0.961 | -5.332 | 5.073 |
ma.L4 | -0.0245 | 1.562 | -0.016 | 0.987 | -3.085 | 3.036 |
ma.L5 | -0.0177 | 0.970 | -0.018 | 0.985 | -1.918 | 1.883 |
ma.L6 | -0.0197 | 0.420 | -0.047 | 0.963 | -0.844 | 0.804 |
ar.S.L7 | -0.2623 | 0.231 | -1.135 | 0.256 | -0.715 | 0.190 |
ma.S.L7 | -0.6646 | 0.232 | -2.868 | 0.004 | -1.119 | -0.210 |
sigma2 | 3837.1275 | 5.99e+04 | 0.064 | 0.949 | -1.14e+05 | 1.21e+05 |
Ljung-Box (Q): | 18.96 | Jarque-Bera (JB): | 24679.04 |
---|---|---|---|
Prob(Q): | 1.00 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 18.05 | Skew: | 6.31 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 62.52 |
title = 'SARIMA order={}, seasonal_order={}'.format(order, seasonal_order)
fig1, fig2, fig3 = make_fig(ts, train_pred, test_pred, test_pred_ci, result, title)
iplot(fig1, image=save_image, filename='SARIMA1-1')
if save_image:
fig2.savefig('SARIMA1-2')
fig3.savefig('SARIMA1-3')
n = 40
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
sm.graphics.tsa.plot_acf(result.resid, lags=n, ax=ax1)
sm.graphics.tsa.plot_pacf(result.resid, lags=n, ax=ax2)
if save_image:
fig.savefig('残差1')
p_range = range(0, 6)
d_range = range(0, 4)
q_range = range(0, 8)
sp_range = range(0, 2)
sd_range = range(0, 2)
sq_range = range(0, 2)
history = pd.DataFrame()
for p, d, q, sp, sd, sq in itertools.product(p_range, d_range, q_range, sp_range, sd_range, sq_range):
if p == 0 and q == 0:
continue
model = SARIMAX(
ts_train,
order=(p, d, q),
seasonal_order=(sp, sd, sq, 7),
enforce_stationarity=False,
enforce_invertibility=False)
result = model.fit()
train_pred = result.predict()
test_pred = result.forecast(len(df_test))
history = history.append(
pd.Series({
'p': p,
'd': d,
'q': q,
'sp': sp,
'sd': sd,
'sq': sq,
'aic': result.aic,
'bic': result.bic,
'train-rmse': np.sqrt(mean_squared_error(ts_train, train_pred)),
'test-rmse': np.sqrt(mean_squared_error(ts_test, test_pred))
}),
ignore_index=True)
history.sort_values(by='aic').head()
aic | bic | d | p | q | sd | sp | sq | test-rmse | train-rmse | |
---|---|---|---|---|---|---|---|---|---|---|
607 | 1788.577241 | 1827.025322 | 1.0 | 2.0 | 7.0 | 1.0 | 1.0 | 1.0 | 213.233196 | 56.551452 |
859 | 1788.745876 | 1827.193957 | 1.0 | 3.0 | 7.0 | 1.0 | 0.0 | 1.0 | 113.702525 | 56.803243 |
1115 | 1789.378411 | 1831.030497 | 1.0 | 4.0 | 7.0 | 1.0 | 0.0 | 1.0 | 135.805266 | 56.125318 |
1119 | 1789.942875 | 1834.798969 | 1.0 | 4.0 | 7.0 | 1.0 | 1.0 | 1.0 | 157.814221 | 56.008527 |
111 | 1790.310009 | 1822.350076 | 1.0 | 0.0 | 7.0 | 1.0 | 1.0 | 1.0 | 156.144135 | 58.798796 |
history.sort_values(by='train-rmse').head()
aic | bic | d | p | q | sd | sp | sq | test-rmse | train-rmse | |
---|---|---|---|---|---|---|---|---|---|---|
1113 | 1859.642120 | 1901.294207 | 1.0 | 4.0 | 7.0 | 0.0 | 0.0 | 1.0 | 113.475712 | 53.923458 |
1369 | 1856.598062 | 1901.454155 | 1.0 | 5.0 | 7.0 | 0.0 | 0.0 | 1.0 | 112.341062 | 53.937692 |
1373 | 1858.913330 | 1906.973430 | 1.0 | 5.0 | 7.0 | 0.0 | 1.0 | 1.0 | 115.611538 | 53.990284 |
1361 | 1869.243645 | 1910.895732 | 1.0 | 5.0 | 6.0 | 0.0 | 0.0 | 1.0 | 112.119264 | 53.996537 |
1372 | 1886.285779 | 1931.141873 | 1.0 | 5.0 | 7.0 | 0.0 | 1.0 | 0.0 | 126.945872 | 54.306539 |
history.sort_values(by='test-rmse').head()
aic | bic | d | p | q | sd | sp | sq | test-rmse | train-rmse | |
---|---|---|---|---|---|---|---|---|---|---|
226 | 2033.558278 | 2039.966291 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 59.381694 | 80.715324 |
234 | 2024.857090 | 2034.469110 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 59.520844 | 80.706696 |
482 | 2024.849634 | 2034.461654 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 59.561665 | 80.705050 |
554 | 2018.883701 | 2031.699728 | 1.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 61.536567 | 80.813581 |
306 | 2008.047838 | 2020.863864 | 1.0 | 1.0 | 2.0 | 1.0 | 0.0 | 0.0 | 62.031098 | 80.816687 |
order=(2,1,7)
seasonal_order=(1,1,1,7)
result = fit_model(ts_train, order, seasonal_order)
train_pred, test_pred, test_pred_ci = eval_model(ts_train, ts_test, result)
result.summary()
RMSE(train): 56.551 RMSE(test): 213.23
Dep. Variable: | pv | No. Observations: | 182 |
---|---|---|---|
Model: | SARIMAX(2, 1, 7)x(1, 1, 1, 7) | Log Likelihood | -882.289 |
Date: | Sun, 06 May 2018 | AIC | 1788.577 |
Time: | 21:31:03 | BIC | 1827.025 |
Sample: | 10-01-2017 | HQIC | 1804.164 |
- 03-31-2018 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | -0.7341 | 0.602 | -1.220 | 0.223 | -1.913 | 0.445 |
ar.L2 | -0.6721 | 0.353 | -1.902 | 0.057 | -1.365 | 0.021 |
ma.L1 | 0.3005 | 0.646 | 0.465 | 0.642 | -0.965 | 1.566 |
ma.L2 | 0.1380 | 0.287 | 0.482 | 0.630 | -0.424 | 0.699 |
ma.L3 | -0.7480 | 0.381 | -1.961 | 0.050 | -1.496 | -0.000 |
ma.L4 | -0.2445 | 0.425 | -0.575 | 0.565 | -1.077 | 0.588 |
ma.L5 | -0.1649 | 0.322 | -0.512 | 0.609 | -0.796 | 0.466 |
ma.L6 | -0.1059 | 0.260 | -0.408 | 0.684 | -0.615 | 0.404 |
ma.L7 | 0.1592 | 0.357 | 0.446 | 0.656 | -0.540 | 0.859 |
ar.S.L7 | -0.6176 | 0.249 | -2.484 | 0.013 | -1.105 | -0.130 |
ma.S.L7 | -0.6251 | 0.271 | -2.309 | 0.021 | -1.156 | -0.095 |
sigma2 | 4736.6494 | 822.241 | 5.761 | 0.000 | 3125.087 | 6348.212 |
Ljung-Box (Q): | 17.50 | Jarque-Bera (JB): | 14827.61 |
---|---|---|---|
Prob(Q): | 1.00 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 14.90 | Skew: | 5.27 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 49.12 |
title = 'SARIMA order={}, seasonal_order={}'.format(order, seasonal_order)
fig1, fig2, fig3 = make_fig(ts, train_pred, test_pred, test_pred_ci, result, title)
iplot(fig1, image=save_image, filename='SARIMA2-1')
if save_image:
fig2.savefig('SARIMA2-2')
fig3.savefig('SARIMA2-3')