import os
import pandas as pd
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import StringIO
import pandas as pd
print __version__ # need 1.9.0 or greater
init_notebook_mode(connected = True)
def plotly_df(df, title = ''):
data = []
for column in df.columns:
trace = go.Scatter(
x = df.index,
y = df[column],
mode = 'lines',
name = column
)
data.append(trace)
layout = dict(title = title)
fig = dict(data = data, layout = layout)
iplot(fig, show_link=False)
%pylab inline
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
def invboxcox(y,lmbda):
if lmbda == 0:
return(np.exp(y))
else:
return(np.exp(np.log(lmbda*y+1)/lmbda))
2.0.1
Populating the interactive namespace from numpy and matplotlib
/Users/miptgirl/anaconda/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['plot', '__version__'] `%matplotlib` prevents importing * from pylab and numpy
habr_df = pd.read_csv('howpop_train.csv')
habr_df['published'] = pd.to_datetime(habr_df.published)
habr_df = habr_df[['published', 'url']]
habr_df = habr_df.drop_duplicates()
aggr_habr_df = habr_df.groupby('published')[['url']].count()
aggr_habr_df.columns = ['posts']
aggr_habr_df = aggr_habr_df.resample('D').apply(sum)
plotly_df(aggr_habr_df.resample('W').apply(sum), title = 'Опубликованные посты на Хабрахабре')
from fbprophet import Prophet
predictions = 30
train_df = aggr_habr_df[:-predictions]
train_df.reset_index(inplace=True)
train_df.columns = ['ds', 'y']
train_df.tail()
# Python
m = Prophet()
m.fit(train_df);
future = m.make_future_dataframe(periods=predictions)
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
ds | yhat | yhat_lower | yhat_upper | |
---|---|---|---|---|
3221 | 2016-10-27 | 67.331440 | 52.639240 | 82.479057 |
3222 | 2016-10-28 | 63.528170 | 48.193334 | 78.255027 |
3223 | 2016-10-29 | 35.546823 | 20.856616 | 50.085801 |
3224 | 2016-10-30 | 37.965443 | 24.419706 | 52.640538 |
3225 | 2016-10-31 | 68.420563 | 54.149134 | 83.448969 |
print ', '.join(forecast.columns)
ds, t, trend, seasonal_lower, seasonal_upper, trend_lower, trend_upper, yhat_lower, yhat_upper, weekly, weekly_lower, weekly_upper, yearly, yearly_lower, yearly_upper, seasonal, yhat
m.plot(forecast);
m.plot_components(forecast);
cmp_df = forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds'))
import numpy as np
cmp_df['e'] = cmp_df['y'] - cmp_df['yhat']
cmp_df['p'] = 100*cmp_df['e']/cmp_df['y']
np.mean(abs(cmp_df[-predictions:]['p'])), np.mean(abs(cmp_df[-predictions:]['e']))
(37.34607096816031, 10.623727053544432)
train_df2 = train_df.copy().fillna(14)
train_df2 = train_df2.set_index('ds')
train_df2['y'], lmbda_prophet = stats.boxcox(train_df2['y'])
train_df2.reset_index(inplace=True)
m2 = Prophet()
m2.fit(train_df2)
future2 = m2.make_future_dataframe(periods=30)
forecast2 = m2.predict(future2)
forecast2['yhat'] = invboxcox(forecast2.yhat, lmbda_prophet)
forecast2['yhat_lower'] = invboxcox(forecast2.yhat_lower, lmbda_prophet)
forecast2['yhat_upper'] = invboxcox(forecast2.yhat_upper, lmbda_prophet)
cmp_df2 = forecast2.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds'))
cmp_df2['e'] = cmp_df2['y'] - cmp_df2['yhat']
cmp_df2['p'] = 100*cmp_df2['e']/cmp_df2['y']
np.mean(abs(cmp_df2[-predictions:]['p'])), np.mean(abs(cmp_df2[-predictions:]['e']))
(26.789153086481026, 8.485639747730659)
def show_forecast(cmp_df, num_predictions, num_values):
upper_bound = go.Scatter(
name='Upper Bound',
x=cmp_df.tail(num_predictions).index,
y=cmp_df.tail(num_predictions).yhat_upper,
mode='lines',
marker=dict(color="444"),
line=dict(width=0),
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty')
forecast = go.Scatter(
name='Prediction',
x=cmp_df.tail(predictions).index,
y=cmp_df.tail(predictions).yhat,
mode='lines',
line=dict(color='rgb(31, 119, 180)'),
)
lower_bound = go.Scatter(
name='Lower Bound',
x=cmp_df.tail(num_predictions).index,
y=cmp_df.tail(num_predictions).yhat_lower,
marker=dict(color="444"),
line=dict(width=0),
mode='lines')
fact = go.Scatter(
name='Fact',
x=cmp_df.tail(num_values).index,
y=cmp_df.tail(num_values).y,
marker=dict(color="red"),
mode='lines',
)
# Trace order can be important
# with continuous error bars
data = [lower_bound, upper_bound, forecast, fact]
layout = go.Layout(
yaxis=dict(title='Посты'),
title='Опубликованные посты на Хабрахабре',
showlegend = False)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)
show_forecast(cmp_df, predictions, 200)
%pylab inline
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
def invboxcox(y,lmbda):
if lmbda == 0:
return(np.exp(y))
else:
return(np.exp(np.log(lmbda*y+1)/lmbda))
Populating the interactive namespace from numpy and matplotlib
train_df = train_df.fillna(14).set_index('ds')
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(train_df['y'].values, freq=7).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1])
Критерий Дики-Фуллера: p=0.040528
<matplotlib.figure.Figure at 0x113937510>
train_df.index = pd.to_datetime(train_df.index)
train_df['y_box'], lmbda = stats.boxcox(map(lambda x: 1 if x == 0 else x, train_df['y']))
plt.figure(figsize(15,7))
train_df.y.plot()
plt.ylabel(u'Transformed wine sales')
print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda)
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1])
Оптимальный параметр преобразования Бокса-Кокса: 0.415948 Критерий Дики-Фуллера: p=0.040528
train_df['y_box_diff'] = train_df.y_box - train_df.y_box.shift(7)
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(train_df.y_box_diff[12:].values, freq=7).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df.y_box_diff[8:])[1])
Критерий Дики-Фуллера: p=0.000000
<matplotlib.figure.Figure at 0x1139372d0>
plt.figure(figsize(15,8))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)
pylab.show()
Начальные приближения Q = 1, q = 4, P = 5, p = 3
ps = range(0, 4)
d=1
qs = range(0, 5)
Ps = range(0, 7)
D=1
Qs = range(0, 2)
from itertools import product
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
280
%%time
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
print param
#try except нужен, потому что на некоторых наборах параметров модель не обучается
try:
%time model=sm.tsa.statespace.SARIMAX(train_df.y_box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], 7)).fit(disp=-1)
#выводим параметры, на которых модель не обучается и переходим к следующему набору
except ValueError:
print('wrong parameters:', param)
continue
aic = model.aic
#сохраняем лучшую модель, aic, параметры
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results.append([param, model.aic])
warnings.filterwarnings('default')
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
parameters aic 212 (3, 4, 1, 1) 9597.537226 214 (3, 4, 2, 1) 9598.586890 216 (3, 4, 3, 1) 9599.168322 218 (3, 4, 4, 1) 9600.863045 220 (3, 4, 5, 1) 9601.172349
print(best_model.summary())
Statespace Model Results ========================================================================================= Dep. Variable: y_box No. Observations: 3197 Model: SARIMAX(3, 1, 4)x(1, 1, 1, 7) Log Likelihood -4788.769 Date: Wed, 22 Mar 2017 AIC 9597.537 Time: 23:58:32 BIC 9658.237 Sample: 01-01-2008 HQIC 9619.300 - 10-01-2016 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ ar.L1 -0.2350 0.218 -1.079 0.280 -0.662 0.192 ar.L2 0.1784 0.145 1.231 0.218 -0.106 0.463 ar.L3 0.5557 0.118 4.697 0.000 0.324 0.788 ma.L1 -0.5165 0.219 -2.360 0.018 -0.946 -0.087 ma.L2 -0.4611 0.268 -1.721 0.085 -0.986 0.064 ma.L3 -0.4182 0.177 -2.365 0.018 -0.765 -0.072 ma.L4 0.4091 0.097 4.220 0.000 0.219 0.599 ar.S.L7 0.0685 0.021 3.296 0.001 0.028 0.109 ma.S.L7 -0.9492 0.007 -139.187 0.000 -0.963 -0.936 sigma2 1.1726 0.021 57.043 0.000 1.132 1.213 =================================================================================== Ljung-Box (Q): 73.39 Jarque-Bera (JB): 1200.17 Prob(Q): 0.00 Prob(JB): 0.00 Heteroskedasticity (H): 1.25 Skew: -0.69 Prob(H) (two-sided): 0.00 Kurtosis: 5.67 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step).
plt.figure(figsize(15,8))
plt.subplot(211)
best_model.resid[13:].plot()
plt.ylabel(u'Residuals')
ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[13:].values.squeeze(), lags=48, ax=ax)
print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[13:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[13:])[1])
Критерий Стьюдента: p=0.050717 Критерий Дики-Фуллера: p=0.000000
train_df['arima_model'] = invboxcox(best_model.fittedvalues, lmbda)
plt.figure(figsize(15,7))
train_df.y.tail(200).plot()
train_df.arima_model[13:].tail(200).plot(color='r')
plt.ylabel('Posts on Habr')
pylab.show()
arima_df = train_df.set_index('ds')[['y']]
date_list = [datetime.datetime.strptime("2016-10-01", "%Y-%m-%d") + datetime.timedelta(x) for x in range(0,predictions+1)]
future = pd.DataFrame(index=date_list, columns= arima_df.columns)
arima_df = pd.concat([arima_df, future])
arima_df['forecast'] = invboxcox(best_model.predict(start=train_df.shape[0], end=train_df.shape[0]+predictions-1), lmbda)
plt.figure(figsize(15,7))
arima_df.y.tail(200).plot()
arima_df.forecast.tail(200).plot(color='r')
plt.ylabel('Habr posts')
pylab.show()
cmp_df = cmp_df.join(arima_df[['forecast']])
import numpy as np
cmp_df['e_arima'] = cmp_df['y'] - cmp_df['forecast']
cmp_df['p_arima'] = 100*cmp_df['e_arima']/cmp_df['y']
np.mean(abs(cmp_df[-predictions:]['p_arima'])), np.mean(abs(cmp_df[-predictions:]['e_arima']))
(16.535559722987166, 7.278655704774915)
num_values = 200
forecast = go.Scatter(
name='Prophet',
x=cmp_df.tail(predictions).index,
y=cmp_df.tail(predictions).yhat,
mode='lines',
line=dict(color='rgb(31, 119, 180)'),
)
fact = go.Scatter(
name='Fact',
x=cmp_df.tail(num_values).index,
y=cmp_df.tail(num_values).y,
marker=dict(color="red"),
mode='lines',
)
arima = go.Scatter(
name='ARIMA',
x=cmp_df.tail(predictions).index,
y=cmp_df.tail(predictions).forecast,
mode='lines'
)
# Trace order can be important
# with continuous error bars
data = [forecast, fact, arima]
layout = go.Layout(
yaxis=dict(title='Посты'),
title='Опубликованные посты на Хабрахабре',
showlegend = True)
fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)