In [1]:
import os
import pandas as pd

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import StringIO
import pandas as pd

print __version__ # need 1.9.0 or greater

init_notebook_mode(connected = True)




def plotly_df(df, title = ''):
    data = []
    
    for column in df.columns:
        trace = go.Scatter(
            x = df.index,
            y = df[column],
            mode = 'lines',
            name = column
        )
        data.append(trace)
    
    layout = dict(title = title)
    fig = dict(data = data, layout = layout)
    iplot(fig, show_link=False)
    
%pylab inline
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

def invboxcox(y,lmbda):
    if lmbda == 0:
        return(np.exp(y))
    else:
        return(np.exp(np.log(lmbda*y+1)/lmbda))
2.0.1
Populating the interactive namespace from numpy and matplotlib
/Users/miptgirl/anaconda/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:

pylab import has clobbered these variables: ['plot', '__version__']
`%matplotlib` prevents importing * from pylab and numpy

Загрузка и предобработка данных

In [2]:
habr_df = pd.read_csv('howpop_train.csv')
In [3]:
habr_df['published'] = pd.to_datetime(habr_df.published)
habr_df = habr_df[['published', 'url']]
habr_df = habr_df.drop_duplicates()
In [4]:
aggr_habr_df = habr_df.groupby('published')[['url']].count()
aggr_habr_df.columns = ['posts']
In [5]:
aggr_habr_df = aggr_habr_df.resample('D').apply(sum)
plotly_df(aggr_habr_df.resample('W').apply(sum), title = 'Опубликованные посты на Хабрахабре')

Построение прогноза Prophet

In [6]:
from fbprophet import Prophet
In [12]:
predictions = 30

train_df = aggr_habr_df[:-predictions]
train_df.reset_index(inplace=True)
train_df.columns = ['ds', 'y']
train_df.tail()

# Python
m = Prophet()
m.fit(train_df);

future = m.make_future_dataframe(periods=predictions)

forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[12]:
ds yhat yhat_lower yhat_upper
3221 2016-10-27 67.331440 52.639240 82.479057
3222 2016-10-28 63.528170 48.193334 78.255027
3223 2016-10-29 35.546823 20.856616 50.085801
3224 2016-10-30 37.965443 24.419706 52.640538
3225 2016-10-31 68.420563 54.149134 83.448969
In [13]:
print ', '.join(forecast.columns)
ds, t, trend, seasonal_lower, seasonal_upper, trend_lower, trend_upper, yhat_lower, yhat_upper, weekly, weekly_lower, weekly_upper, yearly, yearly_lower, yearly_upper, seasonal, yhat
In [16]:
m.plot(forecast);
In [17]:
m.plot_components(forecast);

Оценка качества Prophet

In [18]:
cmp_df = forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds'))
In [19]:
import numpy as np
cmp_df['e'] = cmp_df['y'] - cmp_df['yhat']
cmp_df['p'] = 100*cmp_df['e']/cmp_df['y']
np.mean(abs(cmp_df[-predictions:]['p'])), np.mean(abs(cmp_df[-predictions:]['e']))
Out[19]:
(37.34607096816031, 10.623727053544432)

Прогноз с BoxCox

In [20]:
train_df2 = train_df.copy().fillna(14)
train_df2 = train_df2.set_index('ds')
train_df2['y'], lmbda_prophet = stats.boxcox(train_df2['y'])

train_df2.reset_index(inplace=True)

m2 = Prophet()
m2.fit(train_df2)
future2 = m2.make_future_dataframe(periods=30)

forecast2 = m2.predict(future2)
forecast2['yhat'] = invboxcox(forecast2.yhat, lmbda_prophet)
forecast2['yhat_lower'] = invboxcox(forecast2.yhat_lower, lmbda_prophet)
forecast2['yhat_upper'] = invboxcox(forecast2.yhat_upper, lmbda_prophet)

cmp_df2 = forecast2.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds'))

cmp_df2['e'] = cmp_df2['y'] - cmp_df2['yhat']
cmp_df2['p'] = 100*cmp_df2['e']/cmp_df2['y']
np.mean(abs(cmp_df2[-predictions:]['p'])), np.mean(abs(cmp_df2[-predictions:]['e']))
Out[20]:
(26.789153086481026, 8.485639747730659)

Визуализация результатов

In [28]:
def show_forecast(cmp_df, num_predictions, num_values):
    upper_bound = go.Scatter(
        name='Upper Bound',
        x=cmp_df.tail(num_predictions).index,
        y=cmp_df.tail(num_predictions).yhat_upper,
        mode='lines',
        marker=dict(color="444"),
        line=dict(width=0),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty')

    forecast = go.Scatter(
        name='Prediction',
        x=cmp_df.tail(predictions).index,
        y=cmp_df.tail(predictions).yhat,
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
    )

    lower_bound = go.Scatter(
        name='Lower Bound',
        x=cmp_df.tail(num_predictions).index,
        y=cmp_df.tail(num_predictions).yhat_lower,
        marker=dict(color="444"),
        line=dict(width=0),
        mode='lines')

    fact = go.Scatter(
        name='Fact',
        x=cmp_df.tail(num_values).index,
        y=cmp_df.tail(num_values).y,
        marker=dict(color="red"),
        mode='lines',
    )

    # Trace order can be important
    # with continuous error bars
    data = [lower_bound, upper_bound, forecast, fact]

    layout = go.Layout(
        yaxis=dict(title='Посты'),
        title='Опубликованные посты на Хабрахабре',
        showlegend = False)

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, show_link=False)
In [29]:
show_forecast(cmp_df, predictions, 200)

Сравнение с ARIMA моделью

In [14]:
%pylab inline
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

def invboxcox(y,lmbda):
   if lmbda == 0:
      return(np.exp(y))
   else:
      return(np.exp(np.log(lmbda*y+1)/lmbda))
Populating the interactive namespace from numpy and matplotlib
In [22]:
train_df = train_df.fillna(14).set_index('ds')
In [23]:
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(train_df['y'].values, freq=7).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1])
Критерий Дики-Фуллера: p=0.040528
<matplotlib.figure.Figure at 0x113937510>
In [24]:
train_df.index = pd.to_datetime(train_df.index)
In [25]:
train_df['y_box'], lmbda = stats.boxcox(map(lambda x: 1 if x == 0 else x, train_df['y']))
plt.figure(figsize(15,7))
train_df.y.plot()
plt.ylabel(u'Transformed wine sales')
print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda)
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1])
Оптимальный параметр преобразования Бокса-Кокса: 0.415948
Критерий Дики-Фуллера: p=0.040528
In [26]:
train_df['y_box_diff'] = train_df.y_box - train_df.y_box.shift(7)
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(train_df.y_box_diff[12:].values, freq=7).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df.y_box_diff[8:])[1])
Критерий Дики-Фуллера: p=0.000000
<matplotlib.figure.Figure at 0x1139372d0>
In [27]:
plt.figure(figsize(15,8))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)
pylab.show()

Начальные приближения Q = 1, q = 4, P = 5, p = 3

In [42]:
ps = range(0, 4)
d=1
qs = range(0, 5)
Ps = range(0, 7)
D=1
Qs = range(0, 2)
In [43]:
from itertools import product

parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
Out[43]:
280
In [1]:
%%time
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')


for param in parameters_list:
    print param
    #try except нужен, потому что на некоторых наборах параметров модель не обучается
    try:
        %time model=sm.tsa.statespace.SARIMAX(train_df.y_box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], 7)).fit(disp=-1)
    #выводим параметры, на которых модель не обучается и переходим к следующему набору
    except ValueError:
        print('wrong parameters:', param)
        continue
    aic = model.aic
    #сохраняем лучшую модель, aic, параметры
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])
    
warnings.filterwarnings('default')
In [45]:
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
       parameters          aic
212  (3, 4, 1, 1)  9597.537226
214  (3, 4, 2, 1)  9598.586890
216  (3, 4, 3, 1)  9599.168322
218  (3, 4, 4, 1)  9600.863045
220  (3, 4, 5, 1)  9601.172349
In [46]:
print(best_model.summary())
                                 Statespace Model Results                                
=========================================================================================
Dep. Variable:                             y_box   No. Observations:                 3197
Model:             SARIMAX(3, 1, 4)x(1, 1, 1, 7)   Log Likelihood               -4788.769
Date:                           Wed, 22 Mar 2017   AIC                           9597.537
Time:                                   23:58:32   BIC                           9658.237
Sample:                               01-01-2008   HQIC                          9619.300
                                    - 10-01-2016                                         
Covariance Type:                             opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2350      0.218     -1.079      0.280      -0.662       0.192
ar.L2          0.1784      0.145      1.231      0.218      -0.106       0.463
ar.L3          0.5557      0.118      4.697      0.000       0.324       0.788
ma.L1         -0.5165      0.219     -2.360      0.018      -0.946      -0.087
ma.L2         -0.4611      0.268     -1.721      0.085      -0.986       0.064
ma.L3         -0.4182      0.177     -2.365      0.018      -0.765      -0.072
ma.L4          0.4091      0.097      4.220      0.000       0.219       0.599
ar.S.L7        0.0685      0.021      3.296      0.001       0.028       0.109
ma.S.L7       -0.9492      0.007   -139.187      0.000      -0.963      -0.936
sigma2         1.1726      0.021     57.043      0.000       1.132       1.213
===================================================================================
Ljung-Box (Q):                       73.39   Jarque-Bera (JB):              1200.17
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.25   Skew:                            -0.69
Prob(H) (two-sided):                  0.00   Kurtosis:                         5.67
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [47]:
plt.figure(figsize(15,8))
plt.subplot(211)
best_model.resid[13:].plot()
plt.ylabel(u'Residuals')

ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[13:].values.squeeze(), lags=48, ax=ax)

print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[13:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[13:])[1])
Критерий Стьюдента: p=0.050717
Критерий Дики-Фуллера: p=0.000000
In [55]:
train_df['arima_model'] = invboxcox(best_model.fittedvalues, lmbda)
plt.figure(figsize(15,7))
train_df.y.tail(200).plot()
train_df.arima_model[13:].tail(200).plot(color='r')
plt.ylabel('Posts on Habr')
pylab.show()