In [1]:

import os
import pandas as pd

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import StringIO
import pandas as pd

print __version__ # need 1.9.0 or greater

init_notebook_mode(connected = True)




def plotly_df(df, title = ''):
    data = []
    
    for column in df.columns:
        trace = go.Scatter(
            x = df.index,
            y = df[column],
            mode = 'lines',
            name = column
        )
        data.append(trace)
    
    layout = dict(title = title)
    fig = dict(data = data, layout = layout)
    iplot(fig, show_link=False)
    
%pylab inline
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

def invboxcox(y,lmbda):
    if lmbda == 0:
        return(np.exp(y))
    else:
        return(np.exp(np.log(lmbda*y+1)/lmbda))

2.0.1

Populating the interactive namespace from numpy and matplotlib

/Users/miptgirl/anaconda/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:

pylab import has clobbered these variables: ['plot', '__version__']
`%matplotlib` prevents importing * from pylab and numpy

Загрузка и предобработка данных¶

In [2]:

habr_df = pd.read_csv('howpop_train.csv')

In [3]:

habr_df['published'] = pd.to_datetime(habr_df.published)
habr_df = habr_df[['published', 'url']]
habr_df = habr_df.drop_duplicates()

In [4]:

aggr_habr_df = habr_df.groupby('published')[['url']].count()
aggr_habr_df.columns = ['posts']

In [5]:

aggr_habr_df = aggr_habr_df.resample('D').apply(sum)
plotly_df(aggr_habr_df.resample('W').apply(sum), title = 'Опубликованные посты на Хабрахабре')

Построение прогноза Prophet¶

In [6]:

from fbprophet import Prophet

In [12]:

predictions = 30

train_df = aggr_habr_df[:-predictions]
train_df.reset_index(inplace=True)
train_df.columns = ['ds', 'y']
train_df.tail()

# Python
m = Prophet()
m.fit(train_df);

future = m.make_future_dataframe(periods=predictions)

forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

Out[12]:

	ds	yhat	yhat_lower	yhat_upper
3221	2016-10-27	67.331440	52.639240	82.479057
3222	2016-10-28	63.528170	48.193334	78.255027
3223	2016-10-29	35.546823	20.856616	50.085801
3224	2016-10-30	37.965443	24.419706	52.640538
3225	2016-10-31	68.420563	54.149134	83.448969

In [13]:

print ', '.join(forecast.columns)

ds, t, trend, seasonal_lower, seasonal_upper, trend_lower, trend_upper, yhat_lower, yhat_upper, weekly, weekly_lower, weekly_upper, yearly, yearly_lower, yearly_upper, seasonal, yhat

In [16]:

m.plot(forecast);

In [17]:

m.plot_components(forecast);

Оценка качества Prophet¶

In [18]:

cmp_df = forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds'))

In [19]:

import numpy as np
cmp_df['e'] = cmp_df['y'] - cmp_df['yhat']
cmp_df['p'] = 100*cmp_df['e']/cmp_df['y']
np.mean(abs(cmp_df[-predictions:]['p'])), np.mean(abs(cmp_df[-predictions:]['e']))

Out[19]:

(37.34607096816031, 10.623727053544432)

Прогноз с BoxCox¶

In [20]:

train_df2 = train_df.copy().fillna(14)
train_df2 = train_df2.set_index('ds')
train_df2['y'], lmbda_prophet = stats.boxcox(train_df2['y'])

train_df2.reset_index(inplace=True)

m2 = Prophet()
m2.fit(train_df2)
future2 = m2.make_future_dataframe(periods=30)

forecast2 = m2.predict(future2)
forecast2['yhat'] = invboxcox(forecast2.yhat, lmbda_prophet)
forecast2['yhat_lower'] = invboxcox(forecast2.yhat_lower, lmbda_prophet)
forecast2['yhat_upper'] = invboxcox(forecast2.yhat_upper, lmbda_prophet)

cmp_df2 = forecast2.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds'))

cmp_df2['e'] = cmp_df2['y'] - cmp_df2['yhat']
cmp_df2['p'] = 100*cmp_df2['e']/cmp_df2['y']
np.mean(abs(cmp_df2[-predictions:]['p'])), np.mean(abs(cmp_df2[-predictions:]['e']))

Out[20]:

(26.789153086481026, 8.485639747730659)

Визуализация результатов¶

In [28]:

def show_forecast(cmp_df, num_predictions, num_values):
    upper_bound = go.Scatter(
        name='Upper Bound',
        x=cmp_df.tail(num_predictions).index,
        y=cmp_df.tail(num_predictions).yhat_upper,
        mode='lines',
        marker=dict(color="444"),
        line=dict(width=0),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty')

    forecast = go.Scatter(
        name='Prediction',
        x=cmp_df.tail(predictions).index,
        y=cmp_df.tail(predictions).yhat,
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
    )

    lower_bound = go.Scatter(
        name='Lower Bound',
        x=cmp_df.tail(num_predictions).index,
        y=cmp_df.tail(num_predictions).yhat_lower,
        marker=dict(color="444"),
        line=dict(width=0),
        mode='lines')

    fact = go.Scatter(
        name='Fact',
        x=cmp_df.tail(num_values).index,
        y=cmp_df.tail(num_values).y,
        marker=dict(color="red"),
        mode='lines',
    )

    # Trace order can be important
    # with continuous error bars
    data = [lower_bound, upper_bound, forecast, fact]

    layout = go.Layout(
        yaxis=dict(title='Посты'),
        title='Опубликованные посты на Хабрахабре',
        showlegend = False)

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, show_link=False)

In [29]:

show_forecast(cmp_df, predictions, 200)

Сравнение с ARIMA моделью¶

In [14]:

%pylab inline
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

def invboxcox(y,lmbda):
   if lmbda == 0:
      return(np.exp(y))
   else:
      return(np.exp(np.log(lmbda*y+1)/lmbda))

Populating the interactive namespace from numpy and matplotlib

In [22]:

train_df = train_df.fillna(14).set_index('ds')

In [23]:

plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(train_df['y'].values, freq=7).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1])

Критерий Дики-Фуллера: p=0.040528

<matplotlib.figure.Figure at 0x113937510>

In [24]:

train_df.index = pd.to_datetime(train_df.index)

In [25]:

train_df['y_box'], lmbda = stats.boxcox(map(lambda x: 1 if x == 0 else x, train_df['y']))
plt.figure(figsize(15,7))
train_df.y.plot()
plt.ylabel(u'Transformed wine sales')
print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda)
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1])

Оптимальный параметр преобразования Бокса-Кокса: 0.415948
Критерий Дики-Фуллера: p=0.040528

In [26]:

train_df['y_box_diff'] = train_df.y_box - train_df.y_box.shift(7)
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(train_df.y_box_diff[12:].values, freq=7).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df.y_box_diff[8:])[1])

Критерий Дики-Фуллера: p=0.000000

<matplotlib.figure.Figure at 0x1139372d0>

In [27]:

plt.figure(figsize(15,8))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)
pylab.show()

Начальные приближения Q = 1, q = 4, P = 5, p = 3

In [42]:

ps = range(0, 4)
d=1
qs = range(0, 5)
Ps = range(0, 7)
D=1
Qs = range(0, 2)

In [43]:

from itertools import product

parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)

Out[43]:

In [1]:

%%time
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')


for param in parameters_list:
    print param
    #try except нужен, потому что на некоторых наборах параметров модель не обучается
    try:
        %time model=sm.tsa.statespace.SARIMAX(train_df.y_box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], 7)).fit(disp=-1)
    #выводим параметры, на которых модель не обучается и переходим к следующему набору
    except ValueError:
        print('wrong parameters:', param)
        continue
    aic = model.aic
    #сохраняем лучшую модель, aic, параметры
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])
    
warnings.filterwarnings('default')

In [45]:

result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())

       parameters          aic
212  (3, 4, 1, 1)  9597.537226
214  (3, 4, 2, 1)  9598.586890
216  (3, 4, 3, 1)  9599.168322
218  (3, 4, 4, 1)  9600.863045
220  (3, 4, 5, 1)  9601.172349

In [46]:

print(best_model.summary())

                                 Statespace Model Results                                
=========================================================================================
Dep. Variable:                             y_box   No. Observations:                 3197
Model:             SARIMAX(3, 1, 4)x(1, 1, 1, 7)   Log Likelihood               -4788.769
Date:                           Wed, 22 Mar 2017   AIC                           9597.537
Time:                                   23:58:32   BIC                           9658.237
Sample:                               01-01-2008   HQIC                          9619.300
                                    - 10-01-2016                                         
Covariance Type:                             opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2350      0.218     -1.079      0.280      -0.662       0.192
ar.L2          0.1784      0.145      1.231      0.218      -0.106       0.463
ar.L3          0.5557      0.118      4.697      0.000       0.324       0.788
ma.L1         -0.5165      0.219     -2.360      0.018      -0.946      -0.087
ma.L2         -0.4611      0.268     -1.721      0.085      -0.986       0.064
ma.L3         -0.4182      0.177     -2.365      0.018      -0.765      -0.072
ma.L4          0.4091      0.097      4.220      0.000       0.219       0.599
ar.S.L7        0.0685      0.021      3.296      0.001       0.028       0.109
ma.S.L7       -0.9492      0.007   -139.187      0.000      -0.963      -0.936
sigma2         1.1726      0.021     57.043      0.000       1.132       1.213
===================================================================================
Ljung-Box (Q):                       73.39   Jarque-Bera (JB):              1200.17
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.25   Skew:                            -0.69
Prob(H) (two-sided):                  0.00   Kurtosis:                         5.67
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

In [47]:

plt.figure(figsize(15,8))
plt.subplot(211)
best_model.resid[13:].plot()
plt.ylabel(u'Residuals')

ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[13:].values.squeeze(), lags=48, ax=ax)

print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[13:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[13:])[1])

Критерий Стьюдента: p=0.050717
Критерий Дики-Фуллера: p=0.000000

In [55]:

train_df['arima_model'] = invboxcox(best_model.fittedvalues, lmbda)
plt.figure(figsize(15,7))
train_df.y.tail(200).plot()
train_df.arima_model[13:].tail(200).plot(color='r')
plt.ylabel('Posts on Habr')
pylab.show()

In [95]:

arima_df = train_df.set_index('ds')[['y']]

date_list = [datetime.datetime.strptime("2016-10-01", "%Y-%m-%d") + datetime.timedelta(x) for x in range(0,predictions+1)]
future = pd.DataFrame(index=date_list, columns= arima_df.columns)
arima_df = pd.concat([arima_df, future])
arima_df['forecast'] = invboxcox(best_model.predict(start=train_df.shape[0], end=train_df.shape[0]+predictions-1), lmbda)
plt.figure(figsize(15,7))
arima_df.y.tail(200).plot()
arima_df.forecast.tail(200).plot(color='r')
plt.ylabel('Habr posts')
pylab.show()

In [98]:

cmp_df = cmp_df.join(arima_df[['forecast']])

In [99]:

import numpy as np
cmp_df['e_arima'] = cmp_df['y'] - cmp_df['forecast']
cmp_df['p_arima'] = 100*cmp_df['e_arima']/cmp_df['y']
np.mean(abs(cmp_df[-predictions:]['p_arima'])), np.mean(abs(cmp_df[-predictions:]['e_arima']))

Out[99]:

(16.535559722987166, 7.278655704774915)

In [102]:

num_values = 200

forecast = go.Scatter(
    name='Prophet',
    x=cmp_df.tail(predictions).index,
    y=cmp_df.tail(predictions).yhat,
    mode='lines',
    line=dict(color='rgb(31, 119, 180)'),
)


fact = go.Scatter(
    name='Fact',
    x=cmp_df.tail(num_values).index,
    y=cmp_df.tail(num_values).y,
    marker=dict(color="red"),
    mode='lines',
)

arima = go.Scatter(
    name='ARIMA',
    x=cmp_df.tail(predictions).index,
    y=cmp_df.tail(predictions).forecast,
    mode='lines'
)

# Trace order can be important
# with continuous error bars
data = [forecast, fact, arima]

layout = go.Layout(
    yaxis=dict(title='Посты'),
    title='Опубликованные посты на Хабрахабре',
    showlegend = True)

fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)

In [ ]: