#!/usr/bin/env python # coding: utf-8 # In[1]: import os import pandas as pd from plotly import __version__ from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from plotly import graph_objs as go import requests import StringIO import pandas as pd print __version__ # need 1.9.0 or greater init_notebook_mode(connected = True) def plotly_df(df, title = ''): data = [] for column in df.columns: trace = go.Scatter( x = df.index, y = df[column], mode = 'lines', name = column ) data.append(trace) layout = dict(title = title) fig = dict(data = data, layout = layout) iplot(fig, show_link=False) get_ipython().run_line_magic('pylab', 'inline') import matplotlib.pyplot as plt from scipy import stats import statsmodels.api as sm def invboxcox(y,lmbda): if lmbda == 0: return(np.exp(y)) else: return(np.exp(np.log(lmbda*y+1)/lmbda)) # ## Загрузка и предобработка данных # In[2]: habr_df = pd.read_csv('howpop_train.csv') # In[3]: habr_df['published'] = pd.to_datetime(habr_df.published) habr_df = habr_df[['published', 'url']] habr_df = habr_df.drop_duplicates() # In[4]: aggr_habr_df = habr_df.groupby('published')[['url']].count() aggr_habr_df.columns = ['posts'] # In[5]: aggr_habr_df = aggr_habr_df.resample('D').apply(sum) plotly_df(aggr_habr_df.resample('W').apply(sum), title = 'Опубликованные посты на Хабрахабре') # ## Построение прогноза Prophet # In[6]: from fbprophet import Prophet # In[12]: predictions = 30 train_df = aggr_habr_df[:-predictions] train_df.reset_index(inplace=True) train_df.columns = ['ds', 'y'] train_df.tail() # Python m = Prophet() m.fit(train_df); future = m.make_future_dataframe(periods=predictions) forecast = m.predict(future) forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() # In[13]: print ', '.join(forecast.columns) # In[16]: m.plot(forecast); # In[17]: m.plot_components(forecast); # ## Оценка качества Prophet # In[18]: cmp_df = forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds')) # In[19]: import numpy as np cmp_df['e'] = cmp_df['y'] - cmp_df['yhat'] cmp_df['p'] = 100*cmp_df['e']/cmp_df['y'] np.mean(abs(cmp_df[-predictions:]['p'])), np.mean(abs(cmp_df[-predictions:]['e'])) # ## Прогноз с BoxCox # In[20]: train_df2 = train_df.copy().fillna(14) train_df2 = train_df2.set_index('ds') train_df2['y'], lmbda_prophet = stats.boxcox(train_df2['y']) train_df2.reset_index(inplace=True) m2 = Prophet() m2.fit(train_df2) future2 = m2.make_future_dataframe(periods=30) forecast2 = m2.predict(future2) forecast2['yhat'] = invboxcox(forecast2.yhat, lmbda_prophet) forecast2['yhat_lower'] = invboxcox(forecast2.yhat_lower, lmbda_prophet) forecast2['yhat_upper'] = invboxcox(forecast2.yhat_upper, lmbda_prophet) cmp_df2 = forecast2.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(df.set_index('ds')) cmp_df2['e'] = cmp_df2['y'] - cmp_df2['yhat'] cmp_df2['p'] = 100*cmp_df2['e']/cmp_df2['y'] np.mean(abs(cmp_df2[-predictions:]['p'])), np.mean(abs(cmp_df2[-predictions:]['e'])) # ## Визуализация результатов # In[28]: def show_forecast(cmp_df, num_predictions, num_values): upper_bound = go.Scatter( name='Upper Bound', x=cmp_df.tail(num_predictions).index, y=cmp_df.tail(num_predictions).yhat_upper, mode='lines', marker=dict(color="444"), line=dict(width=0), fillcolor='rgba(68, 68, 68, 0.3)', fill='tonexty') forecast = go.Scatter( name='Prediction', x=cmp_df.tail(predictions).index, y=cmp_df.tail(predictions).yhat, mode='lines', line=dict(color='rgb(31, 119, 180)'), ) lower_bound = go.Scatter( name='Lower Bound', x=cmp_df.tail(num_predictions).index, y=cmp_df.tail(num_predictions).yhat_lower, marker=dict(color="444"), line=dict(width=0), mode='lines') fact = go.Scatter( name='Fact', x=cmp_df.tail(num_values).index, y=cmp_df.tail(num_values).y, marker=dict(color="red"), mode='lines', ) # Trace order can be important # with continuous error bars data = [lower_bound, upper_bound, forecast, fact] layout = go.Layout( yaxis=dict(title='Посты'), title='Опубликованные посты на Хабрахабре', showlegend = False) fig = go.Figure(data=data, layout=layout) iplot(fig, show_link=False) # In[29]: show_forecast(cmp_df, predictions, 200) # ## Сравнение с ARIMA моделью # In[14]: get_ipython().run_line_magic('pylab', 'inline') import matplotlib.pyplot as plt from scipy import stats import statsmodels.api as sm def invboxcox(y,lmbda): if lmbda == 0: return(np.exp(y)) else: return(np.exp(np.log(lmbda*y+1)/lmbda)) # In[22]: train_df = train_df.fillna(14).set_index('ds') # In[23]: plt.figure(figsize(15,10)) sm.tsa.seasonal_decompose(train_df['y'].values, freq=7).plot() print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1]) # In[24]: train_df.index = pd.to_datetime(train_df.index) # In[25]: train_df['y_box'], lmbda = stats.boxcox(map(lambda x: 1 if x == 0 else x, train_df['y'])) plt.figure(figsize(15,7)) train_df.y.plot() plt.ylabel(u'Transformed wine sales') print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda) print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['y'])[1]) # In[26]: train_df['y_box_diff'] = train_df.y_box - train_df.y_box.shift(7) plt.figure(figsize(15,10)) sm.tsa.seasonal_decompose(train_df.y_box_diff[12:].values, freq=7).plot() print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df.y_box_diff[8:])[1]) # In[27]: plt.figure(figsize(15,8)) ax = plt.subplot(211) sm.graphics.tsa.plot_acf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax) pylab.show() ax = plt.subplot(212) sm.graphics.tsa.plot_pacf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax) pylab.show() # Начальные приближения Q = 1, q = 4, P = 5, p = 3 # In[42]: ps = range(0, 4) d=1 qs = range(0, 5) Ps = range(0, 7) D=1 Qs = range(0, 2) # In[43]: from itertools import product parameters = product(ps, qs, Ps, Qs) parameters_list = list(parameters) len(parameters_list) # In[1]: get_ipython().run_cell_magic('time', '', 'results = []\nbest_aic = float("inf")\nwarnings.filterwarnings(\'ignore\')\n\n\nfor param in parameters_list:\n print param\n #try except нужен, потому что на некоторых наборах параметров модель не обучается\n try:\n %time model=sm.tsa.statespace.SARIMAX(train_df.y_box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], 7)).fit(disp=-1)\n #выводим параметры, на которых модель не обучается и переходим к следующему набору\n except ValueError:\n print(\'wrong parameters:\', param)\n continue\n aic = model.aic\n #сохраняем лучшую модель, aic, параметры\n if aic < best_aic:\n best_model = model\n best_aic = aic\n best_param = param\n results.append([param, model.aic])\n \nwarnings.filterwarnings(\'default\')\n') # In[45]: result_table = pd.DataFrame(results) result_table.columns = ['parameters', 'aic'] print(result_table.sort_values(by = 'aic', ascending=True).head()) # In[46]: print(best_model.summary()) # In[47]: plt.figure(figsize(15,8)) plt.subplot(211) best_model.resid[13:].plot() plt.ylabel(u'Residuals') ax = plt.subplot(212) sm.graphics.tsa.plot_acf(best_model.resid[13:].values.squeeze(), lags=48, ax=ax) print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[13:], 0)[1]) print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[13:])[1]) # In[55]: train_df['arima_model'] = invboxcox(best_model.fittedvalues, lmbda) plt.figure(figsize(15,7)) train_df.y.tail(200).plot() train_df.arima_model[13:].tail(200).plot(color='r') plt.ylabel('Posts on Habr') pylab.show() # In[95]: arima_df = train_df.set_index('ds')[['y']] date_list = [datetime.datetime.strptime("2016-10-01", "%Y-%m-%d") + datetime.timedelta(x) for x in range(0,predictions+1)] future = pd.DataFrame(index=date_list, columns= arima_df.columns) arima_df = pd.concat([arima_df, future]) arima_df['forecast'] = invboxcox(best_model.predict(start=train_df.shape[0], end=train_df.shape[0]+predictions-1), lmbda) plt.figure(figsize(15,7)) arima_df.y.tail(200).plot() arima_df.forecast.tail(200).plot(color='r') plt.ylabel('Habr posts') pylab.show() # In[98]: cmp_df = cmp_df.join(arima_df[['forecast']]) # In[99]: import numpy as np cmp_df['e_arima'] = cmp_df['y'] - cmp_df['forecast'] cmp_df['p_arima'] = 100*cmp_df['e_arima']/cmp_df['y'] np.mean(abs(cmp_df[-predictions:]['p_arima'])), np.mean(abs(cmp_df[-predictions:]['e_arima'])) # In[102]: num_values = 200 forecast = go.Scatter( name='Prophet', x=cmp_df.tail(predictions).index, y=cmp_df.tail(predictions).yhat, mode='lines', line=dict(color='rgb(31, 119, 180)'), ) fact = go.Scatter( name='Fact', x=cmp_df.tail(num_values).index, y=cmp_df.tail(num_values).y, marker=dict(color="red"), mode='lines', ) arima = go.Scatter( name='ARIMA', x=cmp_df.tail(predictions).index, y=cmp_df.tail(predictions).forecast, mode='lines' ) # Trace order can be important # with continuous error bars data = [forecast, fact, arima] layout = go.Layout( yaxis=dict(title='Посты'), title='Опубликованные посты на Хабрахабре', showlegend = True) fig = go.Figure(data=data, layout=layout) iplot(fig, show_link=False) # In[ ]: