#!/usr/bin/env python # coding: utf-8 # Notebook from [Forecast Website Traffic Using Facebook's Prophet Library](http://pbpython.com/prophet-overview.html) # Note: The recommended way to install prophet is to use the following command: # # conda install -c conda-forge fbprophet # In[1]: import pandas as pd import numpy as np from fbprophet import Prophet import matplotlib.pyplot as plt # Use ggplot style for ascentic reasons # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('ggplot') # Read in the data file and store as a pandas dataframe # In[3]: data_file = "https://github.com/chris1610/pbpython/blob/master/data/All-Web-Site-Data-Audience-Overview.xlsx?raw=True" df = pd.read_excel(data_file) df.head() # Check the data types to make sure the Day Index is a datetime type # In[4]: df.dtypes # Do a simple plot # In[5]: df.set_index('Day Index').plot(); # Filter out the outlier traffic spike # In[6]: df.loc[(df['Sessions'] > 5000), 'Sessions'] = np.nan df.set_index('Day Index').plot(); # More info on why we use log here - https://people.duke.edu/~rnau/411log.htm # In[7]: df['Sessions'] = np.log(df['Sessions']) df.head() # In[8]: df.set_index('Day Index').plot(); # Rename the columns to comply with the prophet API # In[9]: df.columns = ["ds", "y"] df.head() # Create a prophet object and fit it to our data # In[10]: m1 = Prophet() m1.fit(df) # Create the future days we want to predict # In[11]: future1 = m1.make_future_dataframe(periods=365) future1.tail() # Predict the future! # In[12]: forecast1 = m1.predict(future1) # Look at the values contained in the forecast dataframe # In[13]: forecast1.head() # In[14]: forecast1[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() # In[15]: np.exp(forecast1[['yhat', 'yhat_lower', 'yhat_upper']].tail()) # Plot the forecast for the next year # In[16]: m1.plot(forecast1); # In[17]: m1.plot_components(forecast1); # Create a dataframe with all the blog posts as well as the new one happening on March 6th, 2017. # In[18]: articles = pd.DataFrame({ 'holiday': 'publish', 'ds': pd.to_datetime(['2014-09-27', '2014-10-05', '2014-10-14', '2014-10-26', '2014-11-9', '2014-11-18', '2014-11-30', '2014-12-17', '2014-12-29', '2015-01-06', '2015-01-20', '2015-02-02', '2015-02-16', '2015-03-23', '2015-04-08', '2015-05-04', '2015-05-17', '2015-06-09', '2015-07-02', '2015-07-13', '2015-08-17', '2015-09-14', '2015-10-26', '2015-12-07', '2015-12-30', '2016-01-26', '2016-04-06', '2016-05-16', '2016-06-15', '2016-08-23', '2016-08-29', '2016-09-06', '2016-11-21', '2016-12-19', '2017-01-17', '2017-02-06', '2017-02-21', '2017-03-06']), 'lower_window': 0, 'upper_window': 5, }) # In[19]: articles.head() # Create a new model with the holidays defined # In[20]: m2 = Prophet(holidays=articles).fit(df) # In[21]: future2 = m2.make_future_dataframe(periods=90) # In[22]: forecast2 = m2.predict(future2) # In[23]: m2.plot(forecast2); # In[24]: m2.plot_components(forecast2); # Create the final model using holidays and applying some bayesian sampling to handle the errors with the holidays. # This will take a couple of minutes to run. # In[25]: m3 = Prophet(holidays=articles, mcmc_samples=500).fit(df) future3 = m3.make_future_dataframe(periods=90) forecast3 = m3.predict(future3) # Convert the log values back to the "real world" values. # In[26]: forecast3["Sessions"] = np.exp(forecast3.yhat).round() forecast3["Sessions_lower"] = np.exp(forecast3.yhat_lower).round() forecast3["Sessions_upper"] = np.exp(forecast3.yhat_upper).round() forecast3[(forecast3.ds > "3-5-2017") & (forecast3.ds < "4-1-2017")][["ds", "yhat", "Sessions_lower", "Sessions", "Sessions_upper"]] # In[27]: forecast3.to_excel("March-2017-forecast.xlsx") # In[ ]: