#!/usr/bin/env python # coding: utf-8 # ## Data Loading # In[1]: import warnings warnings.filterwarnings("ignore") import itertools import pandas as pd import numpy as np from pylab import rcParams import statsmodels.api as sm import matplotlib.pyplot as plt from matplotlib.pyplot import figure df = pd.read_csv("avocado.csv") df.head(3) # In[2]: # Index and sort data by Date df = df.set_index("Date") df.index = pd.to_datetime(df.index) df.sort_values(by=['Date'], inplace=True) # ## Data Exploration # ### Organic vs. Conventional # In[3]: organic = df.loc[(df['region'] == "TotalUS") & (df['type'] == "organic")] conv = df.loc[(df['region'] == "TotalUS") & (df['type'] == "conventional")] plt.figure(figsize=(12,6)) plt.plot(organic.index, organic['AveragePrice'], label="organic", color='b') plt.plot(conv.index, conv['AveragePrice'], label="conventional", color='r') plt.legend() plt.title("Conventional vs Organic Avocado Prices in the US", fontsize=15) plt.xlabel("Time") plt.ylabel("Price ($)") plt.show() # #### We can observe that: # 1. Generally, organic and conventional avocado prices have similar fluctuations. # 2. There is a sudden drop in organic prices in August 2015. # ### New York vs. Boston vs. Dallas vs. Total US # In[4]: conventional = df.loc[df['type'] == "conventional"] plt.figure(figsize=(16,8)) cities = ["NewYork", "Boston", "DallasFtWorth", "TotalUS"] for c in cities: data = conventional.loc[conventional['region'] == c] data = data['AveragePrice'].resample("SMS").mean() if c == "TotalUS": linewidth = 3.5 else: linewidth = 1.5 plt.plot(data.index, data, label=c, linewidth=linewidth) plt.legend() plt.title("Avocado prices in Different Regions of the US", fontsize=17) plt.xlabel("Time") plt.ylabel("Price ($)") plt.show() # - New York has the highest prices # - Dallas has the lowest prices # ## Time-series Analysis # In[5]: # Date range: Jan 2015 to March 2018 conv.index.min(), conv.index.max() # In[6]: y = conv['AveragePrice'].resample("MS").mean() plt.figure(figsize=(10,5)) y.plot() plt.title("Monthly Avocado Prices in the US", fontsize=15) plt.xlabel("Month") plt.ylabel("Price ($)") plt.show() # ### Seasonal decomposition # In[7]: rcParams['figure.figsize'] = 13, 10 decomposition = sm.tsa.seasonal_decompose(y, model='additive') fig = decomposition.plot() plt.show() # #### We can observe that # * The fluctuation in prices is *seasonal* # * The trend in price is overall *increasing* # ## Predictive Modeling # ### Training ARIMA time series model # In[19]: import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') p = d = q = range(0, 2) pdq = list(itertools.product(p, d, q)) seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))] for param in pdq: for param_seasonal in seasonal_pdq: try: mod = sm.tsa.statespace.SARIMAX(y, order=param, seasonal_order=param_seasonal, enforce_stationarity=False, enforce_invertibility=False) results = mod.fit(disp=-1) print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic)) except: continue # In[9]: # Check for convergence results.mle_retvals # #### After 27 iterations, the model converged with the optimal values (1, 1, 1) x (1, 1, 0, 12). We will use this for our predictive model. # In[10]: model = sm.tsa.statespace.SARIMAX(y, order=(1, 1, 1), seasonal_order=(1, 1, 0, 12), enforce_stationarity=True, enforce_invertibility=False) results = model.fit() results.summary().tables[1] # In[11]: results.plot_diagnostics(figsize=(16, 8)) plt.show() # #### The model residuals do not exactly follow a normal distribution, but they are close to one. # ### Predicted prices vs. True prices # # #### We use the model above to predict the sale prices of avocados from January 2017 onwards. # In[12]: pred_date = '2017-01-01' pred = results.get_prediction(start=pd.to_datetime(pred_date), dynamic=False) pred_ci = pred.conf_int() ax = y['2015':].plot(label='Truth') pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7)) ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2) ax.set_xlabel('Date') ax.set_ylabel('Average Avocado Price ($)') plt.title("Predicted Prices vs. Observed Prices for Avocado in USA", fontsize=17) plt.legend() plt.show() # #### Although the forecasted values are not exactly the same, they follow the upward then downward trend of the observed data. # ### Evaluation # In[13]: y_forecasted = pred.predicted_mean y_truth = y[pred_date:] mse = ((y_forecasted - y_truth) ** 2).mean() print('Mean Squared Error = ${}'.format(round(mse, 5))) print('Root Mean Squared Error = is ${}'.format(round(np.sqrt(mse), 5))) # ### Future Forecasting # In[14]: pred_uc = results.get_forecast(steps=60) pred_ci = pred_uc.conf_int() ax = y.plot(label='observed', figsize=(15, 7)) pred_uc.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2) ax.set_xlabel('Date') ax.set_ylabel('Average Price ($)') plt.legend() plt.show() # #### The predictive model captures both the seasonality and the overall increasing trend of the price values.