#!/usr/bin/env python
# coding: utf-8

# ## Data Loading

# In[1]:


import warnings
warnings.filterwarnings("ignore")

import itertools
import pandas as pd
import numpy as np
from pylab import rcParams
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

df = pd.read_csv("avocado.csv")
df.head(3)


# In[2]:


# Index and sort data by Date
df = df.set_index("Date")
df.index = pd.to_datetime(df.index)
df.sort_values(by=['Date'], inplace=True)


# ## Data Exploration

# ### Organic vs. Conventional

# In[3]:


organic = df.loc[(df['region'] == "TotalUS") & (df['type'] == "organic")]
conv = df.loc[(df['region'] == "TotalUS") & (df['type'] == "conventional")]

plt.figure(figsize=(12,6))
plt.plot(organic.index, organic['AveragePrice'], label="organic", color='b')
plt.plot(conv.index, conv['AveragePrice'], label="conventional", color='r')
plt.legend()
plt.title("Conventional vs Organic Avocado Prices in the US", fontsize=15)
plt.xlabel("Time")
plt.ylabel("Price ($)")
plt.show()


# #### We can observe that:
# 1. Generally, organic and conventional avocado prices have similar fluctuations.
# 2. There is a sudden drop in organic prices in August 2015.

# ### New York vs. Boston vs. Dallas vs. Total US

# In[4]:


conventional = df.loc[df['type'] == "conventional"]

plt.figure(figsize=(16,8))

cities = ["NewYork", "Boston", "DallasFtWorth", "TotalUS"]
for c in cities:
    data = conventional.loc[conventional['region'] == c]
    data = data['AveragePrice'].resample("SMS").mean()
    if c == "TotalUS":
        linewidth = 3.5
    else:
        linewidth = 1.5
    plt.plot(data.index, data, label=c, linewidth=linewidth)

plt.legend()
plt.title("Avocado prices in Different Regions of the US", fontsize=17)
plt.xlabel("Time")
plt.ylabel("Price ($)")
plt.show()


# - New York has the highest prices
# - Dallas has the lowest prices

# ## Time-series Analysis

# In[5]:


# Date range: Jan 2015 to March 2018
conv.index.min(), conv.index.max()


# In[6]:


y = conv['AveragePrice'].resample("MS").mean()
plt.figure(figsize=(10,5))
y.plot()
plt.title("Monthly Avocado Prices in the US", fontsize=15)
plt.xlabel("Month")
plt.ylabel("Price ($)")
plt.show()


# ### Seasonal decomposition

# In[7]:


rcParams['figure.figsize'] = 13, 10

decomposition = sm.tsa.seasonal_decompose(y, model='additive')
fig = decomposition.plot()
plt.show()


# #### We can observe that
# * The fluctuation in prices is *seasonal*
# * The trend in price is overall *increasing*

# ## Predictive Modeling

# ### Training ARIMA time series model

# In[19]:


import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(y,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit(disp=-1)
            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue


# In[9]:


# Check for convergence
results.mle_retvals


# #### After 27 iterations, the model converged with the optimal values (1, 1, 1) x (1, 1, 0, 12). We will use this for our predictive model.

# In[10]:


model = sm.tsa.statespace.SARIMAX(y,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 0, 12),
                                enforce_stationarity=True,
                                enforce_invertibility=False)
results = model.fit()
results.summary().tables[1]


# In[11]:


results.plot_diagnostics(figsize=(16, 8))
plt.show()


# #### The model residuals do not exactly follow a normal distribution, but they are close to one.

# ### Predicted prices vs. True prices
# 
# #### We use the model above to predict the sale prices of avocados from January 2017 onwards.

# In[12]:


pred_date = '2017-01-01'
pred = results.get_prediction(start=pd.to_datetime(pred_date), dynamic=False)
pred_ci = pred.conf_int()
ax = y['2015':].plot(label='Truth')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Average Avocado Price ($)')
plt.title("Predicted Prices vs. Observed Prices for Avocado in USA", fontsize=17)
plt.legend()
plt.show()


# #### Although the forecasted values are not exactly the same, they follow the upward then downward trend of the observed data.

# ### Evaluation

# In[13]:


y_forecasted = pred.predicted_mean
y_truth = y[pred_date:]
mse = ((y_forecasted - y_truth) ** 2).mean()
print('Mean Squared Error = ${}'.format(round(mse, 5)))
print('Root Mean Squared Error = is ${}'.format(round(np.sqrt(mse), 5)))


# ### Future Forecasting

# In[14]:


pred_uc = results.get_forecast(steps=60)
pred_ci = pred_uc.conf_int()
ax = y.plot(label='observed', figsize=(15, 7))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Average Price ($)')
plt.legend()
plt.show()


# #### The predictive model captures both the seasonality and the overall increasing trend of the price values.