#!/usr/bin/env python
# coding: utf-8

# # PySpark Demo Plotly Notebook
# ## Demo
# 1. Setup Spark
# 2. Load Kaggle Data
# 3. Transform Data with Spark SQL
# 4. Graph Data with Plotly
# 
# _Prepared by: [Gary A. Stafford](https://twitter.com/GaryStafford)   
# Associated article: https://wp.me/p1RD28-61V_

# ### Setup Spark
# Setup Spark SparkSession

# In[1]:


get_ipython().system(' pip install pip --upgrade --quiet')


# In[2]:


from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


# In[3]:


spark = SparkSession \
    .builder \
    .appName('pyspark_demo_app') \
    .master("local[*]") \
    .getOrCreate()


# ### Load Kaggle Data
# Load the Kaggle dataset from the CSV file, containing ~21K records, into a DataFrame

# In[4]:


bakery_schema = StructType([
    StructField('date', StringType(), True),
    StructField('time', StringType(), True),
    StructField('transaction', IntegerType(), True),
    StructField('item', StringType(), True)
])

df_bakery1 = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load("BreadBasket_DMS.csv", schema=bakery_schema)

df_bakery1.show(10)
df_bakery1.count()


# ### Transform Data with Spark SQL
# Transform the DataFrame's bakery data using Spark SQL

# In[5]:


df_bakery1.createOrReplaceTempView("bakery_table_tmp1")

df_bakery2 = spark.sql("SELECT date, transaction, item " +
                       "FROM bakery_table_tmp1 " +
                       "WHERE item NOT LIKE 'NONE'" +
                       "ORDER BY transaction")
df_bakery2.show(5)
df_bakery2.count()


# In[6]:


df_bakery2.createOrReplaceTempView("bakery_table_tmp2")

df_bakery3 = spark.sql("SELECT date, count(*) as count " +
                       "FROM bakery_table_tmp2 " +
                       "WHERE date >= '2017-01-01' " +
                       "GROUP BY date " +
                       "ORDER BY date")
df_bakery3.show(5)
df_bakery3.count()


# ### Use Plotly to Visualize Data
# Use [Plotly](https://plot.ly/python/) to create a chart showing bakery items sold over time  
# Demostrates linear fit and data smoothing:
# * [Plotly Python Open Source Graphing Library](https://plot.ly/python/)
# * [Smoothing in Python](https://plot.ly/python/smoothing/)
# * [Linear Fit in Python](https://plot.ly/python/linear-fits/)

# In[7]:


get_ipython().system(' pip install plotly chart-studio --upgrade --quiet')


# In[8]:


import chart_studio.tools
import chart_studio.plotly as py
import plotly.graph_objs as go
from numpy import arange
from scipy import stats, signal
import warnings
warnings.filterwarnings('ignore')


# In[9]:


# *** UPDATE WITH YOUR CREDENTIALS FOR PLOTLY ***
chart_studio.tools.set_credentials_file(username='username_goes_here', api_key='api_key_goes_here')


# In[10]:


df_bakery4 = df_bakery3.toPandas()

# Generated linear fit
xi = arange(0, len(df_bakery4.index))
slope, intercept, r_value, p_value, std_err = stats.linregress(xi, df_bakery4['count'])
line = slope * xi + intercept

layout = dict(title='2017 Bakery Sales',
              xaxis=dict(
                  title='Month',
                  showgrid=True,
                  zeroline=True,
                  showline=True,
                  ticks='outside',
                  tickangle=45,
                  showticklabels=True),
              yaxis=dict(
                  title='Items Sold/Day',
                  showgrid=True,
                  zeroline=True,
                  showline=True,
                  ticks='outside',
                  showticklabels=True))

trace1 = go.Bar(x=df_bakery4['date'], y=df_bakery4['count'], name='Items Sold')
trace2 = go.Scatter(x=df_bakery4['date'], y=line, mode='lines', name='Linear Fit')
trace3 = go.Scatter(x=df_bakery4['date'], y=signal.savgol_filter(df_bakery4['count'], 53, 3),
                    mode='lines', name='Savitzky-Golay')
data = [trace1, trace2, trace3]
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='jupyter-basic_bar.html')