#!/usr/bin/env python # coding: utf-8 # # PySpark Demo Plotly Notebook # ## Demo # 1. Setup Spark # 2. Load Kaggle Data # 3. Transform Data with Spark SQL # 4. Graph Data with Plotly # # _Prepared by: [Gary A. Stafford](https://twitter.com/GaryStafford) # Associated article: https://wp.me/p1RD28-61V_ # ### Setup Spark # Setup Spark SparkSession # In[1]: get_ipython().system(' pip install pip --upgrade --quiet') # In[2]: from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType # In[3]: spark = SparkSession \ .builder \ .appName('pyspark_demo_app') \ .master("local[*]") \ .getOrCreate() # ### Load Kaggle Data # Load the Kaggle dataset from the CSV file, containing ~21K records, into a DataFrame # In[4]: bakery_schema = StructType([ StructField('date', StringType(), True), StructField('time', StringType(), True), StructField('transaction', IntegerType(), True), StructField('item', StringType(), True) ]) df_bakery1 = spark.read \ .format("csv") \ .option("header", "true") \ .load("BreadBasket_DMS.csv", schema=bakery_schema) df_bakery1.show(10) df_bakery1.count() # ### Transform Data with Spark SQL # Transform the DataFrame's bakery data using Spark SQL # In[5]: df_bakery1.createOrReplaceTempView("bakery_table_tmp1") df_bakery2 = spark.sql("SELECT date, transaction, item " + "FROM bakery_table_tmp1 " + "WHERE item NOT LIKE 'NONE'" + "ORDER BY transaction") df_bakery2.show(5) df_bakery2.count() # In[6]: df_bakery2.createOrReplaceTempView("bakery_table_tmp2") df_bakery3 = spark.sql("SELECT date, count(*) as count " + "FROM bakery_table_tmp2 " + "WHERE date >= '2017-01-01' " + "GROUP BY date " + "ORDER BY date") df_bakery3.show(5) df_bakery3.count() # ### Use Plotly to Visualize Data # Use [Plotly](https://plot.ly/python/) to create a chart showing bakery items sold over time # Demostrates linear fit and data smoothing: # * [Plotly Python Open Source Graphing Library](https://plot.ly/python/) # * [Smoothing in Python](https://plot.ly/python/smoothing/) # * [Linear Fit in Python](https://plot.ly/python/linear-fits/) # In[7]: get_ipython().system(' pip install plotly chart-studio --upgrade --quiet') # In[8]: import chart_studio.tools import chart_studio.plotly as py import plotly.graph_objs as go from numpy import arange from scipy import stats, signal import warnings warnings.filterwarnings('ignore') # In[9]: # *** UPDATE WITH YOUR CREDENTIALS FOR PLOTLY *** chart_studio.tools.set_credentials_file(username='username_goes_here', api_key='api_key_goes_here') # In[10]: df_bakery4 = df_bakery3.toPandas() # Generated linear fit xi = arange(0, len(df_bakery4.index)) slope, intercept, r_value, p_value, std_err = stats.linregress(xi, df_bakery4['count']) line = slope * xi + intercept layout = dict(title='2017 Bakery Sales', xaxis=dict( title='Month', showgrid=True, zeroline=True, showline=True, ticks='outside', tickangle=45, showticklabels=True), yaxis=dict( title='Items Sold/Day', showgrid=True, zeroline=True, showline=True, ticks='outside', showticklabels=True)) trace1 = go.Bar(x=df_bakery4['date'], y=df_bakery4['count'], name='Items Sold') trace2 = go.Scatter(x=df_bakery4['date'], y=line, mode='lines', name='Linear Fit') trace3 = go.Scatter(x=df_bakery4['date'], y=signal.savgol_filter(df_bakery4['count'], 53, 3), mode='lines', name='Savitzky-Golay') data = [trace1, trace2, trace3] fig = dict(data=data, layout=layout) py.iplot(fig, filename='jupyter-basic_bar.html')