Notebook

almond-spark¶

Based ammonite-spark
Works for any Spark version >= 2.0
Currently support for local, standalone and yarn clusters

In [ ]:

import $ivy.`org.apache.spark::spark-sql:2.4.3` // Or use any other 2.x version here
import $ivy.`sh.almond::almond-spark:0.5.0`

import org.apache.spark.sql._, org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)

val spark = {
  NotebookSparkSession.builder()
    .master("local[*]")
    .getOrCreate()
}
def sc = spark.sparkContext

Load a DataFrame¶

In [ ]:

import spark.implicits._
val titanic = spark
  .read
  .format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("titanic.csv")

Show as text¶

In [ ]:

titanic.show()

Let's make the output a bit nicer¶

In [ ]:

implicit class RichDF(val df: DataFrame) {
  def showHTML(limit:Int = 20) = {
    import xml.Utility.escape
    val data = df.take(limit)
    val header = df.schema.fieldNames.toSeq
    val rows: Seq[Seq[String]] = data.map { row =>
      row.toSeq.map { cell =>
        cell match {
          case null => "null"
          case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
          case array: Array[_] => array.mkString("[", ", ", "]")
          case seq: Seq[_] => seq.mkString("[", ", ", "]")
          case _ => cell.toString
        }
      }: Seq[String]
    }

    publish.html(s"""
    <div>
      <table border="1" class="dataframe">
      <thead>
        <tr>
        ${header.map(h => s"<th>${escape(h)}</th>").mkString}
        </tr>
         </thead>
         <tbody>
        ${rows.map { row =>
          s"<tr>${row.map { c => s"<td>${escape(c)}</td>" }.mkString}</tr>"
        }.mkString
        }
        </tbody>
      </table>
    </div>""")
  }
}

In [ ]:

titanic.showHTML(8)

Let's try some visualization¶

In [ ]:

import $ivy.`org.vegas-viz::vegas-spark:0.3.11`
import vegas._, vegas.data.External._, vegas.sparkExt._

Vegas("Titanic Survivors").
  withDataFrame(titanic).
  mark(Bar).
  encodeY("*", aggregate=AggOps.Count, axis=Axis(title="Number of People", grid=false)).
  encodeColumn("Pclass", Ord, scale=Scale(padding=10.0), axis=Axis(orient=Orient.Bottom, axisWidth=1.0, offset= -8.0)).
  encodeX("Survived", Nominal, scale=Scale(bandSize = 16.0), hideAxis=true).
  encodeColor("Survived", Nominal, scale=Scale(rangeNominals=List("red", "green"))).
  configFacet(cell=CellConfig(strokeWidth = 0)).
  configCell(height=400).
  show