import $ivy.`org.apache.spark::spark-sql:2.4.3` // Or use any other 2.x version here
import $ivy.`sh.almond::almond-spark:0.5.0`
import org.apache.spark.sql._, org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)
val spark = {
NotebookSparkSession.builder()
.master("local[*]")
.getOrCreate()
}
def sc = spark.sparkContext
import spark.implicits._
val titanic = spark
.read
.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("titanic.csv")
titanic.show()
implicit class RichDF(val df: DataFrame) {
def showHTML(limit:Int = 20) = {
import xml.Utility.escape
val data = df.take(limit)
val header = df.schema.fieldNames.toSeq
val rows: Seq[Seq[String]] = data.map { row =>
row.toSeq.map { cell =>
cell match {
case null => "null"
case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
case array: Array[_] => array.mkString("[", ", ", "]")
case seq: Seq[_] => seq.mkString("[", ", ", "]")
case _ => cell.toString
}
}: Seq[String]
}
publish.html(s"""
<div>
<table border="1" class="dataframe">
<thead>
<tr>
${header.map(h => s"<th>${escape(h)}</th>").mkString}
</tr>
</thead>
<tbody>
${rows.map { row =>
s"<tr>${row.map { c => s"<td>${escape(c)}</td>" }.mkString}</tr>"
}.mkString
}
</tbody>
</table>
</div>""")
}
}
titanic.showHTML(8)
import $ivy.`org.vegas-viz::vegas-spark:0.3.11`
import vegas._, vegas.data.External._, vegas.sparkExt._
Vegas("Titanic Survivors").
withDataFrame(titanic).
mark(Bar).
encodeY("*", aggregate=AggOps.Count, axis=Axis(title="Number of People", grid=false)).
encodeColumn("Pclass", Ord, scale=Scale(padding=10.0), axis=Axis(orient=Orient.Bottom, axisWidth=1.0, offset= -8.0)).
encodeX("Survived", Nominal, scale=Scale(bandSize = 16.0), hideAxis=true).
encodeColor("Survived", Nominal, scale=Scale(rangeNominals=List("red", "green"))).
configFacet(cell=CellConfig(strokeWidth = 0)).
configCell(height=400).
show