In [1]:
import java.sql.{Date,Timestamp}
In [2]:
spark.version
Out[2]:
2.4.3

when/otherwise

In [7]:
import org.apache.spark.sql.functions.when

val df = Seq(
    ("notebook","2019-01-19"),
    ("notebook", "2019-01-10"),
    ("small_phone", "2019-01-15"),
    ("small_phone", "2019-01-30"),
    ("tv_set", "2019-01-22"),
    ("large_phone", "2019-01-30"),
    ("medium_phone", "2019-01-22")
).toDF("device", "purchase_date").select("purchase_date","device").sort("device","purchase_date")
df = [purchase_date: string, device: string]
lastException: Throwable = null
Out[7]:
[purchase_date: string, device: string]
In [8]:
%%dataframe
df
Out[8]:
purchase_datedevice
2019-01-30large_phone
2019-01-22medium_phone
2019-01-10notebook
2019-01-19notebook
2019-01-15small_phone
2019-01-30small_phone
2019-01-22tv_set
In [9]:
%%dataframe
df.withColumn("is_phone",when($"device".endsWith("phone"),true).otherwise(false))
Out[9]:
purchase_datedeviceis_phone
2019-01-30large_phonetrue
2019-01-22medium_phonetrue
2019-01-10notebookfalse
2019-01-19notebookfalse
2019-01-15small_phonetrue
2019-01-30small_phonetrue
2019-01-22tv_setfalse

when without otherwise

In [19]:
import org.apache.spark.sql.functions.{element_at,split,when}

val df = Seq(
    ("notebook","2019-01-19"),
    ("notebook", "2019-01-10"),
    ("small_phone", "2019-01-15"),
    ("small_phone", "2019-01-30"),
    ("tv_set", "2019-01-22"),
    ("large_phone", "2019-01-30"),
    ("medium_phone", "2019-01-22")
).toDF("device", "purchase_date").select("purchase_date","device").sort("device","purchase_date")
df = [purchase_date: string, device: string]
Out[19]:
[purchase_date: string, device: string]
In [20]:
%%dataframe
df
Out[20]:
purchase_datedevice
2019-01-30large_phone
2019-01-22medium_phone
2019-01-10notebook
2019-01-19notebook
2019-01-15small_phone
2019-01-30small_phone
2019-01-22tv_set
In [21]:
%%dataframe

df.withColumn("phone_size", when($"device".endsWith("phone"), element_at(split($"device","_"),1)))
Out[21]:
purchase_datedevicephone_size
2019-01-30large_phonelarge
2019-01-22medium_phonemedium
2019-01-10notebooknull
2019-01-19notebooknull
2019-01-15small_phonesmall
2019-01-30small_phonesmall
2019-01-22tv_setnull

multiple when clauses

In [23]:
import org.apache.spark.sql.functions.when

val df = Seq(
    ("notebook","2019-01-19"),
    ("notebook", "2019-01-10"),
    ("small_phone", "2019-01-15"),
    ("small_phone", "2019-01-30"),
    ("tv_set", "2019-01-22"),
    ("large_phone", "2019-01-30"),
    ("medium_phone", "2019-01-22")
).toDF("device", "purchase_date").select("purchase_date","device").sort("device","purchase_date")
df = [purchase_date: string, device: string]
Out[23]:
[purchase_date: string, device: string]
In [24]:
%%dataframe
df
Out[24]:
purchase_datedevice
2019-01-30large_phone
2019-01-22medium_phone
2019-01-10notebook
2019-01-19notebook
2019-01-15small_phone
2019-01-30small_phone
2019-01-22tv_set
In [27]:
%%dataframe

:read

df.withColumn("shipping_rate", 
              when($"device"==="tv_set",30)
              .when($"device".isin("large_phone","notebook"),20)
              .when($"device".isin("small_phone","medium_phone"),10)
              .otherwise(25)) // in case we sell other devices
Out[27]:
purchase_datedeviceshipping_rate
2019-01-30large_phone20
2019-01-22medium_phone10
2019-01-10notebook20
2019-01-19notebook20
2019-01-15small_phone10
2019-01-30small_phone10
2019-01-22tv_set30