In [1]:
import java.sql.{Date,Timestamp}
In [2]:
spark.version
Out[2]:
2.4.3

to date

In [5]:
import org.apache.spark.sql.functions.to_date

val df = Seq(
    ("notebook","2019-01-01"),
    ("notebook", "2019-01-10"),
    ("small_phone", "2019-01-15"),
    ("small_phone", "2019-01-30")
).toDF("device", "purchase_date").sort("device","purchase_date")

df.dtypes
df = [device: string, purchase_date: string]
Out[5]:
Array((device,StringType), (purchase_date,StringType))
In [6]:
df.withColumn("purchase_date",to_date($"purchase_date")).dtypes
Out[6]:
Array((device,StringType), (purchase_date,DateType))

to datetime

In [17]:
val df = Seq(
    ("notebook","2019-01-01 00:00:00"),
    ("notebook", "2019-01-10 13:00:00"),
    ("small_phone", "2019-01-15 12:00:00"),
    ("small_phone", "2019-01-30 09:30:00")
).toDF("device", "purchase_time").sort("device","purchase_time")
df = [device: string, purchase_time: string]
Out[17]:
[device: string, purchase_time: string]
In [20]:
df.dtypes
Out[20]:
Array((device,StringType), (purchase_time,StringType))
In [18]:
import org.apache.spark.sql.functions.to_timestamp
In [21]:
df.withColumn("purchase_time",to_timestamp($"purchase_time")).dtypes
Out[21]:
Array((device,StringType), (purchase_time,TimestampType))

to datetime custom format

In [23]:
val df = Seq(
    ("notebook","27/12/2019 12:00"),
    ("notebook", "01/12/2019 00:00"),
    ("small_phone", "23/01/2019 12:00"),
    ("small_phone", "27/12/2019 12:00")
).toDF("device", "purchase_time").sort("device","purchase_time")

df.dtypes
df = [device: string, purchase_time: string]
Out[23]:
Array((device,StringType), (purchase_time,StringType))
In [29]:
%%dataframe
df.withColumn("purchase_time",to_timestamp($"purchase_time"))
Out[29]:
devicepurchase_time
notebooknull
notebooknull
small_phonenull
small_phonenull
In [26]:
%%dataframe
df.withColumn("purchase_time",to_timestamp($"purchase_time","d/M/y H:m"))
Out[26]:
devicepurchase_time
notebook2019-12-01 00:00:00.0
notebook2019-12-27 12:00:00.0
small_phone2019-01-23 12:00:00.0
small_phone2019-12-27 12:00:00.0

timestamp to date

In [37]:
val df = Seq(
    ("notebook",Timestamp.valueOf("2019-01-29 12:00:00")),
    ("notebook", Timestamp.valueOf("2019-01-01 00:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-15 23:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-01 09:00:00"))
).toDF("device", "purchase_time").sort("device","purchase_time")

df.dtypes
df = [device: string, purchase_time: timestamp]
Out[37]:
Array((device,StringType), (purchase_time,TimestampType))
In [38]:
%%dataframe
df.withColumn("purchase_date",to_date($"purchase_time"))
Out[38]:
devicepurchase_timepurchase_date
notebook2019-01-01 00:00:00.02019-01-01
notebook2019-01-29 12:00:00.02019-01-29
small_phone2019-01-01 09:00:00.02019-01-01
small_phone2019-01-15 23:00:00.02019-01-15

date to timestamp with zero hours

In [39]:
import java.sql.Date
import org.apache.spark.sql.functions.to_timestamp

val df = Seq(
    ("notebook",Date.valueOf("2019-01-29")),
    ("notebook", Date.valueOf("2019-01-01")),
    ("small_phone", Date.valueOf("2019-01-15")),
    ("small_phone", Date.valueOf("2019-01-01"))
).toDF("device", "purchase_date").sort("device","purchase_date")
df = [device: string, purchase_date: date]
Out[39]:
[device: string, purchase_date: date]
In [40]:
%%dataframe
df
Out[40]:
devicepurchase_date
notebook2019-01-01
notebook2019-01-29
small_phone2019-01-01
small_phone2019-01-15
In [41]:
%%dataframe
df.withColumn("purchase_time",to_timestamp($"purchase_date"))
Out[41]:
devicepurchase_datepurchase_time
notebook2019-01-012019-01-01 00:00:00.0
notebook2019-01-292019-01-29 00:00:00.0
small_phone2019-01-012019-01-01 00:00:00.0
small_phone2019-01-152019-01-15 00:00:00.0

custom date/timestamp formatting

In [42]:
import java.sql.Timestamp
import org.apache.spark.sql.functions.date_format

val df = Seq(
    ("notebook",Timestamp.valueOf("2019-01-29 12:00:00")),
    ("notebook", Timestamp.valueOf("2019-01-01 00:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-15 23:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-01 09:00:00"))
).toDF("device", "purchase_time").sort("device","purchase_time")
df = [device: string, purchase_time: timestamp]
Out[42]:
[device: string, purchase_time: timestamp]
In [44]:
%%dataframe
df.withColumn("formatted_purchase_time",date_format($"purchase_time","y-MM"))
Out[44]:
devicepurchase_timeformatted_purchase_time
notebook2019-01-01 00:00:00.02019-01
notebook2019-01-29 12:00:00.02019-01
small_phone2019-01-01 09:00:00.02019-01
small_phone2019-01-15 23:00:00.02019-01

add sub

In [46]:
import org.apache.spark.sql.functions.{date_add,date_sub}

// note that the dates are just string
val df = Seq(
    ("notebook","2019-01-29 12:00:00"),
    ("notebook", "2019-01-01 00:00:00"),
    ("small_phone","2019-01-15 23:00:00"),
    ("small_phone", "2019-01-01 09:00:00")
).toDF("device", "purchase_time").sort("device","purchase_time")
df = [device: string, purchase_time: string]
Out[46]:
[device: string, purchase_time: string]
In [47]:
%%dataframe
df.withColumn("plus_2_days",date_add($"purchase_time",2))
Out[47]:
devicepurchase_timeplus_2_days
notebook2019-01-01 00:00:002019-01-03
notebook2019-01-29 12:00:002019-01-31
small_phone2019-01-01 09:00:002019-01-03
small_phone2019-01-15 23:00:002019-01-17

datediff

In [48]:
import org.apache.spark.sql.functions.datediff

// note that the dates are just strings
val df = Seq(
    ("notebook","2019-01-29", "2019-02-10"),
    ("notebook", "2019-01-01","2019-01-15"),
    ("small_phone","2019-01-15","2019-01-05"),
    ("small_phone", "2019-01-01","2019-01-20")
).toDF("device", "purchase_date", "arrival_date").sort("device","purchase_date")
df = [device: string, purchase_date: string ... 1 more field]
Out[48]:
[device: string, purchase_date: string ... 1 more field]
In [51]:
%%dataframe
df
Out[51]:
devicepurchase_datearrival_date
notebook2019-01-012019-01-15
notebook2019-01-292019-02-10
small_phone2019-01-012019-01-20
small_phone2019-01-152019-01-05
In [50]:
%%dataframe
df.withColumn("days_to_arrive",datediff($"arrival_date",$"purchase_date"))
Out[50]:
devicepurchase_datearrival_datedays_to_arrive
notebook2019-01-012019-01-1514
notebook2019-01-292019-02-1012
small_phone2019-01-012019-01-2019
small_phone2019-01-152019-01-05-10
In [ ]: