spark.version import java.sql.Date import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ val devicesDf = Seq( (Date.valueOf("2019-01-01"), "notebook", 600.00), (Date.valueOf("2019-05-10"), "notebook", 1200.00), (Date.valueOf("2019-03-05"), "small phone", 100.00), (Date.valueOf("2019-02-20"), "camera",150.00), (Date.valueOf("2019-01-20"), "small phone", 300.00), (Date.valueOf("2019-02-15"), "large phone", 700.00), (Date.valueOf("2019-07-01"), "camera", 300.00), (Date.valueOf("2019-04-01"), "small phone", 50.00) ).toDF("purchase_date", "device", "price") %%dataframe devicesDf.sort("purchase_date") %%dataframe :paste devicesDf .withColumn("average_price_in_group", mean("price") over Window.partitionBy("device")) %%dataframe :paste devicesDf.withColumn("max_price_in_group", max("price") over Window.partitionBy("device")) %%dataframe :paste devicesDf .withColumn("max_price_in_group", max("price") over Window.partitionBy("device")) .filter($"price" === $"max_price_in_group") %%dataframe :paste devicesDf .withColumn("most_recent_purchase_in_group", max("purchase_date") over Window.partitionBy("device")) %%dataframe :paste devicesDf .withColumn("most_recent_purchase_in_group", max("purchase_date") over Window.partitionBy("device")) .filter($"purchase_date" === $"most_recent_purchase_in_group") %%dataframe :paste devicesDf .withColumn("percentile", percent_rank() over Window.orderBy("price")) %%dataframe :paste devicesDf .withColumn("percentile", percent_rank() over Window.orderBy("price")) .filter($"percentile" >= 0.5) .limit(1) %%dataframe :paste devicesDf .withColumn("percentile", percent_rank() over Window.orderBy("price")) %%dataframe :paste devicesDf .withColumn("percentile", percent_rank() over Window.orderBy("price")) .filter($"percentile" >= 0.85) .limit(1) %%dataframe :paste devicesDf .withColumn("cumulative_sum", sum("price") over Window.orderBy("purchase_date")) %%dataframe :paste devicesDf .withColumn("row_number", row_number() over Window.orderBy("purchase_date")) %%dataframe :paste devicesDf .withColumn("row_number", row_number() over Window.partitionBy("device").orderBy("purchase_date")) val df1 = Seq(("1","x"), ("2", "y")).toDF("a","b"); val df2 = Seq(("1","x2"), ("3", "z")).toDF("a","b"); %%dataframe df1 %%dataframe df2 %%dataframe df1.as("df1").join(df2.as("df2"),col("df1.a")===col("df2.a"),"outer") %%dataframe df1.join(df2, Seq("a"), "inner")