%lsmagic
Available line magics: %lsmagic %showtypes %showoutput %adddeps %truncation %addjar Available cell magics: %%sql %%html %%javascript %%dataframe %%scala Type %<magic_name> for usage info.
sc.version
2.4.3
val unpivotedDf = Seq(
("john", "notebook", 2),
("gary", "notebook", 3),
("john", "small_phone", 2),
("mary", "small_phone", 3),
("john", "large_phone", 3),
("john", "camera", 3)
).toDF("salesperson","device", "amount_sold").sort("salesperson","device")
unpivotedDf = [salesperson: string, device: string ... 1 more field]
[salesperson: string, device: string ... 1 more field]
%%dataframe
unpivotedDf
salesperson | device | amount_sold |
---|---|---|
gary | notebook | 3 |
john | camera | 3 |
john | large_phone | 3 |
john | notebook | 2 |
john | small_phone | 2 |
mary | small_phone | 3 |
%%dataframe
unpivotedDf.groupBy("salesperson").pivot("device").sum("amount_sold")
salesperson | camera | large_phone | notebook | small_phone |
---|---|---|---|---|
gary | null | null | 3 | null |
mary | null | null | null | 3 |
john | 3 | 3 | 2 | 2 |
val pivotedDf = Seq(
("gary", None, None, Some(3),None),
("mary", None, None, None, Some(3)),
("john", Some(3),Some(3),Some(2),Some(2))
).toDF("salesperson", "camera", "large_phone", "notebook", "small_phone")
pivotedDf = [salesperson: string, camera: int ... 3 more fields]
[salesperson: string, camera: int ... 3 more fields]
%%dataframe
pivotedDf
salesperson | camera | large_phone | notebook | small_phone |
---|---|---|---|---|
gary | null | null | 3 | null |
mary | null | null | null | 3 |
john | 3 | 3 | 2 | 2 |
val unpivotedDf = pivotedDf
.selectExpr("salesperson","stack(4,'camera',camera,'large_phone',large_phone,'notebook',notebook,'small_phone',small_phone)")
.withColumnRenamed("col0","device") // default name of this column is col0
.withColumnRenamed("col1","amount_sold") // default name of this column is col1
.filter($"amount_sold".isNotNull) // must explicitly remove nulls
unpivotedDf = [salesperson: string, device: string ... 1 more field]
[salesperson: string, device: string ... 1 more field]
%%dataframe --limit=100
unpivotedDf.sort("salesperson","device")
salesperson | device | amount_sold |
---|---|---|
gary | notebook | 3 |
john | camera | 3 |
john | large_phone | 3 |
john | notebook | 2 |
john | small_phone | 2 |
mary | small_phone | 3 |