rm(list=ls())
library(LalRUtils)
libreq(data.table, magrittr, tidyverse, microbenchmark,
nycflights13, tidyfast, dtplyr)
wants loaded [1,] "data.table" TRUE [2,] "magrittr" TRUE [3,] "tidyverse" TRUE [4,] "microbenchmark" TRUE [5,] "nycflights13" TRUE [6,] "tidyfast" TRUE [7,] "dtplyr" TRUE
input <- if (file.exists("flights14.csv")) {
"flights14.csv"
} else {
"https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
}
flights <- fread(input)
flights %>% head
year | month | day | dep_delay | arr_delay | carrier | origin | dest | air_time | distance | hour |
---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <int> | <int> | <int> |
2014 | 1 | 1 | 14 | 13 | AA | JFK | LAX | 359 | 2475 | 9 |
2014 | 1 | 1 | -3 | 13 | AA | JFK | LAX | 363 | 2475 | 11 |
2014 | 1 | 1 | 2 | 9 | AA | JFK | LAX | 351 | 2475 | 19 |
2014 | 1 | 1 | -8 | -26 | AA | LGA | PBI | 157 | 1035 | 7 |
2014 | 1 | 1 | 2 | 1 | AA | JFK | LAX | 350 | 2475 | 13 |
2014 | 1 | 1 | 4 | 0 | AA | EWR | LAX | 339 | 2454 | 18 |
fwrite(flights, "flights14.csv")
DT = data.table(
ID = c("b","b","b","a","a","c"),
a = 1:6,
b = 7:12,
c = 13:18
)
DT
ID | a | b | c |
---|---|---|---|
<chr> | <int> | <int> | <int> |
b | 1 | 7 | 13 |
b | 2 | 8 | 14 |
b | 3 | 9 | 15 |
a | 4 | 10 | 16 |
a | 5 | 11 | 17 |
c | 6 | 12 | 18 |
flights[origin == "JFK" & month == 6L] %>% head
year | month | day | dep_delay | arr_delay | carrier | origin | dest | air_time | distance | hour |
---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <int> | <int> | <int> |
2014 | 6 | 1 | -9 | -5 | AA | JFK | LAX | 324 | 2475 | 8 |
2014 | 6 | 1 | -10 | -13 | AA | JFK | LAX | 329 | 2475 | 12 |
2014 | 6 | 1 | 18 | -1 | AA | JFK | LAX | 326 | 2475 | 7 |
2014 | 6 | 1 | -6 | -16 | AA | JFK | LAX | 320 | 2475 | 10 |
2014 | 6 | 1 | -4 | -45 | AA | JFK | LAX | 326 | 2475 | 18 |
2014 | 6 | 1 | -6 | -23 | AA | JFK | LAX | 329 | 2475 | 14 |
flights[, arr_delay] %>% head
Slice and return dt
flights[, .(arr_delay)] %>% head
arr_delay |
---|
<int> |
13 |
13 |
9 |
-26 |
1 |
0 |
flights[order(origin, -dest)] %>% head
year | month | day | dep_delay | arr_delay | carrier | origin | dest | air_time | distance | hour |
---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <int> | <int> | <int> |
2014 | 1 | 5 | 6 | 49 | EV | EWR | XNA | 195 | 1131 | 8 |
2014 | 1 | 6 | 7 | 13 | EV | EWR | XNA | 190 | 1131 | 8 |
2014 | 1 | 7 | -6 | -13 | EV | EWR | XNA | 179 | 1131 | 8 |
2014 | 1 | 8 | -7 | -12 | EV | EWR | XNA | 184 | 1131 | 8 |
2014 | 1 | 9 | 16 | 7 | EV | EWR | XNA | 181 | 1131 | 8 |
2014 | 1 | 13 | 66 | 66 | EV | EWR | XNA | 188 | 1131 | 9 |
setorder(flights, origin, -dest)
flights %>% head
year | month | day | dep_delay | arr_delay | carrier | origin | dest | air_time | distance | hour |
---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <int> | <int> | <int> |
2014 | 1 | 5 | 6 | 49 | EV | EWR | XNA | 195 | 1131 | 8 |
2014 | 1 | 6 | 7 | 13 | EV | EWR | XNA | 190 | 1131 | 8 |
2014 | 1 | 7 | -6 | -13 | EV | EWR | XNA | 179 | 1131 | 8 |
2014 | 1 | 8 | -7 | -12 | EV | EWR | XNA | 184 | 1131 | 8 |
2014 | 1 | 9 | 16 | 7 | EV | EWR | XNA | 181 | 1131 | 8 |
2014 | 1 | 13 | 66 | 66 | EV | EWR | XNA | 188 | 1131 | 9 |
# example data
carsDT = data.table(mtcars, keep.rownames = TRUE)
# quick inspection
carsDT %>% head
rn | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb |
---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
Mazda RX4 | 21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 258 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
carsDT[, .(mean_hp = mean(hp), mean_wt = mean(wt)), by=.(am, vs)]
am | vs | mean_hp | mean_wt |
---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> |
1 | 0 | 180.83 | 2.857 |
1 | 1 | 80.57 | 2.028 |
0 | 1 | 102.14 | 3.194 |
0 | 0 | 194.17 | 4.104 |
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = c("hp", "wt", "disp")]
am | vs | hp | wt | disp |
---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
1 | 0 | 180.83 | 2.857 | 206.2 |
1 | 1 | 80.57 | 2.028 | 89.8 |
0 | 1 | 102.14 | 3.194 | 175.1 |
0 | 0 | 194.17 | 4.104 | 357.6 |
:=
syntax¶# example data
DT = data.table(
x = letters[c(1, 2, 3, 4, 5)],
y = c(1, 2, 3, 4, 5),
z = c(1, 2, 3, 4, 5) > 3
)
# create one
DT[, u := 5:1]
# creating multiple
DT[, `:=`(v = 2, w = 3L)]
# creating with dynamic names
nms = c("a", "b", "c")
DT[, (nms) := .(1, 2, 3)]
nms = c("u", "v", "w", "a", "b", "c")
DT[, (nms) := NULL]
DT
x | y | z |
---|---|---|
<chr> | <dbl> | <lgl> |
a | 1 | FALSE |
b | 2 | FALSE |
c | 3 | FALSE |
d | 4 | TRUE |
e | 5 | TRUE |
DT[, b := "Aardvark"][y > 1, b := "Zebra", verbose = TRUE]
Detected that j uses these columns: b Assigning to 4 row subset of 5 rows RHS_list_of_columns == false
cols = c("hp", "wt", "disp")
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = cols] %>%
setnames(cols, sprintf("mean_%s", cols)) %>% print
am vs mean_hp mean_wt mean_disp 1: 1 0 180.83 2.857 206.2 2: 1 1 80.57 2.028 89.8 3: 0 1 102.14 3.194 175.1 4: 0 0 194.17 4.104 357.6
keep_cols = c("arr_delay", "dep_delay")
flights[, ..keep_cols] %>% head
arr_delay | dep_delay |
---|---|
<int> | <int> |
49 | 6 |
13 | 7 |
-13 | -6 |
-12 | -7 |
7 | 16 |
66 | 66 |
drop_cols = c("arr_delay", "dep_delay")
flights[, -..drop_cols] %>% head
year | month | day | carrier | origin | dest | air_time | distance | hour |
---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <chr> | <chr> | <chr> | <int> | <int> | <int> |
2014 | 1 | 5 | EV | EWR | XNA | 195 | 1131 | 8 |
2014 | 1 | 6 | EV | EWR | XNA | 190 | 1131 | 8 |
2014 | 1 | 7 | EV | EWR | XNA | 179 | 1131 | 8 |
2014 | 1 | 8 | EV | EWR | XNA | 184 | 1131 | 8 |
2014 | 1 | 9 | EV | EWR | XNA | 181 | 1131 | 8 |
2014 | 1 | 13 | EV | EWR | XNA | 188 | 1131 | 9 |
flights[, .(.N), by = .(origin)]
origin | N |
---|---|
<chr> | <int> |
EWR | 87400 |
JFK | 81483 |
LGA | 84433 |
flights[carrier == "AA", .N, by = origin]
origin | N |
---|---|
<chr> | <int> |
EWR | 2649 |
JFK | 11923 |
LGA | 11730 |
flights2 = copy(flights)
setkey(flights2, carrier, origin, dest)
key(flights2)
collapse_dplyr = function() {
flights %>% as_tibble() %>%
group_by(carrier, origin, dest) %>%
summarize(mean_arr_delay = mean(arr_delay),
pressure = mean(dep_delay))
}
collapse_dt = function() {
flights[,
.(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
by = .(carrier, origin, dest)]
}
collapse_dt_key = function() {
flights2[,
.(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
by = .(carrier, origin, dest)]
}
microbenchmark(collapse_dplyr(), collapse_dt(), collapse_dt_key(), times = 10) %>%
print
Unit: milliseconds expr min lq mean median uq max neval cld collapse_dplyr() 38.479 39.400 47.325 43.363 44.39 95.52 10 b collapse_dt() 7.241 7.746 9.833 8.969 11.65 13.94 10 a collapse_dt_key() 19.357 19.479 20.962 19.678 20.44 29.73 10 a
keybied = flights[carrier == "AA",
.(mean(arr_delay), mean(dep_delay)),
keyby = .(origin, dest, month)]
key(keybied)
flights[, .N, .(dep_delay>0, arr_delay>0)]
dep_delay | arr_delay | N |
---|---|---|
<lgl> | <lgl> | <int> |
TRUE | TRUE | 72836 |
FALSE | FALSE | 119304 |
FALSE | TRUE | 34583 |
TRUE | FALSE | 26593 |
.SD
notation¶.SD
stands for Subset of Data
DT
x | y | z | b |
---|---|---|---|
<chr> | <dbl> | <lgl> | <chr> |
a | 1 | FALSE | Aardvark |
b | 2 | FALSE | Zebra |
c | 3 | FALSE | Zebra |
d | 4 | TRUE | Zebra |
e | 5 | TRUE | Zebra |
flights[carrier == "AA", ## Only on trips with carrier "AA"
lapply(.SD, mean), ## compute the mean
by = .(origin, dest, month), ## for every 'origin,dest,month'
.SDcols = c("arr_delay", "dep_delay")] %>% ## for just those specified in .SDcols
head
origin | dest | month | arr_delay | dep_delay |
---|---|---|---|---|
<chr> | <chr> | <int> | <dbl> | <dbl> |
EWR | PHX | 7 | -5.103 | 0.2759 |
EWR | PHX | 8 | 3.548 | 6.2258 |
EWR | PHX | 9 | -4.233 | -1.6667 |
EWR | PHX | 10 | -3.032 | -4.2903 |
EWR | MIA | 1 | 11.011 | 12.1236 |
EWR | MIA | 2 | 1.564 | 4.7564 |
emp = 'Employee EmployeeName Department Salary
1 Alice 11 800
2 Bob 11 600
3 Carla 12 900
4 Daniel 12 1000
5 Evelyn 13 800
6 Ferdinand 21 700'
dept = '
table Departments
Department DepartmentName Manager
11 Production 1
12 Sales 4
13 Marketing 5
14 Research NA'
X = fread(emp)
Y = fread(dept)
X
Y
Employee | EmployeeName | Department | Salary |
---|---|---|---|
<int> | <chr> | <int> | <int> |
1 | Alice | 11 | 800 |
2 | Bob | 11 | 600 |
3 | Carla | 12 | 900 |
4 | Daniel | 12 | 1000 |
5 | Evelyn | 13 | 800 |
6 | Ferdinand | 21 | 700 |
Department | DepartmentName | Manager |
---|---|---|
<int> | <chr> | <int> |
11 | Production | 1 |
12 | Sales | 4 |
13 | Marketing | 5 |
14 | Research | NA |
merge(X, Y, all=F) # inner
merge(X, Y, all.x=T) # right
merge(X, Y, all.y=T) # left
merge(X, Y, all=T) # cartesian
Department | Employee | EmployeeName | Salary | DepartmentName | Manager |
---|---|---|---|---|---|
<int> | <int> | <chr> | <int> | <chr> | <int> |
11 | 1 | Alice | 800 | Production | 1 |
11 | 2 | Bob | 600 | Production | 1 |
12 | 3 | Carla | 900 | Sales | 4 |
12 | 4 | Daniel | 1000 | Sales | 4 |
13 | 5 | Evelyn | 800 | Marketing | 5 |
Department | Employee | EmployeeName | Salary | DepartmentName | Manager |
---|---|---|---|---|---|
<int> | <int> | <chr> | <int> | <chr> | <int> |
11 | 1 | Alice | 800 | Production | 1 |
11 | 2 | Bob | 600 | Production | 1 |
12 | 3 | Carla | 900 | Sales | 4 |
12 | 4 | Daniel | 1000 | Sales | 4 |
13 | 5 | Evelyn | 800 | Marketing | 5 |
21 | 6 | Ferdinand | 700 | NA | NA |
Department | Employee | EmployeeName | Salary | DepartmentName | Manager |
---|---|---|---|---|---|
<int> | <int> | <chr> | <int> | <chr> | <int> |
11 | 1 | Alice | 800 | Production | 1 |
11 | 2 | Bob | 600 | Production | 1 |
12 | 3 | Carla | 900 | Sales | 4 |
12 | 4 | Daniel | 1000 | Sales | 4 |
13 | 5 | Evelyn | 800 | Marketing | 5 |
14 | NA | NA | NA | Research | NA |
Department | Employee | EmployeeName | Salary | DepartmentName | Manager |
---|---|---|---|---|---|
<int> | <int> | <chr> | <int> | <chr> | <int> |
11 | 1 | Alice | 800 | Production | 1 |
11 | 2 | Bob | 600 | Production | 1 |
12 | 3 | Carla | 900 | Sales | 4 |
12 | 4 | Daniel | 1000 | Sales | 4 |
13 | 5 | Evelyn | 800 | Marketing | 5 |
14 | NA | NA | NA | Research | NA |
21 | 6 | Ferdinand | 700 | NA | NA |
# example data
a = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
t = c(1L, 2L, 1L, 2L, NA_integer_), x = 11:15)
b = data.table(id = 1:2, y = c(11L, 15L))
a
b
id | t | x |
---|---|---|
<int> | <int> | <int> |
1 | 1 | 11 |
1 | 2 | 12 |
2 | 1 | 13 |
3 | 2 | 14 |
NA | NA | 15 |
id | y |
---|---|
<int> | <int> |
1 | 11 |
2 | 15 |
a[b, on=.(id)]
id | t | x | y |
---|---|---|---|
<int> | <int> | <int> | <int> |
1 | 1 | 11 | 11 |
1 | 2 | 12 | 11 |
2 | 1 | 13 | 15 |
Different keys
a[b, on=.(id, x = y)]
id | t | x |
---|---|---|
<int> | <int> | <int> |
1 | 1 | 11 |
2 | NA | 15 |
a[b, on=.(id), y := i.y ]
a
id | t | x | y |
---|---|---|---|
<int> | <int> | <int> | <int> |
1 | 1 | 11 | 11 |
1 | 2 | 12 | 11 |
2 | 1 | 13 | 15 |
3 | 2 | 14 | NA |
NA | NA | 15 | NA |
mDT = data.table(id = 1:3, x_dn = 10L, x_up = 13L)
mDT
id | x_dn | x_up |
---|---|---|
<int> | <int> | <int> |
1 | 10 | 13 |
2 | 10 | 13 |
3 | 10 | 13 |
a[mDT, on=.(id, x >= x_dn, x <= x_up), .(id, i.x_dn, i.x_up, x.x)]
id | i.x_dn | i.x_up | x.x |
---|---|---|---|
<int> | <int> | <int> | <int> |
1 | 10 | 13 | 11 |
1 | 10 | 13 | 12 |
2 | 10 | 13 | 13 |
3 | 10 | 13 | NA |
Main ref https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html
s = 'id y a b
A X 1 3
A Z 2 4
B X 1 3
B Z 2 4'
DT = fread(s)
dcast(DT, id ~ y, value.var = c('a', 'b'))
id | a_X | a_Z | b_X | b_Z |
---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> |
A | 1 | 2 | 3 | 4 |
B | 1 | 2 | 3 | 4 |
s = 'id a_X a_Z b_X b_Z
A 1 2 3 4
B 1 2 3 4'
DT = fread(s)
DT
melt(DT,
id.vars = c("id"),
measure = patterns("^a", "^b"),
variable.name = "y",
value.name = c("a", "b"))
id | a_X | a_Z | b_X | b_Z |
---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> |
A | 1 | 2 | 3 | 4 |
B | 1 | 2 | 3 | 4 |
id | y | a | b |
---|---|---|---|
<chr> | <fct> | <int> | <int> |
A | 1 | 1 | 3 |
B | 1 | 1 | 3 |
A | 2 | 2 | 4 |
B | 2 | 2 | 4 |
DT = data.table(a = 1:5)
DT[, `:=`(b = letters[a], c = 1 * (a > 3))][, d := .I , by= .(c)][]
a | b | c | d |
---|---|---|---|
<int> | <chr> | <dbl> | <int> |
1 | a | 0 | 1 |
2 | b | 0 | 2 |
3 | c | 0 | 3 |
4 | d | 1 | 4 |
5 | e | 1 | 5 |
DT[, l := shift(b, 1), by = c][]
a | b | c | d | l |
---|---|---|---|---|
<int> | <chr> | <dbl> | <int> | <chr> |
1 | a | 0 | 1 | NA |
2 | b | 0 | 2 | a |
3 | c | 0 | 3 | b |
4 | d | 1 | 4 | NA |
5 | e | 1 | 5 | d |
carsDT[, .N, by=.(drat_bin = cut(drat, 4))]
drat_bin | N |
---|---|
<fct> | <int> |
(3.84,4.39] | 11 |
(2.76,3.3] | 13 |
(3.3,3.84] | 6 |
(4.39,4.93] | 2 |
dplyr
and data.table
with dtplyr
¶collapse_dtplyr = function() {
flights %>% lazy_dt() %>%
group_by(carrier, origin, dest) %>%
summarize(mean_arr_delay = mean(arr_delay),
pressure = mean(dep_delay))
}
## Just compare dtplyr with normal dplyr and data.table versions (i.e. no keys)
microbenchmark::microbenchmark(collapse_dplyr(), collapse_dt_key(), collapse_dt(), collapse_dtplyr(), times = 10) %>%
print
Unit: milliseconds expr min lq mean median uq max neval cld collapse_dplyr() 38.607 39.551 41.795 40.784 42.971 48.918 10 b collapse_dt_key() 19.307 19.527 21.858 20.520 24.677 28.266 10 ab collapse_dt() 7.333 7.681 28.249 10.530 11.551 193.618 10 ab collapse_dtplyr() 3.296 3.512 3.809 3.561 3.646 6.179 10 a