In [1]:

rm(list=ls())
library(LalRUtils)
libreq(data.table, magrittr, tidyverse, microbenchmark, 
       nycflights13, tidyfast, dtplyr)

     wants            loaded
[1,] "data.table"     TRUE  
[2,] "magrittr"       TRUE  
[3,] "tidyverse"      TRUE  
[4,] "microbenchmark" TRUE  
[5,] "nycflights13"   TRUE  
[6,] "tidyfast"       TRUE  
[7,] "dtplyr"         TRUE

Basics¶

https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html

In [2]:

input <- if (file.exists("flights14.csv")) {
   "flights14.csv"
} else {
  "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
}

In [3]:

flights <- fread(input)
flights %>% head

A data.table: 6 × 11
year	month	day	dep_delay	arr_delay	carrier	origin	dest	air_time	distance	hour
<int>	<int>	<int>	<int>	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>
2014	1	1	14	13	AA	JFK	LAX	359	2475	9
2014	1	1	-3	13	AA	JFK	LAX	363	2475	11
2014	1	1	2	9	AA	JFK	LAX	351	2475	19
2014	1	1	-8	-26	AA	LGA	PBI	157	1035	7
2014	1	1	2	1	AA	JFK	LAX	350	2475	13
2014	1	1	4	0	AA	EWR	LAX	339	2454	18

In [4]:

fwrite(flights, "flights14.csv")

Creating DT¶

In [5]:

DT = data.table(
  ID = c("b","b","b","a","a","c"),
  a = 1:6,
  b = 7:12,
  c = 13:18
)
DT

A data.table: 6 × 4
ID	a	b	c
<chr>	<int>	<int>	<int>
b	1	7	13
b	2	8	14
b	3	9	15
a	4	10	16
a	5	11	17
c	6	12	18

Slicing¶

In [6]:

flights[origin == "JFK" & month == 6L] %>% head

A data.table: 6 × 11
year	month	day	dep_delay	arr_delay	carrier	origin	dest	air_time	distance	hour
<int>	<int>	<int>	<int>	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>
2014	6	1	-9	-5	AA	JFK	LAX	324	2475	8
2014	6	1	-10	-13	AA	JFK	LAX	329	2475	12
2014	6	1	18	-1	AA	JFK	LAX	326	2475	7
2014	6	1	-6	-16	AA	JFK	LAX	320	2475	10
2014	6	1	-4	-45	AA	JFK	LAX	326	2475	18
2014	6	1	-6	-23	AA	JFK	LAX	329	2475	14

Slice and return vector¶

In [7]:

flights[, arr_delay] %>% head

13
13
9
-26
1
0

Slice and return dt

In [8]:

flights[, .(arr_delay)] %>% head

A data.table: 6 × 1
arr_delay
<int>
13
13
9
-26
1
0

Sorting¶

In [9]:

flights[order(origin, -dest)] %>% head

A data.table: 6 × 11
year	month	day	dep_delay	arr_delay	carrier	origin	dest	air_time	distance	hour
<int>	<int>	<int>	<int>	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>
2014	1	5	6	49	EV	EWR	XNA	195	1131	8
2014	1	6	7	13	EV	EWR	XNA	190	1131	8
2014	1	7	-6	-13	EV	EWR	XNA	179	1131	8
2014	1	8	-7	-12	EV	EWR	XNA	184	1131	8
2014	1	9	16	7	EV	EWR	XNA	181	1131	8
2014	1	13	66	66	EV	EWR	XNA	188	1131	9

Sorting inplace¶

In [10]:

setorder(flights, origin, -dest)
flights %>% head

A data.table: 6 × 11
year	month	day	dep_delay	arr_delay	carrier	origin	dest	air_time	distance	hour
<int>	<int>	<int>	<int>	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>
2014	1	5	6	49	EV	EWR	XNA	195	1131	8
2014	1	6	7	13	EV	EWR	XNA	190	1131	8
2014	1	7	-6	-13	EV	EWR	XNA	179	1131	8
2014	1	8	-7	-12	EV	EWR	XNA	184	1131	8
2014	1	9	16	7	EV	EWR	XNA	181	1131	8
2014	1	13	66	66	EV	EWR	XNA	188	1131	9

Subset and Calculate¶

In [11]:

# example data
carsDT = data.table(mtcars, keep.rownames = TRUE)
# quick inspection
carsDT %>% head

A data.table: 6 × 12
rn	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
Mazda RX4	21.0	6	160	110	3.90	2.620	16.46	0	1	4	4
Mazda RX4 Wag	21.0	6	160	110	3.90	2.875	17.02	0	1	4	4
Datsun 710	22.8	4	108	93	3.85	2.320	18.61	1	1	4	1
Hornet 4 Drive	21.4	6	258	110	3.08	3.215	19.44	1	0	3	1
Hornet Sportabout	18.7	8	360	175	3.15	3.440	17.02	0	0	3	2
Valiant	18.1	6	225	105	2.76	3.460	20.22	1	0	3	1

Aggregation¶

In [12]:

carsDT[, .(mean_hp = mean(hp), mean_wt = mean(wt)), by=.(am, vs)]

A data.table: 4 × 4
am	vs	mean_hp	mean_wt
<dbl>	<dbl>	<dbl>	<dbl>
1	0	180.83	2.857
1	1	80.57	2.028
0	1	102.14	3.194
0	0	194.17	4.104

In [13]:

carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = c("hp", "wt", "disp")]

A data.table: 4 × 5
am	vs	hp	wt	disp
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	0	180.83	2.857	206.2
1	1	80.57	2.028	89.8
0	1	102.14	3.194	175.1
0	0	194.17	4.104	357.6

`:=` syntax¶

Creating Columns¶

In [14]:

# example data
DT = data.table(
  x = letters[c(1, 2, 3, 4, 5)], 
  y = c(1, 2, 3, 4, 5), 
  z = c(1, 2, 3, 4, 5) > 3
)

In [15]:

# create one
DT[, u := 5:1]
# creating multiple
DT[, `:=`(v = 2, w = 3L)]
# creating with dynamic names
nms = c("a", "b", "c")
DT[, (nms) := .(1, 2, 3)]

Dropping columns¶

In [16]:

nms = c("u", "v", "w", "a", "b", "c")
DT[, (nms) := NULL]

In [17]:

DT

A data.table: 5 × 3
x	y	z
<chr>	<dbl>	<lgl>
a	1	FALSE
b	2	FALSE
c	3	FALSE
d	4	TRUE
e	5	TRUE

Conditional Replace¶

In [18]:

DT[, b := "Aardvark"][y > 1, b := "Zebra", verbose = TRUE]

Detected that j uses these columns: b 
Assigning to 4 row subset of 5 rows
RHS_list_of_columns == false

In [20]:

cols = c("hp", "wt", "disp")
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = cols] %>% 
  setnames(cols, sprintf("mean_%s", cols)) %>% print

   am vs mean_hp mean_wt mean_disp
1:  1  0  180.83   2.857     206.2
2:  1  1   80.57   2.028      89.8
3:  0  1  102.14   3.194     175.1
4:  0  0  194.17   4.104     357.6

Column selection¶

In [21]:

keep_cols = c("arr_delay", "dep_delay")
flights[, ..keep_cols] %>% head

A data.table: 6 × 2
arr_delay	dep_delay
<int>	<int>
49	6
13	7
-13	-6
-12	-7
7	16
66	66

In [22]:

drop_cols = c("arr_delay", "dep_delay")
flights[, -..drop_cols] %>% head

A data.table: 6 × 9
year	month	day	carrier	origin	dest	air_time	distance	hour
<int>	<int>	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>
2014	1	5	EV	EWR	XNA	195	1131	8
2014	1	6	EV	EWR	XNA	190	1131	8
2014	1	7	EV	EWR	XNA	179	1131	8
2014	1	8	EV	EWR	XNA	184	1131	8
2014	1	9	EV	EWR	XNA	181	1131	8
2014	1	13	EV	EWR	XNA	188	1131	9

Grouping summaries¶

In [23]:

flights[, .(.N), by = .(origin)]

A data.table: 3 × 2
origin	N
<chr>	<int>
EWR	87400
JFK	81483
LGA	84433

In [24]:

flights[carrier == "AA", .N, by = origin]

A data.table: 3 × 2
origin	N
<chr>	<int>
EWR	2649
JFK	11923
LGA	11730

Keys¶

In [25]:

flights2 = copy(flights)
setkey(flights2, carrier, origin, dest)
key(flights2)

'carrier'
'origin'
'dest'

In [26]:

collapse_dplyr = function() {
  flights %>% as_tibble() %>% 
    group_by(carrier, origin, dest) %>% 
    summarize(mean_arr_delay = mean(arr_delay), 
              pressure = mean(dep_delay))
  }

collapse_dt = function() {
  flights[, 
    .(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
            by = .(carrier, origin, dest)]
  }

collapse_dt_key = function() {
  flights2[, 
    .(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
            by = .(carrier, origin, dest)]
  }

In [27]:

microbenchmark(collapse_dplyr(), collapse_dt(), collapse_dt_key(), times = 10) %>% 
    print

Unit: milliseconds
              expr    min     lq   mean median    uq   max neval cld
  collapse_dplyr() 38.479 39.400 47.325 43.363 44.39 95.52    10   b
     collapse_dt()  7.241  7.746  9.833  8.969 11.65 13.94    10  a 
 collapse_dt_key() 19.357 19.479 20.962 19.678 20.44 29.73    10  a

Create key as part of agg step¶

In [28]:

keybied = flights[carrier == "AA",
        .(mean(arr_delay), mean(dep_delay)),
        keyby = .(origin, dest, month)] 
key(keybied)

'origin'
'dest'
'month'

Expressions in By¶

In [29]:

flights[, .N, .(dep_delay>0, arr_delay>0)]

A data.table: 4 × 3
dep_delay	arr_delay	N
<lgl>	<lgl>	<int>
TRUE	TRUE	72836
FALSE	FALSE	119304
FALSE	TRUE	34583
TRUE	FALSE	26593

`.SD` notation¶

.SD stands for Subset of Data

In [30]:

DT

A data.table: 5 × 4
x	y	z	b
<chr>	<dbl>	<lgl>	<chr>
a	1	FALSE	Aardvark
b	2	FALSE	Zebra
c	3	FALSE	Zebra
d	4	TRUE	Zebra
e	5	TRUE	Zebra

In [32]:

flights[carrier == "AA",                           ## Only on trips with carrier "AA"
        lapply(.SD, mean),                         ## compute the mean
        by = .(origin, dest, month),               ## for every 'origin,dest,month'
        .SDcols = c("arr_delay", "dep_delay")] %>% ## for just those specified in .SDcols 
        head

A data.table: 6 × 5
origin	dest	month	arr_delay	dep_delay
<chr>	<chr>	<int>	<dbl>	<dbl>
EWR	PHX	7	-5.103	0.2759
EWR	PHX	8	3.548	6.2258
EWR	PHX	9	-4.233	-1.6667
EWR	PHX	10	-3.032	-4.2903
EWR	MIA	1	11.011	12.1236
EWR	MIA	2	1.564	4.7564

Merges¶

https://rstudio-pubs-static.s3.amazonaws.com/52230_5ae0d25125b544caab32f75f0360e775.html

In [33]:

emp = 'Employee	EmployeeName	Department	Salary
1	Alice	11	800
2	Bob	11	600
3	Carla	12	900
4	Daniel	12	1000
5	Evelyn	13	800
6	Ferdinand	21	700'

dept = '
table Departments
Department	DepartmentName	Manager
11	Production	1
12	Sales	4
13	Marketing	5
14	Research	NA'

X = fread(emp)
Y = fread(dept)

In [34]:

X
Y

A data.table: 6 × 4
Employee	EmployeeName	Department	Salary
<int>	<chr>	<int>	<int>
1	Alice	11	800
2	Bob	11	600
3	Carla	12	900
4	Daniel	12	1000
5	Evelyn	13	800
6	Ferdinand	21	700

A data.table: 4 × 3
Department	DepartmentName	Manager
<int>	<chr>	<int>
11	Production	1
12	Sales	4
13	Marketing	5
14	Research	NA

In [37]:

merge(X, Y, all=F) # inner
merge(X, Y, all.x=T) # right
merge(X, Y, all.y=T) # left
merge(X, Y, all=T) # cartesian

A data.table: 5 × 6
Department	Employee	EmployeeName	Salary	DepartmentName	Manager
<int>	<int>	<chr>	<int>	<chr>	<int>
11	1	Alice	800	Production	1
11	2	Bob	600	Production	1
12	3	Carla	900	Sales	4
12	4	Daniel	1000	Sales	4
13	5	Evelyn	800	Marketing	5

A data.table: 6 × 6
Department	Employee	EmployeeName	Salary	DepartmentName	Manager
<int>	<int>	<chr>	<int>	<chr>	<int>
11	1	Alice	800	Production	1
11	2	Bob	600	Production	1
12	3	Carla	900	Sales	4
12	4	Daniel	1000	Sales	4
13	5	Evelyn	800	Marketing	5
21	6	Ferdinand	700	NA	NA

A data.table: 6 × 6
Department	Employee	EmployeeName	Salary	DepartmentName	Manager
<int>	<int>	<chr>	<int>	<chr>	<int>
11	1	Alice	800	Production	1
11	2	Bob	600	Production	1
12	3	Carla	900	Sales	4
12	4	Daniel	1000	Sales	4
13	5	Evelyn	800	Marketing	5
14	NA	NA	NA	Research	NA

A data.table: 7 × 6
Department	Employee	EmployeeName	Salary	DepartmentName	Manager
<int>	<int>	<chr>	<int>	<chr>	<int>
11	1	Alice	800	Production	1
11	2	Bob	600	Production	1
12	3	Carla	900	Sales	4
12	4	Daniel	1000	Sales	4
13	5	Evelyn	800	Marketing	5
14	NA	NA	NA	Research	NA
21	6	Ferdinand	700	NA	NA

DT idiomatic merges¶

In [38]:

# example data
a = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_), 
               t = c(1L, 2L, 1L, 2L, NA_integer_), x = 11:15)
b = data.table(id = 1:2, y = c(11L, 15L))

a
b

A data.table: 5 × 3
id	t	x
<int>	<int>	<int>
1	1	11
1	2	12
2	1	13
3	2	14
NA	NA	15

A data.table: 2 × 2
id	y
<int>	<int>
1	11
2	15

Equi-joins¶

In [39]:

a[b, on=.(id)]

A data.table: 3 × 4
id	t	x	y
<int>	<int>	<int>	<int>
1	1	11	11
1	2	12	11
2	1	13	15

Different keys

In [40]:

a[b, on=.(id, x = y)]

A data.table: 2 × 3
id	t	x
<int>	<int>	<int>
1	1	11
2	NA	15

In [41]:

a[b, on=.(id), y := i.y ]
a

A data.table: 5 × 4
id	t	x	y
<int>	<int>	<int>	<int>
1	1	11	11
1	2	12	11
2	1	13	15
3	2	14	NA
NA	NA	15	NA

Non-Equi Joins¶

In [42]:

mDT = data.table(id = 1:3, x_dn = 10L, x_up = 13L)
mDT

A data.table: 3 × 3
id	x_dn	x_up
<int>	<int>	<int>
1	10	13
2	10	13
3	10	13

In [43]:

a[mDT, on=.(id, x >= x_dn, x <= x_up), .(id, i.x_dn, i.x_up, x.x)]

A data.table: 4 × 4
id	i.x_dn	i.x_up	x.x
<int>	<int>	<int>	<int>
1	10	13	11
1	10	13	12
2	10	13	13
3	10	13	NA

Reshape¶

Main ref https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html

Long to wide¶

In [44]:

s = 'id y a b
A X 1 3
A Z 2 4
B X 1 3
B Z 2 4'
DT = fread(s)

dcast(DT, id ~ y, value.var = c('a', 'b'))

A data.table: 2 × 5
id	a_X	a_Z	b_X	b_Z
<chr>	<int>	<int>	<int>	<int>
A	1	2	3	4
B	1	2	3	4

Wide to long¶

In [45]:

s = 'id a_X a_Z b_X b_Z
A 1 2 3 4
B 1 2 3 4'
DT = fread(s)
DT

melt(DT,
    id.vars = c("id"),
    measure = patterns("^a", "^b"),
    variable.name = "y",
    value.name = c("a", "b"))

A data.table: 2 × 5
id	a_X	a_Z	b_X	b_Z
<chr>	<int>	<int>	<int>	<int>
A	1	2	3	4
B	1	2	3	4

A data.table: 4 × 4
id	y	a	b
<chr>	<fct>	<int>	<int>
A	1	1	3
B	1	1	3
A	2	2	4
B	2	2	4

https://franknarf1.github.io/r-tutorial/_book/tables.html

Advanced operations¶

https://ugoproto.github.io/ugo_r_doc/datatable.pdf

In [46]:

DT = data.table(a = 1:5) 
DT[, `:=`(b = letters[a], c = 1 * (a > 3))][, d := .I , by= .(c)][]

A data.table: 5 × 4
a	b	c	d
<int>	<chr>	<dbl>	<int>
1	a	0	1
2	b	0	2
3	c	0	3
4	d	1	4
5	e	1	5

In [47]:

DT[, l := shift(b, 1), by = c][]

A data.table: 5 × 5
a	b	c	d	l
<int>	<chr>	<dbl>	<int>	<chr>
1	a	0	1	NA
2	b	0	2	a
3	c	0	3	b
4	d	1	4	NA
5	e	1	5	d

Plot by group¶

Merges¶

Binned summary¶

In [48]:

carsDT[, .N, by=.(drat_bin = cut(drat, 4))]

A data.table: 4 × 2
drat_bin	N
<fct>	<int>
(3.84,4.39]	11
(2.76,3.3]	13
(3.3,3.84]	6
(4.39,4.93]	2

Combining `dplyr` and `data.table` with `dtplyr`¶

https://atrebas.github.io/post/2019-03-03-datatable-dplyr

In [50]:

collapse_dtplyr = function() {
  flights %>% lazy_dt() %>% 
    group_by(carrier, origin, dest) %>% 
    summarize(mean_arr_delay = mean(arr_delay), 
              pressure = mean(dep_delay))
  }

## Just compare dtplyr with normal dplyr and data.table versions (i.e. no keys)

In [54]:

microbenchmark::microbenchmark(collapse_dplyr(), collapse_dt_key(), collapse_dt(), collapse_dtplyr(), times = 10) %>% 
    print

Unit: milliseconds
              expr    min     lq   mean median     uq     max neval cld
  collapse_dplyr() 38.607 39.551 41.795 40.784 42.971  48.918    10   b
 collapse_dt_key() 19.307 19.527 21.858 20.520 24.677  28.266    10  ab
     collapse_dt()  7.333  7.681 28.249 10.530 11.551 193.618    10  ab
 collapse_dtplyr()  3.296  3.512  3.809  3.561  3.646   6.179    10  a

Basics¶

Creating DT¶

Slicing¶

Slice and return vector¶

Sorting¶

Sorting inplace¶

Subset and Calculate¶

Aggregation¶

:= syntax¶

Creating Columns¶

Dropping columns¶

Conditional Replace¶

Column selection¶

Grouping summaries¶

Keys¶

Create key as part of agg step¶

Expressions in By¶

.SD notation¶

Merges¶

DT idiomatic merges¶

Equi-joins¶

Non-Equi Joins¶

Reshape¶

Long to wide¶

Wide to long¶

Advanced operations¶

Plot by group¶

Merges¶

Binned summary¶

Combining dplyr and data.table with dtplyr¶

`:=` syntax¶

`.SD` notation¶

Combining `dplyr` and `data.table` with `dtplyr`¶