using BenchmarkTools
using DataFrames
using Transducers
using VegaLite
resultpath = joinpath(@__DIR__, "result.json")
result, = BenchmarkTools.load(resultpath)
df_raw =
BenchmarkTools.leaves(result) |>
Map() do ((basesize, needleloc, ex), trial)
(
basesize = parse(Int, basesize),
needleloc = parse(Int, needleloc),
executor = Symbol(ex),
trial = trial,
)
end |>
DataFrame
basesize | needleloc | executor | trial | |
---|---|---|---|---|
Int64 | Int64 | Symbol | Trial… | |
1 | 4096 | 65536 | DepthFirstEx | 151.959 μs |
2 | 4096 | 65536 | WorkStealingEx | 767.383 μs |
3 | 4096 | 65536 | ThreadedEx | 260.148 μs |
4 | 4096 | 8388608 | DepthFirstEx | 16.429 ms |
5 | 4096 | 8388608 | WorkStealingEx | 1.422 ms |
6 | 4096 | 8388608 | ThreadedEx | 2.188 ms |
7 | 4096 | 1048576 | DepthFirstEx | 2.030 ms |
8 | 4096 | 1048576 | WorkStealingEx | 1.408 ms |
9 | 4096 | 1048576 | ThreadedEx | 458.276 μs |
10 | 4096 | 16384 | DepthFirstEx | 80.750 μs |
11 | 4096 | 16384 | WorkStealingEx | 684.364 μs |
12 | 4096 | 16384 | ThreadedEx | 216.298 μs |
13 | 4096 | 8192 | DepthFirstEx | 69.600 μs |
14 | 4096 | 8192 | WorkStealingEx | 688.154 μs |
15 | 4096 | 8192 | ThreadedEx | 167.048 μs |
16 | 4096 | 262144 | DepthFirstEx | 530.555 μs |
17 | 4096 | 262144 | WorkStealingEx | 1.112 ms |
18 | 4096 | 262144 | ThreadedEx | 339.667 μs |
19 | 4096 | 4096 | DepthFirstEx | 69.820 μs |
20 | 4096 | 4096 | WorkStealingEx | 665.344 μs |
21 | 4096 | 4096 | ThreadedEx | 130.399 μs |
22 | 4096 | 2097152 | DepthFirstEx | 3.700 ms |
23 | 4096 | 2097152 | WorkStealingEx | 1.296 ms |
24 | 4096 | 2097152 | ThreadedEx | 649.834 μs |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
begin
df_tmp = select(df_raw, Not(:trial))
df_tmp[!, :minimum] = map(trial -> minimum(trial).time, df_raw.trial)
df_tmp[!, :median] = map(trial -> median(trial).time, df_raw.trial)
df_tmp[!, :memory] = map(trial -> trial.memory, df_raw.trial)
df_stats = stack(
df_tmp,
[:minimum, :median],
variable_name = :time_stat,
value_name = :time_ns,
)
end
basesize | needleloc | executor | memory | time_stat | time_ns | |
---|---|---|---|---|---|---|
Int64 | Int64 | Symbol | Int64 | String | Float64 | |
1 | 4096 | 65536 | DepthFirstEx | 52832 | minimum | 151959.0 |
2 | 4096 | 65536 | WorkStealingEx | 6342800 | minimum | 767383.0 |
3 | 4096 | 65536 | ThreadedEx | 573328 | minimum | 260148.0 |
4 | 4096 | 8388608 | DepthFirstEx | 4917072 | minimum | 1.6429e7 |
5 | 4096 | 8388608 | WorkStealingEx | 6696048 | minimum | 1.42161e6 |
6 | 4096 | 8388608 | ThreadedEx | 4882080 | minimum | 2.18779e6 |
7 | 4096 | 1048576 | DepthFirstEx | 622656 | minimum | 2.03032e6 |
8 | 4096 | 1048576 | WorkStealingEx | 6466768 | minimum | 1.40804e6 |
9 | 4096 | 1048576 | ThreadedEx | 950528 | minimum | 458276.0 |
10 | 4096 | 16384 | DepthFirstEx | 28576 | minimum | 80750.0 |
11 | 4096 | 16384 | WorkStealingEx | 6333584 | minimum | 684364.0 |
12 | 4096 | 16384 | ThreadedEx | 370048 | minimum | 216298.0 |
13 | 4096 | 8192 | DepthFirstEx | 26080 | minimum | 69600.0 |
14 | 4096 | 8192 | WorkStealingEx | 6332848 | minimum | 688154.0 |
15 | 4096 | 8192 | ThreadedEx | 345600 | minimum | 167048.0 |
16 | 4096 | 262144 | DepthFirstEx | 166096 | minimum | 530555.0 |
17 | 4096 | 262144 | WorkStealingEx | 6381600 | minimum | 1.11229e6 |
18 | 4096 | 262144 | ThreadedEx | 680016 | minimum | 339667.0 |
19 | 4096 | 4096 | DepthFirstEx | 25888 | minimum | 69820.0 |
20 | 4096 | 4096 | WorkStealingEx | 6332576 | minimum | 665344.0 |
21 | 4096 | 4096 | ThreadedEx | 93168 | minimum | 130399.0 |
22 | 4096 | 2097152 | DepthFirstEx | 1239120 | minimum | 3.69973e6 |
23 | 4096 | 2097152 | WorkStealingEx | 6591872 | minimum | 1.29592e6 |
24 | 4096 | 2097152 | ThreadedEx | 1442032 | minimum | 649834.0 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
begin
df = transform(groupby(df_stats, [:basesize, :needleloc, :time_stat])) do group
baseline = only(eachrow(filter(:executor => ==(:ThreadedEx), group)))
(speedup = baseline.time_ns ./ group.time_ns,)
end
filter!(:executor => !(==(:ThreadedEx)), df)
df
end
basesize | needleloc | executor | memory | time_stat | time_ns | speedup | |
---|---|---|---|---|---|---|---|
Int64 | Int64 | Symbol | Int64 | String | Float64 | Float64 | |
1 | 4096 | 65536 | DepthFirstEx | 52832 | minimum | 151959.0 | 1.71196 |
2 | 4096 | 65536 | WorkStealingEx | 6342800 | minimum | 767383.0 | 0.339007 |
3 | 4096 | 8388608 | DepthFirstEx | 4917072 | minimum | 1.6429e7 | 0.133167 |
4 | 4096 | 8388608 | WorkStealingEx | 6696048 | minimum | 1.42161e6 | 1.53896 |
5 | 4096 | 1048576 | DepthFirstEx | 622656 | minimum | 2.03032e6 | 0.225716 |
6 | 4096 | 1048576 | WorkStealingEx | 6466768 | minimum | 1.40804e6 | 0.325471 |
7 | 4096 | 16384 | DepthFirstEx | 28576 | minimum | 80750.0 | 2.67861 |
8 | 4096 | 16384 | WorkStealingEx | 6333584 | minimum | 684364.0 | 0.316057 |
9 | 4096 | 8192 | DepthFirstEx | 26080 | minimum | 69600.0 | 2.40011 |
10 | 4096 | 8192 | WorkStealingEx | 6332848 | minimum | 688154.0 | 0.242748 |
11 | 4096 | 262144 | DepthFirstEx | 166096 | minimum | 530555.0 | 0.640211 |
12 | 4096 | 262144 | WorkStealingEx | 6381600 | minimum | 1.11229e6 | 0.305376 |
13 | 4096 | 4096 | DepthFirstEx | 25888 | minimum | 69820.0 | 1.86765 |
14 | 4096 | 4096 | WorkStealingEx | 6332576 | minimum | 665344.0 | 0.195987 |
15 | 4096 | 2097152 | DepthFirstEx | 1239120 | minimum | 3.69973e6 | 0.175644 |
16 | 4096 | 2097152 | WorkStealingEx | 6591872 | minimum | 1.29592e6 | 0.501447 |
17 | 4096 | 2048 | DepthFirstEx | 25840 | minimum | 63919.0 | 1.81556 |
18 | 4096 | 2048 | WorkStealingEx | 6332576 | minimum | 675144.0 | 0.171888 |
19 | 4096 | 131072 | DepthFirstEx | 89120 | minimum | 272897.0 | 0.781899 |
20 | 4096 | 131072 | WorkStealingEx | 6356880 | minimum | 877112.0 | 0.243273 |
21 | 4096 | 524288 | DepthFirstEx | 318000 | minimum | 990061.0 | 0.3434 |
22 | 4096 | 524288 | WorkStealingEx | 6508976 | minimum | 1.39974e6 | 0.242893 |
23 | 4096 | 4194304 | DepthFirstEx | 2470832 | minimum | 7.6164e6 | 0.142991 |
24 | 4096 | 4194304 | WorkStealingEx | 6624112 | minimum | 1.35167e6 | 0.805731 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
plt1 = @vlplot(
facet = {row = {field = :executor, type = :nominal}, column = {field = :time_stat, type = :nominal}},
spec = {
layer = [
{
mark = {type = :line, point = true},
encoding = {
x = {field = :needleloc, scale = {type = :log, base = 2}},
y = {field = :speedup, axis = {title = "Speedup (T_default / T_\$executor)"}},
color = {field = :basesize, type = :nominal},
},
},
{
mark = :rule,
encoding = {y = {datum = 1}},
},
],
},
data = df,
)
plt2 = @vlplot(
mark = {type = :line, point = true},
x = {field = :needleloc, scale = {type = :log, base = 2}},
y = {field = :time_ns, axis = {title = "Time [ns]"}},
color = {field = :basesize, type = :nominal},
row = :executor,
column = :time_stat,
resolve = {scale = {y = "independent"}},
data = df_stats,
)
plt3 = @vlplot(
mark = {type = :line, point = true, clip = true},
x = {field = :needleloc, scale = {type = :log, base = 2}},
y = {field = :time_ns, axis = {title = "Time [ns]"}, scale = {domain = [0, 2_500_000]}},
color = {field = :basesize, type = :nominal},
row = :executor,
column = :time_stat,
data = df_stats,
)
nothing
Peformance of parallel findfirst
with different scheduling strategies
("executors") are benchmarked. DepthFirstEx
is useful when the latency of
the "hit case" is important and/or the needle is expected to be at somehwere
in the beginning of the collection. Otherwise, ThreadedEx
(default) is a
good option. For consistent latency, WorkStealingEx
may be useful.
const DATA_LENGTH = 2^23
xs = rand(DATA_LENGTH)
xs[needleloc] = 2
@benchmarkable(Folds.findfirst(>(1), xs, $Executor(basesize = $basesize)))
plt1
Speedup of DepthFirstEx
(first row) and WorkStealingEx
(second row) with
respect to the default ThreadedEx
executor.
Since the x axis logarithmic, note that the range of large speedup for
DepthFirstEx
is overly emphasized; i.e., DepthFirstEx
does not perform
better on average if the location of the needle is expected to be uniformly
distributed. On the other hand, DepthFirstEx
is a good choice when it is
important to finish fast when the reduction can be terminated early.
WorkStealingEx
seems to perform better in some limited range.
plt2
NOTE: x axis logarithmic.
DepthFirstEx
behaves consistently well up until the location of the needle
is around the basesize
.
The run-time of ThreadedEx
(middle row) shows drastically different
shape of when the median (left) or the minimum (right) is plotted. This
is likely due to the randomization nature of Julia's scheduling (as of v1.5).
The peformance of WorkStealingEx
is more consistent than ThreadedEx
.
plt3
NOTE: x axis is logarithmic.
Same as above (plt2
) but with the range of Y axis fixed for all plots.
This notebook was generated using Literate.jl.