using BenchmarkTools
using DataFrames
using Statistics
using Transducers
using VegaLite
resultpath = joinpath(@__DIR__, "result.json")
result, = BenchmarkTools.load(resultpath)
df_raw =
BenchmarkTools.leaves(result) |>
Map() do ((nworks, ex), trial)
(nworks = parse(Int, nworks), executor = ex, trial = trial)
end |>
DataFrame
nworks | executor | trial | |
---|---|---|---|
Int64 | String | Trial… | |
1 | 20 | WorkStealingEx | 1.787 ms |
2 | 20 | ThreadedEx | 6.172 ms |
3 | 25 | WorkStealingEx | 1.817 ms |
4 | 25 | ThreadedEx | 5.048 ms |
5 | 35 | WorkStealingEx | 2.841 ms |
6 | 35 | ThreadedEx | 6.023 ms |
7 | 5 | WorkStealingEx | 1.471 ms |
8 | 5 | ThreadedEx | 5.516 ms |
9 | 15 | WorkStealingEx | 1.590 ms |
10 | 15 | ThreadedEx | 5.790 ms |
11 | 50 | WorkStealingEx | 3.826 ms |
12 | 50 | ThreadedEx | 5.891 ms |
13 | 40 | WorkStealingEx | 2.821 ms |
14 | 40 | ThreadedEx | 4.900 ms |
15 | 45 | WorkStealingEx | 2.838 ms |
16 | 45 | ThreadedEx | 6.907 ms |
17 | 10 | WorkStealingEx | 1.475 ms |
18 | 10 | ThreadedEx | 5.616 ms |
19 | 0 | WorkStealingEx | 838.562 μs |
20 | 0 | ThreadedEx | 3.907 ms |
21 | 30 | WorkStealingEx | 1.841 ms |
22 | 30 | ThreadedEx | 6.459 ms |
begin
df_tmp = select(df_raw, Not(:trial))
df_tmp[!, :minimum] = map(trial -> minimum(trial).time, df_raw.trial)
df_tmp[!, :median] = map(trial -> median(trial).time, df_raw.trial)
df_tmp[!, :memory] = map(trial -> trial.memory, df_raw.trial)
df_stats = stack(
df_tmp,
[:minimum, :median],
variable_name = :time_stat,
value_name = :time_ns,
)
end
nworks | executor | memory | time_stat | time_ns | |
---|---|---|---|---|---|
Int64 | String | Int64 | String | Float64 | |
1 | 20 | WorkStealingEx | 11230240 | minimum | 1.78651e6 |
2 | 20 | ThreadedEx | 6682896 | minimum | 6.17157e6 |
3 | 25 | WorkStealingEx | 11231680 | minimum | 1.81712e6 |
4 | 25 | ThreadedEx | 6685184 | minimum | 5.04755e6 |
5 | 35 | WorkStealingEx | 11234496 | minimum | 2.84102e6 |
6 | 35 | ThreadedEx | 6685696 | minimum | 6.02276e6 |
7 | 5 | WorkStealingEx | 11315536 | minimum | 1.47101e6 |
8 | 5 | ThreadedEx | 6769712 | minimum | 5.51611e6 |
9 | 15 | WorkStealingEx | 11241776 | minimum | 1.59026e6 |
10 | 15 | ThreadedEx | 6689872 | minimum | 5.79008e6 |
11 | 50 | WorkStealingEx | 11238368 | minimum | 3.82622e6 |
12 | 50 | ThreadedEx | 6689216 | minimum | 5.89149e6 |
13 | 40 | WorkStealingEx | 11235776 | minimum | 2.82139e6 |
14 | 40 | ThreadedEx | 6687328 | minimum | 4.89968e6 |
15 | 45 | WorkStealingEx | 11237056 | minimum | 2.83812e6 |
16 | 45 | ThreadedEx | 6688224 | minimum | 6.90663e6 |
17 | 10 | WorkStealingEx | 11280656 | minimum | 1.47544e6 |
18 | 10 | ThreadedEx | 6730480 | minimum | 5.61629e6 |
19 | 0 | WorkStealingEx | 11346672 | minimum | 838562.0 |
20 | 0 | ThreadedEx | 6802016 | minimum | 3.90678e6 |
21 | 30 | WorkStealingEx | 11233120 | minimum | 1.8411e6 |
22 | 30 | ThreadedEx | 6685328 | minimum | 6.45944e6 |
23 | 20 | WorkStealingEx | 11230240 | median | 2.23865e6 |
24 | 20 | ThreadedEx | 6682896 | median | 8.62916e6 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
df = combine(groupby(df_stats, [:nworks, :time_stat])) do group
d = Dict(zip(group.executor, group.time_ns))
(speedup = d["ThreadedEx"] / d["WorkStealingEx"],)
end
nworks | time_stat | speedup | |
---|---|---|---|
Int64 | String | Float64 | |
1 | 20 | minimum | 3.45454 |
2 | 25 | minimum | 2.77777 |
3 | 35 | minimum | 2.11993 |
4 | 5 | minimum | 3.74989 |
5 | 15 | minimum | 3.64095 |
6 | 50 | minimum | 1.53977 |
7 | 40 | minimum | 1.73662 |
8 | 45 | minimum | 2.43352 |
9 | 10 | minimum | 3.80653 |
10 | 0 | minimum | 4.6589 |
11 | 30 | minimum | 3.50846 |
12 | 20 | median | 3.85463 |
13 | 25 | median | 3.65468 |
14 | 35 | median | 2.64295 |
15 | 5 | median | 4.12985 |
16 | 15 | median | 4.09793 |
17 | 50 | median | 1.97676 |
18 | 40 | median | 2.47478 |
19 | 45 | median | 2.71579 |
20 | 10 | median | 4.16194 |
21 | 0 | median | 5.41905 |
22 | 30 | median | 3.81937 |
plt1 = @vlplot(
layer = [
{
mark = {type = :line, point = true},
encoding = {
x = {field = :nworks},
y = {field = :speedup, axis = {title = "Speedup (T_default / T_WS)"}},
color = {field = :time_stat},
},
},
{
mark = :rule,
encoding = {y = {datum = 1}},
},
],
data = df,
width = 400,
height = 200,
)
plt2 = @vlplot(
mark = {type = :line, point = true},
x = :nworks,
y = {field = :time_ns, axis = {title = "Time [ns]"}},
color = {field = :time_stat},
column = :executor,
data = df_stats,
)
nothing
Peformance of parallel reduce with wildly skewed run-time distribution is
benchmarked with the default ThreadedEx
executor and WorkStealingEx
executor. WorkStealingEx
consistently performces better than ThreadedEx
(if the run-time distribution is unbalanced enough). Furthermore, the
run-time of WorkStealingEx
is much more conisstent than ThreadedEx
.
The following problem is benchmarked
xs = 1:2^13
Folds.sum($f, xs, $Executor(basesize = 1))
where f
spins for 100 μs for nworks
items (i.e., every length(xs) ÷ nworks
) in the input collection xs
.
plt1
For this range of parameter, WorkStealingEx
performs better than
ThreadedEx
. Note that the run-time distribution is very skewed. The
largest tried nworks
is 50; i.e., only 50 /2^13 * 100 = 0.6
% of the
items are actually compute-intensive.
plt2
Note the big difference in the minimum and median run-time of ThreadedEx
(left). This is probably due to the randomization by Julia's paralel task
runtime. On the other hand, the performance of WorkStealingEx
(right) is
more consistent (smaller difference between minimum and median).
This notebook was generated using Literate.jl.