Presented by Chiyoung Ahn (https://github.com/chiyahn)
Consider minimizing following objective function
$$
f(x) = \sum_{i=1}^N \left[ (x_i - m_i)^2 + \log(x_i) \right]
$$
with a fixed $m$ and some fairly large N
by limited-memory BFGS (LBFGS):
using NLopt, BenchmarkTools, ForwardDiff, NLSolversBase, DiffResults, Flux, ReverseDiff, DiffResults
N = 5000
x0 = fill(1.0, N)
m = range(2,5,length = N)
f(x) = sum((x .- m).^2) + sum(log.(x))
f (generic function with 1 method)
ForwardDiff.jl
¶function g!(G::Vector, x::Vector)
ForwardDiff.gradient!(G, f, x)
end
function fg!(x::Vector, grad::Vector)
if length(grad) > 0 # gradient of f(x)
g!(grad, x)
end
f(x)
end
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(1.0, N)) # find `x` above 0
min_objective!(opt, fg!) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0) seconds = 30.0
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
696.862 ms (67239 allocations: 3.70 GiB) got 5966.9406953073385 after 9 iterations (returned SUCCESS)
NLoptAdapter
with autodiff¶# See nlopt-tutorial.ipynb
struct NLoptAdapter{T} <: Function where T <: AbstractObjective
nlsolver_base::T
end
# implement fg!; note that the order is reversed
(adapter::NLoptAdapter)(x, df) = adapter.nlsolver_base.fdf(df, x)
(adapter::NLoptAdapter)(result, x, jacobian_transpose) = adapter.nlsolver_base.fdf(result, jacobian_transpose', x)
# constructors
NLoptAdapter(f, x, autodiff) = NLoptAdapter(OnceDifferentiable(f, x, autodiff = autodiff))
NLoptAdapter(f!, x::Vector, F::Vector, autodiff) = NLoptAdapter(OnceDifferentiable(f!, x, F, autodiff = autodiff))
NLoptAdapter
f_opt = NLoptAdapter(f, zeros(N), :forward)
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(0.0, N)) # find `x` above -2.0
min_objective!(opt, f_opt) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0) seconds = 30.0
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
546.883 ms (67112 allocations: 3.69 GiB) got 5966.9406953073385 after 9 iterations (returned SUCCESS)
NLoptAdapter
with numerical gradients¶f_opt = NLoptAdapter(f, zeros(N), :finite)
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(0.0, N)) # find `x` above -2.0
min_objective!(opt, f_opt) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0) seconds = 30.0
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
6.025 s (1809596 allocations: 8.24 GiB) got 5966.94069530734 after 11 iterations (returned FTOL_REACHED)
Flux.jl
¶using Flux
using Flux.Tracker: gradient_
function fg!(x::Vector, grad::Vector)
val = f(x)
grad[:] = gradient_(f, x)[1]
return val
end
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(1.0, N)) # find `x` above 0
min_objective!(opt, fg!) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0)
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
7.268 ms (3752 allocations: 5.35 MiB) got 5966.9406953073385 after 9 iterations (returned SUCCESS)
ReverseDiff.jl
¶# precompile tape
result = DiffResults.GradientResult(similar(x0));
tape = ReverseDiff.compile(ReverseDiff.GradientTape(f, similar(x0)));
function fg!(x::Vector, grad::Vector)
ReverseDiff.gradient!(result, tape, x)
grad[:] = DiffResults.gradient(result)
DiffResults.value(result)
end
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(1.0, N)) # find `x` above 0
min_objective!(opt, fg!) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0)
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
16.604 ms (62 allocations: 41.42 KiB) got 5966.9406953073385 after 9 iterations (returned SUCCESS)
The current version of ReverseDiff.jl
(v0.3.1) has broadcast implementation targetting an older version of Julia, v0.6. To fully exploit the performance of ReverseDiff.jl
, some explicit broadcasting is needed:
# explicit broadcasting
ReverseDiff.@forward g(x, m) = (x - m)^2
f(x) = sum(broadcast(g, x, m)) + sum(broadcast(log, x))
f (generic function with 1 method)
Compare the performance:
# precompile tape
result = DiffResults.GradientResult(similar(x0));
tape = ReverseDiff.compile(ReverseDiff.GradientTape(f, similar(x0)));
function fg!(x::Vector, grad::Vector)
ReverseDiff.gradient!(result, tape, x)
grad[:] = DiffResults.gradient(result)
DiffResults.value(result)
end
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(1.0, N)) # find `x` above 0
min_objective!(opt, fg!) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0)
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
5.637 ms (125 allocations: 44.23 KiB) got 5966.9406953073385 after 9 iterations (returned SUCCESS)
Flux.jl
and ReverseDiff.jl
have insanely good performance¶Let's see how it goes when we have N=1000000
:
N = 1000000 # one million!
x0 = fill(1.0, N)
m = range(2,5,length = N)
f(x) = sum((x .- m).^2) + sum(log.(x))
f (generic function with 1 method)
Flux.jl
¶function fg!(x::Vector, grad::Vector)
val = f(x)
grad[:] = gradient_(f, x)[1]
return val
end
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(1.0, N)) # find `x` above 0
min_objective!(opt, fg!) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0)
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
3.266 s (3752 allocations: 1.01 GiB) got 1.193404758611993e6 after 9 iterations (returned SUCCESS)
ReverseDiff.jl
¶# explicit broadcasting
ReverseDiff.@forward g(x, m) = (x - m)^2
f(x) = sum(broadcast(g, x, m)) + sum(broadcast(log, x))
# precompile tape
result = DiffResults.GradientResult(similar(x0));
tape = ReverseDiff.compile(ReverseDiff.GradientTape(f, similar(x0)));
function fg!(x::Vector, grad::Vector)
ReverseDiff.gradient!(result, tape, x)
grad[:] = DiffResults.gradient(result)
DiffResults.value(result)
end
# define the optimization problem
opt = Opt(:LD_LBFGS, N) # 2 indicates the length of `x`
lower_bounds!(opt, fill(1.0, N)) # find `x` above 0
min_objective!(opt, fg!) # specifies that optimization problem is on minimization
# solve the optimization problem
(minf,minx,ret) = @btime optimize($opt, $x0)
numevals = opt.numevals # the number of function evaluations
println("got $minf after $numevals iterations (returned $ret)")
1.363 s (125 allocations: 7.63 MiB) got 1.193404758611993e6 after 9 iterations (returned SUCCESS)
🤯