ForwardDiff.jl
ForwardDiff.jl copied to clipboard
Type dependent default maximum chunk size.
From the following benchmark
function rosenbrock(x)
a = one(eltype(x))
b = 100 * a
result = zero(eltype(x))
for i in 1:length(x)-1
result += (a - x[i])^2 + b*(x[i+1] - x[i]^2)^2
end
return result
end
using BenchmarkTools
import ForwardDiff
for T in (Float32, Float64)
println("T = $T")
x = rand(T, 64)
y = similar(x)
for chunk in (2, 4, 8, 16, 32)
print("Chunk: $chunk -")
gradient_cache = ForwardDiff.GradientConfig(rosenbrock, x, ForwardDiff.Chunk{chunk}())
@btime ForwardDiff.gradient!($y, rosenbrock, $x, $gradient_cache)
end
end
with the result
T = Float32
Chunk: 2 - 10.411 μs (0 allocations: 0 bytes)
Chunk: 4 - 6.778 μs (0 allocations: 0 bytes)
Chunk: 8 - 7.124 μs (0 allocations: 0 bytes)
Chunk: 16 - 4.576 μs (0 allocations: 0 bytes)
Chunk: 32 - 3.200 μs (0 allocations: 0 bytes)
T = Float64
Chunk: 2 - 9.029 μs (0 allocations: 0 bytes)
Chunk: 4 - 7.851 μs (0 allocations: 0 bytes)
Chunk: 8 - 5.353 μs (0 allocations: 0 bytes)
Chunk: 16 - 5.005 μs (0 allocations: 0 bytes)
Chunk: 32 - 4.351 μs (0 allocations: 0 bytes)
it suggests that maybe the maxumum chunk size should be a function of the type used.
Another thing to consider is that perhaps we should try use chunk size which well into vector registries (assuming they don't cause another function evaluation)
t = Float32
println("T = $T")
x = rand(T, 21)
y = similar(x)
for chunk in (8, 10)
print("Chunk: $chunk -")
gradient_cache = ForwardDiff.GradientConfig(rosenbrock, x, ForwardDiff.Chunk{chunk}())
@btime ForwardDiff.gradient!($y, rosenbrock, $x, $gradient_cache)
end
with the result
Chunk: 8 - 13.837 μs (0 allocations: 0 bytes)
Chunk: 10 - 15.309 μs (0 allocations: 0 bytes)
so 8 here is a better choice than 10 (fitting nicely into an AVX lane).