IncrementalInference.jl
IncrementalInference.jl copied to clipboard
Improve Parametric Batch Solve Performance
This issue is currently just a placeholder for performance-related issues around parametric batch solve.
using IncrementalInference #v0.29
using RoME #v0.19
##
fg = initfg()
addVariable!(fg, :x0, ContinuousScalar)
addVariable!(fg, :x1, ContinuousScalar)
addVariable!(fg, :x2, ContinuousScalar)
addVariable!(fg, :x3, ContinuousScalar)
addVariable!(fg, :x4, ContinuousScalar)
addVariable!(fg, :l1, ContinuousScalar)
addFactor!(fg, [:x0], Prior(Normal(0.0,0.1)))
addFactor!(fg, [:x0,:x1], LinearRelative(Normal(1.0,0.1)))
addFactor!(fg, [:x1,:x2], LinearRelative(Normal(1.0,0.1)))
addFactor!(fg, [:x2,:x3], LinearRelative(Normal(1.0,0.1)))
addFactor!(fg, [:x3,:x4], LinearRelative(Normal(1.0,0.1)))
addFactor!(fg, [:x0,:l1], LinearRelative(Normal(1.0,0.1)))
addFactor!(fg, [:x4,:l1], LinearRelative(Normal(-3.0,0.1)))
initAll!(fg)
@time r = IIF.solveGraphParametric!(fg)
# 0.033984 seconds (151.65 k allocations: 6.195 MiB)
# Seconds run: 0 (vs limit 100)
# Iterations: 6
# f(x) calls: 13
# ∇f(x) calls: 13
##
fg = initfg()
addVariable!(fg, :x0, Point3)
addVariable!(fg, :x1, Point3)
addVariable!(fg, :x2, Point3)
addVariable!(fg, :x3, Point3)
addVariable!(fg, :x4, Point3)
addVariable!(fg, :l1, Point3)
addFactor!(fg, [:x0], PriorPoint3(MvNormal([0.,0,0], [0.1,0.1,0.1])))
addFactor!(fg, [:x0,:x1], Point3Point3(MvNormal([1.,0,0], [0.1,0.1,0.1])))
addFactor!(fg, [:x1,:x2], Point3Point3(MvNormal([1.,0,0], [0.1,0.1,0.1])))
addFactor!(fg, [:x2,:x3], Point3Point3(MvNormal([1.,0,0], [0.1,0.1,0.1])))
addFactor!(fg, [:x3,:x4], Point3Point3(MvNormal([1.,0,0], [0.1,0.1,0.1])))
addFactor!(fg, [:x0,:l1], Point3Point3(MvNormal([1.,0,0], [0.1,0.1,0.1])))
addFactor!(fg, [:x4,:l1], Point3Point3(MvNormal([-3.,0,0], [0.1,0.1,0.1])))
initAll!(fg)
@time r = IIF.solveGraphParametric!(fg)
# 1.313179 seconds (1.71 M allocations: 79.619 MiB, 5.35% gc time, 99.39% compilation time)
# 0.002358 seconds (10.93 k allocations: 1.604 MiB)
# Seconds run: 0 (vs limit 100)
# Iterations: 6
# f(x) calls: 13
# ∇f(x) calls: 13
##
fg = initfg()
addVariable!(fg, :x0, Pose2)
addVariable!(fg, :x1, Pose2)
addVariable!(fg, :x2, Pose2)
addVariable!(fg, :x3, Pose2)
addVariable!(fg, :x4, Pose2)
addVariable!(fg, :l1, Pose2)
addFactor!(fg, [:x0], PriorPose2(MvNormal([0.,0,0], [0.1,0.1,0.01])))
addFactor!(fg, [:x0,:x1], Pose2Pose2(MvNormal([1.,0,0], [0.1,0.1,0.01])))
addFactor!(fg, [:x1,:x2], Pose2Pose2(MvNormal([1.,0,0], [0.1,0.1,0.01])))
addFactor!(fg, [:x2,:x3], Pose2Pose2(MvNormal([1.,0,0], [0.1,0.1,0.01])))
addFactor!(fg, [:x3,:x4], Pose2Pose2(MvNormal([1.,0,0], [0.1,0.1,0.01])))
addFactor!(fg, [:x0,:l1], Pose2Pose2(MvNormal([1.,0,0], [0.1,0.1,0.01])))
addFactor!(fg, [:x4,:l1], Pose2Pose2(MvNormal([-3.,0,0], [0.1,0.1,0.01])))
initAll!(fg)
@time r = IIF.solveGraphParametric!(fg)
# 0.024620 seconds (277.42 k allocations: 26.630 MiB)
# Seconds run: 0 (vs limit 100)
# Iterations: 21
# f(x) calls: 52
# ∇f(x) calls: 52
##
fg = initfg()
addVariable!(fg, :x0, Pose3)
addVariable!(fg, :x1, Pose3)
addVariable!(fg, :x2, Pose3)
addVariable!(fg, :x3, Pose3)
addVariable!(fg, :x4, Pose3)
addVariable!(fg, :l1, Pose3)
addFactor!(fg, [:x0], PriorPose3(MvNormal([0.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
addFactor!(fg, [:x0,:x1], Pose3Pose3(MvNormal([1.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
addFactor!(fg, [:x1,:x2], Pose3Pose3(MvNormal([1.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
addFactor!(fg, [:x2,:x3], Pose3Pose3(MvNormal([1.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
addFactor!(fg, [:x3,:x4], Pose3Pose3(MvNormal([1.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
addFactor!(fg, [:x0,:l1], Pose3Pose3(MvNormal([1.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
addFactor!(fg, [:x4,:l1], Pose3Pose3(MvNormal([-3.,0,0,0,0,0], [0.1,0.1,0.1,0.01,0.01,0.01])))
initAll!(fg)
@time r = IIF.solveGraphParametric!(fg)
# 86.839226 seconds (54.76 M allocations: 2.379 GiB, 0.78% gc time, 99.75% compilation time)
# 0.094435 seconds (531.50 k allocations: 101.043 MiB)
# Seconds run: 0 (vs limit 100)
# Iterations: 39
# f(x) calls: 92
# ∇f(x) calls: 92
For future reference, @profview got a nice upgrade
Here is the flame for parametric, memory reuse or stack allocation looks like it will have the biggest impact:

Yeah, i suspect caching in the residual and getSample functions will help with the allocations problem. Next step in that story is the preambleCache for CalcFactor.cache. Link to docs:
- https://juliarobotics.org/Caesar.jl/latest/concepts/stash_and_cache/
Jip, flame graph is cool. The way i read it, is that the hot-loop calling of Manifolds.exp is allocating Array many times. So my guess would be to allocate once in cf.cache._tempmem (whatever name works best) and then use Manifolds.exp! for in-place operations.
Not sure how that is going to influence the AD story which you ran into last time.
Not sure how that is going to influence the AD story which you ran into last time.
jip, that is the biggest problem. The type cannot be hardcoded and is actually not consistent in one solve. I can try a sort of operational memory “hot-swop” cache that won’t be type stable, but we can annotate the type. or I’d also like to try using bits types such as static arrays.
It’s the garbage collector that runs and slows it down.
Testing different options that work with AD in Pose2Pose2 from:
https://github.com/JuliaRobotics/RoME.jl/blob/d1b325df3609f5810c757db38afab24713037446/src/factors/Pose2D.jl#L44-L52
#Full cache/lazy version (11 allocations)
M = cf.cache.manifold
ϵ0 = cf.cache.ϵ0
q̂ = cf.cache.lazy[q, :q_hat]
exp!(M, q̂, ϵ0, X)
Manifolds.compose!(M, q̂, p, q̂)
X = cf.cache.lazy[q, :X]
log!(M, X, q, q̂)
Xc = IIF.getCoordCache!(cf.cache.lazy, M, number_eltype(q), :Xc)
vee!(M, Xc, q, X)
# no cache version (28 allocations)
M = getManifold(Pose2)
q̂ = Manifolds.compose(M, p, exp(M, identity_element(M, p), X))
Xc = vee(M, q, log(M, q, q̂))
# no cache version shared (19 allocations)
M = getManifold(Pose2)
q̂ = allocate(q)
exp!(M, q̂, identity_element(M, p), X)
Manifolds.compose!(M, q̂, p, q̂)
Xc = vee(M, q, log(M, q, q̂))
# as much as possible cache version (17 allocations)
q̂ = allocate(q)
M = cf.cache.manifold
ϵ0 = cf.cache.ϵ0
exp!(M, q̂, ϵ0, X)
Manifolds.compose!(M, q̂, p, q̂)
Xc = vee(M, q, log(M, q, q̂))
# as much as possible and weird reuse cache version (10 allocations)
q̂ = allocate(q)
M = cf.cache.manifold
ϵ0 = cf.cache.ϵ0
exp!(M, q̂, ϵ0, X)
Manifolds.compose!(M, q̂, p, q̂)
Xc = vee(M, q, log!(M, q̂, q, q̂))
Tested with with @time macro:
N=10
fg = initfg(GraphsDFG;solverParams=SolverParams(algorithms=[:default, :parametric]))
fg.solverParams.graphinit = false
addVariable!(fg, :x0, Pose2)
addFactor!(fg, [:x0], PriorPose2(MvNormal([0.,0,0], [0.1,0.1,0.01])))
dist = MvNormal([1.,0,0], [0.1,0.1,0.01])
for i=1:N
addVariable!(fg, Symbol("x$i"), Pose2)
addFactor!(fg, [Symbol("x$(i-1)"),Symbol("x$i")], Pose2Pose2(dist))
end
IIF.solveGraphParametric(fg)
# solveGraph!(fg)
I still want to figure out on this if we cannot just move the preambleCache step in the solve code sooner so that parametric solve CAN have the type information. Might be as simple as that?
I still want to figure out on this if we cannot just move the
preambleCachestep in the solve code sooner so that parametric solve CAN have the type information. Might be as simple as that?
Sorry, I don’t understand?
I found this package specifically for the issue: https://github.com/SciML/PreallocationTools.jl If you look at the readme it describes the problem.
Ah, perhaps i can ask this way round: Where is the first line of code in a parametric solve where AD is called?
I'll go look from there in IIF why the CalcFactor type is not stable or assigned yet.
The question i'm trying to figure out is whether the current intended type assignment to the cache object "happens after" or "happens before" the first AD request in parametric solve. And maybe the answer here is just, let's move preambleCache call "earlier" somehow, so that by the time AD happens all the types are set. Or i'm misunderstanding the issue.
Ah, perhaps i can ask this way round: Where is the first line of code in a parametric solve where AD is called?
Currently from Optim.jl depending on the value passed for autodiff
From link above, this is what is needed:
The dualcache function builds a DualCache object that stores both a version of the cache for u and for the Dual version of u, allowing use of pre-cached vectors with forward-mode automatic differentiation. Note that dualcache, due to its design, is only compatible with arrays that contain concretely typed elements.
Currently from Optim.jl depending on the value passed for autodiff
Right ... so i'd expect cf.cache to be all concrete types by then. Sounds like the issue is bit trickier.
[need] DualCache object
oops, i'm not much help here yet. I need to learn on this. I'd suggest you keep going on this track and I'll work on other data upload work. I'll come back here after AMP41 and IIF1010, and since 1010 is also about AD it would be a good time for me to loop back.
I'm completely refactoring to use a product manifold of power manifolds. I think that way the same data structure can support both Manopt.jl and Optim.jl.
Not sure how that is going to influence the AD story which you ran into last time.
jip, that is the biggest problem. The type cannot be hardcoded and is actually not consistent in one solve. I can try a sort of operational memory “hot-swop” cache that won’t be type stable, but we can annotate the type. or I’d also like to try using bits types such as static arrays.
Yes, SArray is both AD-friendly and fast -- I'd suggest going that way instead of caches. One note about AD, for product manifolds Manifolds.jl supports both ProductRepr and ArrayPartition, and the latter is more reverse-AD friendly. There are almost no drawbacks to ArrayPartition vs ProductRepr.
Also note, for power manifolds, that Manifolds.jl supports https://github.com/JuliaArrays/HybridArrays.jl .
BTW, there are some issues with AD through exp and log on (among other) SO(n), I have it largely solved for ForwardDiff.jl and ChainRules.jl but I think this is something worth discussing.
xref https://github.com/JuliaRobotics/IncrementalInference.jl/issues/1372
A rough benchmark of where we are.
fg = initfg()
fg = generateGraph_Beehive!(1000; dfg=fg, graphinit=false, locality=0.5);
r=IIF.solveGraphParametric(fg; autodiff=:finite, computeCovariance=false);
# Seconds run: 164 (vs limit 100)
# Iterations: 1
# f(x) calls: 8
# ∇f(x) calls: 8
r=IIF.solveGraphParametric(fg; ... Nelder-Mead
# Seconds run: 109 (vs limit 100)
# Iterations: 1226
# f(x) calls: 41265
Hi @Affie , that's awesome thank you!
Update: new Parametric RLM performance on Beehive1000
fg = initfg()
fg = generateGraph_Beehive!(1000; dfg=fg, graphinit=false, locality=0.5);
IIF.autoinitParametricManopt!(fg, [ls(fg, r"x"); ls(fg, r"l")])
#Progress: 100%|███████████████████████| Time: 0:00:15
v,r = IIF.solve_RLM(fg; damping_term_min=1e-18, expect_zero_residual=true);
# takes less than 1 second from initialized fg with noise added, but cannot solve with uninitialized.
# converges in 5 iterations
For Pose3 graph in the first post, Nr variables: 6, Nr factors: 7
@time v,r = IIF.solve_RLM(fg; is_sparse=true, debug, stopping_criterion, damping_term_min=1e-18, expect_zero_residual=true);
# 5.567798 seconds (6.62 M allocations: 453.195 MiB, 6.78% gc time, 99.79% compilation time)
# 0.009137 seconds (24.01 k allocations: 2.460 MiB)
# 4 iterations
In my experiments, time spent in RLM (not counting compilation) was dominated by Cholesky decomposition for the linear subproblem. Is this the same thing for you? It's possible in certain situations the linear subproblem could be solved better than the existing implementation, or there could be some other optimizations.
In my experiments, time spent in RLM (not counting compilation) was dominated by Cholesky decomposition for the linear subproblem. Is this the same thing for you? It's possible in certain situations the linear subproblem could be solved better than the existing implementation, or there could be some other optimizations.
Currently, a lot of time is spent on the factor cost function itself, there are still a lot of allocations that shouldn't be there. I haven't been able to figure it out yet. I'll post a flame graph. I did notice that for bigger problems the time spent shifted more to Cholesky decomposition.
Sure, calculating cost function and its Jacobian can be take a lot of time too.
Here is the @profview flame-graph of a larger graph, as you can see most of the time is spent on the cost to calculate the Jacobian. Jacobian has size=(59148, 58182). We were unable to solve problems of this size previously, so it is already a big improvement.
Here is the allocations flame-graph for a smaller problem @profileview_allocs
We upgraded to Static Arrays and ArrayPartition, but I think we are still hitting generic implementations in Manifolds.jl I did try and implement specialized dispatches for static arrays but haven't had success yet.
Yes, it looks like we miss some specialized implementations in Manifolds.jl. I can take a look at it and see what could be improved if you give me a code to reproduce the benchmarks. There are many snippets in this thread and I'm not sure which one to try.
Also, it's cool that it's already an improvement. I didn't even implement RLM with robotics in mind, and this RLM doesn't use many tricks developed for the Euclidean case :smile: .
Yes, it looks like we miss some specialized implementations in Manifolds.jl. I can take a look at it and see what could be improved if you give me a code to reproduce the benchmarks. There are many snippets in this thread and I'm not sure which one to try.
Thanks for offering. The best results currently need branches in RoME and IIF, give me a few days to get them merged and I can get an example together that will work on the master branches.
I quickly made this, it is the minimum to evaluate a single cost function (factor).
using Manifolds
using StaticArrays
using DistributedFactorGraphs
using IncrementalInference
using RoME
function runFactorLots(factor::T, N=100000) where T<:AbstractRelative
cf = CalcFactor(factor, 0, nothing, false, nothing, (), 0, getManifold(f))
M = getManifold(T)
# point from where measurement is taken ∈ M
p = getPointIdentity(M)
# point to where measurement is taken ∈ M
q = convert(typeof(p), exp(M, p, hat(M, p, mean(factor.Z))))
# q = exp(M, p, hat(M, p, mean(factor.Z)))
res = zeros(6)
for _ in 1:N
# measurement vector TpM - hat returns mutable array
m = convert(typeof(p), hat(M, p, rand(factor.Z)))
# m = hat(M, p, rand(factor.Z))
# to run the factor
res = cf(m, p, q)
end
end
## Factors types to test:
# Factor for SE2 to SE2 measurement
f = Pose2Pose2(MvNormal(SA[1.,0,0], SA[0.1,0.1,0.01]))
# Factor for SE3 to SE3 measurement
f = Pose3Pose3(MvNormal(SA[1.,0,0,0,0,0], SA[0.1,0.1,0.1,0.01,0.01,0.01]))
##
@time runFactorLots(f)
@profview runFactorLots(f)
@profview_allocs runFactorLots(f)
Here are some basic special-cases that make it much faster:
function Manifolds.exp(M::SpecialEuclidean, p::ArrayPartition, X::ArrayPartition)
return ArrayPartition(exp(M.manifold.manifolds[1].manifold, p.x[1], X.x[1]), exp(M.manifold.manifolds[2].manifold, p.x[2], X.x[2]))
end
function Manifolds.log(M::SpecialEuclidean, p::ArrayPartition, q::ArrayPartition)
return ArrayPartition(log(M.manifold.manifolds[1].manifold, p.x[1], q.x[1]), log(M.manifold.manifolds[2].manifold, p.x[2], q.x[2]))
end
function Manifolds.vee(M::SpecialEuclidean, p::ArrayPartition, X::ArrayPartition)
return vcat(vee(M.manifold.manifolds[1].manifold, p.x[1], X.x[1]), vee(M.manifold.manifolds[2].manifold, p.x[2], X.x[2]))
end
function Manifolds.compose(M::SpecialEuclidean, p::ArrayPartition, q::ArrayPartition)
return ArrayPartition(p.x[2]*q.x[1] + p.x[1], p.x[2]*q.x[2])
end
@kellertuer would you be fine with adding them to Manifolds.jl?
vee can be even faster if special-cased for SE(2) and SE(3).
Sure we can add that to Manifolds.jl, looks good to me performance wise
@mateuszbaran, thanks very much it already makes a big difference. Just note: log might be wrong (I think it misses the semidirect part and only does the product manifold) If i run the optimization with the log dispatch above included I don't get convergence.
You're welcome :slightly_smiling_face: . I've made a small mistake in log written above, I've already fixed it in my PR and in edit to my comment. This doesn't have any semidirect part because it is a log corresponding to the product metric, not the Lie group logarithm (see log_lie).
I would also consider the above one still a great retraction for the lie one (first order approximation that is) so your convergence maybe fails due to other parameters then.