Question about possible assembly over predefined indices
Hi, I'm trying to write a parallel reduction but haven't seen it in the docs and cannot make it work.
The idea is the following (MWE):
function assemble!(K, F, elements)
@floop for i in eachindex(elements)
Fe, Ke = compute_contribution(elements[i])
@reduce() do (Kl = zeros(size(K)); Ke), (Fl = zeros(size(F)); Fe)
Fl[elements[i].vec_idxs] .+= Fe
Kl[elements[i].mat_idxs] .+= Ke
end
end
K .= Kl
F .= Fl
end
Note that this is a reduce operation as several elements may share the same indices vec_idxs and mat_idxs.
Is this kind of operation supported in FLoops? If so, could you point me how to make it work (even better without allocating intermediate Fl and Kl arrays?
Thanks!
It's a bit unclear what assemble! actually does, without knowing what elements, Fe, and Ke are. But, I'm guessing that something like the following could work
function assemble!(K, F, elements)
@floop for i in eachindex(elements)
Fe, Ke = compute_contribution(elements[i])
Klinc = (Ke, elements[i].vec_idxs)
Flinc = (Fe, elements[i].mat_idxs)
@reduce() do (Kl = zeros(size(K)); Klinc), (Fl = zeros(size(F)); Flinc)
if Flinc isa Tuple
(Fe, mat_idxs) = Flinc
Fl[vec_idxs] .+= Fe
else
Fl .+= Flinc
end
if Klinc isa Tuple
(Ke, mat_idxs) = Klinc
Kl[mat_idxs] .+= Ke
else
Kl .+= Klinc
end
end
end
K .= Kl
F .= Fl
end
(This pattern would be easier to write once #117 lands)
Sorry for not being more explicit but you guessed it right, and the above code works well. However I benchmarked it against the usual approach with a lock, and I don't see any benefit in using FLoops:
using StaticArrays
using FLoops
using BenchmarkTools
const Vec3{T} = SVector{3, T}
N_els = 1000
N_redvec = 500
struct Element
vec_idxs::Vec3{Int}
val::Vector{Vec3{Float64}}
end
elements = [Element(rand(1:N_redvec, 3), [rand(3) for _ in 1:1000]) for _ in 1:N_els]
F = zeros(N_redvec)
# Dummy compute function
compute_contribution(e) = sum(tan.(v) for v in e.val)
function assemble_floops!(F, elements)
@floop for i in eachindex(elements)
Fe = compute_contribution(elements[i])
Flinc = (Fe, elements[i].vec_idxs)
@reduce() do (Fl = zeros(size(F)); Flinc)
if Flinc isa Tuple
(Fe, vec_idxs) = Flinc
Fl[vec_idxs] .+= Fe
else
Fl .+= Flinc
end
end
end
F .= Fl
end
@btime assemble_floops!($F, $elements)
function assemble_threads!(F, elements)
lck = Threads.SpinLock()
Threads.@threads for i in eachindex(elements)
Fe = compute_contribution(elements[i])
lock(lck) do
F[elements[i].vec_idxs] += Fe
end
end
end
@btime assemble_threads!($F, $elements)
3.566 ms (76 allocations: 37.52 KiB)
3.567 ms (42 allocations: 4.89 KiB)
Am I doing something wrong or is FLoops not really expected to help in this case?
Thanks!