FLoops.jl Question about possible assembly over predefined indices

Hi, I'm trying to write a parallel reduction but haven't seen it in the docs and cannot make it work.

The idea is the following (MWE):

function assemble!(K, F, elements)

    @floop for i in eachindex(elements)

        Fe, Ke = compute_contribution(elements[i])

        @reduce() do (Kl = zeros(size(K)); Ke), (Fl = zeros(size(F)); Fe)
            Fl[elements[i].vec_idxs] .+= Fe
            Kl[elements[i].mat_idxs] .+= Ke
        end

    end

    K .= Kl
    F .= Fl

end

Note that this is a reduce operation as several elements may share the same indices vec_idxs and mat_idxs. Is this kind of operation supported in FLoops? If so, could you point me how to make it work (even better without allocating intermediate Fl and Kl arrays?

Thanks!

Feb 25 '22 17:02 touste

It's a bit unclear what assemble! actually does, without knowing what elements, Fe, and Ke are. But, I'm guessing that something like the following could work

function assemble!(K, F, elements)

    @floop for i in eachindex(elements)

        Fe, Ke = compute_contribution(elements[i])

        Klinc = (Ke, elements[i].vec_idxs)
        Flinc = (Fe, elements[i].mat_idxs)
        @reduce() do (Kl = zeros(size(K)); Klinc), (Fl = zeros(size(F)); Flinc)
            if Flinc isa Tuple
                (Fe, mat_idxs) = Flinc
                Fl[vec_idxs] .+= Fe
            else
                Fl .+= Flinc
            end
            if Klinc isa Tuple
                (Ke, mat_idxs) = Klinc
                Kl[mat_idxs] .+= Ke
            else
                Kl .+= Klinc
            end
        end
    end

    K .= Kl
    F .= Fl

end

(This pattern would be easier to write once #117 lands)

Feb 26 '22 03:02 tkf

Sorry for not being more explicit but you guessed it right, and the above code works well. However I benchmarked it against the usual approach with a lock, and I don't see any benefit in using FLoops:

using StaticArrays
using FLoops
using BenchmarkTools

const Vec3{T} = SVector{3, T}

N_els = 1000
N_redvec = 500

struct Element
    vec_idxs::Vec3{Int}
    val::Vector{Vec3{Float64}}
end

elements = [Element(rand(1:N_redvec, 3), [rand(3) for _ in 1:1000]) for _ in 1:N_els]

F = zeros(N_redvec)

# Dummy compute function
compute_contribution(e) = sum(tan.(v) for v in e.val)


function assemble_floops!(F, elements)

    @floop for i in eachindex(elements)

        Fe = compute_contribution(elements[i])

        Flinc = (Fe, elements[i].vec_idxs)
        @reduce() do (Fl = zeros(size(F)); Flinc)
            if Flinc isa Tuple
                (Fe, vec_idxs) = Flinc
                Fl[vec_idxs] .+= Fe
            else
                Fl .+= Flinc
            end
        end
    end

    F .= Fl

end


@btime assemble_floops!($F, $elements)





function assemble_threads!(F, elements)

    lck = Threads.SpinLock()

    Threads.@threads for i in eachindex(elements)
        Fe = compute_contribution(elements[i])
        lock(lck) do 
            F[elements[i].vec_idxs] += Fe
        end 
    end

end

@btime assemble_threads!($F, $elements)

  3.566 ms (76 allocations: 37.52 KiB)
  3.567 ms (42 allocations: 4.89 KiB)

Am I doing something wrong or is FLoops not really expected to help in this case?

Thanks!

Feb 26 '22 18:02 touste