julia dense-matrix mul!(C, A, B, alpha, beta) allocates

On Julia 1.8.1 I noticed the following:

function manymul(N, C, A, B, alpha, beta)
    for i in 1:N
        mul!(C, A, B, alpha, beta)
        #BLAS.gemm!('N', 'N', alpha, A, B, beta, C)  # eliminates allocations
        C, A = A, C
    end
    C
end

D = 16
A = randn(D, D)
B = randn(D, D)
C = zero(A)

N = 100000
@time manymul(N, C, A, B, 1.0, 0.5)   #allocates N times (32 bytes each) with `mul!()`, 0 times with `gemm!()`

Cthulhu suggests this is due to runtime dispatch related to MulAdd(). This can impact performance of e.g. ODE solving involving mul!() for small matrix sizes. The example above takes around 10% longer with mul!() vs. gemm!(), according to benchmarktools (single-threaded BLAS).

Is this known/intended?

My versioninfo():

Julia Version 1.8.1
Commit afb6c60d69a (2022-09-06 15:09 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin21.4.0)
  CPU: 12 × Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, skylake)
  Threads: 1 on 6 virtual cores
Environment:
  JULIA_EDITOR = code
  JULIA_NUM_THREADS = 
  JULIA_PKG_USE_CLI_GIT = true

Sep 22 '22 18:09 amilsted

Isn't that because you're accessing non-const globals? Does it change if you make those arrays const? (I'm away from computer now, can't try myself)

Sep 22 '22 21:09 giordano

[EDIT: It probably allocates more often than not, but not always, so not deterministic, but if I recall not data-dependent. BLAS threads, see below?]

No, it allocates N ~~3*N~~ [EDIT: 3*N was likely true at that time, and see below sometimes 0 times] times even if I put into a function. I don't know if this is intended, nor if promised not to allocate. I did comment out e.g. this line that allocates extra: C, A = A, C

I ~~think there's~~ the problem: https://github.com/JuliaLang/julia/blob/99225ab1d11925b8e904b0b12f1ee8b0f6946660/stdlib/LinearAlgebra/src/matmul.jl#L639

Sep 22 '22 22:09 PallHaraldsson

It's strange, it's defined like:

@inline function mul!(C::StridedMatrix{T}, A::StridedVecOrMat{T}, B::StridedVecOrMat{T},
                             alpha::Number, beta::Number) where {T<:BlasFloat}
           return gemm_wrapper!(C, 'N', 'N', A, B, MulAddMul(alpha, beta))
       end

but I ruled out MulAddMul (running/timing it separately) and gemm_wrapper! problematic (but maybe not BlasFloat, I cloned mul! to mul2! to test but substituted with Float64):

@time gemm_wrapper!(C, 'N', 'N', A, B, MulAddMul(1.0, 1.5))
  0.000015 seconds

Sep 22 '22 22:09 PallHaraldsson

I did (do still?) suspect threading, because I actually DID get 3*N allocations (on 1.7.3, and likely remembering right for 1.8.1), but then that went away down to 0 [N] allocations, and then I get N allocations.

I'm pasting verbatim, to show I'm not wrong (and the function definitions should be dummy redefinitions):

julia> @time manymul(N, C, A, B, 1.0, 0.5)
  0.000024 seconds (3 allocations: 64 bytes)

julia> N
1

julia> function manymul(N, C, A, B, alpha, beta)
                         for i in 1:N
                             #mul!(C, A, B, alpha, beta)
                             BLAS.gemm!('N', 'N', alpha, A, B, beta, C)  # eliminates allocations
                             # C, A = A, C
                         end
                         nothing # C
                     end
manymul (generic function with 1 method)

julia> @time manymul(N, C, A, B, 1.0, 0.5)
  0.018076 seconds (14.48 k allocations: 848.309 KiB, 99.68% compilation time)

julia> @time manymul(N, C, A, B, 1.0, 0.5)
  0.000022 seconds

julia> function manymul(N, C, A, B, alpha, beta)
                         for i in 1:N
                             mul!(C, A, B, alpha, beta)
                             #BLAS.gemm!('N', 'N', alpha, A, B, beta, C)  # eliminates allocations
                             # C, A = A, C
                         end
                         nothing # C
                     end
manymul (generic function with 1 method)

julia> @time manymul(N, C, A, B, 1.0, 0.5)
  0.032734 seconds (14.71 k allocations: 852.665 KiB, 99.85% compilation time)

julia> @time manymul(N, C, A, B, 1.0, 0.5)
  0.000021 seconds (1 allocation: 32 bytes)

I did run again with 1.7.3 with:

$ julia -t1

and got 3 allocations. Maybe I ruled out regular threads, but not BLAS threads.

Sep 22 '22 22:09 PallHaraldsson

running on 1.8.1 (trying to rule out BLAS threads, but I'm not sure OMP_NUM_THREADS does anything in my version):

$ OMP_NUM_THREADS=1 julia -t1
julia> @time for i in 1:1000 manymul(N, C, A, B, 1.0, 0.5) end
  0.001940 seconds (1000 allocations: 31.250 KiB)

running the same in 1.7.3 I got:

julia> @time for i in 1:1000 manymul(N, C, A, B, 1.0, 0.5) end
  0.002373 seconds (3.00 k allocations: 62.500 KiB)

Three times the allocations (likely not always 3*N as I wrote about before), but not 3*N bytes, if that's helpful for anyone to know, only 2*N KB of allocations.

Sep 22 '22 23:09 PallHaraldsson

I think I ruled out (BLAS) threads an issue below:

julia> versioninfo()
Julia Version 1.8.1
Commit afb6c60d69a (2022-09-06 15:09 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 16 × Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, broadwell)
  Threads: 1 on 16 virtual cores

julia> @time @time @time mul!(C, A, B, 1.0, 1.5)
  0.000016 seconds (1 allocation: 32 bytes)
  0.004511 seconds (45 allocations: 2.570 KiB, 97.40% compilation time)
  0.004548 seconds (72 allocations: 4.961 KiB, 96.60% compilation time)

julia> BLAS.set_num_threads(1)  # still allocates after this, and this should be effective I can set/send in (0) or even (-1) though...

julia> @time @time @time mul!(C, A, B, 1.0, 1.5)
  0.000017 seconds (1 allocation: 32 bytes)
  0.000110 seconds (26 allocations: 1.477 KiB)
  0.000135 seconds (50 allocations: 3.148 KiB)

In that terminal I was however runing julia, not julia -t1

Sep 22 '22 23:09 PallHaraldsson

I added version info to the report above. On my system the allocation behavior (post compilation) seems to be deterministic and it must be something in mul!() that is not in gemm!(), as the gemm!() version does not allocate. @benchmark also agrees that the mul!() version allocates, whereas the gemm!() version does not.

Sep 23 '22 01:09 amilsted

This version also does not allocate:

function manymul(N, out, op, in, alpha, beta)
    _add = LinearAlgebra.MulAddMul{false, false, typeof(alpha), typeof(beta)}(alpha, beta)
    for _ in 1:N
        LinearAlgebra.gemm_wrapper!(out, 'N', 'N', op, in, _add)
        out, in = in, out
    end
    out
end

The difference is that a fully-concrete MulAddMul is used. If constructed using MulAddMul(alpha, beta) (as in mul!), the resulting struct type is not fully inferrable. It seems like this isn't getting fixed by the compiler in mul!().

Even if constant propagation were working to eliminate this here, I guess this would not fix the non-constant case, where e.g. alpha and beta are values taken from a vector of Float64? In my actual code, this is what happens, so I care about the non-constant case. I would call gemm more directly, but then I lose all the transpose/adjoint handling etc, which I really want to keep!

Using _add = LinearAlgebra.MulAddMul(alpha, beta) instead (still outside the loop) leads to 1 allocation. Putting that inside the loop gets us back to N allocations. It seems pretty clear that MulAddMul() is the issue here.

Sep 23 '22 01:09 amilsted

Now that I'm back to the computer, I'm more and more convinced the only thing here is that you're accessing non-const globals:

julia> using LinearAlgebra

julia> function manymul(N, C, A, B, alpha, beta)
           for i in 1:N
               mul!(C, A, B, alpha, beta)
               C, A = A, C
           end
           C
       end
manymul (generic function with 1 method)

julia> const D = 16;

julia> const A = randn(D, D);

julia> const B = randn(D, D);

julia> const C = zero(A);

julia> const N = 100000;

julia> @time manymul(N, C, A, B, 1.0, 0.5);
  0.075428 seconds

So I don't see what's the issue here.

Sep 23 '22 10:09 giordano

Yes, const fixed for me too, but shouldn't this work too, while it doesn't?

function test()
       D = 16
       A = randn(D, D)
       B = randn(D, D)
       C = zero(A)

       N = 100000
       manymul(N, C, A, B, 1.0, 0.5)
end

julia> @time test()
  0.124667 seconds (100.00 k allocations: 3.058 MiB)

I guess why it doesn't with local variables (unlike for const), might be a clue (should work the same?!), and differing number of allocations with same code that puzzle me.

Sep 23 '22 14:09 PallHaraldsson

Interesting! I also see 0 allocations in @giordano's example. However:

D = 16
const A = randn(D, D)
const B = randn(D, D)
const C = zero(A)
const N = 100000
const alphas = [1.0]

@time manymul(N, C, A, B, alphas[1], 0.5)   #allocates N times

Considering also @PallHaraldsson's latest evidence, it seems we rely on constant propagation of alpha and beta to avoid allocation, and it is quite fragile.

Personally, I don't think we should be relying on constant prop. here, at least not in the BLAS case - I don't think reading alpha or beta from a vector is a particularly crazy thing to do - but also the cases where alpha and beta really are constants seem to be fragile in ways I wouldn't have expected. @PallHaraldsson's example is particularly odd!

Actually, it's also very surprising to me that having non-const globals for A, B, and C makes a difference vs @giordano's example. I might have expected some constant dispatch overhead when calling manymul() in this case, but after that I would have expected the behavior to be the same as with const globals. Instead we see overhead that is linear in N.

Sep 23 '22 16:09 amilsted

we rely on constant propagation of alpha and beta to avoid allocation, and it is quite fragile.

You mean non-deterministic. I would have thought the compiler compiles the same way each time for same inputs, is that not to be relied upon, and why might that be?

Sep 25 '22 16:09 PallHaraldsson

You mean non-deterministic. I would have thought the compiler compiles the same way each time for same inputs, is that not to be relied upon, and why might that be?

I didn't observe any non-determinism. Constant propagation depends on the compiler figuring out that certain values are constant and can be converted to some kind of Const type. Functions called with different types involve different methods. I do think it is fragile, though, since it seems to be easy to prevent the compiler from doing this. In your example, I can get const. prop. to work by adding an @inline to the manymul() callsite.

Sep 26 '22 16:09 amilsted

I mean non-deterministinc (allocations, then none, then back on) as I showed there: https://github.com/JuliaLang/julia/issues/46865#issuecomment-1255637879

Yes, I did "compile", but the same code in-between. I doubt I can help more, I started debugging this thinking it might be simple, but seems above my (zero) pay-grade. Not that I don't want to help.

Sep 26 '22 16:09 PallHaraldsson

Huh, that is indeed puzzling.

Btw, I took a peek at matmul.jl and friends in LinearAlgebra - MulAdd is everywhere! Even if constant prop were working perfectly, why would we only want non-allocating versions of all these functions for constant alpha and beta? If I am using a 5-arg mul! I am trying to squeeze out as much performance as I can! I agree with @Micket's comments on slack: seems like a questionable choice.

At least in gemm_wrapper!, it looks like one could quite easily push MulAdd down the call stack so that it is not needed for the BLAS case at least (where it really is entirely superfluous).

Sep 26 '22 17:09 amilsted

Just want to add that these allocations, although small, can be disastrous for scaling with threaded parallelism due to GC contention. And yes, we have seen this in real code.

It's also easy to make a simple example that shows this:

# start julia with nonzero thread count!

function manymul_threads(N, Cs, A, B, alpha, beta)
     Threads.@threads for C in Cs
         manymul(N/length(Cs), C, A, B, alpha, beta)
     end
     Cs
 end

BLAS.set_num_threads(1)
Cs = [zero(A) for _ in 1:10]
@time manymul_threads(N, C, A, B, 1.0, 0.5)   # as previous examples, for comparison
@time manymul_threads(N, Cs, A, B, 1.0, 0.5)   # same number of allocations as above, but *slower* at 4 threads

using SparseArrays
Asp = sparse(A)
@time manymul(N, C, Asp, B, 1.0, 0.5)   # slower than dense, but *does not allocate*
@time manymul_threads(N, Cs, Asp, B, 1.0, 0.5)   # actually faster than the single-threaded version

Oct 03 '22 18:10 amilsted

Oh man, this issue has history. https://github.com/JuliaLang/julia/issues/34013, https://github.com/JuliaLang/julia/pull/29634 In a previous version, the BLAS path was already free of MulAddMul, but this apparently broke constant-prop. for the small-matrix specializations. Seems it's kind of a "whac-a-mole" situation....

Oct 04 '22 21:10 amilsted

Worth noting that the 2x2 and 3x3 versions of the tests above also allocate on 1.8.1 (any propagated constants are not making it as far as MulAddMul() I guess).

Oct 04 '22 22:10 amilsted

I wrote a macro that pulls the value-dependent branching for MulAddMul() outside the enclosing function call. I'm sure this is not the prettiest way to do it:

macro mambranch(expr)
    expr.head == :call || throw(ArgumentError("Can only handle function calls."))
    for (i, e) in enumerate(expr.args)
        e isa Expr || continue
        if e.head == :call && e.args[1] == :(LinearAlgebra.MulAddMul)
            local asym = e.args[2]
            local bsym = e.args[3]

            local e_sub11 = copy(expr)
            e_sub11.args[i] = :(LinearAlgebra.MulAddMul{true, true, typeof($asym), typeof($bsym)}($asym, $bsym))
            
            local e_sub10 = copy(expr)
            e_sub10.args[i] = :(LinearAlgebra.MulAddMul{true, false, typeof($asym), typeof($bsym)}($asym, $bsym))
            
            local e_sub01 = copy(expr)
            e_sub01.args[i] = :(LinearAlgebra.MulAddMul{false, true, typeof($asym), typeof($bsym)}($asym, $bsym))

            local e_sub00 = copy(expr)
            e_sub00.args[i] = :(LinearAlgebra.MulAddMul{false, false, typeof($asym), typeof($bsym)}($asym, $bsym))

            local e_out = quote
                if isone($asym) && iszero($bsym)
                    $e_sub11
                elseif isone($asym)
                    $e_sub10
                elseif iszero($bsym)
                    $e_sub01
                else
                    $e_sub00
                end
            end
            return esc(e_out)
        end
    end
    throw(ArgumentError("No valid MulAddMul expression found."))
end

With the macro, you can write a fully inferable manymul like this:

function manymul(N, out, op, in, alpha, beta)
    for _ in 1:N
        @mambranch LinearAlgebra.gemm_wrapper!(out, 'N', 'N', op, in, LinearAlgebra.MulAddMul(alpha, beta))
        out, in = in, out
    end
    out
end

It gets transformed by the macro into approximately this:

function manymul(N, out, op, in, alpha, beta)
    for _ in 1:N
        ais1, bis0 = isone(alpha), iszero(beta)
        if ais1 && bis0
            LinearAlgebra.gemm_wrapper!(out, 'N', 'N', op, in, LinearAlgebra.MulAddMul{true, true, typeof(alpha), typeof(beta)}(alpha, beta))
        elseif ais1
            LinearAlgebra.gemm_wrapper!(out, 'N', 'N', op, in, LinearAlgebra.MulAddMul{true, false, typeof(alpha), typeof(beta)}(alpha, beta))
        elseif bis0
            LinearAlgebra.gemm_wrapper!(out, 'N', 'N', op, in, LinearAlgebra.MulAddMul{false, true, typeof(alpha), typeof(beta)}(alpha, beta))
        else
            LinearAlgebra.gemm_wrapper!(out, 'N', 'N', op, in, LinearAlgebra.MulAddMul{false, false, typeof(alpha), typeof(beta)}(alpha, beta))
        end
        out, in = in, out
    end
    out
end

This does not allocate for either const or variable alpha/beta! Adding something like this to mul!() in matmul.jl seems a fair way of removing the const-prop fragility and make non-const cases also nonallocating.

I don't see any disadvantages: This branching won't happen at runtime in the const-prop case, and was happening in any case in the MulAddMul() constructor otherwise. Now it just doesn't lead to runtime dispatch.

Edit: See #47088 for a PR.

Oct 05 '22 20:10 amilsted

There ought to be a simpler solution that exploits union-splitting? I couldn't seem to make that happen though.

Oct 05 '22 22:10 amilsted

Any update on this issue? We've been using Static.jl to get around this for now.

Aug 04 '23 18:08 jlchan

Should be fixed in 1.12! https://github.com/JuliaLang/julia/pull/52439

Jun 10 '24 16:06 amilsted

julia julia copied to clipboard

dense-matrix mul!(C, A, B, alpha, beta) allocates

julia
julia copied to clipboard