LoopVectorization.jl icon indicating copy to clipboard operation
LoopVectorization.jl copied to clipboard

No method matching `_vstore_unroll!` on ARM

Open benegee opened this issue 10 months ago • 4 comments

We are using @turbo extensively in Trixi.jl. Recently, we have started running our code on ARM-based machines and encountered the following error:

LoadError: MethodError: no method matching _vstore_unroll!(::LayoutPointers.StridedPointer{Float64, 4, 1, 0, (1, 2, 3, 4), Tuple{Static.StaticInt{8}, Static.StaticInt{8}, Static.StaticInt{40}, Static.StaticInt{200}}, NTuple{4, Static.StaticInt{0}}}, ::VectorizationBase.VecUnroll{4, 1, Float64, VectorizationBase.VecUnroll{4, 1, Float64, Float64}}, ::VectorizationBase.Unroll{2, 1, 5, 1, 1, 0x0000000000000000, 1, VectorizationBase.Unroll{4, 1, 5, 1, 1, 0x0000000000000000, 1, Static.StaticInt{0}}}, ::Static.False, ::Static.False, ::Static.False, ::Static.StaticInt{16}, ::Static.StaticInt{8})
  
  Closest candidates are:
    _vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::M, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS, M}
     @ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2552
    _vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T, <:VectorizationBase.VecUnroll{<:Any, W, T, VectorizationBase.Vec{W, T}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::Static.StaticInt{SVUS}) where {W, T, A<:Static.StaticBool, S<:Static.StaticBool, NT<:Static.StaticBool, RS, D, C, SVUS, UU<:(VectorizationBase.Unroll{AUO, FO, NO, AV, W, MO, X, VectorizationBase.Unroll{AUI, FI, NI, AV, W, MI, X, I}} where {AV, X, I, AUO, FO, NO, MO, AUI, FI, NI, MI})}
     @ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2575
    _vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS}
     @ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2531
    ...

This was caused by one of our helper functions, which basically does a specialized matrix-vector multiplication. We were able to reproduce this issue with this example:

MWE
using StaticArrays
using StrideArrays: PtrArray, StaticInt
using LoopVectorization: @turbo

function multiply_dimensionwise!(data_out, matrix)

    tmp = zeros(eltype(data_out), size(data_out, 1), size(matrix, 1), size(matrix, 2), size(matrix, 2))

    @turbo for k in axes(data_out, 4), j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1)

        res = zero(eltype(data_out))
        for kk in axes(matrix, 2)
            res += matrix[k, kk] * tmp[v, i, j, kk]
        end
        data_out[v, i, j, k] = res
    end

    return nothing
end

dims = 3
nodes = 5 # important!
els = 1

test_u = fill(2.0, nodes^dims * els)
test_ptr = PtrArray(pointer(test_u), (StaticInt(1), ntuple(_ -> StaticInt(nodes), dims)..., els))
test_mat = fill(1.0, nodes, nodes)
test_smat = SMatrix{nodes, nodes}(test_mat)

multiply_dimensionwise!(view(test_ptr, :, :, :, :, 1), test_smat)

Xref: https://github.com/trixi-framework/Trixi.jl/issues/2075

benegee avatar Feb 14 '25 12:02 benegee

Slightly reduced MWE:

julia> using StaticArrays, LoopVectorization

julia> function foo!(data_out, matrix, data_in)
           @turbo for j in axes(data_out, 3), i in axes(data_out, 2),
                   v in axes(data_out, 1)
               
               res = zero(eltype(data_out))
               for jj in axes(matrix, 2)
                   res += matrix[j, jj] * data_in[v, i, jj]
               end
               data_out[v, i, j] = res
           end
           
           return nothing
       end
foo! (generic function with 1 method)

julia> begin
       v = 1 # works for v = 2
       n = 5 # works for n = 4
       # works if not all dimensions are statically known, e.g., from MArray or PtrArray
       data_out = @MArray zeros(v, n, n)
       matrix = @SMatrix zeros(n, n)
       # sizes of data_in are not used
       data_in = zeros(v, n, n)
       foo!(data_out, matrix, data_in)
       end
ERROR: MethodError: no method matching _vstore_unroll!(::LayoutPointers.StridedPointer{…}, ::VectorizationBase.VecUnroll{…}, ::VectorizationBase.Unroll{…}, ::Static.False, ::Static.False, ::Static.False, ::Static.StaticInt{…}, ::Static.StaticInt{…})

Closest candidates are:
  _vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::M, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS, M}
   @ VectorizationBase ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2552
  _vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T, <:VectorizationBase.VecUnroll{<:Any, W, T, VectorizationBase.Vec{W, T}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::Static.StaticInt{SVUS}) where {W, T, A<:Static.StaticBool, S<:Static.StaticBool, NT<:Static.StaticBool, RS, D, C, SVUS, UU<:(VectorizationBase.Unroll{AUO, FO, NO, AV, W, MO, X, VectorizationBase.Unroll{AUI, FI, NI, AV, W, MI, X, I}} where {AV, X, I, AUO, FO, NO, MO, AUI, FI, NI, MI})}
   @ VectorizationBase ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2575
  _vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS}
   @ VectorizationBase ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2531
  ...

Stacktrace:
 [1] _vstore!
   @ ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2093 [inlined]
 [2] macro expansion
   @ ~/.julia/packages/LoopVectorization/GKxH5/src/reconstruct_loopset.jl:1107 [inlined]
 [3] _turbo_!
   @ ~/.julia/packages/LoopVectorization/GKxH5/src/reconstruct_loopset.jl:1107 [inlined]
 [4] macro expansion
   @ ~/.julia/packages/LoopVectorization/GKxH5/src/condense_loopset.jl:1179 [inlined]
 [5] foo!(data_out::MArray{Tuple{1, 5, 5}, Float64, 3, 25}, matrix::SMatrix{5, 5, Float64, 25}, data_in::Array{Float64, 3})
   @ Main ./REPL[2]:2
 [6] top-level scope
   @ REPL[3]:9
Some type information was truncated. Use `show(err)` to see complete types.

ranocha avatar Nov 27 '25 20:11 ranocha

VectorizationBase.VecUnroll{4, 1, Float64, VectorizationBase.VecUnroll{4, 1, Float64, Float64}}

That is a 4x4 tile of scalars. That's odd for it to be trying to store a register tile of scalars instead of SIMD vectors.

I guessing adding a method for that is an okay fix, but a better (performance-wise) one would be to find out why that is happening.

chriselrod avatar Nov 28 '25 11:11 chriselrod

If it knows that the v dim is 1 and thus doesn't SIMD that, it should SIMD the next axis i instead. But I guess it doesn't realize those are contiguous.

Also, it lost the static size information here, hence the 4x4 instead of 5x5 tile? If real code is like that, you could try using indices instead of axes to collect static size info.

chriselrod avatar Nov 28 '25 11:11 chriselrod

Thanks a lot for your comments, @chriselrod! So https://github.com/JuliaSIMD/VectorizationBase.jl/pull/125 is an acceptable fix that will likely not result in optimal performance, correct?

In our use case, the performance of the case where the v dim is 1 is not the most critical part, but we need it to work correctly to avoid having to define another method for this case (or losing the performance of LoopVectorization.jl for the case where the v dim is not 1).

Sadly, I am not familiar enough with the internals to propose a better fix within LoopVectorization.jl, avoiding the performance pitfalls, during the time I can allocate for this right now.

ranocha avatar Dec 01 '25 08:12 ranocha