No method matching `_vstore_unroll!` on ARM
We are using @turbo extensively in Trixi.jl.
Recently, we have started running our code on ARM-based machines and encountered the following error:
LoadError: MethodError: no method matching _vstore_unroll!(::LayoutPointers.StridedPointer{Float64, 4, 1, 0, (1, 2, 3, 4), Tuple{Static.StaticInt{8}, Static.StaticInt{8}, Static.StaticInt{40}, Static.StaticInt{200}}, NTuple{4, Static.StaticInt{0}}}, ::VectorizationBase.VecUnroll{4, 1, Float64, VectorizationBase.VecUnroll{4, 1, Float64, Float64}}, ::VectorizationBase.Unroll{2, 1, 5, 1, 1, 0x0000000000000000, 1, VectorizationBase.Unroll{4, 1, 5, 1, 1, 0x0000000000000000, 1, Static.StaticInt{0}}}, ::Static.False, ::Static.False, ::Static.False, ::Static.StaticInt{16}, ::Static.StaticInt{8})
Closest candidates are:
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::M, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS, M}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2552
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T, <:VectorizationBase.VecUnroll{<:Any, W, T, VectorizationBase.Vec{W, T}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::Static.StaticInt{SVUS}) where {W, T, A<:Static.StaticBool, S<:Static.StaticBool, NT<:Static.StaticBool, RS, D, C, SVUS, UU<:(VectorizationBase.Unroll{AUO, FO, NO, AV, W, MO, X, VectorizationBase.Unroll{AUI, FI, NI, AV, W, MI, X, I}} where {AV, X, I, AUO, FO, NO, MO, AUI, FI, NI, MI})}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2575
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2531
...
This was caused by one of our helper functions, which basically does a specialized matrix-vector multiplication. We were able to reproduce this issue with this example:
MWE
using StaticArrays
using StrideArrays: PtrArray, StaticInt
using LoopVectorization: @turbo
function multiply_dimensionwise!(data_out, matrix)
tmp = zeros(eltype(data_out), size(data_out, 1), size(matrix, 1), size(matrix, 2), size(matrix, 2))
@turbo for k in axes(data_out, 4), j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1)
res = zero(eltype(data_out))
for kk in axes(matrix, 2)
res += matrix[k, kk] * tmp[v, i, j, kk]
end
data_out[v, i, j, k] = res
end
return nothing
end
dims = 3
nodes = 5 # important!
els = 1
test_u = fill(2.0, nodes^dims * els)
test_ptr = PtrArray(pointer(test_u), (StaticInt(1), ntuple(_ -> StaticInt(nodes), dims)..., els))
test_mat = fill(1.0, nodes, nodes)
test_smat = SMatrix{nodes, nodes}(test_mat)
multiply_dimensionwise!(view(test_ptr, :, :, :, :, 1), test_smat)
Xref: https://github.com/trixi-framework/Trixi.jl/issues/2075
Slightly reduced MWE:
julia> using StaticArrays, LoopVectorization
julia> function foo!(data_out, matrix, data_in)
@turbo for j in axes(data_out, 3), i in axes(data_out, 2),
v in axes(data_out, 1)
res = zero(eltype(data_out))
for jj in axes(matrix, 2)
res += matrix[j, jj] * data_in[v, i, jj]
end
data_out[v, i, j] = res
end
return nothing
end
foo! (generic function with 1 method)
julia> begin
v = 1 # works for v = 2
n = 5 # works for n = 4
# works if not all dimensions are statically known, e.g., from MArray or PtrArray
data_out = @MArray zeros(v, n, n)
matrix = @SMatrix zeros(n, n)
# sizes of data_in are not used
data_in = zeros(v, n, n)
foo!(data_out, matrix, data_in)
end
ERROR: MethodError: no method matching _vstore_unroll!(::LayoutPointers.StridedPointer{…}, ::VectorizationBase.VecUnroll{…}, ::VectorizationBase.Unroll{…}, ::Static.False, ::Static.False, ::Static.False, ::Static.StaticInt{…}, ::Static.StaticInt{…})
Closest candidates are:
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::M, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS, M}
@ VectorizationBase ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2552
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T, <:VectorizationBase.VecUnroll{<:Any, W, T, VectorizationBase.Vec{W, T}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::Static.StaticInt{SVUS}) where {W, T, A<:Static.StaticBool, S<:Static.StaticBool, NT<:Static.StaticBool, RS, D, C, SVUS, UU<:(VectorizationBase.Unroll{AUO, FO, NO, AV, W, MO, X, VectorizationBase.Unroll{AUI, FI, NI, AV, W, MI, X, I}} where {AV, X, I, AUO, FO, NO, MO, AUI, FI, NI, MI})}
@ VectorizationBase ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2575
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS}
@ VectorizationBase ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2531
...
Stacktrace:
[1] _vstore!
@ ~/.julia/packages/VectorizationBase/7mwzi/src/vecunroll/memory.jl:2093 [inlined]
[2] macro expansion
@ ~/.julia/packages/LoopVectorization/GKxH5/src/reconstruct_loopset.jl:1107 [inlined]
[3] _turbo_!
@ ~/.julia/packages/LoopVectorization/GKxH5/src/reconstruct_loopset.jl:1107 [inlined]
[4] macro expansion
@ ~/.julia/packages/LoopVectorization/GKxH5/src/condense_loopset.jl:1179 [inlined]
[5] foo!(data_out::MArray{Tuple{1, 5, 5}, Float64, 3, 25}, matrix::SMatrix{5, 5, Float64, 25}, data_in::Array{Float64, 3})
@ Main ./REPL[2]:2
[6] top-level scope
@ REPL[3]:9
Some type information was truncated. Use `show(err)` to see complete types.
VectorizationBase.VecUnroll{4, 1, Float64, VectorizationBase.VecUnroll{4, 1, Float64, Float64}}
That is a 4x4 tile of scalars. That's odd for it to be trying to store a register tile of scalars instead of SIMD vectors.
I guessing adding a method for that is an okay fix, but a better (performance-wise) one would be to find out why that is happening.
If it knows that the v dim is 1 and thus doesn't SIMD that, it should SIMD the next axis i instead. But I guess it doesn't realize those are contiguous.
Also, it lost the static size information here, hence the 4x4 instead of 5x5 tile?
If real code is like that, you could try using indices instead of axes to collect static size info.
Thanks a lot for your comments, @chriselrod! So https://github.com/JuliaSIMD/VectorizationBase.jl/pull/125 is an acceptable fix that will likely not result in optimal performance, correct?
In our use case, the performance of the case where the v dim is 1 is not the most critical part, but we need it to work correctly to avoid having to define another method for this case (or losing the performance of LoopVectorization.jl for the case where the v dim is not 1).
Sadly, I am not familiar enough with the internals to propose a better fix within LoopVectorization.jl, avoiding the performance pitfalls, during the time I can allocate for this right now.