LoopVectorization error in AMR on ARM
p4est_3d_dgsem/elixir_advection_amr_unstructured_curved.jl fails when run on aarch64.
This was e.g. observed here: https://github.com/trixi-framework/Trixi.jl/actions/runs/10800155321/job/29957509012 It is not an issue with MPI though. I tried to run the elixir on a grace hopper node and got the same.
It went unnoticed so far.
Error message
elixir_advection_amr_unstructured_curved.jl: Error During Test at elixir_advection_amr_unstructured_curved.jl: /Users/runner/work/Trixi.jl/Trixi.jl/test/test_trixi.jl:242
Got exception outside of a @test
elixir_advection_amr_unstructured_curved.jl: LoadError: MethodError: no method matching _vstore_unroll!(::LayoutPointers.StridedPointer{Float64, 4, 1, 0, (1, 2, 3, 4), Tuple{Static.StaticInt{8}, Static.StaticInt{8}, Static.StaticInt{40}, Static.StaticInt{200}}, NTuple{4, Static.StaticInt{0}}}, ::VectorizationBase.VecUnroll{4, 1, Float64, VectorizationBase.VecUnroll{4, 1, Float64, Float64}}, ::VectorizationBase.Unroll{2, 1, 5, 1, 1, 0x0000000000000000, 1, VectorizationBase.Unroll{4, 1, 5, 1, 1, 0x0000000000000000, 1, Static.StaticInt{0}}}, ::Static.False, ::Static.False, ::Static.False, ::Static.StaticInt{16}, ::Static.StaticInt{8})
Closest candidates are:
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::M, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS, M}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2552
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T, <:VectorizationBase.VecUnroll{<:Any, W, T, VectorizationBase.Vec{W, T}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::Static.StaticInt{SVUS}) where {W, T, A<:Static.StaticBool, S<:Static.StaticBool, NT<:Static.StaticBool, RS, D, C, SVUS, UU<:(VectorizationBase.Unroll{AUO, FO, NO, AV, W, MO, X, VectorizationBase.Unroll{AUI, FI, NI, AV, W, MI, X, I}} where {AV, X, I, AUO, FO, NO, MO, AUI, FI, NI, MI})}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2575
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2531
...
Can you please create an MWE and file an issue upstream?
It is not an issue with MPI though. I tried to run the elixir on a grace hopper node and got the same.
That means it does also occur when running it serially?
Just double-checked. Yes, it was a serial run.
This is as minimal as I could get so far:
using StaticArrays
using StrideArrays: PtrArray, StaticInt
using LoopVectorization: @turbo
function multiply_dimensionwise!(data_out, matrix)
tmp = zeros(eltype(data_out), size(data_out, 1), size(matrix, 1), size(matrix, 2), size(matrix, 2))
@turbo for k in axes(data_out, 4), j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1)
res = zero(eltype(data_out))
for kk in axes(matrix, 2)
res += matrix[k, kk] * tmp[v, i, j, kk]
end
data_out[v, i, j, k] = res
end
return nothing
end
dims = 3
nodes = 5 # important!
els = 1
test_u = fill(2.0, nodes^dims * els)
test_ptr = PtrArray(pointer(test_u), (StaticInt(1), ntuple(_ -> StaticInt(nodes), dims)..., els))
test_mat = fill(1.0, nodes, nodes)
test_smat = SMatrix{nodes, nodes}(test_mat)
multiply_dimensionwise!(view(test_ptr, :, :, :, :, 1), test_smat)
Do you think it is minimal enough to file an issue upstream? @ranocha @vchuravy
👍