error for scatter with Metal arrays
gather works fine on Apple Silicon, but with scatter I get an error.
The scatter kernel works fine with cuda and amdgpu arrays.
cc @maleadt @pxl-th
julia> using Metal, NNlib, Flux
julia> Metal.versioninfo()
macOS 14.0.0, Darwin 23.0.0
Toolchain:
- Julia: 1.9.3
- LLVM: 14.0.6
Julia packages:
- Metal.jl: 0.5.1
- Metal_LLVM_Tools_jll: 0.5.1+0
1 device:
- Apple M1 Pro (384.000 KiB allocated)
julia> device = Flux.get_device("Metal")
julia> NNlib.gather([1 2 3; 4 5 6] |> device, [1,3,1,3,1] |> device)
2×5 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
1 3 1 3 1
4 6 4 6 4
julia> NNlib.scatter(+, [1 2 3 4; 5 6 7 8] |> device, [2,1,1,5] |> device)
ERROR: Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach /var/folders/z_/n_d2vxmx4jj95q7hzmwngnyc0000gn/T/jl_A3uHqJIPoH.metallib.
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:78
[3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
[4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:132
[5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:103
[6] macro expansion
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
[7] macro expansion
@ ./lock.jl:267 [inlined]
[8] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
[9] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
[10] macro expansion
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77 [inlined]
[11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
@ Metal.MetalKernels ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:105
[12] Kernel
@ ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:101 [inlined]
[13] scatter!
@ ~/.julia/packages/NNlib/lOntC/src/scatter.jl:104 [inlined]
[14] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}; init::Nothing, dstsize::Nothing)
@ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:177
[15] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate})
@ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:168
[16] top-level scope
@ REPL[32]:1
[17] top-level scope
@ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57
caused by: NSError: Compiler encountered an internal error (AGXMetalG13X, code 3)
Stacktrace:
[1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
@ Metal.MTL ~/.julia/packages/Metal/lnkVP/lib/mtl/compute_pipeline.jl:60
[2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:70
[3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
[4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:132
[5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:103
[6] macro expansion
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
[7] macro expansion
@ ./lock.jl:267 [inlined]
[8] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
[9] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
[10] macro expansion
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77 [inlined]
[11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
@ Metal.MetalKernels ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:105
[12] Kernel
@ ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:101 [inlined]
[13] scatter!
@ ~/.julia/packages/NNlib/lOntC/src/scatter.jl:104 [inlined]
[14] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}; init::Nothing, dstsize::Nothing)
@ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:177
[15] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate})
@ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:168
[16] top-level scope
@ REPL[32]:1
[17] top-level scope
@ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57
This is likely because of atomic operations, which rely on Atomix: https://github.com/FluxML/NNlib.jl/blob/83df6426d5a669754399bc4a9d8920c4b52e1a76/src/scatter.jl#L112
IIUC, Metal does not yet support Atomix.
IIUC, Metal does not yet support Atomix.
That is correct; I think we have the necessary intrinsics, but nobody has implemented the Atomix.jl interface yet.
The compiler shouldn't crash like that, though. Unless of course Atomix.jl is falling back to LLVM atomic, which aren't supported by the back-end.