NNlib.jl icon indicating copy to clipboard operation
NNlib.jl copied to clipboard

error for scatter with Metal arrays

Open CarloLucibello opened this issue 2 years ago • 8 comments

gather works fine on Apple Silicon, but with scatter I get an error. The scatter kernel works fine with cuda and amdgpu arrays.

cc @maleadt @pxl-th

julia> using Metal, NNlib, Flux

julia> Metal.versioninfo()
macOS 14.0.0, Darwin 23.0.0

Toolchain:
- Julia: 1.9.3
- LLVM: 14.0.6

Julia packages:
- Metal.jl: 0.5.1
- Metal_LLVM_Tools_jll: 0.5.1+0

1 device:
- Apple M1 Pro (384.000 KiB allocated)

julia> device = Flux.get_device("Metal")

julia> NNlib.gather([1 2 3; 4 5 6] |> device, [1,3,1,3,1] |> device)
2×5 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
 1  3  1  3  1
 4  6  4  6  4

julia> NNlib.scatter(+, [1 2 3 4; 5 6 7 8] |> device, [2,1,1,5] |> device)
ERROR: Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach /var/folders/z_/n_d2vxmx4jj95q7hzmwngnyc0000gn/T/jl_A3uHqJIPoH.metallib.
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:78
  [3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
  [4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:132
  [5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:103
  [6] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
  [7] macro expansion
    @ ./lock.jl:267 [inlined]
  [8] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
  [9] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
 [10] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77 [inlined]
 [11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:105
 [12] Kernel
    @ ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:101 [inlined]
 [13] scatter!
    @ ~/.julia/packages/NNlib/lOntC/src/scatter.jl:104 [inlined]
 [14] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:177
 [15] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate})
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:168
 [16] top-level scope
    @ REPL[32]:1
 [17] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57

caused by: NSError: Compiler encountered an internal error (AGXMetalG13X, code 3)
Stacktrace:
  [1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
    @ Metal.MTL ~/.julia/packages/Metal/lnkVP/lib/mtl/compute_pipeline.jl:60
  [2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:70
  [3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
  [4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:132
  [5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:103
  [6] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
  [7] macro expansion
    @ ./lock.jl:267 [inlined]
  [8] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
  [9] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
 [10] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77 [inlined]
 [11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:105
 [12] Kernel
    @ ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:101 [inlined]
 [13] scatter!
    @ ~/.julia/packages/NNlib/lOntC/src/scatter.jl:104 [inlined]
 [14] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:177
 [15] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate})
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:168
 [16] top-level scope
    @ REPL[32]:1
 [17] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57

CarloLucibello avatar Sep 19 '23 12:09 CarloLucibello

This is likely because of atomic operations, which rely on Atomix: https://github.com/FluxML/NNlib.jl/blob/83df6426d5a669754399bc4a9d8920c4b52e1a76/src/scatter.jl#L112

IIUC, Metal does not yet support Atomix.

pxl-th avatar Sep 19 '23 12:09 pxl-th

IIUC, Metal does not yet support Atomix.

That is correct; I think we have the necessary intrinsics, but nobody has implemented the Atomix.jl interface yet.

The compiler shouldn't crash like that, though. Unless of course Atomix.jl is falling back to LLVM atomic, which aren't supported by the back-end.

maleadt avatar Sep 19 '23 12:09 maleadt