KernelAbstractions.jl icon indicating copy to clipboard operation
KernelAbstractions.jl copied to clipboard

LLVM error when using Atomix.@atomic with FP16

Open ymtoo opened this issue 1 year ago • 6 comments

With an FP16 input, the example

using CUDA, KernelAbstractions, Atomix

function index_fun_fixed(arr; backend=get_backend(arr))
	out = similar(arr)
	fill!(out, 0)
	kernel! = my_kernel_fixed!(backend)
	kernel!(out, arr, ndrange=(size(arr, 1), size(arr, 2)))
	return out
end

@kernel function my_kernel_fixed!(out, arr)
	i, j = @index(Global, NTuple)
	for k in 1:size(out, 1)
		Atomix.@atomic out[k, i] += arr[i, j]
	end
end

img_f16 = zeros(Float16, (50, 50))
index_fun_fixed(CuArray(img_f16))

throws an error.

ERROR: LLVM error: Cannot select: 0x434d1250: f16,ch = AtomicLoadFAdd<(load store seq_cst (s16) on %ir.15, addrspace 1)> 0x43223230:1, 0x434d0fe0, 0x43223230, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:363 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/internal.jl:20 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ]
  0x434d0fe0: i64 = add 0x43222f58, Constant:i64<-2>, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
    0x43222f58: i64 = add 0x434cd998, 0x43222598, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
      0x434cd998: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %5, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
        0x41eb8eb8: i64 = Register %5
      0x43222598: i64 = shl 0x43222600, Constant:i32<1>, int.jl:88 @[ abstractarray.jl:1244 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ]
        0x43222600: i64 = add 0x43222bb0, 0x434d0a98, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
          0x43222bb0: i64 = mul 0x434cde78, 0x41eb9538, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
            0x434cde78: i64 = AssertZext 0x41eb97a8, ValueType:ch:i63, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
              0x41eb97a8: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %23, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
                0x434d08f8: i64 = Register %23
            0x41eb9538: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %24, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
              0x43222c18: i64 = Register %24
          0x434d0a98: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %12, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
            0x434cde10: i64 = Register %12
        0x41eb9190: i32 = Constant<1>
    0x41eb8aa8: i64 = Constant<-2>
  0x43223230: f16,ch = load<(load (s16) from %ir.64, !tbaa !511, addrspace 1)> 0x45ac35c8, 0x434d1180, undef:i64, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
    0x434d1180: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %27, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
      0x43222a78: i64 = Register %27
    0x43222ce8: i64 = undef
In function: _Z20gpu_my_kernel_fixed_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEE7NDRangeILi2ES0_S0_S2_ILi2ES3_IS4_IS5_ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEEEE13CuDeviceArrayI7Float16Li2ELi1EES7_IS8_Li2ELi1EE
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/6cDbl/src/core/context.jl:168
  [2] LLVMTargetMachineEmitToMemoryBuffer(T::LLVM.TargetMachine, M::LLVM.Module, codegen::LLVM.API.LLVMCodeGenFileType, ErrorMessage::Base.RefValue{…}, OutMemBuf::Base.RefValue{…})
    @ LLVM.API ~/.julia/packages/LLVM/6cDbl/lib/15/libLLVM.jl:5318
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/6cDbl/src/targetmachine.jl:45
  [4] mcgen
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/mcgen.jl:84 [inlined]
  [5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:131
  [6] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:481 [inlined]
  [8] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [9] macro expansion
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:478 [inlined]
 [10] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:103
 [11] emit_asm
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:97 [inlined]
 [12] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:156
 [13] codegen
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:115 [inlined]
 [14] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:111
 [15] compile
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:103 [inlined]
 [16] #1145
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:254 [inlined]
 [17] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:52
 [18] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:42
 [19] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:253
 [20] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:128
 [21] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:103
 [22] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:369 [inlined]
 [23] macro expansion
    @ ./lock.jl:267 [inlined]
 [24] cufunction(f::typeof(gpu_my_kernel_fixed!), tt::Type{Tuple{…}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:364
 [25] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:112 [inlined]
 [26] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:103
 [27] Kernel
    @ ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:89 [inlined]
 [28] #index_fun_fixed#1
    @ ./REPL[96]:5 [inlined]
 [29] index_fun_fixed(arr::CuArray{Float16, 2, CUDA.DeviceMemory})
    @ Main ./REPL[96]:1
 [30] top-level scope
    @ REPL[110]:1
Some type information was truncated. Use `show(err)` to see complete types.

It works fine on FP32 inputs.

Julia and package version:

julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 24 × AMD Ryzen 9 5900X 12-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 24 virtual cores)

(TestKA) pkg> st
Project TestKA v0.1.10
Status `~/Projects/TestKA.jl/Project.toml`
  [a9b6321e] Atomix v0.1.0
  [052768ef] CUDA v5.4.2
  [63c18a36] KernelAbstractions v0.9.21

ymtoo avatar Jun 27 '24 06:06 ymtoo

This is tricky to fix.

IIRC one needs to perform a cas loop for < 4 byte atomics.

Atomix should use https://github.com/JuliaConcurrent/Atomix.jl/blob/e60c518e3ffd2c9d4e96104f16f2a970a69e4289/lib/AtomixCUDA/src/AtomixCUDA.jl#L38

Which does claim to support Float16: https://github.com/JuliaGPU/CUDA.jl/blob/14de0097ff7c26932cc4a175840961cc7d3f396e/src/device/intrinsics/atomics.jl#L195

What is ]status -m

x-ref: https://github.com/JuliaGPU/CUDA.jl/pull/1790

vchuravy avatar Jun 27 '24 19:06 vchuravy

It might be that we end up in https://github.com/JuliaConcurrent/UnsafeAtomicsLLVM.jl instead of UnsafeAtomicsCUDA.jl

vchuravy avatar Jun 27 '24 19:06 vchuravy

This is tricky to fix.

IIRC one needs to perform a cas loop for < 4 byte atomics.

Atomix should use https://github.com/JuliaConcurrent/Atomix.jl/blob/e60c518e3ffd2c9d4e96104f16f2a970a69e4289/lib/AtomixCUDA/src/AtomixCUDA.jl#L38

Which does claim to support Float16: https://github.com/JuliaGPU/CUDA.jl/blob/14de0097ff7c26932cc4a175840961cc7d3f396e/src/device/intrinsics/atomics.jl#L195

What is ]status -m

x-ref: JuliaGPU/CUDA.jl#1790

(jl_hHeJiL) pkg> status -m
Status `/tmp/jl_hHeJiL/Manifest.toml`
  [621f4979] AbstractFFTs v1.5.0
  [79e6a3ab] Adapt v4.0.4
  [a9b6321e] Atomix v0.1.0
  [ab4f0b2a] BFloat16s v0.5.0
  [fa961155] CEnum v0.5.0
  [052768ef] CUDA v5.4.2
  [1af6417a] CUDA_Runtime_Discovery v0.3.4
  [3da002f7] ColorTypes v0.11.5
  [5ae59095] Colors v0.12.11
  [34da2185] Compat v4.15.0
  [a8cc5b0e] Crayons v4.1.1
  [9a962f9c] DataAPI v1.16.0
  [a93c6f00] DataFrames v1.6.1
  [864edb3b] DataStructures v0.18.20
  [e2d170a0] DataValueInterfaces v1.0.0
  [e2ba6199] ExprTools v0.1.10
  [53c48c17] FixedPointNumbers v0.8.5
  [0c68f7d7] GPUArrays v10.2.1
  [46192b85] GPUArraysCore v0.1.6
⌃ [61eb1bfa] GPUCompiler v0.26.5
  [842dd82b] InlineStrings v1.4.1
  [41ab1584] InvertedIndices v1.3.0
  [82899510] IteratorInterfaceExtensions v1.0.0
  [692b3bcd] JLLWrappers v1.5.0
  [63c18a36] KernelAbstractions v0.9.21
⌅ [929cbde3] LLVM v7.2.1
  [8b046642] LLVMLoopInfo v1.0.0
  [b964fa9f] LaTeXStrings v1.3.1
  [1914dd2f] MacroTools v0.5.13
  [e1d29d7a] Missings v1.2.0
  [5da4648a] NVTX v0.3.4
  [bac558e1] OrderedCollections v1.6.3
  [69de0a69] Parsers v2.8.1
  [2dfb63ee] PooledArrays v1.4.3
  [aea7be01] PrecompileTools v1.2.1
  [21216c6a] Preferences v1.4.3
  [08abe8d2] PrettyTables v2.3.2
  [74087812] Random123 v1.7.0
  [e6cf234a] RandomNumbers v1.5.3
  [189a3867] Reexport v1.2.2
  [ae029012] Requires v1.3.0
  [6c6a2e73] Scratch v1.2.1
  [91c51154] SentinelArrays v1.4.3
  [a2af1166] SortingAlgorithms v1.2.1
  [90137ffa] StaticArrays v1.9.6
  [1e83bf80] StaticArraysCore v1.4.3
  [892a3eda] StringManipulation v0.3.4
  [3783bdb8] TableTraits v1.0.1
  [bd369af6] Tables v1.11.1
  [a759f4b9] TimerOutputs v0.5.24
  [013be700] UnsafeAtomics v0.2.1
  [d80eeb9a] UnsafeAtomicsLLVM v0.1.5
  [4ee394cb] CUDA_Driver_jll v0.9.0+0
  [76a88914] CUDA_Runtime_jll v0.14.0+1
  [9c1d0b0a] JuliaNVTXCallbacks_jll v0.2.1+0
⌅ [dad2f222] LLVMExtra_jll v0.0.29+0
  [e98f9f5b] NVTX_jll v3.1.0+2
  [0dad84c5] ArgTools v1.1.1
  [56f22d72] Artifacts
  [2a0f44e3] Base64
  [ade2ca70] Dates
  [f43a241f] Downloads v1.6.0
  [7b1f6079] FileWatching
  [9fa8497b] Future
  [b77e0a4c] InteractiveUtils
  [4af54fe1] LazyArtifacts
  [b27032c2] LibCURL v0.6.4
  [76f85450] LibGit2
  [8f399da3] Libdl
  [37e2e46d] LinearAlgebra
  [56ddb016] Logging
  [d6f4376e] Markdown
  [ca575930] NetworkOptions v1.2.0
  [44cfe95a] Pkg v1.10.0
  [de0858da] Printf
  [3fa0cd96] REPL
  [9a3f8284] Random
  [ea8e919c] SHA v0.7.0
  [9e88b42a] Serialization
  [6462fe0b] Sockets
  [2f01184e] SparseArrays v1.10.0
  [10745b16] Statistics v1.10.0
  [fa267f1f] TOML v1.0.3
  [a4e569a6] Tar v1.10.0
  [8dfed614] Test
  [cf7118a7] UUIDs
  [4ec0a83e] Unicode
  [e66e0078] CompilerSupportLibraries_jll v1.1.1+0
  [deac9b47] LibCURL_jll v8.4.0+0
  [e37daf67] LibGit2_jll v1.6.4+0
  [29816b5a] LibSSH2_jll v1.11.0+1
  [c8ffd9c3] MbedTLS_jll v2.28.2+1
  [14a3606d] MozillaCACerts_jll v2023.1.10
  [4536629a] OpenBLAS_jll v0.3.23+4
  [bea87d4a] SuiteSparse_jll v7.2.1+1
  [83775a58] Zlib_jll v1.2.13+1
  [8e850b90] libblastrampoline_jll v5.8.0+1
  [8e850ede] nghttp2_jll v1.52.0+1
  [3f19e933] p7zip_jll v17.4.0+2
Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`

ymtoo avatar Jun 28 '24 01:06 ymtoo

What happens when you load AtomixCUDA?

vchuravy avatar Jun 28 '24 13:06 vchuravy

The error still occurs after adding and loading AtomixCUDA.

(jl_OV6Zim) pkg> st
Status `/tmp/jl_OV6Zim/Project.toml`
  [a9b6321e] Atomix v0.1.0
  [6171a885] AtomixCUDA v0.1.0-DEV `https://github.com/JuliaConcurrent/Atomix.jl#main:lib/AtomixCUDA`
  [052768ef] CUDA v5.4.2
  [63c18a36] KernelAbstractions v0.9.22

julia> using AtomixCUDA

ymtoo avatar Jun 28 '24 16:06 ymtoo

I won't be able to look at this in detail until August.

For now I would recommend just writing a CUDA.jl kernel and using CUDA.@atomic

vchuravy avatar Jun 28 '24 16:06 vchuravy