LLVM error when using Atomix.@atomic with FP16
With an FP16 input, the example
using CUDA, KernelAbstractions, Atomix
function index_fun_fixed(arr; backend=get_backend(arr))
out = similar(arr)
fill!(out, 0)
kernel! = my_kernel_fixed!(backend)
kernel!(out, arr, ndrange=(size(arr, 1), size(arr, 2)))
return out
end
@kernel function my_kernel_fixed!(out, arr)
i, j = @index(Global, NTuple)
for k in 1:size(out, 1)
Atomix.@atomic out[k, i] += arr[i, j]
end
end
img_f16 = zeros(Float16, (50, 50))
index_fun_fixed(CuArray(img_f16))
throws an error.
ERROR: LLVM error: Cannot select: 0x434d1250: f16,ch = AtomicLoadFAdd<(load store seq_cst (s16) on %ir.15, addrspace 1)> 0x43223230:1, 0x434d0fe0, 0x43223230, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:363 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/internal.jl:20 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ]
0x434d0fe0: i64 = add 0x43222f58, Constant:i64<-2>, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
0x43222f58: i64 = add 0x434cd998, 0x43222598, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
0x434cd998: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %5, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
0x41eb8eb8: i64 = Register %5
0x43222598: i64 = shl 0x43222600, Constant:i32<1>, int.jl:88 @[ abstractarray.jl:1244 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ]
0x43222600: i64 = add 0x43222bb0, 0x434d0a98, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x43222bb0: i64 = mul 0x434cde78, 0x41eb9538, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x434cde78: i64 = AssertZext 0x41eb97a8, ValueType:ch:i63, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x41eb97a8: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %23, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x434d08f8: i64 = Register %23
0x41eb9538: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %24, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x43222c18: i64 = Register %24
0x434d0a98: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %12, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x434cde10: i64 = Register %12
0x41eb9190: i32 = Constant<1>
0x41eb8aa8: i64 = Constant<-2>
0x43223230: f16,ch = load<(load (s16) from %ir.64, !tbaa !511, addrspace 1)> 0x45ac35c8, 0x434d1180, undef:i64, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
0x434d1180: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %27, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
0x43222a78: i64 = Register %27
0x43222ce8: i64 = undef
In function: _Z20gpu_my_kernel_fixed_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEE7NDRangeILi2ES0_S0_S2_ILi2ES3_IS4_IS5_ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEEEE13CuDeviceArrayI7Float16Li2ELi1EES7_IS8_Li2ELi1EE
Stacktrace:
[1] handle_error(reason::Cstring)
@ LLVM ~/.julia/packages/LLVM/6cDbl/src/core/context.jl:168
[2] LLVMTargetMachineEmitToMemoryBuffer(T::LLVM.TargetMachine, M::LLVM.Module, codegen::LLVM.API.LLVMCodeGenFileType, ErrorMessage::Base.RefValue{…}, OutMemBuf::Base.RefValue{…})
@ LLVM.API ~/.julia/packages/LLVM/6cDbl/lib/15/libLLVM.jl:5318
[3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
@ LLVM ~/.julia/packages/LLVM/6cDbl/src/targetmachine.jl:45
[4] mcgen
@ ~/.julia/packages/GPUCompiler/nWT2N/src/mcgen.jl:84 [inlined]
[5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:131
[6] macro expansion
@ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
[7] macro expansion
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:481 [inlined]
[8] macro expansion
@ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
[9] macro expansion
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:478 [inlined]
[10] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:103
[11] emit_asm
@ ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:97 [inlined]
[12]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:156
[13] codegen
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:115 [inlined]
[14]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:111
[15] compile
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:103 [inlined]
[16] #1145
@ ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:254 [inlined]
[17] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:52
[18] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:42
[19] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:253
[20] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:128
[21] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:103
[22] macro expansion
@ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:369 [inlined]
[23] macro expansion
@ ./lock.jl:267 [inlined]
[24] cufunction(f::typeof(gpu_my_kernel_fixed!), tt::Type{Tuple{…}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:364
[25] macro expansion
@ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:112 [inlined]
[26] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
@ CUDA.CUDAKernels ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:103
[27] Kernel
@ ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:89 [inlined]
[28] #index_fun_fixed#1
@ ./REPL[96]:5 [inlined]
[29] index_fun_fixed(arr::CuArray{Float16, 2, CUDA.DeviceMemory})
@ Main ./REPL[96]:1
[30] top-level scope
@ REPL[110]:1
Some type information was truncated. Use `show(err)` to see complete types.
It works fine on FP32 inputs.
Julia and package version:
julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 24 × AMD Ryzen 9 5900X 12-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 24 virtual cores)
(TestKA) pkg> st
Project TestKA v0.1.10
Status `~/Projects/TestKA.jl/Project.toml`
[a9b6321e] Atomix v0.1.0
[052768ef] CUDA v5.4.2
[63c18a36] KernelAbstractions v0.9.21
This is tricky to fix.
IIRC one needs to perform a cas loop for < 4 byte atomics.
Atomix should use https://github.com/JuliaConcurrent/Atomix.jl/blob/e60c518e3ffd2c9d4e96104f16f2a970a69e4289/lib/AtomixCUDA/src/AtomixCUDA.jl#L38
Which does claim to support Float16: https://github.com/JuliaGPU/CUDA.jl/blob/14de0097ff7c26932cc4a175840961cc7d3f396e/src/device/intrinsics/atomics.jl#L195
What is ]status -m
x-ref: https://github.com/JuliaGPU/CUDA.jl/pull/1790
It might be that we end up in https://github.com/JuliaConcurrent/UnsafeAtomicsLLVM.jl instead of UnsafeAtomicsCUDA.jl
This is tricky to fix.
IIRC one needs to perform a
casloop for < 4 byte atomics.Atomix should use https://github.com/JuliaConcurrent/Atomix.jl/blob/e60c518e3ffd2c9d4e96104f16f2a970a69e4289/lib/AtomixCUDA/src/AtomixCUDA.jl#L38
Which does claim to support Float16: https://github.com/JuliaGPU/CUDA.jl/blob/14de0097ff7c26932cc4a175840961cc7d3f396e/src/device/intrinsics/atomics.jl#L195
What is
]status -mx-ref: JuliaGPU/CUDA.jl#1790
(jl_hHeJiL) pkg> status -m
Status `/tmp/jl_hHeJiL/Manifest.toml`
[621f4979] AbstractFFTs v1.5.0
[79e6a3ab] Adapt v4.0.4
[a9b6321e] Atomix v0.1.0
[ab4f0b2a] BFloat16s v0.5.0
[fa961155] CEnum v0.5.0
[052768ef] CUDA v5.4.2
[1af6417a] CUDA_Runtime_Discovery v0.3.4
[3da002f7] ColorTypes v0.11.5
[5ae59095] Colors v0.12.11
[34da2185] Compat v4.15.0
[a8cc5b0e] Crayons v4.1.1
[9a962f9c] DataAPI v1.16.0
[a93c6f00] DataFrames v1.6.1
[864edb3b] DataStructures v0.18.20
[e2d170a0] DataValueInterfaces v1.0.0
[e2ba6199] ExprTools v0.1.10
[53c48c17] FixedPointNumbers v0.8.5
[0c68f7d7] GPUArrays v10.2.1
[46192b85] GPUArraysCore v0.1.6
⌃ [61eb1bfa] GPUCompiler v0.26.5
[842dd82b] InlineStrings v1.4.1
[41ab1584] InvertedIndices v1.3.0
[82899510] IteratorInterfaceExtensions v1.0.0
[692b3bcd] JLLWrappers v1.5.0
[63c18a36] KernelAbstractions v0.9.21
⌅ [929cbde3] LLVM v7.2.1
[8b046642] LLVMLoopInfo v1.0.0
[b964fa9f] LaTeXStrings v1.3.1
[1914dd2f] MacroTools v0.5.13
[e1d29d7a] Missings v1.2.0
[5da4648a] NVTX v0.3.4
[bac558e1] OrderedCollections v1.6.3
[69de0a69] Parsers v2.8.1
[2dfb63ee] PooledArrays v1.4.3
[aea7be01] PrecompileTools v1.2.1
[21216c6a] Preferences v1.4.3
[08abe8d2] PrettyTables v2.3.2
[74087812] Random123 v1.7.0
[e6cf234a] RandomNumbers v1.5.3
[189a3867] Reexport v1.2.2
[ae029012] Requires v1.3.0
[6c6a2e73] Scratch v1.2.1
[91c51154] SentinelArrays v1.4.3
[a2af1166] SortingAlgorithms v1.2.1
[90137ffa] StaticArrays v1.9.6
[1e83bf80] StaticArraysCore v1.4.3
[892a3eda] StringManipulation v0.3.4
[3783bdb8] TableTraits v1.0.1
[bd369af6] Tables v1.11.1
[a759f4b9] TimerOutputs v0.5.24
[013be700] UnsafeAtomics v0.2.1
[d80eeb9a] UnsafeAtomicsLLVM v0.1.5
[4ee394cb] CUDA_Driver_jll v0.9.0+0
[76a88914] CUDA_Runtime_jll v0.14.0+1
[9c1d0b0a] JuliaNVTXCallbacks_jll v0.2.1+0
⌅ [dad2f222] LLVMExtra_jll v0.0.29+0
[e98f9f5b] NVTX_jll v3.1.0+2
[0dad84c5] ArgTools v1.1.1
[56f22d72] Artifacts
[2a0f44e3] Base64
[ade2ca70] Dates
[f43a241f] Downloads v1.6.0
[7b1f6079] FileWatching
[9fa8497b] Future
[b77e0a4c] InteractiveUtils
[4af54fe1] LazyArtifacts
[b27032c2] LibCURL v0.6.4
[76f85450] LibGit2
[8f399da3] Libdl
[37e2e46d] LinearAlgebra
[56ddb016] Logging
[d6f4376e] Markdown
[ca575930] NetworkOptions v1.2.0
[44cfe95a] Pkg v1.10.0
[de0858da] Printf
[3fa0cd96] REPL
[9a3f8284] Random
[ea8e919c] SHA v0.7.0
[9e88b42a] Serialization
[6462fe0b] Sockets
[2f01184e] SparseArrays v1.10.0
[10745b16] Statistics v1.10.0
[fa267f1f] TOML v1.0.3
[a4e569a6] Tar v1.10.0
[8dfed614] Test
[cf7118a7] UUIDs
[4ec0a83e] Unicode
[e66e0078] CompilerSupportLibraries_jll v1.1.1+0
[deac9b47] LibCURL_jll v8.4.0+0
[e37daf67] LibGit2_jll v1.6.4+0
[29816b5a] LibSSH2_jll v1.11.0+1
[c8ffd9c3] MbedTLS_jll v2.28.2+1
[14a3606d] MozillaCACerts_jll v2023.1.10
[4536629a] OpenBLAS_jll v0.3.23+4
[bea87d4a] SuiteSparse_jll v7.2.1+1
[83775a58] Zlib_jll v1.2.13+1
[8e850b90] libblastrampoline_jll v5.8.0+1
[8e850ede] nghttp2_jll v1.52.0+1
[3f19e933] p7zip_jll v17.4.0+2
Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`
What happens when you load AtomixCUDA?
The error still occurs after adding and loading AtomixCUDA.
(jl_OV6Zim) pkg> st
Status `/tmp/jl_OV6Zim/Project.toml`
[a9b6321e] Atomix v0.1.0
[6171a885] AtomixCUDA v0.1.0-DEV `https://github.com/JuliaConcurrent/Atomix.jl#main:lib/AtomixCUDA`
[052768ef] CUDA v5.4.2
[63c18a36] KernelAbstractions v0.9.22
julia> using AtomixCUDA
I won't be able to look at this in detail until August.
For now I would recommend just writing a CUDA.jl kernel and using CUDA.@atomic