'autodiff_deferred' fails with ERROR_ILLEGAL_ADDRESS for large `n` on CUDA kernel
Version info
- Julia version: 1.10.8
- CUDA.jl version 5.6.1
- Enzyme.jl version 0.13.30 (commit 43654f904beeeed866305bd5b9e282818a098e50)
- GPU Model: NVIDIA GeForce RTX 4090
- Driver Version: 550.90.07
CUDA kernel
The kernel simply computes the norm of n x n static vectors. The primal call to the kernel norm_kernel! works fine for every n, but grad_norm_kernel! returns an illegal address error. More specifically, the code runs successfully for "small" values of n (say 100, for instance), but it crashes for n=500 and higher values.
using CUDA
using Enzyme
using StaticArrays
using Random
using LinearAlgebra
T = Float32
n = 500
n_threads = 10
norms = CUDA.zeros(T, (n, n))
d_norms = CUDA.ones(T, (n, n))
coords_cpu = rand(SVector{3, T}, n)
coords = CuArray(coords_cpu)
d_coords = CUDA.zeros(eltype(coords), size(coords))
function norm_kernel!(norms, coords, ::Val{n}, ::Val{NTH}) where {n, NTH}
tix = threadIdx().x
idx = tix
while idx <= n^2
ai = cld(idx, n)
aj = (idx - Int32(1)) % n + Int32(1)
if ai <= n && aj <= n
coord_i = coords[ai]
coord_j = coords[aj]
norms[ai, aj] = norm(coord_i - coord_j)
end
idx += NTH
end
return nothing
end
function grad_norm_kernel!(norms, d_norms, coords, d_coords, ::Val{n}, ::Val{n_threads}) where {n, n_threads}
autodiff_deferred(
Reverse,
Const(norm_kernel!),
Const,
Duplicated(norms, d_norms),
Duplicated(coords, d_coords),
Const(Val(n)),
Const(Val(n_threads))
)
return nothing
end
CUDA.@sync @cuda blocks=1 threads=n_threads norm_kernel!(norms, coords, Val(n), Val(n_threads))
CUDA.@sync @cuda blocks=1 threads=n_threads grad_norm_kernel!(norms, d_norms, coords, d_coords, Val(n), Val(n_threads))
Crash output
In the Julia REPL I get the following message:
ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
This causes the Julia session to become unstable, leading to a failure in CUDA memory deallocation. In fact, when I do exit() I get another long error message suggesting that the issue might be related to how Enzyme.jl handles memory in CUDA kernels.
error in running finalizer: CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/1kIOw/lib/cudadrv/libcuda.jl:30
[2] check
@ ~/.julia/packages/CUDA/1kIOw/lib/cudadrv/libcuda.jl:37 [inlined]
[3] cuMemFreeAsync
@ ~/.julia/packages/CUDA/1kIOw/lib/utils/call.jl:34 [inlined]
[4] free(mem::CUDA.DeviceMemory; stream::CuStream)
@ CUDA ~/.julia/packages/CUDA/1kIOw/lib/cudadrv/memory.jl:87
[5] free
@ ~/.julia/packages/CUDA/1kIOw/lib/cudadrv/memory.jl:82 [inlined]
[6] #1102
@ ~/.julia/packages/CUDA/1kIOw/src/memory.jl:710 [inlined]
[7] #context!#990
@ ~/.julia/packages/CUDA/1kIOw/lib/cudadrv/state.jl:168 [inlined]
[8] context!
@ ~/.julia/packages/CUDA/1kIOw/lib/cudadrv/state.jl:163 [inlined]
[9] _pool_free
@ ~/.julia/packages/CUDA/1kIOw/src/memory.jl:709 [inlined]
[10] macro expansion
@ ./timing.jl:395 [inlined]
[11] pool_free(managed::CUDA.Managed{CUDA.DeviceMemory})
@ CUDA ~/.julia/packages/CUDA/1kIOw/src/memory.jl:691
[12] release(::GPUArrays.RefCounted{CUDA.Managed{CUDA.DeviceMemory}})
@ GPUArrays ~/.julia/packages/GPUArrays/uiVyU/src/host/abstractarray.jl:42
[13] unsafe_free!
@ ~/.julia/packages/GPUArrays/uiVyU/src/host/abstractarray.jl:100 [inlined]
[14] unsafe_free!(x::CuArray{SVector{3, Float32}, 1, CUDA.DeviceMemory})
@ GPUArrays ~/.julia/packages/GPUArrays/uiVyU/src/host/abstractarray.jl:115
[15] exit
@ ./initdefs.jl:28 [inlined]
[16] exit()
@ Base ./initdefs.jl:29
[17] eval
@ ./boot.jl:385 [inlined]
[18] eval_user_input(ast::Any, backend::REPL.REPLBackend, mod::Module)
@ REPL ~/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/share/julia/stdlib/v1.10/REPL/src/REPL.jl:150
[19] repl_backend_loop(backend::REPL.REPLBackend, get_module::Function)
@ REPL ~/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/share/julia/stdlib/v1.10/REPL/src/REPL.jl:246
[20] start_repl_backend(backend::REPL.REPLBackend, consumer::Any; get_module::Function)
@ REPL ~/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/share/julia/stdlib/v1.10/REPL/src/REPL.jl:231
[21] run_repl(repl::REPL.AbstractREPL, consumer::Any; backend_on_current_task::Bool, backend::Any)
@ REPL ~/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/share/julia/stdlib/v1.10/REPL/src/REPL.jl:389
[22] run_repl(repl::REPL.AbstractREPL, consumer::Any)
@ REPL ~/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/share/julia/stdlib/v1.10/REPL/src/REPL.jl:375
[23] (::Base.var"#1014#1016"{Bool, Bool, Bool})(REPL::Module)
@ Base ./client.jl:437
[24] #invokelatest#2
@ ./essentials.jl:892 [inlined]
[25] invokelatest
@ ./essentials.jl:889 [inlined]
[26] run_main_repl(interactive::Bool, quiet::Bool, banner::Bool, history_file::Bool, color_set::Bool)
@ Base ./client.jl:421
[27] exec_options(opts::Base.JLOptions)
@ Base ./client.jl:338
[28] _start()
@ Base ./client.jl:557
WARNING: Error while freeing DeviceMemory(5.859 KiB at 0x00000004021fd000):
@CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc))
Is there a better way to manage memory when using Enzyme autodiff_deferred with CUDA?
I include the full error message as an attachment.
@jgreener64, tagging you so you can follow the discussion.
@vchuravy I don't suppose you have any ideas on this one?
No nothing immediate. I would run it through the compute-sanitizer
related to how Enzyme.jl handles memory in CUDA kernels.
No after a segmentation fault the device must be reset. The finalizer error you are seeing is basically just saying "the device is in an error state"
I ran the kernel with compute-sanitizer and have attached the log file obtained inside a julia session.
Below are two lines from the output that might be useful:
========= Invalid __global__ write of size 1 bytes
========= at 0x1e90 in /lmb/home/ggambini/Desktop/MLIP_examples/GPU_kernel/min_ex.jl:21:grad_norm_kernel_
========= Address 0x0 is out of bounds
========= and is 8,724,152,320 bytes before the nearest allocation at 0x208000000 of size 8,388,864 bytes
The error appears to be similar to a basic "out of bounds" CUDA error, as shown in this screenshot:
Finally, here is the summary when the same kernel is run with n=50:
========= ERROR SUMMARY: 0 errors
Process(`/net/nfs1/public/EM/CUDA/cuda-12.1/bin/compute-sanitizer --launch-timeout=0 --target-processes=all --report-api-errors=no /lmb/home/ggambini/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/bin/julia -C native -J/lmb/home/ggambini/.julia/juliaup/julia-1.10.8+0.x64.linux.gnu/lib/julia/sys.so -g1`, ProcessExited(0))