KernelAbstractions.jl
KernelAbstractions.jl copied to clipboard
issue with using enzyme on kernel abstractions on CUDA backend
I want to desing rrule from chainrules for my kernel. Below is simple reproducible example.
System info:
Julia 1.10
CUDA v5.4.0
ChainRulesCore v1.23.0
ChainRulesTestUtils v1.13.0
Enzyme v0.12.9 https://github.com/EnzymeAD/Enzyme.jl.git#main
EnzymeTestUtils v0.1.7
KernelAbstractions v0.9.20 https://github.com/JuliaGPU/KernelAbstractions.jl#main
GPU: Nvidia RTX 3090
code
using KernelAbstractions
using ChainRulesCore, Zygote, CUDA, Enzyme, Test
@kernel function example_kenr(@Const(A),A_out)
index = @index(Global)
shared_arr = @localmem Float32 (@groupsize()[1], 1)
shared_arr[@index(Local, Linear)] = A[index]
A_out[index] = shared_arr[@index(Local, Linear), 1]
index = @index(Global)
end
function call_example(A,A_out)
dev = get_backend(A)
example_kenr(dev, 256)(A,A_out, ndrange=(size(A)[1]))
KernelAbstractions.synchronize(dev)
return nothing
end
A=CUDA.ones(10).*2
A_out=CUDA.ones(10)
call_example(A,A_out)
@test A_out == CUDA.ones(10).*2
function ChainRulesCore.rrule(::typeof(call_example), A,A_out)
#modify A_out by mutation
call_example(A,A_out)
function call_test_kernel1_pullback(d_A_out)
d_A_out = CuArray(collect(d_A_out))
d_A = CUDA.zeros(size(A)...)
Enzyme.autodiff_deferred(Enzyme.Reverse, call_example, Const, Duplicated(A,d_A), Duplicated(A_out, d_A_out))
#NoTangent for the function itself
return NoTangent(), d_A,d_A_out
end
return A_out, call_test_kernel1_pullback
end
out,pull_back=rrule(call_example,A,A_out)
pull_back(CUDA.ones(10))
error
ERROR: Enzyme.Compiler.EnzymeRuntimeException(Cstring(0x00007d0b6b39b32c))
Stacktrace:
[1] throwerr(cstr::Cstring)
@ Enzyme.Compiler ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:1338
[2] getindex
@ ./essentials.jl:13 [inlined]
[3] get
@ ./dict.jl:525 [inlined]
[4] compiler_cache
@ ~/.julia/packages/CUDA/DS19C/src/compiler/compilation.jl:166 [inlined]
[5] macro expansion
@ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:366 [inlined]
[6] macro expansion
@ ./lock.jl:267 [inlined]
[7] #cufunction#1169
@ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:364
[8] cufunction
@ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:361 [inlined]
[9] macro expansion
@ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:112 [inlined]
[10] #_#4
@ ~/.julia/packages/CUDA/DS19C/src/CUDAKernels.jl:103 [inlined]
[11] augmented_julia____4_10560_inner_1wrap
@ ~/.julia/packages/CUDA/DS19C/src/CUDAKernels.jl:0
[12] macro expansion
@ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5916 [inlined]
[13] enzyme_call
@ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5566 [inlined]
[14] AugmentedForwardThunk
@ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5454 [inlined]
[15] runtime_generic_augfwd(activity::Type{…}, width::Val{…}, ModifiedBetween::Val{…}, RT::Val{…}, f::CUDA.CUDAKernels.var"##_#4", df::Nothing, primal_1::Int64, shadow_1_1::Nothing, primal_2::Nothing, shadow_2_1::Nothing, primal_3::KernelAbstractions.Kernel{…}, shadow_3_1::Nothing, primal_4::CuArray{…}, shadow_4_1::CuArray{…}, primal_5::CuArray{…}, shadow_5_1::CuArray{…})
@ Enzyme.Compiler ~/.julia/packages/Enzyme/sDjFs/src/rules/jitrules.jl:179
[16] Kernel
@ ~/.julia/packages/CUDA/DS19C/src/CUDAKernels.jl:89 [inlined]
[17] call_example
@ ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:15 [inlined]
[18] diffejulia_call_example_4822wrap
@ ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:0
[19] macro expansion
@ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5916 [inlined]
[20] enzyme_call
@ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5566 [inlined]
[21] CombinedAdjointThunk
@ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5443 [inlined]
[22] autodiff_deferred
@ ~/.julia/packages/Enzyme/sDjFs/src/Enzyme.jl:440 [inlined]
[23] autodiff_deferred
@ ~/.julia/packages/Enzyme/sDjFs/src/Enzyme.jl:510 [inlined]
[24] (::var"#call_test_kernel1_pullback#5"{CuArray{…}, CuArray{…}})(d_A_out::CuArray{Float32, 1, CUDA.DeviceMemory})
@ Main ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:36
[25] top-level scope
@ ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:48
Some type information was truncated. Use `show(err)` to see complete types.
X-ref #454
Ok thanks for reference, so if I understand correctly issue is coming close to the solution, fantastic!