Enzyme.jl
Enzyme.jl copied to clipboard
KernelGradients: GPU compilation of kernel failed.
Running the matmul example from the KernelGradients package using CUDA throws an error. As far as I can tell it's nothing because of the specific kernel as a simple copy-kernel also has the same error. On CPU things run as expected.
To reproduce:
using Enzyme, KernelAbstractions, KernelGradients, CUDAKernels, CUDA
@kernel function matmul_kernel!(a, b, c)
i, j = @index(Global, NTuple)
# creating a temporary sum variable for matrix multiplication
tmp_sum = zero(eltype(c))
for k = 1:size(a)[2]
@inbounds tmp_sum += a[i,k] * b[k, j]
end
c[i,j] = tmp_sum
end
matmul = matmul_kernel!(CUDADevice())
a = CUDA.rand(128, 256)
b = CUDA.rand(256, 128)
c = CUDA.zeros(128, 128)
wait(matmul(a, b, c, ndrange=size(c)))
c ≈ a*b # OK
dmatmul = Enzyme.autodiff(matmul)
da = similar(a)
da .= 0
db = similar(b)
db .= 0
dc = similar(c)
dc .= 1
c .= 0
compare_dc = copy(dc)
wait(dmatmul(
Duplicated(a, da),
Duplicated(b, db),
Duplicated(c, dc), ndrange=size(c)))
Error+stacktrace:
ERROR: GPU compilation of kernel df(Cassette.Context{nametype(CUDACtx), Nothing, Nothing, KernelAbstractions.var"##PassType#274", Nothing, Cassette.DisableHooks}, KernelGradients.var"#df#5"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}, KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}) failed
KernelError: kernel returns a value of type `Tuple{}`
Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.
Stacktrace:
[1] check_method(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/1Ajz2/src/validation.jl:21
[2] macro expansion
@ ~/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
[3] macro expansion
@ ~/.julia/packages/GPUCompiler/1Ajz2/src/driver.jl:89 [inlined]
[4] emit_julia(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/1Ajz2/src/utils.jl:64
[5] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/bki2w/src/compiler/execution.jl:324
[6] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/1Ajz2/src/cache.jl:90
[7] cufunction(f::typeof(Cassette.overdub), tt::Type{Tuple{Cassette.Context{nametype(CUDACtx), Nothing, Nothing, KernelAbstractions.var"##PassType#274", Nothing, Cassette.DisableHooks}, KernelGradients.var"#df#5"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}, KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}}}; name::String, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/packages/CUDA/bki2w/src/compiler/execution.jl:297
[8] macro expansion
@ ~/.julia/packages/CUDA/bki2w/src/compiler/execution.jl:102 [inlined]
[9] (::KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, KernelGradients.var"#df#5"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}})(::Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, ::Vararg{Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}; ndrange::Tuple{Int64, Int64}, dependencies::CUDAKernels.CudaEvent, workgroupsize::Nothing, progress::Function)
@ CUDAKernels ~/.julia/packages/CUDAKernels/kCOA4/src/CUDAKernels.jl:194
[10] top-level scope
@ REPL[16]:12
[11] top-level scope
@ ~/.julia/packages/CUDA/bki2w/src/initialization.jl:52
Am now seeing a:
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/error.jl:89
[2] isdone
@ ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/context.jl:319
[6] top-level scope
@ ~/.julia/packages/CUDA/DfvRa/src/initialization.jl:54