Enzyme.jl icon indicating copy to clipboard operation
Enzyme.jl copied to clipboard

KernelGradients: GPU compilation of kernel failed.

Open jumerckx opened this issue 3 years ago • 1 comments

Running the matmul example from the KernelGradients package using CUDA throws an error. As far as I can tell it's nothing because of the specific kernel as a simple copy-kernel also has the same error. On CPU things run as expected.

To reproduce:

using Enzyme, KernelAbstractions, KernelGradients, CUDAKernels, CUDA

@kernel function matmul_kernel!(a, b, c)
    i, j = @index(Global, NTuple)

    # creating a temporary sum variable for matrix multiplication
    tmp_sum = zero(eltype(c))
    for k = 1:size(a)[2]
        @inbounds tmp_sum += a[i,k] * b[k, j]
    end

    c[i,j] = tmp_sum
end


matmul = matmul_kernel!(CUDADevice())
a = CUDA.rand(128, 256)
b = CUDA.rand(256, 128)
c = CUDA.zeros(128, 128)
wait(matmul(a, b, c, ndrange=size(c)))

c ≈ a*b # OK

dmatmul = Enzyme.autodiff(matmul)
da = similar(a)
da .= 0
db = similar(b)
db .= 0
dc = similar(c)
dc .= 1
c .= 0

compare_dc = copy(dc)
wait(dmatmul(
	Duplicated(a, da),
	Duplicated(b, db),
	Duplicated(c, dc), ndrange=size(c)))

Error+stacktrace:

ERROR: GPU compilation of kernel df(Cassette.Context{nametype(CUDACtx), Nothing, Nothing, KernelAbstractions.var"##PassType#274", Nothing, Cassette.DisableHooks}, KernelGradients.var"#df#5"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}, KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}) failed
KernelError: kernel returns a value of type `Tuple{}`

Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.

Stacktrace:
  [1] check_method(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/1Ajz2/src/validation.jl:21
  [2] macro expansion
    @ ~/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/GPUCompiler/1Ajz2/src/driver.jl:89 [inlined]
  [4] emit_julia(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/1Ajz2/src/utils.jl:64
  [5] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/bki2w/src/compiler/execution.jl:324
  [6] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/1Ajz2/src/cache.jl:90
  [7] cufunction(f::typeof(Cassette.overdub), tt::Type{Tuple{Cassette.Context{nametype(CUDACtx), Nothing, Nothing, KernelAbstractions.var"##PassType#274", Nothing, Cassette.DisableHooks}, KernelGradients.var"#df#5"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}, KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceMatrix{Float32, 1}}}}; name::String, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/bki2w/src/compiler/execution.jl:297
  [8] macro expansion
    @ ~/.julia/packages/CUDA/bki2w/src/compiler/execution.jl:102 [inlined]
  [9] (::KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, KernelGradients.var"#df#5"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}})(::Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, ::Vararg{Duplicated{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}; ndrange::Tuple{Int64, Int64}, dependencies::CUDAKernels.CudaEvent, workgroupsize::Nothing, progress::Function)
    @ CUDAKernels ~/.julia/packages/CUDAKernels/kCOA4/src/CUDAKernels.jl:194
 [10] top-level scope
    @ REPL[16]:12
 [11] top-level scope
    @ ~/.julia/packages/CUDA/bki2w/src/initialization.jl:52

jumerckx avatar Feb 04 '22 22:02 jumerckx

Am now seeing a:


ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
 [1] throw_api_error(res::CUDA.cudaError_enum)
   @ CUDA ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/error.jl:89
 [2] isdone
   @ ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/stream.jl:109 [inlined]
 [3] nonblocking_synchronize
   @ ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/stream.jl:139 [inlined]
 [4] nonblocking_synchronize
   @ ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/context.jl:325 [inlined]
 [5] device_synchronize()
   @ CUDA ~/.julia/packages/CUDA/DfvRa/lib/cudadrv/context.jl:319
 [6] top-level scope
   @ ~/.julia/packages/CUDA/DfvRa/src/initialization.jl:54

wsmoses avatar Aug 06 '22 18:08 wsmoses