Enzyme.jl
Enzyme.jl copied to clipboard
[Julia 1.11] CUDA support
Before tackling the big complicated things in https://github.com/EnzymeAD/Enzyme.jl/issues/2365
using Enzyme
using CUDA
function f(A)
A[1] *= A[1]
return nothing
end
A = CUDA.ones(1)
@cuda f(A)
function df(A, dA)
autodiff_deferred(Reverse, Const(f), Const, Duplicated(A, dA))
return nothing
end
dA = CUDA.ones(1)
@cuda df(A, dA)
Fails with:
ERROR: LoadError: AssertionError: actualRetType != Union{}
Stacktrace:
[1] codegen(output::Symbol, job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, toplevel::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
@ Enzyme.Compiler ~/src/Enzyme/src/compiler.jl:4226
[2] codegen(output::Symbol, job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams})
@ Enzyme.Compiler ~/src/Enzyme/src/compiler.jl:3455
[3] (::GPUCompiler.var"#157#161"{GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams}})()
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:226
[4] get!(default::GPUCompiler.var"#157#161"{GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams}}, h::Dict{GPUCompiler.CompilerJob, String}, key::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams})
@ Base ./dict.jl:458
[5] macro expansion
@ ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:224 [inlined]
[6] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/utils.jl:116
[7] emit_llvm(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/utils.jl:114
[8] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:95
[9] compile_unhooked
@ ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:80 [inlined]
[10] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:67
[11] compile
@ ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:55 [inlined]
[12] #1181
@ ~/.julia/packages/CUDA/LhtzZ/src/compiler/compilation.jl:250 [inlined]
[13] JuliaContext(f::CUDA.var"#1181#1184"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:34
[14] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/driver.jl:25
[15] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/LhtzZ/src/compiler/compilation.jl:249
[16] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/execution.jl:245
[17] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/3QaEh/src/execution.jl:159
[18] macro expansion
@ ~/.julia/packages/CUDA/LhtzZ/src/compiler/execution.jl:373 [inlined]
[19] macro expansion
@ ./lock.jl:273 [inlined]
[20] cufunction(f::typeof(df), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}; kwargs::@Kwargs{})
@ CUDA ~/.julia/packages/CUDA/LhtzZ/src/compiler/execution.jl:368
[21] cufunction(f::typeof(df), tt::Type{Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}})
@ CUDA ~/.julia/packages/CUDA/LhtzZ/src/compiler/execution.jl:365
[22] top-level scope
@ ~/.julia/packages/CUDA/LhtzZ/src/compiler/execution.jl:112
This makes me think that we are doing something fundamentally wrong with type inference in the case of nested compilation.
So yeah we are constructing a primal_job that is hella wrong
primal_job = GPUCompiler.CompilerJob{GPUCompiler.NativeCompilerTarget, Enzyme.Compiler.PrimalCompilerParams}(MethodInstance for f(::CuDeviceVector{Float32, 1}), CompilerConfig for GPUCompiler.NativeCompilerTarget, 0x00000000000068a4)
do we have a parent that's valid?
This is the annouced change from https://github.com/JuliaGPU/GPUCompiler.jl/pull/668#issuecomment-2665477244
We are matching the wrong function and should be looking at the config instead. I will try rewriting all that tomorrow morning.