Support for `sincos` NVIDIA intrinsic
Hi,
I have a kernel that depends on NVIDIA's sincos intrinsic, that I am not able to differentiate with Enzyme.
MWE in Julia:
using Enzyme, CUDA, KernelAbstractions, Adapt
function sincos_caller!(x, backend)
sincos_kernel!(backend)(x; ndrange=length(x))
KernelAbstractions.synchronize(backend)
end
@kernel function sincos_kernel!(x)
i = @index(Global, Linear)
s, c = sincos(x[i])
x[i] = s + c
end
backend = CUDABackend()
x = rand(Float32, 100) |> adapt(backend)
∂x = ones(Float32, 100) |> adapt(backend)
autodiff(Reverse, sincos_caller!, Duplicated(x, ∂x), Const(backend))
Which gives the following error:
ERROR: AssertionError: expectedTapeType === TapeType
Stacktrace:
[1] macro expansion
@ ~/BlochHole/.julia/packages/Enzyme/EITgk/src/compiler.jl:2597 [inlined]
[2] macro expansion
@ ~/BlochHole/.julia/packages/LLVM/iza6e/src/base.jl:97 [inlined]
[3] enzyme!(job::GPUCompiler.CompilerJob{…}, interp::Enzyme.Compiler.Interpreter.EnzymeInterpreter{…}, mod::LLVM.Module, primalf::LLVM.Function, TT::Type, mode::Enzyme.API.CDerivativeMode, width::Int64, parallel::Bool, actualRetType::Type, wrap::Bool, modifiedBetween::Tuple{…} where N, returnPrimal::Bool, expectedTapeType::Type, loweredArgs::Set{…}, boxedArgs::Set{…})
@ Enzyme.Compiler ~/BlochHole/.julia/packages/Enzyme/EITgk/src/compiler.jl:2534
[4] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob{…})
@ Enzyme.Compiler ~/BlochHole/.julia/packages/Enzyme/EITgk/src/compiler.jl:5138
[5] codegen(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:77
[6] codegen
@ ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:71 [inlined]
[7] (::GPUCompiler.var"#156#161"{GPUCompiler.CompilerJob{…}})()
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:256
[8] get!(default::GPUCompiler.var"#156#161"{…}, h::Dict{…}, key::GPUCompiler.CompilerJob{…})
@ Base ./dict.jl:479
[9] macro expansion
@ ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:252 [inlined]
[10] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/utils.jl:116
[11] emit_llvm
@ ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/utils.jl:114 [inlined]
[12] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:95
[13] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:80
[14] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:67
[15] compile
@ ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:55 [inlined]
[16] #1188
@ ~/BlochHole/.julia/packages/CUDA/g94EB/src/compiler/compilation.jl:250 [inlined]
[17] JuliaContext(f::CUDA.var"#1188#1191"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:34
[18] JuliaContext(f::Function)
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/driver.jl:25
[19] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/BlochHole/.julia/packages/CUDA/g94EB/src/compiler/compilation.jl:249
[20] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/execution.jl:245
[21] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/BlochHole/.julia/packages/GPUCompiler/4uj4i/src/execution.jl:159
[22] macro expansion
@ ~/BlochHole/.julia/packages/CUDA/g94EB/src/compiler/execution.jl:373 [inlined]
[23] macro expansion
@ ./lock.jl:267 [inlined]
[24] cufunction(f::typeof(EnzymeExt.gpu_aug_fwd), tt::Type{…}; kwargs::@Kwargs{…})
@ CUDA ~/BlochHole/.julia/packages/CUDA/g94EB/src/compiler/execution.jl:368
[25] macro expansion
@ ~/BlochHole/.julia/packages/CUDA/g94EB/src/compiler/execution.jl:112 [inlined]
[26] (::KernelAbstractions.Kernel{…})(::Function, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
@ CUDA.CUDAKernels ~/BlochHole/.julia/packages/CUDA/g94EB/src/CUDAKernels.jl:127
[27] #augmented_primal#7
@ ~/BlochHole/.julia/packages/KernelAbstractions/X5fk1/ext/EnzymeCore08Ext.jl:264
[28] augmented_primal
@ ~/BlochHole/.julia/packages/KernelAbstractions/X5fk1/ext/EnzymeCore08Ext.jl:214 [inlined]
[29] sincos_caller!
@ ./REPL[5]:2 [inlined]
[30] sincos_caller!
@ ./REPL[5]:0 [inlined]
[31] diffejulia_sincos_caller__1063_inner_1wrap
@ ./REPL[5]:0
[32] macro expansion
@ ~/BlochHole/.julia/packages/Enzyme/EITgk/src/compiler.jl:5923 [inlined]
[33] enzyme_call
@ ~/BlochHole/.julia/packages/Enzyme/EITgk/src/compiler.jl:5454 [inlined]
[34] CombinedAdjointThunk
@ ~/BlochHole/.julia/packages/Enzyme/EITgk/src/compiler.jl:5340 [inlined]
[35] autodiff
@ ~/BlochHole/.julia/packages/Enzyme/EITgk/src/Enzyme.jl:534 [inlined]
[36] autodiff
@ ~/BlochHole/.julia/packages/Enzyme/EITgk/src/Enzyme.jl:575 [inlined]
[37] autodiff(::ReverseMode{…}, ::typeof(sincos_caller!), ::Duplicated{…}, ::Const{…})
@ Enzyme ~/BlochHole/.julia/packages/Enzyme/EITgk/src/Enzyme.jl:547
[38] top-level scope
@ REPL[10]:1
[39] top-level scope
@ none:1
Some type information was truncated. Use `show(err)` to see complete types.
My understanding is that it would be an easy add to this list (adding __nv_sincosf should be enough (?)):
https://github.com/EnzymeAD/Enzyme/blob/5655a0c72214755d886e8dff1bb47788908be999/enzyme/Enzyme/InstructionDerivatives.td#L541
Thanks for all the help! If this belongs to the Enzyme.jl repo, feel free to move it.
Bumping this issue, I'm experiencing it as well. Any insight on how it could be fixed would be appreciated, I can try and create a PR.
I believe @cncastillo identified the correct fix, so would either of you be up to opening a PR?
Awesome! I submitted a PR.