KernelAbstractions.jl
KernelAbstractions.jl copied to clipboard
Broadcasting fails with CUDA: unsupported dynamic function invocation (call to overdub)
This code fails when using CUDADevice()
. When using CPU()
however, everything works just fine.
Michael Abbott identified this to be a problem with the broacasting, replacing these with map
resolves the error. (slack thread)
using KernelAbstractions, CUDA
@kernel function ker(a)
I = @index(Global, NTuple)
a[I...] = round.(I .+ 0.5)[1]
end
f = ker(CUDADevice(), 256)
a = cu(rand(10, 10).*10)
event = f(a, ndrange=size(a))
ERROR: InvalidIRError: compiling kernel gpu_ker(Cassette.Context{nametype(CUDACtx),KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.DynamicCheck,Nothing,CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},KernelAbstractions.NDIteration.NDRange{2,KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.StaticSize{(256, 1)},CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},Nothing}},Nothing,KernelAbstractions.var"##PassType#253",Nothing,Cassette.DisableHooks}, typeof(gpu_ker), CuDeviceArray{Float32,2,CUDA.AS.Global}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to overdub)
Stacktrace:
[1] copy at broadcast.jl:1023
[2] materialize at broadcast.jl:820
[3] macro expansion at Untitled-2:37
[4] gpu_ker at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\macros.jl:80
[5] overdub at C:\Users\jules\.julia\packages\Cassette\158rp\src\overdub.jl:0
Reason: unsupported dynamic function invocation (call to overdub)
Stacktrace:
[1] copy at broadcast.jl:1024
[2] materialize at broadcast.jl:820
[3] macro expansion at Untitled-2:37
[4] gpu_ker at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\macros.jl:80
[5] overdub at C:\Users\jules\.julia\packages\Cassette\158rp\src\overdub.jl:0
Reason: unsupported dynamic function invocation (call to overdub)
Stacktrace:
[1] macro expansion at Untitled-2:37
[2] gpu_ker at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\macros.jl:80
[3] overdub at C:\Users\jules\.julia\packages\Cassette\158rp\src\overdub.jl:0
Reason: unsupported dynamic function invocation (call to overdub(overdub_context::Cassette.Context, overdub_arguments...) in Cassette at C:\Users\jules\.julia\packages\Cassette\158rp\src\overdub.jl:587)
Stacktrace:
[1] axes at broadcast.jl:206
[2] copy at broadcast.jl:1021
[3] materialize at broadcast.jl:820
[4] macro expansion at Untitled-2:37
[5] gpu_ker at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\macros.jl:80
[6] overdub at C:\Users\jules\.julia\packages\Cassette\158rp\src\overdub.jl:0
Reason: unsupported dynamic function invocation (call to overdub)
Stacktrace:
[1] copy at broadcast.jl:1022
[2] materialize at broadcast.jl:820
[3] macro expansion at Untitled-2:37
[4] gpu_ker at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\macros.jl:80
[5] overdub at C:\Users\jules\.julia\packages\Cassette\158rp\src\overdub.jl:0
Stacktrace:
[1] check_ir(::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}, ::LLVM.Module) at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\validation.jl:123
[2] macro expansion at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\driver.jl:241 [inlined]
[3] macro expansion at C:\Users\jules\.julia\packages\TimerOutputs\dVnaw\src\TimerOutput.jl:206 [inlined]
[4] codegen(::Symbol, ::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\driver.jl:239
[5] compile(::Symbol, ::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\driver.jl:39
[6] compile at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\driver.jl:35 [inlined]
[7] _cufunction(::GPUCompiler.FunctionSpec{typeof(Cassette.overdub),Tuple{Cassette.Context{nametype(CUDACtx),KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.DynamicCheck,Nothing,CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},KernelAbstractions.NDIteration.NDRange{2,KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.StaticSize{(256, 1)},CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},Nothing}},Nothing,KernelAbstractions.var"##PassType#253",Nothing,Cassette.DisableHooks},typeof(gpu_ker),CuDeviceArray{Float32,2,CUDA.AS.Global}}}; kwargs::Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:maxthreads,),Tuple{Int64}}}) at C:\Users\jules\.julia\packages\CUDA\h38pe\src\compiler\execution.jl:308
[8] #87 at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\cache.jl:21 [inlined]
[9] get!(::GPUCompiler.var"#87#88"{Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:maxthreads,),Tuple{Int64}}},typeof(CUDA._cufunction),GPUCompiler.FunctionSpec{typeof(Cassette.overdub),Tuple{Cassette.Context{nametype(CUDACtx),KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.DynamicCheck,Nothing,CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},KernelAbstractions.NDIteration.NDRange{2,KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.StaticSize{(256, 1)},CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},Nothing}},Nothing,KernelAbstractions.var"##PassType#253",Nothing,Cassette.DisableHooks},typeof(gpu_ker),CuDeviceArray{Float32,2,CUDA.AS.Global}}}}, ::Dict{UInt64,Any}, ::UInt64) at .\dict.jl:452
[10] macro expansion at .\lock.jl:183 [inlined]
[11] check_cache(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(Cassette.overdub),Tuple{Cassette.Context{nametype(CUDACtx),KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.DynamicCheck,Nothing,CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},KernelAbstractions.NDIteration.NDRange{2,KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.StaticSize{(256, 1)},CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},Nothing}},Nothing,KernelAbstractions.var"##PassType#253",Nothing,Cassette.DisableHooks},typeof(gpu_ker),CuDeviceArray{Float32,2,CUDA.AS.Global}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:maxthreads,),Tuple{Int64}}}) at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\cache.jl:19
[12] + at .\int.jl:53 [inlined]
[13] hash_64_64 at .\hashing.jl:35 [inlined]
[14] hash_uint64 at .\hashing.jl:62 [inlined]
[15] hx at .\float.jl:568 [inlined]
[16] hash at .\float.jl:571 [inlined]
[17] cached_compilation(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(Cassette.overdub),Tuple{Cassette.Context{nametype(CUDACtx),KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.DynamicCheck,Nothing,CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},KernelAbstractions.NDIteration.NDRange{2,KernelAbstractions.NDIteration.DynamicSize,KernelAbstractions.NDIteration.StaticSize{(256, 1)},CartesianIndices{2,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}},Nothing}},Nothing,KernelAbstractions.var"##PassType#253",Nothing,Cassette.DisableHooks},typeof(gpu_ker),CuDeviceArray{Float32,2,CUDA.AS.Global}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:maxthreads,),Tuple{Int64}}}) at C:\Users\jules\.julia\packages\GPUCompiler\4e9CU\src\cache.jl:0
[18] cufunction(::Function, ::Type{T} where T; name::String, kwargs::Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:maxthreads,),Tuple{Int64}}}) at C:\Users\jules\.julia\packages\CUDA\h38pe\src\compiler\execution.jl:296
[19] macro expansion at C:\Users\jules\.julia\packages\CUDA\h38pe\src\compiler\execution.jl:108 [inlined]
[20] (::KernelAbstractions.Kernel{CUDADevice,KernelAbstractions.NDIteration.StaticSize{(256,)},KernelAbstractions.NDIteration.DynamicSize,typeof(gpu_ker)})(::CuArray{Float32,2,Nothing}; ndrange::Tuple{Int64,Int64}, dependencies::Nothing, workgroupsize::Nothing, progress::Function) at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\backends\cuda.jl:211
[21] top-level scope at REPL[2]:1
Is this a bug, or is this just something KernelAbstractions doesn't support? Thanks