BUG: @synchronize inside loop in CPU is not working
Hi! I realized that there is a bug using CPU backend (the same code is working in CUDA backend)
This is the example:
using KernelAbstractions, CUDA
const backend = CPU()
const BLOCK_SIZE = 4
@kernel function kk(A, B, C)
varr = 2
for i in 1:varr
for j in 1:varr
@synchronize()
C[i, j] = 0
for k in 1:varr
C[i, j] += A[i, k] * B[k, j]
end
end
end
end
function run_gpu()
m = 10
n = 20
#Inicializo las matrices en la GPU
A = KernelAbstractions.zeros(backend, Int, m, n)
B = KernelAbstractions.zeros(backend, Int, m, n)
C = KernelAbstractions.zeros(backend, Int, m, n)
#Calculo el tamaño de bloque
block_size = BLOCK_SIZE
mn = max(m, n)
if mn < BLOCK_SIZE
block_size = mn
end
#Calculo el número de bloques
total_blocks = (mn + block_size - 1) ÷ block_size
#Anti-diagonal loop
@time @inbounds for diag in 0:(2*total_blocks-1)
#Número de bloques a lanzar en la anti-diagonal
num_blocks_diagonal = min(diag + 1, 2 * total_blocks - diag - 1)
kernel! = kk(backend)
kernel!(A, B, C, ndrange = (block_size * block_size, num_blocks_diagonal), workgroupsize = block_size * block_size)
KernelAbstractions.synchronize(backend)
end
end
run_gpu()
and this is the error:
ERROR: LoadError: UndefVarError: `varr` not defined
Stacktrace:
[1] cpu_kk
@ ~/.julia/packages/KernelAbstractions/Zcyra/src/macros.jl:288 [inlined]
[2] cpu_kk(__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{2}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, A::Matrix{Int64}, B::Matrix{Int64}, C::Matrix{Int64})
@ Main ./none:0
[3] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(cpu_kk)}, ndrange::Tuple{Int64, Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}, args::Tuple{Matrix{Int64}, Matrix{Int64}, Matrix{Int64}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:115
[4] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(cpu_kk)}, ndrange::Tuple{Int64, Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}, args::Tuple{Matrix{Int64}, Matrix{Int64}, Matrix{Int64}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:82
[5] (::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(cpu_kk)})(::Matrix{Int64}, ::Vararg{Matrix{Int64}}; ndrange::Tuple{Int64, Int64}, workgroupsize::Int64)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:44
[6] Kernel
@ ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:37 [inlined]
[7] macro expansion
@ /app/manuel/test3.jl:48 [inlined]
[8] macro expansion
@ ./timing.jl:279 [inlined]
[9] run_gpu()
@ Main /app/manuel/test3.jl:44
[10] top-level scope
@ /app/manuel/test3.jl:55
So, it's like after @synchronize, all local variables are lost and this only happens in the CPU backend.
Any thoughts on this? Thank you very much!
I realized that @uniform works for this, but I'm having issues querying the thread id and block id. For example this is not working because index is not defined
index = @index(Global, NTuple)
@uniform tid = index[1] - 1
```
I've never run into this for variables defined using @index, but otherwise defining variables is not supposed to work outside of using @private according to docs.
Even when doing so, I run into this -
@kernel function test_kernel()
@private s = 1
@print("s: ", s, " type: ", typeof(s), "\n")
while s < 6
s += 1
@synchronize()
end
end
For some reason, s goes from being an Int64 to a ::Tuple{Int64} when the while loop starts. Here is the error I get -
s: 1 type: Int64
ERROR: LoadError: MethodError: no method matching isless(::Tuple{Int64}, ::Int64)
Closest candidates are:
isless(::Missing, ::Any)
@ Base missing.jl:87
isless(::Any, ::Missing)
@ Base missing.jl:88
isless(::Tuple, ::Tuple{})
@ Base tuple.jl:556
...
Stacktrace:
[1] <(x::Tuple{Int64}, y::Int64)
@ Base ./operators.jl:352
[2] cpu_test_kernel
@ ~/.julia/packages/KernelAbstractions/zPAn3/src/macros.jl:288 [inlined]
[3] cpu_test_kernel(__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{1}, Nothing, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}})
@ Main ./none:0
[4] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:115
[5] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:82
[6] #_#16
@ ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:44 [inlined]
If you set the while condition to true, I get -
s: 1 type: Int64
ERROR: LoadError: MethodError: no method matching setindex!(::Tuple{Int64}, ::Int64, ::Int64)
Stacktrace:
[1] cpu_test_kernel
@ ~/.julia/packages/KernelAbstractions/zPAn3/src/macros.jl:287 [inlined]
[2] cpu_test_kernel(__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{1}, Nothing, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}})
@ Main ./none:0
[3] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:115
[4] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:82
[5] #_#16
@ ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:44 [inlined]
I cannot comprehend what makes this happen to s.
@synchronize sadly doesn't work on the CPU within arbitrary control-flow see #330