KernelAbstractions.jl icon indicating copy to clipboard operation
KernelAbstractions.jl copied to clipboard

BUG: @synchronize inside loop in CPU is not working

Open ManuelCostanzo opened this issue 1 year ago • 3 comments

Hi! I realized that there is a bug using CPU backend (the same code is working in CUDA backend)

This is the example:


using KernelAbstractions, CUDA

const backend = CPU()
const BLOCK_SIZE = 4



@kernel function kk(A, B, C)
	varr = 2
	for i in 1:varr
		for j in 1:varr
			@synchronize()
			C[i, j] = 0
			for k in 1:varr
				C[i, j] += A[i, k] * B[k, j]
			end
		end
	end
end


function run_gpu()
	m = 10
	n = 20

	#Inicializo las matrices en la GPU
	A = KernelAbstractions.zeros(backend, Int, m, n)
	B = KernelAbstractions.zeros(backend, Int, m, n)
	C = KernelAbstractions.zeros(backend, Int, m, n)

	#Calculo el tamaño de bloque
	block_size = BLOCK_SIZE
	mn = max(m, n)
	if mn < BLOCK_SIZE
		block_size = mn
	end

	#Calculo el número de bloques
	total_blocks = (mn + block_size - 1) ÷ block_size


	#Anti-diagonal loop
	@time @inbounds for diag in 0:(2*total_blocks-1)
		#Número de bloques a lanzar en la anti-diagonal
		num_blocks_diagonal = min(diag + 1, 2 * total_blocks - diag - 1)
		kernel! = kk(backend)
		kernel!(A, B, C, ndrange = (block_size * block_size, num_blocks_diagonal), workgroupsize = block_size * block_size)
		KernelAbstractions.synchronize(backend)
	end

end


run_gpu() 

and this is the error:

ERROR: LoadError: UndefVarError: `varr` not defined
Stacktrace:
  [1] cpu_kk
    @ ~/.julia/packages/KernelAbstractions/Zcyra/src/macros.jl:288 [inlined]
  [2] cpu_kk(__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{2}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, A::Matrix{Int64}, B::Matrix{Int64}, C::Matrix{Int64})
    @ Main ./none:0
  [3] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(cpu_kk)}, ndrange::Tuple{Int64, Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}, args::Tuple{Matrix{Int64}, Matrix{Int64}, Matrix{Int64}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
    @ KernelAbstractions ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:115
  [4] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(cpu_kk)}, ndrange::Tuple{Int64, Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{2, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}, args::Tuple{Matrix{Int64}, Matrix{Int64}, Matrix{Int64}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
    @ KernelAbstractions ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:82
  [5] (::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(cpu_kk)})(::Matrix{Int64}, ::Vararg{Matrix{Int64}}; ndrange::Tuple{Int64, Int64}, workgroupsize::Int64)
    @ KernelAbstractions ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:44
  [6] Kernel
    @ ~/.julia/packages/KernelAbstractions/Zcyra/src/cpu.jl:37 [inlined]
  [7] macro expansion
    @ /app/manuel/test3.jl:48 [inlined]
  [8] macro expansion
    @ ./timing.jl:279 [inlined]
  [9] run_gpu()
    @ Main /app/manuel/test3.jl:44
 [10] top-level scope
    @ /app/manuel/test3.jl:55

So, it's like after @synchronize, all local variables are lost and this only happens in the CPU backend.

Any thoughts on this? Thank you very much!

ManuelCostanzo avatar Feb 29 '24 16:02 ManuelCostanzo

I realized that @uniform works for this, but I'm having issues querying the thread id and block id. For example this is not working because index is not defined

index = @index(Global, NTuple)
@uniform tid = index[1] - 1
	```

ManuelCostanzo avatar Feb 29 '24 17:02 ManuelCostanzo

I've never run into this for variables defined using @index, but otherwise defining variables is not supposed to work outside of using @private according to docs.

Even when doing so, I run into this -

@kernel function test_kernel()
  @private s = 1

  @print("s: ", s, " type: ", typeof(s), "\n")
  while s < 6
    s += 1
    @synchronize()
  end
end

For some reason, s goes from being an Int64 to a ::Tuple{Int64} when the while loop starts. Here is the error I get -

s: 1 type: Int64
ERROR: LoadError: MethodError: no method matching isless(::Tuple{Int64}, ::Int64)

Closest candidates are:
  isless(::Missing, ::Any)
   @ Base missing.jl:87
  isless(::Any, ::Missing)
   @ Base missing.jl:88
  isless(::Tuple, ::Tuple{})
   @ Base tuple.jl:556
  ...

Stacktrace:
 [1] <(x::Tuple{Int64}, y::Int64)
   @ Base ./operators.jl:352
 [2] cpu_test_kernel
   @ ~/.julia/packages/KernelAbstractions/zPAn3/src/macros.jl:288 [inlined]
 [3] cpu_test_kernel(__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{1}, Nothing, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}})
   @ Main ./none:0
 [4] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
   @ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:115
 [5] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
   @ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:82
 [6] #_#16
   @ ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:44 [inlined]

If you set the while condition to true, I get -

s: 1 type: Int64
ERROR: LoadError: MethodError: no method matching setindex!(::Tuple{Int64}, ::Int64, ::Int64)
Stacktrace:
 [1] cpu_test_kernel
   @ ~/.julia/packages/KernelAbstractions/zPAn3/src/macros.jl:287 [inlined]
 [2] cpu_test_kernel(__ctx__::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{1}, Nothing, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}})
   @ Main ./none:0
 [3] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
   @ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:115
 [4] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(6000,)}, typeof(cpu_test_kernel)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(6000,)}, KernelAbstractions.NDIteration.StaticSize{(1,)}, Nothing, Nothing}, args::Tuple{}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
   @ KernelAbstractions ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:82
 [5] #_#16
   @ ~/.julia/packages/KernelAbstractions/zPAn3/src/cpu.jl:44 [inlined]

I cannot comprehend what makes this happen to s.

DhruvDh avatar Mar 08 '24 14:03 DhruvDh

@synchronize sadly doesn't work on the CPU within arbitrary control-flow see #330

vchuravy avatar Mar 08 '24 14:03 vchuravy