KernelAbstractions.jl icon indicating copy to clipboard operation
KernelAbstractions.jl copied to clipboard

`@synchronize` inside while loops doesn't work on CPUs

Open tom91136 opened this issue 3 years ago • 0 comments

I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl. See this for the direct CUDA implementation of what I'm trying to port from. This is what I have implemented currently:


const TBSize = 1024::Int
const DotBlocks = 256::Int

@kernel function dot(@Const(a), @Const(b), size, partial)  
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0
  
    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end
  
    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end
  
    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

# driver
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))

I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation. On CPU however, I got the following error:

Using device: AMD Ryzen 9 3900X 12-Core Processor (1T)
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:322 [inlined]
 [2] wait
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:65 [inlined]
 [3] wait (repeats 2 times)
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:29 [inlined]
 [4] runDot(device::Tuple{UndefInitializer, String, Backend}, size::Int64, ainit::Float32, binit::Float32)
   @ Main ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:102
 [5] top-level scope
   @ ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

    nested task error: MethodError: no method matching isless(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
    Closest candidates are:
      isless(::AbstractVector{T} where T, ::AbstractVector{T} where T) at abstractarray.jl:1989
      isless(::Any, ::Missing) at missing.jl:88
      isless(::Missing, ::Any) at missing.jl:87
      ...
    Stacktrace:
      [1] call
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:456 [inlined]
      [2] fallback
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:454 [inlined]
      [3] _overdub_fallback(::Any, ::Vararg{Any, N} where N)
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [4] overdub
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [5] <(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
        @ ./operators.jl:279 [inlined]
      [6] overdub
        @ ./operators.jl:279 [inlined]
      [7] >(::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true}, ::Int64)
        @ ./operators.jl:305 [inlined]
      [8] overdub
        @ ./operators.jl:305 [inlined]
      [9] overdub
        @ ~/.julia/packages/KernelAbstractions/8W8KX/src/KernelAbstractions.jl:266 [inlined]
     [10] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:157
     [11] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:130
     [12] (::KernelAbstractions.var"#33#34"{Nothing, Nothing, typeof(KernelAbstractions.__run), Tuple{KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, Tuple{Int64}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, KernelAbstractions.NDIteration.NoDynamicCheck}})()
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:22
in expression starting at /home/tom/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

Removing the @synchronize macro in the while-loop makes the error go away but the answer becomes incorrect. I've tried to do @print eltype(offset[1]), it prints the correct generic type (Float32 in the case) so I'm not sure what @synchronize is doing here.

For reference, here is what pkg status says:

  [21141c5a] AMDGPU v0.2.7
  [c7e460c6] ArgParse v1.1.4
  [052768ef] CUDA v3.2.1
  [72cfdca4] CUDAKernels v0.3.0
  [e2ba6199] ExprTools v0.1.3 ⚲
  [63c18a36] KernelAbstractions v0.7.0
  [d96e819e] Parameters v0.12.2
  [7eb9e9f0] ROCKernels v0.2.0
  [8ba89e20] Distributed

And the complete Test.jl reproducer:

using Base: Float32
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU

const TBSize = 1024::Int
const DotBlocks = 256::Int

@enum Backend cuda rocm cpu
const DeviceWithRepr = Tuple{Any,String,Backend}

function list_rocm_devices()::Vector{DeviceWithRepr}
  try
    # AMDGPU.agents()'s internal iteration order isn't stable
    sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
    map(x -> (x, repr(x), rocm), sorted)
  catch
    # probably unsupported
    []
  end
end

function list_cuda_devices()::Vector{DeviceWithRepr}
  return !CUDA.functional(false) ? [] :
         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end

function devices()::Vector{DeviceWithRepr}
  cudas = list_cuda_devices()
  rocms = list_rocm_devices()
  cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
  vcat(cpus, cudas, rocms)
end


function runDot(device::DeviceWithRepr, size::Int, ainit::T, binit::T)::Tuple{T,T} where {T}
  (actual, name, backend) = device
  println("Using device: ", name)
  as = fill(ainit, size)
  bs = fill(binit, size)
  if backend == cpu
    partial_sum = Vector{T}(undef, DotBlocks)
    a = Vector{T}(as)
    b = Vector{T}(bs)
    backendDevice = CPU()
  elseif backend == cuda
    CUDA.device!(actual)
    partial_sum = CuArray{T}(undef, DotBlocks)
    a = CuArray{T}(as)
    b = CuArray{T}(bs)
    backendDevice = CUDADevice()
  elseif backend == rocm
    AMDGPU.DEFAULT_AGENT[] = actual
    partial_sum = ROCArray{T}(undef, DotBlocks)
    a = ROCArray{T}(as)
    b = ROCArray{T}(bs)
    backendDevice = ROCDevice()
  else
    error("unsupported backend $(backend)")
  end

  @kernel function dot(@Const(a), @Const(b), size, partial)
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0

    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end

    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end

    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

  function referenceDot()
    sum = zero(T)
    for i = 1:size
      @inbounds sum += a[i] * b[i]
    end
    return sum
  end

  wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
  return (referenceDot(), sum(partial_sum))
end

device = devices()[1]
(expected, actual) = runDot(device, TBSize * 2, 1.0f0, 2.0f0)
println("actual=", actual, ", expected=", expected)

tom91136 avatar Jul 20 '21 14:07 tom91136