KernelAbstractions.jl
KernelAbstractions.jl copied to clipboard
`@synchronize` inside while loops doesn't work on CPUs
I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl. See this for the direct CUDA implementation of what I'm trying to port from. This is what I have implemented currently:
const TBSize = 1024::Int
const DotBlocks = 256::Int
@kernel function dot(@Const(a), @Const(b), size, partial)
local_i = @index(Local)
group_i = @index(Group)
tb_sum = @localmem T TBSize
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = @index(Global)
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks
end
# then tree reduction
offset = @private Int64 (1,)
@inbounds begin
offset[1] = @groupsize()[1] ÷ 2
while offset[1] > 0
@synchronize
if (local_i - 1) < offset[1]
tb_sum[local_i] += tb_sum[local_i+offset[1]]
end
offset[1] ÷= 2
end
end
if (local_i == 1)
@inbounds partial[group_i] = tb_sum[local_i]
end
end
# driver
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation. On CPU however, I got the following error:
Using device: AMD Ryzen 9 3900X 12-Core Processor (1T)
ERROR: LoadError: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] wait
@ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:65 [inlined]
[3] wait (repeats 2 times)
@ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:29 [inlined]
[4] runDot(device::Tuple{UndefInitializer, String, Backend}, size::Int64, ainit::Float32, binit::Float32)
@ Main ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:102
[5] top-level scope
@ ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:107
nested task error: MethodError: no method matching isless(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
Closest candidates are:
isless(::AbstractVector{T} where T, ::AbstractVector{T} where T) at abstractarray.jl:1989
isless(::Any, ::Missing) at missing.jl:88
isless(::Missing, ::Any) at missing.jl:87
...
Stacktrace:
[1] call
@ ~/.julia/packages/Cassette/N5kbV/src/context.jl:456 [inlined]
[2] fallback
@ ~/.julia/packages/Cassette/N5kbV/src/context.jl:454 [inlined]
[3] _overdub_fallback(::Any, ::Vararg{Any, N} where N)
@ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
[4] overdub
@ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
[5] <(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
@ ./operators.jl:279 [inlined]
[6] overdub
@ ./operators.jl:279 [inlined]
[7] >(::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true}, ::Int64)
@ ./operators.jl:305 [inlined]
[8] overdub
@ ./operators.jl:305 [inlined]
[9] overdub
@ ~/.julia/packages/KernelAbstractions/8W8KX/src/KernelAbstractions.jl:266 [inlined]
[10] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:157
[11] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:130
[12] (::KernelAbstractions.var"#33#34"{Nothing, Nothing, typeof(KernelAbstractions.__run), Tuple{KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, Tuple{Int64}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, KernelAbstractions.NDIteration.NoDynamicCheck}})()
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:22
in expression starting at /home/tom/babelstream-upstream/JuliaStream.jl/src/Test.jl:107
Removing the @synchronize
macro in the while-loop makes the error go away but the answer becomes incorrect.
I've tried to do @print eltype(offset[1])
, it prints the correct generic type (Float32
in the case) so I'm not sure what @synchronize
is doing here.
For reference, here is what pkg status
says:
[21141c5a] AMDGPU v0.2.7
[c7e460c6] ArgParse v1.1.4
[052768ef] CUDA v3.2.1
[72cfdca4] CUDAKernels v0.3.0
[e2ba6199] ExprTools v0.1.3 ⚲
[63c18a36] KernelAbstractions v0.7.0
[d96e819e] Parameters v0.12.2
[7eb9e9f0] ROCKernels v0.2.0
[8ba89e20] Distributed
And the complete Test.jl
reproducer:
using Base: Float32
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU
const TBSize = 1024::Int
const DotBlocks = 256::Int
@enum Backend cuda rocm cpu
const DeviceWithRepr = Tuple{Any,String,Backend}
function list_rocm_devices()::Vector{DeviceWithRepr}
try
# AMDGPU.agents()'s internal iteration order isn't stable
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
map(x -> (x, repr(x), rocm), sorted)
catch
# probably unsupported
[]
end
end
function list_cuda_devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? [] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end
function devices()::Vector{DeviceWithRepr}
cudas = list_cuda_devices()
rocms = list_rocm_devices()
cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
vcat(cpus, cudas, rocms)
end
function runDot(device::DeviceWithRepr, size::Int, ainit::T, binit::T)::Tuple{T,T} where {T}
(actual, name, backend) = device
println("Using device: ", name)
as = fill(ainit, size)
bs = fill(binit, size)
if backend == cpu
partial_sum = Vector{T}(undef, DotBlocks)
a = Vector{T}(as)
b = Vector{T}(bs)
backendDevice = CPU()
elseif backend == cuda
CUDA.device!(actual)
partial_sum = CuArray{T}(undef, DotBlocks)
a = CuArray{T}(as)
b = CuArray{T}(bs)
backendDevice = CUDADevice()
elseif backend == rocm
AMDGPU.DEFAULT_AGENT[] = actual
partial_sum = ROCArray{T}(undef, DotBlocks)
a = ROCArray{T}(as)
b = ROCArray{T}(bs)
backendDevice = ROCDevice()
else
error("unsupported backend $(backend)")
end
@kernel function dot(@Const(a), @Const(b), size, partial)
local_i = @index(Local)
group_i = @index(Group)
tb_sum = @localmem T TBSize
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = @index(Global)
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks
end
# then tree reduction
offset = @private Int64 (1,)
@inbounds begin
offset[1] = @groupsize()[1] ÷ 2
while offset[1] > 0
@synchronize
if (local_i - 1) < offset[1]
tb_sum[local_i] += tb_sum[local_i+offset[1]]
end
offset[1] ÷= 2
end
end
if (local_i == 1)
@inbounds partial[group_i] = tb_sum[local_i]
end
end
function referenceDot()
sum = zero(T)
for i = 1:size
@inbounds sum += a[i] * b[i]
end
return sum
end
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
return (referenceDot(), sum(partial_sum))
end
device = devices()[1]
(expected, actual) = runDot(device, TBSize * 2, 1.0f0, 2.0f0)
println("actual=", actual, ", expected=", expected)