AMDGPU.jl
AMDGPU.jl copied to clipboard
Using one single array of pointers for multiGPU AMDGPU computation
Hi folks!
I am working on the multiGPU support of JACC: https://github.com/JuliaORNL/JACC.jl/ For that, I would need to be able to use a single array of pointers that can store pointers to different GPUs.
I opened another issue a few days ago: https://github.com/JuliaGPU/AMDGPU.jl/issues/662 Although that helped me understand the problem better, I still cannot run the test code below. I can run that code on CUDA (I put the CUDA code too, just in case it is useful).
@pxl-th mentioned the CU_MEMHOSTALLOC_PORTABLE CUDA flag. Can we use that in AMDGPU?
Here are the codes: AMDGPU
function multi_scal(N, dev_id,alpha,x)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
if i <= N
@inbounds x[dev_id][i]*=alpha
end
return nothing
end
x = ones(600)
alpha = 2.0
ndev = length(AMDGPU.devices())
ret = Vector{Any}(undef, 2)
AMDGPU.device!(AMDGPU.device(1))
s_array = length(x)
s_arrays = ceil(Int, s_array/ndev)
array_ret = Vector{Any}(undef, ndev)
pointer_ret = Vector{AMDGPU.Device.ROCDeviceVector{Float64,AMDGPU.Device.AS.Global}}(undef, ndev)
for i in 1:ndev
AMDGPU.device!(AMDGPU.device(i))
array_ret[i] = AMDGPU.ROCArray(x[((i-1)*s_arrays)+1:i*s_arrays])
pointer_ret[i] = AMDGPU.rocconvert(array_ret[i])
end
AMDGPU.device!(AMDGPU.device(1))
amdgpu_pointer_ret = ROCArray(pointer_ret)
ret[1] = amdgpu_pointer_ret
ret[2] = array_ret
numThreads = 256
threads = min(s_arrays, numThreads)
blocks = ceil(Int, s_arrays / threads)
# This works
AMDGPU.device!(AMDGPU.device(1))
@roc groupsize=threads gridsize=blocks multi_scal(s_arrays, 1, alpha, ret[1])
# This does not work
AMDGPU.device!(AMDGPU.device(2))
@roc groupsize=threads gridsize=blocks multi_scal(s_arrays, 2, alpha, ret[1])
CUDA
function multi_scal(N, dev_id,alpha,x)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i <= N
@inbounds x[dev_id][i]*=alpha
end
return nothing
end
x = ones(600)
alpha = 2.0
ndev = length(devices())
ret = Vector{Any}(undef, 2)
device!(0)
s_array = length(x)
s_arrays = ceil(Int, s_array/ndev)
array_ret = Vector{Any}(undef, ndev)
pointer_ret = Vector{CuDeviceVector{Float64,CUDA.AS.Global}}(undef, ndev)
for i in 1:ndev
device!(i-1)
array_ret[i] = CuArray(x[((i-1)*s_arrays)+1:i*s_arrays])
pointer_ret[i] = cudaconvert(array_ret[i])
end
device!(0)
cuda_pointer_ret = CuArray(pointer_ret)
ret[1] = cuda_pointer_ret
ret[2] = array_ret
numThreads = 256
threads = min(s_arrays, numThreads)
blocks = ceil(Int, s_arrays / threads)
# This works
device!(0)
@cuda threads=threads blocks=blocks multi_scal(s_arrays, 1, alpha, ret[1])
# This works too
device!(1)
@cuda threads=threads blocks=blocks multi_scal(s_arrays, 2, alpha, ret[1])