ClimaCore.jl
ClimaCore.jl copied to clipboard
Improve CUDA kernels
Using fortran style 1D indexing on the parent, with any required assertions done upstream, might be easiest for some kernels. E.g.:
function Base.copyto!(
dest::IJFH{S, Nij},
bc::Union{IJFH{S, Nij, A}, Base.Broadcast.Broadcasted{IJFHStyle{Nij, A}}},
) where {S, Nij, A <: CUDA.CuArray}
nitems = length(parent(dest))
max_threads = 256 # can be higher if conditions permit
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
pdest, pbc = parent(dest), parent(bc)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_copyto!(pdest, pbc)
return dest
end
function knl_copyto!(dest, src)
nitems = length(dest)
gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if gidx < nitems
@inbounds dest[gidx] = p_src[gidx]
end
return nothing
end
Originally posted by @sriharshakandala in https://github.com/CliMA/ClimaCore.jl/pull/767#discussion_r1106263606
We can try
cartidx = CartesianIndices(dest)[gidx]