CUDA.jl
CUDA.jl copied to clipboard
Use invariant.load for ldg
Fixes #2531
Trying out the suggestion in https://github.com/JuliaGPU/CUDA.jl/issues/41#issuecomment-2655642904
Currently, the metadata is correctly generated, but at somepoint it is stripped.
On LLVM 16
using CUDA, LLVM.Interop
function kernel_aligned(X, Y)
assume(UInt(pointer(X)) % (2*sizeof(eltype(X))) == 0)
@inbounds begin
I = threadIdx().x
v1 = CUDA.unsafe_cached_load(pointer(Y), 2*I-1,Val(8))
v2 = CUDA.unsafe_cached_load(pointer(Y), 2*I, Val(8))
v1 += 1
v2 -= 1
X[2*I-1] = v1
X[2*I] = v2
end
return nothing
end
function foo_aligned(X, Y)
@cuda threads=512 blocks=1 kernel_aligned(X, Y)
return X
end
@device_code_llvm raw=true optimize=false foo_aligned(CUDA.ones(1024), CUDA.ones(1024))
; PTX CompilerJob of MethodInstance for kernel_aligned(::CuDeviceVector{Float32, 1}, ::CuDeviceVector{Float32, 1}) for sm_75
; ...
; ┌ @ /home/vchuravy/src/CUDA/src/device/pointer.jl:81 within `unsafe_cached_load`
; │┌ @ none within `pointerref_ldg`
; ││┌ @ none within `macro expansion` @ /home/vchuravy/.julia/packages/LLVM/b3kFs/src/interop/base.jl:39
%121 = bitcast {}* inttoptr (i64 139198630604496 to {}*) to {} addrspace(10)**, !dbg !169
%122 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)** %121, i64 0, !dbg !169
%123 = addrspacecast {}* inttoptr (i64 139198827636848 to {}*) to {} addrspace(10)*, !dbg !169
%124 = insertvalue [2 x {} addrspace(10)*] zeroinitializer, {} addrspace(10)* %123, 0, !dbg !169
%125 = addrspacecast {}* inttoptr (i64 139192734499008 to {}*) to {} addrspace(10)*, !dbg !169
%126 = insertvalue [2 x {} addrspace(10)*] %124, {} addrspace(10)* %125, 1, !dbg !169
; │││┌ @ int.jl:86 within `-`
%127 = bitcast {}* inttoptr (i64 139198668290448 to {}*) to {} addrspace(10)**, !dbg !176
%128 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)** %127, i64 0, !dbg !176
%129 = sub i64 %120, 1, !dbg !176
; │││└
%130 = bitcast {}* inttoptr (i64 139198648237312 to {}*) to {} addrspace(10)**, !dbg !169
%131 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)** %130, i64 0, !dbg !169
%132 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(11)* %111, align 8, !dbg !169, !alias.scope !60, !noalias !63
%133 = bitcast i8 addrspace(1)* %132 to float addrspace(1)*, !dbg !169
%134 = getelementptr inbounds float, float addrspace(1)* %133, i64 %129, !dbg !169
%135 = load float, float addrspace(1)* %134, align 8, !dbg !169, !tbaa !100, !invariant.load !44
; └└└
; ...
!44 = !{}
julia> @device_code_llvm raw=true optimize=true foo_aligned(CUDA.ones(1024), CUDA.ones(1024))
; PTX CompilerJob of MethodInstance for kernel_aligned(::CuDeviceVector{Float32, 1}, ::CuDeviceVector{Float32, 1}) for sm_75
; @ REPL[3]:1 within `kernel_aligned`
define ptx_kernel void @_Z14kernel_aligned13CuDeviceArrayI7Float32Li1ELi1EES1_({ i64, i32 } %state, { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1) local_unnamed_addr !dbg !41 {
conversion:
; ...
; │┌ @ none within `pointerref_ldg`
; ││┌ @ none within `macro expansion` @ /home/vchuravy/.julia/packages/LLVM/b3kFs/src/interop/base.jl:39
; │││┌ @ int.jl:86 within `-`
%13 = add nsw i64 %8, -1, !dbg !96
; │││└
%14 = getelementptr inbounds float, float addrspace(1)* %10, i64 %13, !dbg !97
%15 = load float, float addrspace(1)* %14, align 8, !dbg !97, !tbaa !93
; └└└