CUDA.jl icon indicating copy to clipboard operation
CUDA.jl copied to clipboard

Use invariant.load for ldg

Open vchuravy opened this issue 10 months ago • 1 comments

Fixes #2531

Trying out the suggestion in https://github.com/JuliaGPU/CUDA.jl/issues/41#issuecomment-2655642904

Currently, the metadata is correctly generated, but at somepoint it is stripped.

vchuravy avatar Feb 13 '25 08:02 vchuravy

On LLVM 16

using CUDA, LLVM.Interop

function kernel_aligned(X, Y)
    assume(UInt(pointer(X)) % (2*sizeof(eltype(X))) == 0)
    @inbounds begin
        I = threadIdx().x
        v1 = CUDA.unsafe_cached_load(pointer(Y), 2*I-1,Val(8))
        v2 = CUDA.unsafe_cached_load(pointer(Y), 2*I,  Val(8))
        v1 += 1
        v2 -= 1
        X[2*I-1] = v1
        X[2*I] = v2
    end
    return nothing
end

function foo_aligned(X, Y)
    @cuda threads=512 blocks=1 kernel_aligned(X, Y)
    return X
end
@device_code_llvm  raw=true optimize=false foo_aligned(CUDA.ones(1024), CUDA.ones(1024))
; PTX CompilerJob of MethodInstance for kernel_aligned(::CuDeviceVector{Float32, 1}, ::CuDeviceVector{Float32, 1}) for sm_75
; ...
; ┌ @ /home/vchuravy/src/CUDA/src/device/pointer.jl:81 within `unsafe_cached_load`
; │┌ @ none within `pointerref_ldg`
; ││┌ @ none within `macro expansion` @ /home/vchuravy/.julia/packages/LLVM/b3kFs/src/interop/base.jl:39
     %121 = bitcast {}* inttoptr (i64 139198630604496 to {}*) to {} addrspace(10)**, !dbg !169
     %122 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)** %121, i64 0, !dbg !169
     %123 = addrspacecast {}* inttoptr (i64 139198827636848 to {}*) to {} addrspace(10)*, !dbg !169
     %124 = insertvalue [2 x {} addrspace(10)*] zeroinitializer, {} addrspace(10)* %123, 0, !dbg !169
     %125 = addrspacecast {}* inttoptr (i64 139192734499008 to {}*) to {} addrspace(10)*, !dbg !169
     %126 = insertvalue [2 x {} addrspace(10)*] %124, {} addrspace(10)* %125, 1, !dbg !169
; │││┌ @ int.jl:86 within `-`
      %127 = bitcast {}* inttoptr (i64 139198668290448 to {}*) to {} addrspace(10)**, !dbg !176
      %128 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)** %127, i64 0, !dbg !176
      %129 = sub i64 %120, 1, !dbg !176
; │││└
     %130 = bitcast {}* inttoptr (i64 139198648237312 to {}*) to {} addrspace(10)**, !dbg !169
     %131 = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)** %130, i64 0, !dbg !169
     %132 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(11)* %111, align 8, !dbg !169, !alias.scope !60, !noalias !63
     %133 = bitcast i8 addrspace(1)* %132 to float addrspace(1)*, !dbg !169
     %134 = getelementptr inbounds float, float addrspace(1)* %133, i64 %129, !dbg !169
     %135 = load float, float addrspace(1)* %134, align 8, !dbg !169, !tbaa !100, !invariant.load !44
; └└└
; ...
!44 = !{}
julia> @device_code_llvm  raw=true optimize=true foo_aligned(CUDA.ones(1024), CUDA.ones(1024))
; PTX CompilerJob of MethodInstance for kernel_aligned(::CuDeviceVector{Float32, 1}, ::CuDeviceVector{Float32, 1}) for sm_75
;  @ REPL[3]:1 within `kernel_aligned`
define ptx_kernel void @_Z14kernel_aligned13CuDeviceArrayI7Float32Li1ELi1EES1_({ i64, i32 } %state, { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1) local_unnamed_addr !dbg !41 {
conversion:
; ...
; │┌ @ none within `pointerref_ldg`
; ││┌ @ none within `macro expansion` @ /home/vchuravy/.julia/packages/LLVM/b3kFs/src/interop/base.jl:39
; │││┌ @ int.jl:86 within `-`
      %13 = add nsw i64 %8, -1, !dbg !96
; │││└
     %14 = getelementptr inbounds float, float addrspace(1)* %10, i64 %13, !dbg !97
     %15 = load float, float addrspace(1)* %14, align 8, !dbg !97, !tbaa !93
; └└└

vchuravy avatar Feb 13 '25 09:02 vchuravy