AMDGPU.jl Miscompilation with nested loops and control flow

After removing all the KernelAbstractions parts from https://github.com/JuliaGPU/KernelAbstractions.jl/issues/517 The assume is there to remove an exceptional branch

using AMDGPU

nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1

"""
    assume(cond::Bool)

Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
        declare void @llvm.assume(i1)

        define void @entry(i8) #0 {
            %cond = icmp eq i8 %0, 1
            call void @llvm.assume(i1 %cond)
            ret void
        }

        attributes #0 = { alwaysinline }""", "entry"),
    Nothing, Tuple{Bool}, cond)

@inline function assume_nonzero(CI::CartesianIndices)
    ntuple(Val(ndims(CI))) do I
        @inline
        indices = CI.indices[I]
        assume(indices.stop > 0)
    end
end

Base.@propagate_inbounds function expand(blocks, workitems, groupidx::Integer, idx::Integer)
    # this causes a exception branch and a div
    assume_nonzero(blocks)
    assume_nonzero(workitems)
    expand(blocks, workitems, blocks[groupidx], workitems[idx])
end

@inline function expand(blocks, workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
    nI = ntuple(Val(N)) do I
        Base.@_inline_meta
        stride = size(workitems, I)
        gidx = groupidx.I[I]
        (gidx - 1) * stride + idx.I[I]
    end
    CartesianIndex(nI)
end

function gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx::Int64, Ny::Int64; )
    bI = AMDGPU.blockIdx().x
    tI = AMDGPU.threadIdx().x

    I = @inbounds expand(blocks, workitems, bI, tI)
    if I in ndrange
        I = @inbounds expand(blocks, workitems, bI, tI)
        (i, j, k) = I.I
        sum = zero(eltype(tensor))
        for p = 1:Nx + 2
            for q = -Ny:Ny
                sum += 1.0
            end
        end
        @inbounds tensor[i, j, k] = sum
    end
    return nothing
end

tensor = AMDGPU.zeros(Float64, nx, ny, nz)

ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1, 1, 1))
workitems = CartesianIndices((10, 1, 1))

@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]

Sep 12 '24 13:09 vchuravy

Some more simplification:

using AMDGPU

nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1

"""
    assume(cond::Bool)

Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
        declare void @llvm.assume(i1)

        define void @entry(i8) #0 {
            %cond = icmp eq i8 %0, 1
            call void @llvm.assume(i1 %cond)
            ret void
        }

        attributes #0 = { alwaysinline }""", "entry"),
    Nothing, Tuple{Bool}, cond)

@inline function assume_nonzero(CI::CartesianIndices)
    ntuple(Val(ndims(CI))) do I
        @inline
        indices = CI.indices[I]
        assume(indices.stop > 0)
    end
end

@inline function expand(workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
    nI = ntuple(Val(N)) do I
        Base.@_inline_meta
        stride = size(workitems, I)
        gidx = groupidx.I[I]
        (gidx - 1) * stride + idx.I[I]
    end
    CartesianIndex(nI)
end

function gpu_kernel_xx!(workitems, tensor, Nx::Int64, Ny::Int64; )
    ndrange = CartesianIndices((10,1,1))
    blocks = CartesianIndices((1,1,1))

    bI = AMDGPU.blockIdx().x
    tI = AMDGPU.threadIdx().x

    assume_nonzero(blocks)
    assume_nonzero(workitems)
    bI2 = @inbounds blocks[bI]
    wI2 = @inbounds workitems[tI]
 
    I = @inbounds expand(workitems, bI2, wI2)
    if I in ndrange
        wI2 = @inbounds workitems[tI]
        I = @inbounds expand(workitems, bI2, wI2)
        sum = zero(eltype(tensor))
        for p = 1:Nx + 2
            for q = -Ny:Ny
                sum += 1.0
            end
        end
        @inbounds tensor[I] = sum
    end
    return nothing
end

tensor = AMDGPU.zeros(Float64, nx, ny, nz)

ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))

@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]

Sep 12 '24 14:09 vchuravy

And again:

using AMDGPU

nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1

function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
    workitems = CartesianIndices((10, 1, 1))
    blocks = CartesianIndices((1,1,1))

    bI = AMDGPU.blockIdx().x
    tI = AMDGPU.threadIdx().x

    bI2 = @inbounds blocks[bI]
    wI2 = @inbounds workitems[tI]

    @inbounds begin
        stride = size(workitems, 3)
        gidx = bI2.I[3]
        i3 = (gidx - 1) * stride + wI2.I[3]
    end

    if tI <= 10
        wI2 = @inbounds workitems[tI]
        sum = zero(eltype(tensor))
        for _ = 1:Nx + 2
            for _ = -Ny:Ny
                sum += 1.0
            end
        end
        @inbounds tensor[tI, 1, i3] = sum
    end
    return nothing
end

tensor = AMDGPU.zeros(Float64, nx, ny, nz)

ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))

@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
println("ka_direct:", tensor)

Sep 12 '24 14:09 vchuravy

using AMDGPU

nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1

function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
    workitems = CartesianIndices((10, 1, 1))
    tI = AMDGPU.threadIdx().x
    wI2 = @inbounds workitems[tI]

    @inbounds begin
        i3 = wI2.I[3]
    end

    if tI <= 10
        wI2 = @inbounds workitems[tI]
        sum = zero(eltype(tensor))
        for _ = -Nx:Nx
            for _ = -Ny:Ny
                sum += 1.0
            end
        end
        @inbounds tensor[tI, 1, i3] = sum
    end
    return nothing
end

tensor = AMDGPU.zeros(Float64, nx, ny, nz)

ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))

@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]

AMDGPU.@device_code dir="amd_dump" begin
    @roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
end

Sep 12 '24 14:09 vchuravy

The LLVM IR is becoming manageable:

;  @ /home/vchuravy/src/KernelAbstractions/issue517/repr.jl:6 within `gpu_kernel_xx!`
define amdgpu_kernel void @_Z14gpu_kernel_xx_14ROCDeviceArrayI7Float64Ll3ELl1EE5Int64S1_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [3 x i64], i8 addrspace(1)*, i64 } %0, i64 signext %1, i64 signext %2) local_unnamed_addr #2 !dbg !43 {
conversion:
  %.fca.0.0.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 0, 0
  %.fca.0.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 0, 1
  %.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 1
  %3 = call i32 @llvm.amdgcn.workitem.id.x(), !dbg !47, !range !62
  %.lhs.trunc = trunc i32 %3 to i16, !dbg !63
  %4 = udiv i16 %.lhs.trunc, 10, !dbg !63
  %5 = icmp ugt i32 %3, 9, !dbg !83
  br i1 %5, label %L296, label %pass11, !dbg !89

L179:                                             ; preds = %L179.preheader, %L211
  %value_phi15 = phi i64 [ %8, %L211 ], [ %18, %L179.preheader ]
  %value_phi16 = phi double [ %value_phi24, %L211 ], [ 0.000000e+00, %L179.preheader ]
  br i1 %.not50.not, label %L211, label %L198, !dbg !90

L198:                                             ; preds = %L198, %L179
  %value_phi20 = phi double [ %6, %L198 ], [ %value_phi16, %L179 ]
  %value_phi21 = phi i64 [ %7, %L198 ], [ %20, %L179 ]
  %6 = fadd double %value_phi20, 1.000000e+00, !dbg !91
  %.not51 = icmp eq i64 %value_phi21, %value_phi17, !dbg !95
  %7 = add i64 %value_phi21, 1, !dbg !97
  br i1 %.not51, label %L211, label %L198, !dbg !100

L211:                                             ; preds = %L198, %L179
  %value_phi24 = phi double [ %value_phi16, %L179 ], [ %6, %L198 ]
  %.not52 = icmp eq i64 %value_phi15, %value_phi, !dbg !101
  %8 = add i64 %value_phi15, 1, !dbg !102
  br i1 %.not52, label %L222, label %L179, !dbg !103

L222:                                             ; preds = %pass11, %L211
  %value_phi27 = phi double [ 0.000000e+00, %pass11 ], [ %value_phi24, %L211 ]
  %9 = call i64 @llvm.smax.i64(i64 %.fca.0.0.extract, i64 0), !dbg !104
  %10 = call i64 @llvm.smax.i64(i64 %.fca.0.1.extract, i64 0), !dbg !104
  %11 = mul i64 %9, %10, !dbg !128
  %12 = zext i16 %4 to i64, !dbg !136
  %13 = mul i64 %11, %12, !dbg !141
  %14 = zext i32 %3 to i64, !dbg !142
  %15 = add i64 %13, %14, !dbg !144
  %16 = bitcast i8 addrspace(1)* %.fca.1.extract to double addrspace(1)*, !dbg !145
  %17 = getelementptr inbounds double, double addrspace(1)* %16, i64 %15, !dbg !145
  store double %value_phi27, double addrspace(1)* %17, align 8, !dbg !145, !tbaa !157
  br label %L296, !dbg !160

L296:                                             ; preds = %L222, %conversion
  ret void, !dbg !161

pass11:                                           ; preds = %conversion
  %18 = sub i64 0, %1, !dbg !162
  %.not = icmp sgt i64 %18, %1, !dbg !164
  %19 = sext i1 %.not to i64, !dbg !168
  %value_phi = xor i64 %19, %1, !dbg !168
  %.not48.not = icmp slt i64 %value_phi, %18, !dbg !174
  br i1 %.not48.not, label %L222, label %L179.preheader, !dbg !163

L179.preheader:                                   ; preds = %pass11
  %20 = sub i64 0, %2
  %.not49 = icmp sgt i64 %20, %2
  %21 = sext i1 %.not49 to i64
  %value_phi17 = xor i64 %21, %2
  %.not50.not = icmp slt i64 %value_phi17, %20
  br label %L179, !dbg !90
}

Sep 12 '24 14:09 vchuravy

using AMDGPU

function gpu_kernel_xx!(x, Nx::Int64, Ny::Int64)
    i = AMDGPU.threadIdx().x
    i ≤ 10 || return

    workitems = CartesianIndices((10, 1))
    @inbounds workitems[i] # accessing `workitems` triggers miscompilation

    y = 0f0
    for _ in -Nx:Nx
        for _ in -Ny:Ny
            y += 1f0
        end
    end
    @inbounds x[i] = y
    return
end

Nx, Ny = 1, 1
x = AMDGPU.zeros(Float32, 10)

@roc groupsize=length(x) gridsize=1 gpu_kernel_xx!(x, Nx, Ny)
println("ka_direct:", x)
# expected answer [9.0, 9.0, 9.0, ...]

AMDGPU.@device_code dir="devcode" @roc launch=false gpu_kernel_xx!(x, Nx, Ny)

define amdgpu_kernel void @_Z14gpu_kernel_xx_14ROCDeviceArrayI7Float32Ll1ELl1EE5Int64S1_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x i64], i8 addrspace(1)*, i64 } %0, i64 signext %1, i64 signext %2) local_unnamed_addr #1 {
conversion:
  %.fca.1.extract = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %0, 1
  %3 = call i32 @llvm.amdgcn.workitem.id.x(), !range !7
  %4 = icmp ugt i32 %3, 9
  br i1 %4, label %common.ret, label %pass

L86:                                              ; preds = %L86.preheader, %L118
  %value_phi3 = phi i64 [ %7, %L118 ], [ %11, %L86.preheader ]
  %value_phi4 = phi float [ %value_phi12, %L118 ], [ 0.000000e+00, %L86.preheader ]
  br i1 %.not3.not, label %L118, label %L105

L105:                                             ; preds = %L105, %L86
  %value_phi8 = phi float [ %5, %L105 ], [ %value_phi4, %L86 ]
  %value_phi9 = phi i64 [ %6, %L105 ], [ %13, %L86 ]
  %5 = fadd float %value_phi8, 1.000000e+00
  %.not4 = icmp eq i64 %value_phi9, %value_phi5
  %6 = add i64 %value_phi9, 1
  br i1 %.not4, label %L118, label %L105

L118:                                             ; preds = %L105, %L86
  %value_phi12 = phi float [ %value_phi4, %L86 ], [ %5, %L105 ]
  %.not5 = icmp eq i64 %value_phi3, %value_phi
  %7 = add i64 %value_phi3, 1
  br i1 %.not5, label %L150, label %L86

L150:                                             ; preds = %pass, %L118
  %value_phi15 = phi float [ 0.000000e+00, %pass ], [ %value_phi12, %L118 ]
  %8 = bitcast i8 addrspace(1)* %.fca.1.extract to float addrspace(1)*
  %9 = zext i32 %3 to i64
  %10 = getelementptr inbounds float, float addrspace(1)* %8, i64 %9
  store float %value_phi15, float addrspace(1)* %10, align 4, !tbaa !8
  br label %common.ret

common.ret:                                       ; preds = %L150, %conversion
  ret void

pass:                                             ; preds = %conversion
  %11 = sub i64 0, %1
  %.not = icmp sgt i64 %11, %1
  %12 = sext i1 %.not to i64
  %value_phi = xor i64 %12, %1
  %.not1.not = icmp slt i64 %value_phi, %11
  br i1 %.not1.not, label %L150, label %L86.preheader

L86.preheader:                                    ; preds = %pass
  %13 = sub i64 0, %2
  %.not2 = icmp sgt i64 %13, %2
  %14 = sext i1 %.not2 to i64
  %value_phi5 = xor i64 %14, %2
  %.not3.not = icmp slt i64 %value_phi5, %13
  br label %L86
}

Sep 12 '24 19:09 pxl-th

@vchuravy so this is a bug with ROCm LLVM?

Sep 12 '24 19:09 pxl-th

Yeah pretty much as far as I can tell this is very cursed.

    @inbounds workitems[i] # accessing `workitems` triggers miscompilation

Yeah that's what I figured out as well, funnily the code is gone in the optimized IR and the code is basically identically

Identical LLVM modules lead to a very subtle codegen difference... https://godbolt.org/z/1z3b3fEdn This seems fixed in LLVM 17 https://godbolt.org/z/97x8Kz4n3

Sep 12 '24 20:09 vchuravy

My reproducer ended up looking like:

function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
    workitems = CartesianIndices((10, 1, 1))
    tI = AMDGPU.threadIdx().x

    if tI <= 10
        @inbounds workitems[tI]
        sum = 0.0
        for _ = -Nx:Nx # seeminlgy skipped
            for _ = -Ny:Ny
                sum += 1.0
            end
        end
        Base.unsafe_store!(tensor, sum, tI)
    end
    return nothing
end

Sep 12 '24 20:09 vchuravy

@pxl-th do you have time to bisect this? Otherwise I can try and take a look.

Sep 12 '24 20:09 vchuravy

If you have some infrastructure setup for this... Going through every commit seems like it'd take a lot of time

Sep 12 '24 20:09 pxl-th

There is the git bisect command and related workflow that allows to nail down a regression without too many iterations.

Sep 12 '24 20:09 luraess

No worries, I will bisect it.

Sep 13 '24 07:09 vchuravy

I observe a similar behavior with LLVM 18 & Julia 1.12.

Julia MWE (reduced from generic matrix multiplication kernel):

using LLVM.Interop
using AMDGPU

function matmatmul_kernel_bad!(C::AbstractArray{T}, A) where T
    assume(length(C) > 0)
    assume(length(A) > 0)

    idx = workitemIdx().x

    Cij = zero(T)
    for k in 1:length(A) # hardcoding loop to `1:4` gets rid of the bug
        @inbounds Cij += A[k]
    end
    @inbounds C[idx] = Cij
    return
end

function main()
    T = ComplexF16
    x = ROCArray(ones(T, 4))
    z = ROCArray(zeros(T, 4))
    @roc groupsize=4 gridsize=1 matmatmul_kernel_good!(z, x)
    display(z); println()
    return
end

Results in:

4-element ROCArray{ComplexF16, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
 Float16(0.0) + Float16(4.0)im
 Float16(4.0) + Float16(0.0)im
 Float16(4.0) + Float16(0.0)im
 Float16(4.0) + Float16(0.0)im

While with hardcoded for-loop to 1:4 range, the output is correct:

4-element ROCArray{ComplexF16, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
 Float16(4.0) + Float16(0.0)im
 Float16(4.0) + Float16(0.0)im
 Float16(4.0) + Float16(0.0)im
 Float16(4.0) + Float16(0.0)im

This is not present on LLVM 16 (Julia 1.11) and older versions.

Attaching LLVM IR, ASM dump of the kernel with LLVM 16 & LLVM 18: devcode.zip

Apr 27 '25 19:04 pxl-th

I think this is due to https://github.com/JuliaLang/julia/pull/58837

Can confirm that having that PR fixes at least your latest example

Jul 01 '25 20:07 gbaraldi