Miscompilation with nested loops and control flow
After removing all the KernelAbstractions parts from https://github.com/JuliaGPU/KernelAbstractions.jl/issues/517
The assume is there to remove an exceptional branch
using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
"""
assume(cond::Bool)
Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
declare void @llvm.assume(i1)
define void @entry(i8) #0 {
%cond = icmp eq i8 %0, 1
call void @llvm.assume(i1 %cond)
ret void
}
attributes #0 = { alwaysinline }""", "entry"),
Nothing, Tuple{Bool}, cond)
@inline function assume_nonzero(CI::CartesianIndices)
ntuple(Val(ndims(CI))) do I
@inline
indices = CI.indices[I]
assume(indices.stop > 0)
end
end
Base.@propagate_inbounds function expand(blocks, workitems, groupidx::Integer, idx::Integer)
# this causes a exception branch and a div
assume_nonzero(blocks)
assume_nonzero(workitems)
expand(blocks, workitems, blocks[groupidx], workitems[idx])
end
@inline function expand(blocks, workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
nI = ntuple(Val(N)) do I
Base.@_inline_meta
stride = size(workitems, I)
gidx = groupidx.I[I]
(gidx - 1) * stride + idx.I[I]
end
CartesianIndex(nI)
end
function gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx::Int64, Ny::Int64; )
bI = AMDGPU.blockIdx().x
tI = AMDGPU.threadIdx().x
I = @inbounds expand(blocks, workitems, bI, tI)
if I in ndrange
I = @inbounds expand(blocks, workitems, bI, tI)
(i, j, k) = I.I
sum = zero(eltype(tensor))
for p = 1:Nx + 2
for q = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[i, j, k] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1, 1, 1))
workitems = CartesianIndices((10, 1, 1))
@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]
Some more simplification:
using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
"""
assume(cond::Bool)
Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
declare void @llvm.assume(i1)
define void @entry(i8) #0 {
%cond = icmp eq i8 %0, 1
call void @llvm.assume(i1 %cond)
ret void
}
attributes #0 = { alwaysinline }""", "entry"),
Nothing, Tuple{Bool}, cond)
@inline function assume_nonzero(CI::CartesianIndices)
ntuple(Val(ndims(CI))) do I
@inline
indices = CI.indices[I]
assume(indices.stop > 0)
end
end
@inline function expand(workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
nI = ntuple(Val(N)) do I
Base.@_inline_meta
stride = size(workitems, I)
gidx = groupidx.I[I]
(gidx - 1) * stride + idx.I[I]
end
CartesianIndex(nI)
end
function gpu_kernel_xx!(workitems, tensor, Nx::Int64, Ny::Int64; )
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
bI = AMDGPU.blockIdx().x
tI = AMDGPU.threadIdx().x
assume_nonzero(blocks)
assume_nonzero(workitems)
bI2 = @inbounds blocks[bI]
wI2 = @inbounds workitems[tI]
I = @inbounds expand(workitems, bI2, wI2)
if I in ndrange
wI2 = @inbounds workitems[tI]
I = @inbounds expand(workitems, bI2, wI2)
sum = zero(eltype(tensor))
for p = 1:Nx + 2
for q = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[I] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]
And again:
using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
workitems = CartesianIndices((10, 1, 1))
blocks = CartesianIndices((1,1,1))
bI = AMDGPU.blockIdx().x
tI = AMDGPU.threadIdx().x
bI2 = @inbounds blocks[bI]
wI2 = @inbounds workitems[tI]
@inbounds begin
stride = size(workitems, 3)
gidx = bI2.I[3]
i3 = (gidx - 1) * stride + wI2.I[3]
end
if tI <= 10
wI2 = @inbounds workitems[tI]
sum = zero(eltype(tensor))
for _ = 1:Nx + 2
for _ = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[tI, 1, i3] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
println("ka_direct:", tensor)
using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
workitems = CartesianIndices((10, 1, 1))
tI = AMDGPU.threadIdx().x
wI2 = @inbounds workitems[tI]
@inbounds begin
i3 = wI2.I[3]
end
if tI <= 10
wI2 = @inbounds workitems[tI]
sum = zero(eltype(tensor))
for _ = -Nx:Nx
for _ = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[tI, 1, i3] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1,1,1))
workitems = CartesianIndices((10,1,1))
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]
AMDGPU.@device_code dir="amd_dump" begin
@roc groupsize=512 gridsize=length(tensor) gpu_kernel_xx!(tensor, Nx, Ny)
end
The LLVM IR is becoming manageable:
; @ /home/vchuravy/src/KernelAbstractions/issue517/repr.jl:6 within `gpu_kernel_xx!`
define amdgpu_kernel void @_Z14gpu_kernel_xx_14ROCDeviceArrayI7Float64Ll3ELl1EE5Int64S1_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [3 x i64], i8 addrspace(1)*, i64 } %0, i64 signext %1, i64 signext %2) local_unnamed_addr #2 !dbg !43 {
conversion:
%.fca.0.0.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 0, 0
%.fca.0.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 0, 1
%.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %0, 1
%3 = call i32 @llvm.amdgcn.workitem.id.x(), !dbg !47, !range !62
%.lhs.trunc = trunc i32 %3 to i16, !dbg !63
%4 = udiv i16 %.lhs.trunc, 10, !dbg !63
%5 = icmp ugt i32 %3, 9, !dbg !83
br i1 %5, label %L296, label %pass11, !dbg !89
L179: ; preds = %L179.preheader, %L211
%value_phi15 = phi i64 [ %8, %L211 ], [ %18, %L179.preheader ]
%value_phi16 = phi double [ %value_phi24, %L211 ], [ 0.000000e+00, %L179.preheader ]
br i1 %.not50.not, label %L211, label %L198, !dbg !90
L198: ; preds = %L198, %L179
%value_phi20 = phi double [ %6, %L198 ], [ %value_phi16, %L179 ]
%value_phi21 = phi i64 [ %7, %L198 ], [ %20, %L179 ]
%6 = fadd double %value_phi20, 1.000000e+00, !dbg !91
%.not51 = icmp eq i64 %value_phi21, %value_phi17, !dbg !95
%7 = add i64 %value_phi21, 1, !dbg !97
br i1 %.not51, label %L211, label %L198, !dbg !100
L211: ; preds = %L198, %L179
%value_phi24 = phi double [ %value_phi16, %L179 ], [ %6, %L198 ]
%.not52 = icmp eq i64 %value_phi15, %value_phi, !dbg !101
%8 = add i64 %value_phi15, 1, !dbg !102
br i1 %.not52, label %L222, label %L179, !dbg !103
L222: ; preds = %pass11, %L211
%value_phi27 = phi double [ 0.000000e+00, %pass11 ], [ %value_phi24, %L211 ]
%9 = call i64 @llvm.smax.i64(i64 %.fca.0.0.extract, i64 0), !dbg !104
%10 = call i64 @llvm.smax.i64(i64 %.fca.0.1.extract, i64 0), !dbg !104
%11 = mul i64 %9, %10, !dbg !128
%12 = zext i16 %4 to i64, !dbg !136
%13 = mul i64 %11, %12, !dbg !141
%14 = zext i32 %3 to i64, !dbg !142
%15 = add i64 %13, %14, !dbg !144
%16 = bitcast i8 addrspace(1)* %.fca.1.extract to double addrspace(1)*, !dbg !145
%17 = getelementptr inbounds double, double addrspace(1)* %16, i64 %15, !dbg !145
store double %value_phi27, double addrspace(1)* %17, align 8, !dbg !145, !tbaa !157
br label %L296, !dbg !160
L296: ; preds = %L222, %conversion
ret void, !dbg !161
pass11: ; preds = %conversion
%18 = sub i64 0, %1, !dbg !162
%.not = icmp sgt i64 %18, %1, !dbg !164
%19 = sext i1 %.not to i64, !dbg !168
%value_phi = xor i64 %19, %1, !dbg !168
%.not48.not = icmp slt i64 %value_phi, %18, !dbg !174
br i1 %.not48.not, label %L222, label %L179.preheader, !dbg !163
L179.preheader: ; preds = %pass11
%20 = sub i64 0, %2
%.not49 = icmp sgt i64 %20, %2
%21 = sext i1 %.not49 to i64
%value_phi17 = xor i64 %21, %2
%.not50.not = icmp slt i64 %value_phi17, %20
br label %L179, !dbg !90
}
using AMDGPU
function gpu_kernel_xx!(x, Nx::Int64, Ny::Int64)
i = AMDGPU.threadIdx().x
i ≤ 10 || return
workitems = CartesianIndices((10, 1))
@inbounds workitems[i] # accessing `workitems` triggers miscompilation
y = 0f0
for _ in -Nx:Nx
for _ in -Ny:Ny
y += 1f0
end
end
@inbounds x[i] = y
return
end
Nx, Ny = 1, 1
x = AMDGPU.zeros(Float32, 10)
@roc groupsize=length(x) gridsize=1 gpu_kernel_xx!(x, Nx, Ny)
println("ka_direct:", x)
# expected answer [9.0, 9.0, 9.0, ...]
AMDGPU.@device_code dir="devcode" @roc launch=false gpu_kernel_xx!(x, Nx, Ny)
define amdgpu_kernel void @_Z14gpu_kernel_xx_14ROCDeviceArrayI7Float32Ll1ELl1EE5Int64S1_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x i64], i8 addrspace(1)*, i64 } %0, i64 signext %1, i64 signext %2) local_unnamed_addr #1 {
conversion:
%.fca.1.extract = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %0, 1
%3 = call i32 @llvm.amdgcn.workitem.id.x(), !range !7
%4 = icmp ugt i32 %3, 9
br i1 %4, label %common.ret, label %pass
L86: ; preds = %L86.preheader, %L118
%value_phi3 = phi i64 [ %7, %L118 ], [ %11, %L86.preheader ]
%value_phi4 = phi float [ %value_phi12, %L118 ], [ 0.000000e+00, %L86.preheader ]
br i1 %.not3.not, label %L118, label %L105
L105: ; preds = %L105, %L86
%value_phi8 = phi float [ %5, %L105 ], [ %value_phi4, %L86 ]
%value_phi9 = phi i64 [ %6, %L105 ], [ %13, %L86 ]
%5 = fadd float %value_phi8, 1.000000e+00
%.not4 = icmp eq i64 %value_phi9, %value_phi5
%6 = add i64 %value_phi9, 1
br i1 %.not4, label %L118, label %L105
L118: ; preds = %L105, %L86
%value_phi12 = phi float [ %value_phi4, %L86 ], [ %5, %L105 ]
%.not5 = icmp eq i64 %value_phi3, %value_phi
%7 = add i64 %value_phi3, 1
br i1 %.not5, label %L150, label %L86
L150: ; preds = %pass, %L118
%value_phi15 = phi float [ 0.000000e+00, %pass ], [ %value_phi12, %L118 ]
%8 = bitcast i8 addrspace(1)* %.fca.1.extract to float addrspace(1)*
%9 = zext i32 %3 to i64
%10 = getelementptr inbounds float, float addrspace(1)* %8, i64 %9
store float %value_phi15, float addrspace(1)* %10, align 4, !tbaa !8
br label %common.ret
common.ret: ; preds = %L150, %conversion
ret void
pass: ; preds = %conversion
%11 = sub i64 0, %1
%.not = icmp sgt i64 %11, %1
%12 = sext i1 %.not to i64
%value_phi = xor i64 %12, %1
%.not1.not = icmp slt i64 %value_phi, %11
br i1 %.not1.not, label %L150, label %L86.preheader
L86.preheader: ; preds = %pass
%13 = sub i64 0, %2
%.not2 = icmp sgt i64 %13, %2
%14 = sext i1 %.not2 to i64
%value_phi5 = xor i64 %14, %2
%.not3.not = icmp slt i64 %value_phi5, %13
br label %L86
}
@vchuravy so this is a bug with ROCm LLVM?
Yeah pretty much as far as I can tell this is very cursed.
@inbounds workitems[i] # accessing `workitems` triggers miscompilation
Yeah that's what I figured out as well, funnily the code is gone in the optimized IR and the code is basically identically
Identical LLVM modules lead to a very subtle codegen difference... https://godbolt.org/z/1z3b3fEdn This seems fixed in LLVM 17 https://godbolt.org/z/97x8Kz4n3
My reproducer ended up looking like:
function gpu_kernel_xx!(tensor, Nx::Int64, Ny::Int64; )
workitems = CartesianIndices((10, 1, 1))
tI = AMDGPU.threadIdx().x
if tI <= 10
@inbounds workitems[tI]
sum = 0.0
for _ = -Nx:Nx # seeminlgy skipped
for _ = -Ny:Ny
sum += 1.0
end
end
Base.unsafe_store!(tensor, sum, tI)
end
return nothing
end
@pxl-th do you have time to bisect this? Otherwise I can try and take a look.
If you have some infrastructure setup for this... Going through every commit seems like it'd take a lot of time
There is the git bisect command and related workflow that allows to nail down a regression without too many iterations.
No worries, I will bisect it.
I observe a similar behavior with LLVM 18 & Julia 1.12.
Julia MWE (reduced from generic matrix multiplication kernel):
using LLVM.Interop
using AMDGPU
function matmatmul_kernel_bad!(C::AbstractArray{T}, A) where T
assume(length(C) > 0)
assume(length(A) > 0)
idx = workitemIdx().x
Cij = zero(T)
for k in 1:length(A) # hardcoding loop to `1:4` gets rid of the bug
@inbounds Cij += A[k]
end
@inbounds C[idx] = Cij
return
end
function main()
T = ComplexF16
x = ROCArray(ones(T, 4))
z = ROCArray(zeros(T, 4))
@roc groupsize=4 gridsize=1 matmatmul_kernel_good!(z, x)
display(z); println()
return
end
Results in:
4-element ROCArray{ComplexF16, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
Float16(0.0) + Float16(4.0)im
Float16(4.0) + Float16(0.0)im
Float16(4.0) + Float16(0.0)im
Float16(4.0) + Float16(0.0)im
While with hardcoded for-loop to 1:4 range, the output is correct:
4-element ROCArray{ComplexF16, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
Float16(4.0) + Float16(0.0)im
Float16(4.0) + Float16(0.0)im
Float16(4.0) + Float16(0.0)im
Float16(4.0) + Float16(0.0)im
This is not present on LLVM 16 (Julia 1.11) and older versions.
Attaching LLVM IR, ASM dump of the kernel with LLVM 16 & LLVM 18: devcode.zip
I think this is due to https://github.com/JuliaLang/julia/pull/58837
Can confirm that having that PR fixes at least your latest example