llvm-project icon indicating copy to clipboard operation
llvm-project copied to clipboard

[Issue]: enabling `amdgpu-unsafe-fp-atomics` for gfx90a

Open pxl-th opened this issue 11 months ago • 3 comments

Problem Description

Hi! I'm one of the developers of AMDGPU.jl library that provides support for AMD GPU programming in Julia. To perform compilation of Julia GPU kernels we omit HIP and use LLVM directly, hence -munsafe-fp-atomics is not available.

To enable HW atomics, we instead add amdgpu-unsafe-fp-atomics=true function attribute to our LLVM IR during compilation. This works fine with gfx1100 devices, replacing CAS loop with HW fadd.

However, for gfx90a devices this does nothing. I was wondering if I'm missing something else that needs to be done?

Here's an example Julia kernel, which does atomic fadd on the first array item:

@kernel function ker!(x)
    @inbounds @atomic x[1] += 1f0
end

Here's its optimized LLVM IR with atomicrmw fadd float which is the same for gfx1100 and gfx90a (notice amdgpu-unsafe-fp-atomics attribute):

click
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:10:11:12:13"
target triple = "amdgcn-amd-amdhsa"

; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #0

; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #0

; Function Attrs: cold noreturn nounwind
declare void @llvm.amdgcn.endpgm() #1

;  @ none within `gpu_ker!`
define amdgpu_kernel void @_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, { [1 x i64], i8 addrspace(1)*, i64 } %1) local_unnamed_addr #2 !dbg !41 {
conversion:
  %.fca.0.0.0.0.extract = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, 0, 0, 0, 0
  %.fca.1.1.0.0.0.extract = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, 1, 1, 0, 0, 0
  %.fca.1.extract = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %1, 1
;  @ none within `gpu_ker!` @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/macros.jl:94
; ┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/ROCKernels.jl:144 within `#__validindex`
; │┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:172 within `blockIdx`
; ││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:95 within `blockIdx_x`
; │││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:93 within `workgroupIdx_x`
; ││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `_index`
; │││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `macro expansion` @ /home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:38
        %2 = call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !45, !range !66
; │└└└└└
; │┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:164 within `threadIdx`
; ││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:89 within `threadIdx_x`
; │││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:87 within `workitemIdx_x`
; ││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `_index`
; │││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `macro expansion` @ /home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:38
        %3 = call i32 @llvm.amdgcn.workitem.id.x(), !dbg !67, !range !76
; ││││└└
; ││││┌ @ int.jl:1068 within `+` @ int.jl:87
       %4 = add nuw nsw i32 %3, 1, !dbg !77
; │└└└└
; │┌ @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/nditeration.jl:84 within `expand`
; ││┌ @ abstractarray.jl:1291 within `getindex`
; │││┌ @ indices.jl:350 within `to_indices` @ indices.jl:354
; ││││┌ @ indices.jl:359 within `_to_indices1`
; │││││┌ @ indices.jl:277 within `to_index` @ indices.jl:292
; ││││││┌ @ number.jl:7 within `convert`
; │││││││┌ @ boot.jl:784 within `Int64`
; ││││││││┌ @ boot.jl:708 within `toInt64`
           %5 = zext i32 %4 to i64, !dbg !81
; ││└└└└└└└
; ││ @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/nditeration.jl:84 within `expand` @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/nditeration.jl:74
; ││┌ @ ntuple.jl:48 within `ntuple`
; │││┌ @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/nditeration.jl:78 within `#1`
; ││││┌ @ int.jl:86 within `-`
       %6 = zext i32 %2 to i64, !dbg !104
; ││││└
; ││││┌ @ int.jl:88 within `*`
       %7 = mul i64 %.fca.1.1.0.0.0.extract, %6, !dbg !112
; ││││└
; ││││┌ @ int.jl:87 within `+`
       %8 = add i64 %7, %5, !dbg !114
; │└└└└
; │ @ /home/pxl-th/.julia/dev/AMDGPU/src/ROCKernels.jl:145 within `#__validindex`
; │┌ @ multidimensional.jl:471 within `in`
; ││┌ @ tuple.jl:318 within `map`
; │││┌ @ range.jl:1439 within `in`
; ││││┌ @ int.jl:514 within `<=`
       %9 = icmp slt i64 %8, 1, !dbg !115
       %10 = icmp sgt i64 %8, %.fca.0.0.0.0.extract, !dbg !115
; └└└└└
  %11 = or i1 %9, %10, !dbg !62
  br i1 %11, label %L128, label %L104, !dbg !62

L104:                                             ; preds = %conversion
  %.fca.0.0.extract = extractvalue { [1 x i64], i8 addrspace(1)*, i64 } %1, 0, 0
;  @ none within `gpu_ker!` @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/macros.jl:95
; ┌ @ /home/pxl-th/.julia/dev/atomic.jl:6 within `macro expansion`
; │┌ @ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:95 within `getindex`
; ││┌ @ abstractarray.jl:702 within `checkbounds` @ abstractarray.jl:687
; │││┌ @ abstractarray.jl:763 within `checkindex`
; ││││┌ @ int.jl:513 within `<`
       %.not = icmp slt i64 %.fca.0.0.extract, 1, !dbg !127
; │││└└
; │││ @ abstractarray.jl:702 within `checkbounds`
     br i1 %.not, label %L115, label %L119, !dbg !133

L115:                                             ; preds = %L104
; │││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/quirks.jl:8 within `#throw_boundserror`
; ││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/runtime.jl:113 within `signal_exception`
; │││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/runtime.jl:11 within `exception_flag`
; ││││││┌ @ none within `kernel_state`
; │││││││┌ @ none within `macro expansion` @ /home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:38
          %state.i.fca.0.extract.i = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0, !dbg !141
; │││││└└└
; │││││┌ @ pointer.jl:146 within `unsafe_store!` @ pointer.jl:146
        %memcpy_refined_dst.i = inttoptr i64 %state.i.fca.0.extract.i to i32*, !dbg !156
        store i32 1, i32* %memcpy_refined_dst.i, align 1, !dbg !156
; │││││└
; │││││ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/runtime.jl:115 within `signal_exception`
; │││││┌ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/execution_control.jl:52 within `endpgm`
        call void @llvm.amdgcn.endpgm(), !dbg !160
; │││││└
; │││││ @ /home/pxl-th/.julia/dev/AMDGPU/src/device/runtime.jl:116 within `signal_exception`
       unreachable, !dbg !164

L119:                                             ; preds = %L104
; │└└└└
; │┌ @ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:33 within `modify!` @ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/6HZfV/src/internal.jl:20
; ││┌ @ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/6HZfV/src/atomics.jl:359 within `atomic_pointermodify`
; │││┌ @ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/6HZfV/src/atomics.jl:255 within `llvm_atomic_op`
; ││││┌ @ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/6HZfV/src/atomics.jl:255 within `macro expansion` @ /home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:38
       %12 = bitcast i8 addrspace(1)* %.fca.1.extract to float addrspace(1)*, !dbg !165
       %13 = atomicrmw fadd float addrspace(1)* %12, float 1.000000e+00 seq_cst, align 4, !dbg !165
; ││└└└
; ││ @ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:33 within `modify!`
    br label %L128, !dbg !176

L128:                                             ; preds = %L119, %conversion
; └└
;  @ none within `gpu_ker!` @ /home/pxl-th/.julia/packages/KernelAbstractions/Zcyra/src/macros.jl:97
  ret void, !dbg !179
}

attributes #0 = { nounwind readnone speculatable willreturn "amdgpu-unsafe-fp-atomics"="true"}
attributes #1 = { cold noreturn nounwind }
attributes #2 = { "amdgpu-unsafe-fp-atomics"="true"}

And here's the assembly output for gfx1100, notice global_atomic_add_f32 present:

click
	.text
	.amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
	.globl	_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE ; -- Begin function _Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE
	.p2align	8
	.type	_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE,@function
_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE: ; @_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE
.Lfunc_begin0:
	.file	1 "." "none"
	.loc	1 0 0                           ; none:0:0
	.cfi_sections .debug_frame
	.cfi_startproc
; %bb.0:                                ; %conversion
	s_clause 0x1
	s_load_b64 s[2:3], s[0:1], 0x68
	s_load_b64 s[4:5], s[0:1], 0x58
.Ltmp0:
	.file	2 "." "boot.jl"
	.loc	2 708 0 prologue_end            ; boot.jl:708:0
	v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 1, v0
.Ltmp1:
	.file	3 "." "int.jl"
	.loc	3 87 0                          ; int.jl:87:0
	s_waitcnt lgkmcnt(0)
	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
	v_mad_u64_u32 v[2:3], null, s2, s15, v[0:1]
	v_mov_b32_e32 v0, v3
	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
	v_mad_u64_u32 v[3:4], null, s3, s15, v[0:1]
.Ltmp2:
	.loc	3 514 0                         ; int.jl:514:0
	v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
	v_cmp_ge_i64_e64 s2, s[4:5], v[2:3]
	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
	s_and_b32 s2, vcc_lo, s2
	s_and_saveexec_b32 s3, s2
	s_cbranch_execz .LBB0_4
.Ltmp3:
; %bb.1:                                ; %L104
	.loc	3 0 0 is_stmt 0                 ; int.jl:0:0
	s_load_b64 s[2:3], s[0:1], 0x70
.Ltmp4:
	.loc	3 513 0 is_stmt 1               ; int.jl:513:0
	s_waitcnt lgkmcnt(0)
	v_cmp_gt_i64_e64 s2, s[2:3], 0
	s_delay_alu instid0(VALU_DEP_1)
	s_and_b32 vcc_lo, exec_lo, s2
	s_mov_b32 s2, -1
	s_cbranch_vccz .LBB0_3
.Ltmp5:
; %bb.2:                                ; %L119
	.loc	3 0 0 is_stmt 0                 ; int.jl:0:0
	s_load_b64 s[2:3], s[0:1], 0x78
	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
.Ltmp6:
	.file	4 "." "/home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl"
	.loc	4 38 0 is_stmt 1                ; /home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:38:0
	s_waitcnt vmcnt(0) lgkmcnt(0)
	s_waitcnt_vscnt null, 0x0
	global_atomic_add_f32 v0, v1, s[2:3]
	s_waitcnt_vscnt null, 0x0
	buffer_gl0_inv
	buffer_gl1_inv
	s_mov_b32 s2, 0
.Ltmp7:
.LBB0_3:                                ; %Flow
	.loc	4 0 0 is_stmt 0                 ; /home/pxl-th/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:0:0
	s_delay_alu instid0(SALU_CYCLE_1)
	s_and_not1_b32 vcc_lo, exec_lo, s2
	s_cbranch_vccz .LBB0_5
.LBB0_4:                                ; %UnifiedReturnBlock
	s_endpgm
.LBB0_5:                                ; %L115
	s_load_b64 s[0:1], s[0:1], 0x0
	v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1
	s_waitcnt lgkmcnt(0)
	v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
.Ltmp8:
	.file	5 "." "pointer.jl"
	.loc	5 146 0 is_stmt 1               ; pointer.jl:146:0
	s_clause 0x3
	flat_store_b8 v[0:1], v2 offset:3
	flat_store_b8 v[0:1], v2 offset:2
	flat_store_b8 v[0:1], v2 offset:1
	flat_store_b8 v[0:1], v3
.Ltmp9:
	.file	6 "." "/home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/execution_control.jl"
	.loc	6 52 0                          ; /home/pxl-th/.julia/dev/AMDGPU/src/device/gcn/execution_control.jl:52:0
	s_endpgm
	; divergent unreachable
	s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	s_endpgm

And here's assembly for gfx90a, notice regular global_atomic_cmpswap:

click
       .text
        .amdgcn_target "amdgcn-amd-amdhsa--gfx90a:sramecc+"
        .globl  _Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE ; -- Begin function _Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE
        .p2align        8
        .type   _Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE,@function
_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE: ; @_Z8gpu_ker_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float32Li1ELi1EE
.Lfunc_begin0:
        .file   1 "." "none"
        .loc    1 0 0                           ; none:0:0
        .cfi_sections .debug_frame
        .cfi_startproc
; %bb.0:                                ; %conversion
        s_load_dwordx2 s[0:1], s[4:5], 0x68
        s_load_dwordx2 s[2:3], s[4:5], 0x58
.Ltmp0:
        .file   2 "." "int.jl"
        .loc    2 87 0 prologue_end             ; int.jl:87:0
        v_add_u32_e32 v0, 1, v0
.Ltmp1:
        .file   3 "." "boot.jl"
        .loc    3 708 0                         ; boot.jl:708:0
        v_mov_b32_e32 v1, 0
.Ltmp2:
        .loc    2 87 0                          ; int.jl:87:0
        v_mov_b32_e32 v2, s6
        s_waitcnt lgkmcnt(0)
        s_mul_i32 s7, s1, s6
        v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
        v_add_u32_e32 v1, s7, v1
.Ltmp3:
        .loc    2 514 0                         ; int.jl:514:0
        v_cmp_lt_i64_e32 vcc, 0, v[0:1]
        v_cmp_ge_i64_e64 s[0:1], s[2:3], v[0:1]
        s_and_b64 s[0:1], vcc, s[0:1]
        s_and_saveexec_b64 s[2:3], s[0:1]
        s_cbranch_execz .LBB0_6
.Ltmp4:
; %bb.1:                                ; %L104
        .loc    2 0 0 is_stmt 0                 ; int.jl:0:0
        s_load_dwordx2 s[2:3], s[4:5], 0x70
        s_mov_b64 s[0:1], 0
.Ltmp5:
        .loc    2 513 0 is_stmt 1               ; int.jl:513:0
        s_waitcnt lgkmcnt(0)
        v_cmp_gt_i64_e64 s[6:7], s[2:3], 0
        s_mov_b64 s[2:3], -1
        s_and_b64 vcc, exec, s[6:7]
        s_cbranch_vccz .LBB0_5
.Ltmp6:
; %bb.2:                                ; %L119
        .loc    2 0 0 is_stmt 0                 ; int.jl:0:0
        s_load_dwordx2 s[2:3], s[4:5], 0x78
        v_mov_b32_e32 v2, 0
.Ltmp7:
        .file   4 "." "/users/antonsmi/.julia/packages/LLVM/Q3CgR/src/interop/base.jl"
        .loc    4 38 0 is_stmt 1                ; /users/antonsmi/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:38:0
        s_waitcnt lgkmcnt(0)
        s_load_dword s6, s[2:3], 0x0
        s_waitcnt lgkmcnt(0)
        v_mov_b32_e32 v1, s6
.LBB0_3:                                ; %atomicrmw.start
                                        ; =>This Inner Loop Header: Depth=1
        v_add_f32_e32 v0, 1.0, v1
        buffer_wbl2
        s_waitcnt vmcnt(0) lgkmcnt(0)
        global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
        s_waitcnt vmcnt(0)
        buffer_invl2
        buffer_wbinvl1_vol
        v_cmp_eq_u32_e32 vcc, v0, v1
        s_or_b64 s[0:1], vcc, s[0:1]
        v_mov_b32_e32 v1, v0
        s_andn2_b64 exec, exec, s[0:1]
        s_cbranch_execnz .LBB0_3
.Ltmp8:
; %bb.4:                                ; %Flow
        .loc    4 0 0 is_stmt 0                 ; /users/antonsmi/.julia/packages/LLVM/Q3CgR/src/interop/base.jl:0:0
        s_or_b64 exec, exec, s[0:1]
        s_mov_b64 s[2:3], 0
.LBB0_5:                                ; %Flow4
        s_and_b64 vcc, exec, s[2:3]
        s_cbranch_vccnz .LBB0_7
.LBB0_6:                                ; %UnifiedReturnBlock
        s_endpgm
.LBB0_7:                                ; %L115
        s_load_dwordx2 s[0:1], s[4:5], 0x0
        v_mov_b32_e32 v2, 0
        v_mov_b32_e32 v3, 1
        s_waitcnt lgkmcnt(0)
        v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
.Ltmp9:
        .file   5 "." "pointer.jl"
        .loc    5 146 0 is_stmt 1               ; pointer.jl:146:0
        flat_store_byte v[0:1], v2 offset:3
        flat_store_byte v[0:1], v2 offset:2
        flat_store_byte v[0:1], v2 offset:1
        flat_store_byte v[0:1], v3
.Ltmp10:
        .file   6 "." "/pfs/lustrep2/scratch/project_465000557/antonsmi/julia_depot/dev/AMDGPU/src/device/gcn/execution_control.jl"
        .loc    6 52 0                          ; /pfs/lustrep2/scratch/project_465000557/antonsmi/julia_depot/dev/AMDGPU/src/device/gcn/execution_control.jl:52:0
        s_endpgm
        ; divergent unreachable
        s_endpgm

Any help or advice is appreciated. Thanks!

Operating System

Ubuntu 22.04.3 LTS (Jammy Jellyfish)

CPU

AMD Ryzen 7 5800X 8-Core Processor

GPU

AMD Instinct MI250X, AMD Radeon RX 7900 XTX

ROCm Version

ROCm 6.0.0, ROCm 5.6.0

pxl-th avatar Mar 06 '24 11:03 pxl-th

@kzhuravl

lamb-j avatar Apr 18 '24 20:04 lamb-j

Hi @arsenm or Shilei (@shiltian), can you help with this?

kzhuravl avatar Apr 18 '24 22:04 kzhuravl

We're in the middle of overhauling the unsafe atomic handling (i.e. see https://github.com/llvm/llvm-project/pull/85052)

I'm not sure the handling here was ever updated properly for gfx11. I'm planning on fixing all of these cases soon.

For this particular case I think you need to use something stricter than system scope (e.g. add syncscope("agent") to the atomic)

arsenm avatar Apr 25 '24 09:04 arsenm

I'll close it as specifying syncscope fixes the issue. Thanks!

pxl-th avatar Jul 23 '24 16:07 pxl-th

I'll close it as specifying syncscope fixes the issue. Thanks!

Note the attribute was just removed in edded8d7b5cb310524494cca317dd3582234b56f. You should now specify some combination of !amdgpu.no.fine.grained.memory , !amdgpu.no.remote.memory, and !amdgpu.ignore.denormal.mode depending on what's appropriate for the situation

arsenm avatar Aug 16 '24 18:08 arsenm