libcudacxx icon indicating copy to clipboard operation
libcudacxx copied to clipboard

cuda::barrier<thread_scope_thread> uses exponential backoff

Open gonzalobg opened this issue 3 years ago • 0 comments

Example (https://cuda.godbolt.org/z/W541eW):

#include <cooperative_groups.h>
#include <cuda/pipeline>
#include <cuda/barrier>
#include <cuda_pipeline.h>
#include <cuda_awbarrier.h>


using T = float4;
__global__ void test_pipe_intr(T* input, T volatile* out) {
   __shared__ T smem[32];
     __pipeline_memcpy_async(smem + threadIdx.x, input + threadIdx.x, sizeof(T));
     __pipeline_commit();
     __pipeline_wait_prior(0);
}

__global__ void test_bar(T* input, T volatile* out) {
   __shared__ T smem[32];
     cuda::barrier<cuda::thread_scope_thread> b{1};
     auto g = cooperative_groups::this_thread();
     cuda::memcpy_async(g, smem + threadIdx.x, input + threadIdx.x, cuda::aligned_size_t<16>(sizeof(T)), b);
     b.arrive_and_wait();
}

I expect test_pipe_intr and test_bar to generate similar (or the same) code (see #135 for test_pipe_intr codegen).

However, test_bar generates the code below.

One issue is that it does not generate an LDGSTS.E.BYPASS.

Another issue is that it uses exponential backoff. However, according to the thread_scope_thread docs:

A thread scope specifies the kind of threads that can synchronize with each other using a primitive such as an atomic or a barrier.

Each thread (CPU or GPU) is related to itself by the thread thread scope, specified with thread_scope_thread.

There is only one thread synchronizing with the barrier, and this should not be necessary.

test_bar(float4*, float4 volatile*):
 IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] 
 S2R R0, SR_EQMASK 
 IADD3 R1, R1, -0x8, RZ 
 IMAD.MOV.U32 R4, RZ, RZ, 0x7fffffff 
 ULDC.64 UR36, c[0x0][0x118] 
 S2R R3, SR_LTMASK 
 IMAD.MOV.U32 R5, RZ, RZ, 0x7fffffff 
 IADD3 R24, P1, R1, c[0x0][0x20], RZ 
 BSSY B0, `(.L_7) 
 STL.64 [R1], R4 
 IMAD.X R25, RZ, RZ, c[0x0][0x24], P1 
 LOP3.LUT R0, R3, R0, RZ, 0xc0, !PT 
 POPC R0, R0 
 IMAD.WIDE.U32 R2, R0, 0x10, RZ 
 ISETP.GT.U32.AND P0, PT, R2, 0xf, PT 
 ISETP.GT.U32.AND.EX P0, PT, R3, RZ, PT, P0 
 @P0 BRA `(.L_8) 
 S2R R5, SR_TID.X 
 IADD3 R4, P0, -R2, -0x10, RZ 
 IADD3.X R2, ~R3, -0x1, RZ, P0, !PT 
 ISETP.NE.U32.AND P0, PT, R4, RZ, PT 
 ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 
 IADD3 R3, P1, R0.reuse, R5, RZ 
 IMAD.IADD R0, R0, 0x1, R5 
 IMAD.X R4, RZ, RZ, RZ, P1 
 LEA R2, P1, R3, c[0x0][0x160], 0x4 
 IMAD.SHL.U32 R5, R0, 0x10, RZ 
 LEA.HI.X R3, R3, c[0x0][0x164], R4, 0x4, P1 
 @!P0 LDGSTS.E.128 [R5], [R2.64] 
 @!P0 LDGSTS.E.128 [R5+0x10], [R2.64+0x10] 
 @P0 LDGSTS.E.128 [R5], [R2.64] 
.L_8:
 BSYNC B0 
.L_7:
 LDGDEPBAR 
 DEPBAR.LE SB0, 0x0 
 IMAD.MOV.U32 R4, RZ, RZ, 0x0 
 IMAD.MOV.U32 R5, RZ, RZ, 0x1 
 NOP
 NOP
 NOP
 NOP
 MEMBAR.ALL.CTA 
 ATOM.E.ADD.64.STRONG.SM P0, R2, [R24.64], R4 
 BSSY B0, `(.L_9) 
 IMAD.MOV.U32 R17, RZ, RZ, R3 
 @P0 BRA `(.L_10) 
 QSPC.E.S P0, RZ, [R24] 
 @!P0 BRA `(.L_11) 
 BSSY B1, `(.L_12) 
 LOP3.LUT R3, R24, 0xffffff, RZ, 0xc0, !PT 
.L_13:
 LDS.64 R16, [R3] 
 IADD3 R18, P0, RZ, R16, RZ 
 IADD3.X R19, R17, 0x1, RZ, P0, !PT 
 ATOMS.CAST.SPIN.64 R18, [R3], R16, R18 
 ISETP.EQ.U32.AND P0, PT, R18, 0x1, PT 
 ISETP.EQ.U32.AND.EX P0, PT, R19, RZ, PT, P0 
 @!P0 BRA `(.L_13) 
 BSYNC B1 
.L_12:
 IMAD.MOV.U32 R2, RZ, RZ, R16 
 BRA `(.L_10) 
.L_11:
 LD.E.64 R2, [R24.64] 
 IADD3 R4, P0, RZ, R2, RZ 
 IMAD.MOV.U32 R17, RZ, RZ, R3 
 IADD3.X R5, R3, 0x1, RZ, P0, !PT 
 ST.E.64 [R24.64], R4 
.L_10:
 BSYNC B0 
.L_9:
 IADD3 R3, P0, RZ, R2.reuse, RZ 
 BSSY B0, `(.L_14) 
 LOP3.LUT R0, R3, R2, RZ, 0x3c, !PT 
 IADD3.X R4, R17, 0x1, RZ, P0, !PT 
 ISETP.GT.U32.AND P0, PT, R0, -0x1, PT 
 LOP3.LUT R0, R4, R17, RZ, 0x3c, !PT 
 ISETP.GT.AND.EX P0, PT, R0, -0x1, PT, P0 
 @P0 BRA `(.L_15) 
 IMAD.MOV.U32 R4, RZ, RZ, RZ 
 IMAD.MOV.U32 R5, RZ, RZ, R2 
 ATOM.E.ADD.64.STRONG.SM P0, RZ, [R24.64], R4 
 @P0 BRA `(.L_15) 
 QSPC.E.S P0, RZ, [R24] 
 @!P0 BRA `(.L_16) 
 BSSY B1, `(.L_17) 
 LOP3.LUT R3, R24, 0xffffff, RZ, 0xc0, !PT 
.L_18:
 LDS.64 R4, [R3] 
 IADD3 R6, P0, RZ, R4, RZ 
 IMAD.X R7, R5, 0x1, R2, P0 
 ATOMS.CAST.SPIN.64 R6, [R3], R4, R6 
 ISETP.EQ.U32.AND P0, PT, R6, 0x1, PT 
 ISETP.EQ.U32.AND.EX P0, PT, R7, RZ, PT, P0 
 @!P0 BRA `(.L_18) 
 BSYNC B1 
.L_17:
 BRA `(.L_15) 
.L_16:
 LD.E.64 R4, [R24.64] 
 IADD3 R4, P0, RZ, R4, RZ 
 IMAD.X R5, R5, 0x1, R2, P0 
 ST.E.64 [R24.64], R4 
.L_15:
 BSYNC B0 
.L_14:
 CS2R R22, SR_GLOBALTIMERLO 
 LD.E.64.STRONG.SM R4, [R24.64] 
 LOP3.LUT R0, R4, R2, RZ, 0x3c, !PT 
 LOP3.LUT R4, R5, R17, RZ, 0x3c, !PT 
 ISETP.GE.U32.AND P0, PT, R0, RZ, PT 
 ISETP.GE.AND.EX P0, PT, R4, RZ, PT, P0 
 @!P0 EXIT 
 BSSY B7, `(.L_19) 
 IMAD.MOV.U32 R19, RZ, RZ, RZ 
.L_24:
 ISETP.GE.AND P0, PT, R19, 0x10, PT 
 YIELD 
 BSSY B6, `(.L_20) 
 @!P0 BRA `(.L_21) 
 CS2R R4, SR_GLOBALTIMERLO 
 IADD3 R18, P0, -R22, R4, RZ 
 IMAD.X R16, R5, 0x1, ~R23, P0 
 ISETP.GT.U32.AND P0, PT, R18, 0x3d08ff, PT 
 ISETP.GT.AND.EX P0, PT, R16, RZ, PT, P0 
 @P0 BRA `(.L_22) 
 ISETP.GE.U32.AND P0, PT, R18, 0x9c40, PT 
 ISETP.GE.AND.EX P0, PT, R16, RZ, PT, P0 
 @!P0 BRA `(.L_23) 
 ISETP.GE.U32.AND P0, PT, R18, -0x4, PT 
 ISETP.GE.AND.EX P0, PT, R16, 0x3, PT, P0 
 @!P0 BRA `(.L_6) 
 UMOV UR8, 32@lo($str) 
 IMAD.MOV.U32 R8, RZ, RZ, 0x128 
 UMOV UR9, 32@hi($str) 
 IMAD.MOV.U32 R12, RZ, RZ, 0x1 
 UMOV UR6, 32@lo($str$1) 
 IMAD.MOV.U32 R13, RZ, RZ, RZ 
 UMOV UR7, 32@hi($str$1) 
 IMAD.U32 R4, RZ, RZ, UR8 
 UMOV UR4, 32@lo(__unnamed_1) 
 IMAD.U32 R5, RZ, RZ, UR9 
 UMOV UR5, 32@hi(__unnamed_1) 
 IMAD.U32 R6, RZ, RZ, UR6 
 MOV R20, 32@lo((test_bar(float4*, float4 volatile*) + .L_6@srel)) 
 IMAD.U32 R7, RZ, RZ, UR7 
 MOV R21, 32@hi((test_bar(float4*, float4 volatile*) + .L_6@srel)) 
 IMAD.U32 R10, RZ, RZ, UR4 
 IMAD.U32 R11, RZ, RZ, UR5 
 CALL.ABS.NOINC `(__assertfail) 
.L_6:
 SHF.R.S32.HI R3, RZ, 0x1f, R16 
 LEA.HI R0, P0, R3, R18, RZ, 0x2 
 IMAD.X R3, RZ, RZ, R16, P0 
 SHF.R.U64 R0, R0, 0x2, R3 
 NANOSLEEP R0 
 BRA `(.L_23) 
.L_22:
 NANOSLEEP 0xf4240 
 BRA `(.L_23) 
.L_21:
 IADD3 R19, R19, 0x1, RZ 
.L_23:
 BSYNC B6 
.L_20:
 LD.E.64.STRONG.SM R4, [R24.64] 
 LOP3.LUT R0, R4, R2, RZ, 0x3c, !PT 
 LOP3.LUT R4, R5, R17, RZ, 0x3c, !PT 
 ISETP.GT.U32.AND P0, PT, R0, -0x1, PT 
 ISETP.GT.AND.EX P0, PT, R4, -0x1, PT, P0 
 @P0 BRA `(.L_24) 
 BSYNC B7 
.L_19:
 EXIT 
.L_25:
 BRA `(.L_25)
.L_67:

While this SASS was generated with CUDA 11.1, I can reproduce with CUDA 11.2.

gonzalobg avatar Mar 10 '21 12:03 gonzalobg