libcudacxx
libcudacxx copied to clipboard
cuda::barrier<thread_scope_thread> uses exponential backoff
Example (https://cuda.godbolt.org/z/W541eW):
#include <cooperative_groups.h>
#include <cuda/pipeline>
#include <cuda/barrier>
#include <cuda_pipeline.h>
#include <cuda_awbarrier.h>
using T = float4;
__global__ void test_pipe_intr(T* input, T volatile* out) {
__shared__ T smem[32];
__pipeline_memcpy_async(smem + threadIdx.x, input + threadIdx.x, sizeof(T));
__pipeline_commit();
__pipeline_wait_prior(0);
}
__global__ void test_bar(T* input, T volatile* out) {
__shared__ T smem[32];
cuda::barrier<cuda::thread_scope_thread> b{1};
auto g = cooperative_groups::this_thread();
cuda::memcpy_async(g, smem + threadIdx.x, input + threadIdx.x, cuda::aligned_size_t<16>(sizeof(T)), b);
b.arrive_and_wait();
}
I expect test_pipe_intr
and test_bar
to generate similar (or the same) code (see #135 for test_pipe_intr
codegen).
However, test_bar
generates the code below.
One issue is that it does not generate an LDGSTS.E.BYPASS
.
Another issue is that it uses exponential backoff. However, according to the thread_scope_thread
docs:
A thread scope specifies the kind of threads that can synchronize with each other using a primitive such as an atomic or a barrier.
Each thread (CPU or GPU) is related to itself by the thread thread scope, specified with thread_scope_thread.
There is only one thread synchronizing with the barrier, and this should not be necessary.
test_bar(float4*, float4 volatile*):
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28]
S2R R0, SR_EQMASK
IADD3 R1, R1, -0x8, RZ
IMAD.MOV.U32 R4, RZ, RZ, 0x7fffffff
ULDC.64 UR36, c[0x0][0x118]
S2R R3, SR_LTMASK
IMAD.MOV.U32 R5, RZ, RZ, 0x7fffffff
IADD3 R24, P1, R1, c[0x0][0x20], RZ
BSSY B0, `(.L_7)
STL.64 [R1], R4
IMAD.X R25, RZ, RZ, c[0x0][0x24], P1
LOP3.LUT R0, R3, R0, RZ, 0xc0, !PT
POPC R0, R0
IMAD.WIDE.U32 R2, R0, 0x10, RZ
ISETP.GT.U32.AND P0, PT, R2, 0xf, PT
ISETP.GT.U32.AND.EX P0, PT, R3, RZ, PT, P0
@P0 BRA `(.L_8)
S2R R5, SR_TID.X
IADD3 R4, P0, -R2, -0x10, RZ
IADD3.X R2, ~R3, -0x1, RZ, P0, !PT
ISETP.NE.U32.AND P0, PT, R4, RZ, PT
ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0
IADD3 R3, P1, R0.reuse, R5, RZ
IMAD.IADD R0, R0, 0x1, R5
IMAD.X R4, RZ, RZ, RZ, P1
LEA R2, P1, R3, c[0x0][0x160], 0x4
IMAD.SHL.U32 R5, R0, 0x10, RZ
LEA.HI.X R3, R3, c[0x0][0x164], R4, 0x4, P1
@!P0 LDGSTS.E.128 [R5], [R2.64]
@!P0 LDGSTS.E.128 [R5+0x10], [R2.64+0x10]
@P0 LDGSTS.E.128 [R5], [R2.64]
.L_8:
BSYNC B0
.L_7:
LDGDEPBAR
DEPBAR.LE SB0, 0x0
IMAD.MOV.U32 R4, RZ, RZ, 0x0
IMAD.MOV.U32 R5, RZ, RZ, 0x1
NOP
NOP
NOP
NOP
MEMBAR.ALL.CTA
ATOM.E.ADD.64.STRONG.SM P0, R2, [R24.64], R4
BSSY B0, `(.L_9)
IMAD.MOV.U32 R17, RZ, RZ, R3
@P0 BRA `(.L_10)
QSPC.E.S P0, RZ, [R24]
@!P0 BRA `(.L_11)
BSSY B1, `(.L_12)
LOP3.LUT R3, R24, 0xffffff, RZ, 0xc0, !PT
.L_13:
LDS.64 R16, [R3]
IADD3 R18, P0, RZ, R16, RZ
IADD3.X R19, R17, 0x1, RZ, P0, !PT
ATOMS.CAST.SPIN.64 R18, [R3], R16, R18
ISETP.EQ.U32.AND P0, PT, R18, 0x1, PT
ISETP.EQ.U32.AND.EX P0, PT, R19, RZ, PT, P0
@!P0 BRA `(.L_13)
BSYNC B1
.L_12:
IMAD.MOV.U32 R2, RZ, RZ, R16
BRA `(.L_10)
.L_11:
LD.E.64 R2, [R24.64]
IADD3 R4, P0, RZ, R2, RZ
IMAD.MOV.U32 R17, RZ, RZ, R3
IADD3.X R5, R3, 0x1, RZ, P0, !PT
ST.E.64 [R24.64], R4
.L_10:
BSYNC B0
.L_9:
IADD3 R3, P0, RZ, R2.reuse, RZ
BSSY B0, `(.L_14)
LOP3.LUT R0, R3, R2, RZ, 0x3c, !PT
IADD3.X R4, R17, 0x1, RZ, P0, !PT
ISETP.GT.U32.AND P0, PT, R0, -0x1, PT
LOP3.LUT R0, R4, R17, RZ, 0x3c, !PT
ISETP.GT.AND.EX P0, PT, R0, -0x1, PT, P0
@P0 BRA `(.L_15)
IMAD.MOV.U32 R4, RZ, RZ, RZ
IMAD.MOV.U32 R5, RZ, RZ, R2
ATOM.E.ADD.64.STRONG.SM P0, RZ, [R24.64], R4
@P0 BRA `(.L_15)
QSPC.E.S P0, RZ, [R24]
@!P0 BRA `(.L_16)
BSSY B1, `(.L_17)
LOP3.LUT R3, R24, 0xffffff, RZ, 0xc0, !PT
.L_18:
LDS.64 R4, [R3]
IADD3 R6, P0, RZ, R4, RZ
IMAD.X R7, R5, 0x1, R2, P0
ATOMS.CAST.SPIN.64 R6, [R3], R4, R6
ISETP.EQ.U32.AND P0, PT, R6, 0x1, PT
ISETP.EQ.U32.AND.EX P0, PT, R7, RZ, PT, P0
@!P0 BRA `(.L_18)
BSYNC B1
.L_17:
BRA `(.L_15)
.L_16:
LD.E.64 R4, [R24.64]
IADD3 R4, P0, RZ, R4, RZ
IMAD.X R5, R5, 0x1, R2, P0
ST.E.64 [R24.64], R4
.L_15:
BSYNC B0
.L_14:
CS2R R22, SR_GLOBALTIMERLO
LD.E.64.STRONG.SM R4, [R24.64]
LOP3.LUT R0, R4, R2, RZ, 0x3c, !PT
LOP3.LUT R4, R5, R17, RZ, 0x3c, !PT
ISETP.GE.U32.AND P0, PT, R0, RZ, PT
ISETP.GE.AND.EX P0, PT, R4, RZ, PT, P0
@!P0 EXIT
BSSY B7, `(.L_19)
IMAD.MOV.U32 R19, RZ, RZ, RZ
.L_24:
ISETP.GE.AND P0, PT, R19, 0x10, PT
YIELD
BSSY B6, `(.L_20)
@!P0 BRA `(.L_21)
CS2R R4, SR_GLOBALTIMERLO
IADD3 R18, P0, -R22, R4, RZ
IMAD.X R16, R5, 0x1, ~R23, P0
ISETP.GT.U32.AND P0, PT, R18, 0x3d08ff, PT
ISETP.GT.AND.EX P0, PT, R16, RZ, PT, P0
@P0 BRA `(.L_22)
ISETP.GE.U32.AND P0, PT, R18, 0x9c40, PT
ISETP.GE.AND.EX P0, PT, R16, RZ, PT, P0
@!P0 BRA `(.L_23)
ISETP.GE.U32.AND P0, PT, R18, -0x4, PT
ISETP.GE.AND.EX P0, PT, R16, 0x3, PT, P0
@!P0 BRA `(.L_6)
UMOV UR8, 32@lo($str)
IMAD.MOV.U32 R8, RZ, RZ, 0x128
UMOV UR9, 32@hi($str)
IMAD.MOV.U32 R12, RZ, RZ, 0x1
UMOV UR6, 32@lo($str$1)
IMAD.MOV.U32 R13, RZ, RZ, RZ
UMOV UR7, 32@hi($str$1)
IMAD.U32 R4, RZ, RZ, UR8
UMOV UR4, 32@lo(__unnamed_1)
IMAD.U32 R5, RZ, RZ, UR9
UMOV UR5, 32@hi(__unnamed_1)
IMAD.U32 R6, RZ, RZ, UR6
MOV R20, 32@lo((test_bar(float4*, float4 volatile*) + .L_6@srel))
IMAD.U32 R7, RZ, RZ, UR7
MOV R21, 32@hi((test_bar(float4*, float4 volatile*) + .L_6@srel))
IMAD.U32 R10, RZ, RZ, UR4
IMAD.U32 R11, RZ, RZ, UR5
CALL.ABS.NOINC `(__assertfail)
.L_6:
SHF.R.S32.HI R3, RZ, 0x1f, R16
LEA.HI R0, P0, R3, R18, RZ, 0x2
IMAD.X R3, RZ, RZ, R16, P0
SHF.R.U64 R0, R0, 0x2, R3
NANOSLEEP R0
BRA `(.L_23)
.L_22:
NANOSLEEP 0xf4240
BRA `(.L_23)
.L_21:
IADD3 R19, R19, 0x1, RZ
.L_23:
BSYNC B6
.L_20:
LD.E.64.STRONG.SM R4, [R24.64]
LOP3.LUT R0, R4, R2, RZ, 0x3c, !PT
LOP3.LUT R4, R5, R17, RZ, 0x3c, !PT
ISETP.GT.U32.AND P0, PT, R0, -0x1, PT
ISETP.GT.AND.EX P0, PT, R4, -0x1, PT, P0
@P0 BRA `(.L_24)
BSYNC B7
.L_19:
EXIT
.L_25:
BRA `(.L_25)
.L_67:
While this SASS was generated with CUDA 11.1, I can reproduce with CUDA 11.2.