hcc
hcc copied to clipboard
Incorrectly compiled branching code with barriers
#include <hc.hpp>
int main()
{
hc::array_view<unsigned int, 2> g_mm(256, 8);
// generate some low quality pseudo random data
unsigned int s[4] = {0xA713B461, 0x7F1A1723, 0x162B712A, 0x91AB2673};
for(int y = 0; y < 256; y++)
for(int x = 0; x < 8; x++)
{
unsigned int e = ((s[0] << 5) | (s[1] >> 27));
s[0] = (s[1] ^ s[2]) + s[3];
s[1] = (s[3] << 3) + s[2];
s[2] = e + s[2] + s[3];
s[3] = (e << 15) | (s[0] >> 17);
g_mm[y][x] = s[3];
}
hc::array_view<int> cnts(256);
parallel_for_each(hc::tiled_extent<1>(256, 256), [=](hc::tiled_index<1> i) [[hc]]
{
tile_static unsigned int l_mm[256][8];
unsigned int mm[8];
for(int x = 0; x < 8; x++)
mm[x] = l_mm[i.local[0]][x] = g_mm[i.local[0]][x];
tile_static int sel[256];
sel[i.local[0]] = -1;
i.barrier.wait_with_tile_static_memory_fence();
int cnt = 0;
#pragma nounroll
for(int k = 0; k < 256; k++)
{
bool one = mm[k / 32] & (1 << k);
if(one)sel[k] = i.local[0];
i.barrier.wait_with_tile_static_memory_fence();
int y = sel[k];
cnt = y != -1 ? cnt + 1 : cnt;
if(one)
for(int x = 0; x < 8; x++)
mm[x] ^= l_mm[y][x];
i.barrier.wait_with_tile_static_memory_fence(); // *
if(one) // *
for(int x = 0; x < 8; x++)
l_mm[i.local[0]][x] = mm[x];
}
cnts[i.local[0]] = cnt;
});
for(int i = 0; i < 256; i += 16)
{
printf("%3i ", i);
for(int j = 0; j < 16; j++)
printf(" %i", cnts[i + j]);
printf("\n");
}
// check with CPU version
int cnt = 0;
for(int k = 0; k < 256; k++)
{
int sel;
for(sel = 0; sel < 256; sel++)
if(g_mm[sel][k / 32] & (1 << k))
break;
if(sel == 256)continue;
unsigned int row[8];
for(int x = 0; x < 8; x++)
row[x] = g_mm[sel][x];
for(int y = 0; y < 256; y++)
if(g_mm[y][k / 32] & (1 << k))
for(int x = 0; x < 8; x++)
g_mm[y][x] ^= row[x];
cnt++;
}
printf("\nCPU: %i\n", cnt);
return 0;
}
This should produce 251 from all work items but I am getting randomly outputs like this:
0 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
16 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
32 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
48 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
64 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
80 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
96 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
112 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
128 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
144 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
160 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
176 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
192 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
208 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
224 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
240 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
Interesting is that it differs for individual wavefronts. It will print correct results when is commented line with "if(one)" or "i.barrier" marked with "// *" in comment. But commenting barrier should actually make this code incorrect. I have feeling it has something to do with barriers inside mixing of mask branches and real branches and optimisation that will merge two "if(one)" and create two s_barrier instructions in two branches from one "i.barrier.wait...".
Here is the problem. For some wavefronts within workgroup EXEC at top will be zero so branch BB0_7 is taken and only one s_branch is executed but some with non zero EXEC will execute all code below (mask branching) so they will execute both s_barrier instructions but as all work items must execute exactly same number of barriers something will go wrong here:
s_and_saveexec_b64 s[10:11], s[6:7]
s_xor_b64 s[6:7], exec, s[10:11]
; mask branch BB0_7
s_cbranch_execz BB0_7
BB0_6: ; %for.inc62.critedge.i
; in Loop: Header=BB0_1 Depth=1
s_waitcnt lgkmcnt(0)
s_barrier
v_mov_b32_e32 v29, v12
...
v_mov_b32_e32 v30, v13
BB0_7: ; %Flow
; in Loop: Header=BB0_1 Depth=1
s_or_saveexec_b64 s[6:7], s[6:7]
s_xor_b64 exec, exec, s[6:7]
; mask branch BB0_9
s_cbranch_execz BB0_9
BB0_8: ; %for.body49.i.preheader
; in Loop: Header=BB0_1 Depth=1
s_waitcnt lgkmcnt(0)
...
ds_read2_b32 v[19:20], v19 offset0:4 offset1:5
s_waitcnt lgkmcnt(0)
s_barrier
v_xor_b32_e32 v29, v15, v12
...
v_mov_b32_e32 v37, v22
ds_write2_b32 v1, v27, v26 offset0:2 offset1:3
...
BB0_9: ; %for.inc62.i
; in Loop: Header=BB0_1 Depth=1
s_or_b64 exec, exec, s[6:7]
Full isa dump
; %bb.0: ; %entry
s_load_dwordx2 s[6:7], s[4:5], 0x0
s_load_dword s1, s[4:5], 0x20
s_load_dword s9, s[4:5], 0x28
s_load_dword s8, s[4:5], 0x30
s_load_dword s10, s[4:5], 0x38
s_load_dwordx2 s[2:3], s[4:5], 0x40
s_load_dword s0, s[4:5], 0x58
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, s9, v0
v_mul_lo_i32 v1, v1, s1
s_ashr_i32 s9, s8, 31
v_mov_b32_e32 v3, s7
s_lshl_b64 s[8:9], s[8:9], 2
v_add_u32_e32 v1, s10, v1
v_ashrrev_i32_e32 v2, 31, v1
v_lshlrev_b64 v[1:2], 2, v[1:2]
v_mov_b32_e32 v4, s9
v_add_co_u32_e32 v1, vcc, s6, v1
v_addc_co_u32_e32 v2, vcc, v3, v2, vcc
v_add_co_u32_e32 v1, vcc, s8, v1
v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
global_load_dword v13, v[1:2], off
global_load_dword v14, v[1:2], off offset:4
global_load_dword v15, v[1:2], off offset:8
global_load_dword v16, v[1:2], off offset:12
global_load_dword v17, v[1:2], off offset:16
global_load_dword v18, v[1:2], off offset:20
global_load_dword v19, v[1:2], off offset:24
global_load_dword v20, v[1:2], off offset:28
v_lshlrev_b32_e32 v1, 5, v0
s_movk_i32 s1, 0x400
v_add_u32_e32 v1, s1, v1
v_lshlrev_b32_e32 v2, 2, v0
v_mov_b32_e32 v3, -1
ds_write_b32 v2, v3
s_load_dword s4, s[4:5], 0x60
s_mov_b32 s8, 0
v_mov_b32_e32 v4, 0
s_waitcnt vmcnt(7)
v_mov_b32_e32 v12, v13
s_waitcnt vmcnt(6)
v_mov_b32_e32 v11, v14
s_waitcnt vmcnt(5)
v_mov_b32_e32 v10, v15
s_waitcnt vmcnt(4)
v_mov_b32_e32 v9, v16
s_waitcnt vmcnt(3)
v_mov_b32_e32 v8, v17
s_waitcnt vmcnt(2)
ds_write2_b32 v1, v17, v18 offset0:4 offset1:5
ds_write2_b32 v1, v15, v16 offset0:2 offset1:3
ds_write2_b32 v1, v13, v14 offset1:1
s_waitcnt vmcnt(0)
ds_write2_b32 v1, v19, v20 offset0:6 offset1:7
s_waitcnt lgkmcnt(0)
s_barrier
v_mov_b32_e32 v5, v20
v_mov_b32_e32 v6, v19
v_mov_b32_e32 v7, v18
v_mov_b32_e32 v3, 0
v_mov_b32_e32 v2, 0
BB0_1: ; %for.body15.i
; =>This Loop Header: Depth=1
; Child Loop BB0_2 Depth 2
v_lshrrev_b32_e32 v21, 5, v4
s_mov_b64 s[6:7], exec
BB0_2: ; Parent Loop BB0_1 Depth=1
; => This Inner Loop Header: Depth=2
v_readfirstlane_b32 s5, v21
v_cmp_eq_u32_e32 vcc, s5, v21
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s5, src0
v_mov_b32_e32 v22, v13
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_2
; %bb.3: ; in Loop: Header=BB0_1 Depth=1
s_mov_b64 exec, s[6:7]
v_lshlrev_b32_e64 v21, v4, 1
v_and_b32_e32 v21, v22, v21
v_cmp_ne_u32_e32 vcc, 0, v21
s_and_saveexec_b64 s[6:7], vcc
; mask branch BB0_5
BB0_4: ; %if.then.i
; in Loop: Header=BB0_1 Depth=1
ds_write_b32 v3, v0
BB0_5: ; %if.end.i
; in Loop: Header=BB0_1 Depth=1
s_or_b64 exec, exec, s[6:7]
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b32 v21, v3
s_mov_b32 s9, s8
s_mov_b32 s10, s8
s_mov_b32 s11, s8
s_mov_b32 s12, s8
s_mov_b32 s13, s8
s_mov_b32 s14, s8
s_mov_b32 s15, s8
v_mov_b32_e32 v29, s15
v_mov_b32_e32 v28, s14
v_mov_b32_e32 v27, s13
v_mov_b32_e32 v26, s12
v_mov_b32_e32 v25, s11
v_mov_b32_e32 v24, s10
v_mov_b32_e32 v23, s9
v_mov_b32_e32 v22, s8
v_mov_b32_e32 v37, v29
s_and_b64 s[6:7], exec, vcc
s_xor_b64 s[6:7], s[6:7], -1
v_mov_b32_e32 v36, v28
v_mov_b32_e32 v35, v27
v_mov_b32_e32 v34, v26
v_mov_b32_e32 v33, v25
v_mov_b32_e32 v32, v24
v_mov_b32_e32 v31, v23
v_mov_b32_e32 v30, v22
s_and_saveexec_b64 s[10:11], s[6:7]
s_xor_b64 s[6:7], exec, s[10:11]
; mask branch BB0_7
s_cbranch_execz BB0_7
BB0_6: ; %for.inc62.critedge.i
; in Loop: Header=BB0_1 Depth=1
s_waitcnt lgkmcnt(0)
s_barrier
v_mov_b32_e32 v29, v12
v_mov_b32_e32 v37, v20
v_mov_b32_e32 v28, v11
v_mov_b32_e32 v27, v10
v_mov_b32_e32 v26, v9
v_mov_b32_e32 v25, v8
v_mov_b32_e32 v24, v7
v_mov_b32_e32 v23, v6
v_mov_b32_e32 v22, v5
v_mov_b32_e32 v36, v19
v_mov_b32_e32 v35, v18
v_mov_b32_e32 v34, v17
v_mov_b32_e32 v33, v16
v_mov_b32_e32 v32, v15
v_mov_b32_e32 v31, v14
v_mov_b32_e32 v30, v13
BB0_7: ; %Flow
; in Loop: Header=BB0_1 Depth=1
s_or_saveexec_b64 s[6:7], s[6:7]
s_xor_b64 exec, exec, s[6:7]
; mask branch BB0_9
s_cbranch_execz BB0_9
BB0_8: ; %for.body49.i.preheader
; in Loop: Header=BB0_1 Depth=1
s_waitcnt lgkmcnt(0)
v_lshlrev_b32_e32 v13, 5, v21
v_add_u32_e32 v19, s1, v13
ds_read2_b32 v[13:14], v19 offset0:6 offset1:7
ds_read2_b32 v[15:16], v19 offset1:1
ds_read2_b32 v[17:18], v19 offset0:2 offset1:3
ds_read2_b32 v[19:20], v19 offset0:4 offset1:5
s_waitcnt lgkmcnt(0)
s_barrier
v_xor_b32_e32 v29, v15, v12
v_xor_b32_e32 v28, v16, v11
v_xor_b32_e32 v27, v17, v10
v_xor_b32_e32 v26, v18, v9
v_xor_b32_e32 v25, v19, v8
v_xor_b32_e32 v24, v20, v7
v_xor_b32_e32 v23, v13, v6
v_xor_b32_e32 v22, v14, v5
v_mov_b32_e32 v30, v29
v_mov_b32_e32 v31, v28
v_mov_b32_e32 v32, v27
v_mov_b32_e32 v33, v26
v_mov_b32_e32 v34, v25
v_mov_b32_e32 v35, v24
v_mov_b32_e32 v36, v23
v_mov_b32_e32 v37, v22
ds_write2_b32 v1, v27, v26 offset0:2 offset1:3
ds_write2_b32 v1, v29, v28 offset1:1
ds_write2_b32 v1, v23, v22 offset0:6 offset1:7
ds_write2_b32 v1, v25, v24 offset0:4 offset1:5
BB0_9: ; %for.inc62.i
; in Loop: Header=BB0_1 Depth=1
s_or_b64 exec, exec, s[6:7]
s_waitcnt lgkmcnt(0)
v_cmp_ne_u32_e32 vcc, -1, v21
v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
v_add_u32_e32 v4, 1, v4
v_mov_b32_e32 v13, v30
v_mov_b32_e32 v5, v22
v_cmp_ne_u32_e32 vcc, 0x100, v4
v_add_u32_e32 v3, 4, v3
s_and_b64 vcc, exec, vcc
v_mov_b32_e32 v14, v31
v_mov_b32_e32 v15, v32
v_mov_b32_e32 v16, v33
v_mov_b32_e32 v17, v34
v_mov_b32_e32 v18, v35
v_mov_b32_e32 v19, v36
v_mov_b32_e32 v20, v37
v_mov_b32_e32 v6, v23
v_mov_b32_e32 v7, v24
v_mov_b32_e32 v8, v25
v_mov_b32_e32 v9, v26
v_mov_b32_e32 v10, v27
v_mov_b32_e32 v11, v28
v_mov_b32_e32 v12, v29
s_cbranch_vccnz BB0_1
; %bb.10: ; %"_ZZ4mainENK3$_0clEN2hc11tiled_indexILi1EEE.exit"
v_lshlrev_b32_e32 v0, 2, v0
v_mov_b32_e32 v1, s3
v_add_co_u32_e32 v0, vcc, s2, v0
s_ashr_i32 s5, s4, 31
s_lshl_b64 s[2:3], s[4:5], 2
v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
s_ashr_i32 s1, s0, 31
v_mov_b32_e32 v3, s3
v_add_co_u32_e32 v0, vcc, s2, v0
v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
s_lshl_b64 s[0:1], s[0:1], 2
v_mov_b32_e32 v3, s1
v_add_co_u32_e32 v0, vcc, s0, v0
v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
global_store_dword v[0:1], v2, off
s_endpgm
.Lfunc_end0: