hcc Incorrectly compiled branching code with barriers

Incorrectly compiled branching code with barriers

Open misos1 opened this issue 7 years ago • 1 comments

#include <hc.hpp>

int main()
{
	hc::array_view<unsigned int, 2> g_mm(256, 8);
	// generate some low quality pseudo random data
	unsigned int s[4] = {0xA713B461, 0x7F1A1723, 0x162B712A, 0x91AB2673};
	for(int y = 0; y < 256; y++)
		for(int x = 0; x < 8; x++)
		{
			unsigned int e = ((s[0] << 5) | (s[1] >> 27));
			s[0] = (s[1] ^ s[2]) + s[3];
			s[1] = (s[3] << 3) + s[2];
			s[2] = e + s[2] + s[3];
			s[3] = (e << 15) | (s[0] >> 17);
			g_mm[y][x] = s[3];
		}

	hc::array_view<int> cnts(256);
	parallel_for_each(hc::tiled_extent<1>(256, 256), [=](hc::tiled_index<1> i) [[hc]]
	{
		tile_static unsigned int l_mm[256][8];
		unsigned int mm[8];
		for(int x = 0; x < 8; x++)
			mm[x] = l_mm[i.local[0]][x] = g_mm[i.local[0]][x];
		tile_static int sel[256];
		sel[i.local[0]] = -1;
		i.barrier.wait_with_tile_static_memory_fence();
		int cnt = 0;
		#pragma nounroll
		for(int k = 0; k < 256; k++)
		{
			bool one = mm[k / 32] & (1 << k);
			if(one)sel[k] = i.local[0];
			i.barrier.wait_with_tile_static_memory_fence();
			int y = sel[k];
			cnt = y != -1 ? cnt + 1 : cnt;
			if(one)
				for(int x = 0; x < 8; x++)
					mm[x] ^= l_mm[y][x];
			i.barrier.wait_with_tile_static_memory_fence(); // *
			if(one) // *
				for(int x = 0; x < 8; x++)
					l_mm[i.local[0]][x] = mm[x];
		}
		cnts[i.local[0]] = cnt;
	});

	for(int i = 0; i < 256; i += 16)
	{
		printf("%3i  ", i);
		for(int j = 0; j < 16; j++)
			printf(" %i", cnts[i + j]);
		printf("\n");
	}

	// check with CPU version
	int cnt = 0;
	for(int k = 0; k < 256; k++)
	{
		int sel;
		for(sel = 0; sel < 256; sel++)
			if(g_mm[sel][k / 32] & (1 << k))
				break;
		if(sel == 256)continue;
		unsigned int row[8];
		for(int x = 0; x < 8; x++)
			row[x] = g_mm[sel][x];
		for(int y = 0; y < 256; y++)
			if(g_mm[y][k / 32] & (1 << k))
				for(int x = 0; x < 8; x++)
					g_mm[y][x] ^= row[x];
		cnt++;
	}
	printf("\nCPU: %i\n", cnt);
	return 0;
}

This should produce 251 from all work items but I am getting randomly outputs like this:

  0   251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
 16   251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
 32   251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
 48   251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251
 64   129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
 80   129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
 96   129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
112   129 129 129 129 129 129 129 129 129 129 129 129 129 129 129 129
128   65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
144   65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
160   65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
176   65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65
192   225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
208   225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
224   225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225
240   225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225

Interesting is that it differs for individual wavefronts. It will print correct results when is commented line with "if(one)" or "i.barrier" marked with "// *" in comment. But commenting barrier should actually make this code incorrect. I have feeling it has something to do with barriers inside mixing of mask branches and real branches and optimisation that will merge two "if(one)" and create two s_barrier instructions in two branches from one "i.barrier.wait...".

Oct 14 '18 23:10 misos1

Here is the problem. For some wavefronts within workgroup EXEC at top will be zero so branch BB0_7 is taken and only one s_branch is executed but some with non zero EXEC will execute all code below (mask branching) so they will execute both s_barrier instructions but as all work items must execute exactly same number of barriers something will go wrong here:

	s_and_saveexec_b64 s[10:11], s[6:7]
	s_xor_b64 s[6:7], exec, s[10:11]
	; mask branch BB0_7
	s_cbranch_execz BB0_7
BB0_6:                                  ; %for.inc62.critedge.i
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_waitcnt lgkmcnt(0)
	s_barrier
	v_mov_b32_e32 v29, v12
	...
	v_mov_b32_e32 v30, v13
BB0_7:                                  ; %Flow
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_or_saveexec_b64 s[6:7], s[6:7]
	s_xor_b64 exec, exec, s[6:7]
	; mask branch BB0_9
	s_cbranch_execz BB0_9
BB0_8:                                  ; %for.body49.i.preheader
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_waitcnt lgkmcnt(0)
	...
	ds_read2_b32 v[19:20], v19 offset0:4 offset1:5
	s_waitcnt lgkmcnt(0)
	s_barrier
	v_xor_b32_e32 v29, v15, v12
	...
	v_mov_b32_e32 v37, v22
	ds_write2_b32 v1, v27, v26 offset0:2 offset1:3
	...
BB0_9:                                  ; %for.inc62.i
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_or_b64 exec, exec, s[6:7]

Full isa dump

; %bb.0:                                ; %entry
	s_load_dwordx2 s[6:7], s[4:5], 0x0
	s_load_dword s1, s[4:5], 0x20
	s_load_dword s9, s[4:5], 0x28
	s_load_dword s8, s[4:5], 0x30
	s_load_dword s10, s[4:5], 0x38
	s_load_dwordx2 s[2:3], s[4:5], 0x40
	s_load_dword s0, s[4:5], 0x58
	s_waitcnt lgkmcnt(0)
	v_add_u32_e32 v1, s9, v0
	v_mul_lo_i32 v1, v1, s1
	s_ashr_i32 s9, s8, 31
	v_mov_b32_e32 v3, s7
	s_lshl_b64 s[8:9], s[8:9], 2
	v_add_u32_e32 v1, s10, v1
	v_ashrrev_i32_e32 v2, 31, v1
	v_lshlrev_b64 v[1:2], 2, v[1:2]
	v_mov_b32_e32 v4, s9
	v_add_co_u32_e32 v1, vcc, s6, v1
	v_addc_co_u32_e32 v2, vcc, v3, v2, vcc
	v_add_co_u32_e32 v1, vcc, s8, v1
	v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
	global_load_dword v13, v[1:2], off
	global_load_dword v14, v[1:2], off offset:4
	global_load_dword v15, v[1:2], off offset:8
	global_load_dword v16, v[1:2], off offset:12
	global_load_dword v17, v[1:2], off offset:16
	global_load_dword v18, v[1:2], off offset:20
	global_load_dword v19, v[1:2], off offset:24
	global_load_dword v20, v[1:2], off offset:28
	v_lshlrev_b32_e32 v1, 5, v0
	s_movk_i32 s1, 0x400
	v_add_u32_e32 v1, s1, v1
	v_lshlrev_b32_e32 v2, 2, v0
	v_mov_b32_e32 v3, -1
	ds_write_b32 v2, v3
	s_load_dword s4, s[4:5], 0x60
	s_mov_b32 s8, 0
	v_mov_b32_e32 v4, 0
	s_waitcnt vmcnt(7)
	v_mov_b32_e32 v12, v13
	s_waitcnt vmcnt(6)
	v_mov_b32_e32 v11, v14
	s_waitcnt vmcnt(5)
	v_mov_b32_e32 v10, v15
	s_waitcnt vmcnt(4)
	v_mov_b32_e32 v9, v16
	s_waitcnt vmcnt(3)
	v_mov_b32_e32 v8, v17
	s_waitcnt vmcnt(2)
	ds_write2_b32 v1, v17, v18 offset0:4 offset1:5
	ds_write2_b32 v1, v15, v16 offset0:2 offset1:3
	ds_write2_b32 v1, v13, v14 offset1:1
	s_waitcnt vmcnt(0)
	ds_write2_b32 v1, v19, v20 offset0:6 offset1:7
	s_waitcnt lgkmcnt(0)
	s_barrier
	v_mov_b32_e32 v5, v20
	v_mov_b32_e32 v6, v19
	v_mov_b32_e32 v7, v18
	v_mov_b32_e32 v3, 0
	v_mov_b32_e32 v2, 0
BB0_1:                                  ; %for.body15.i
                                        ; =>This Loop Header: Depth=1
                                        ;     Child Loop BB0_2 Depth 2
	v_lshrrev_b32_e32 v21, 5, v4
	s_mov_b64 s[6:7], exec
BB0_2:                                  ;   Parent Loop BB0_1 Depth=1
                                        ; =>  This Inner Loop Header: Depth=2
	v_readfirstlane_b32 s5, v21
	v_cmp_eq_u32_e32 vcc, s5, v21
	s_and_saveexec_b64 vcc, vcc
	s_set_gpr_idx_on s5, src0
	v_mov_b32_e32 v22, v13
	s_set_gpr_idx_off
	s_xor_b64 exec, exec, vcc
	s_cbranch_execnz BB0_2
; %bb.3:                                ;   in Loop: Header=BB0_1 Depth=1
	s_mov_b64 exec, s[6:7]
	v_lshlrev_b32_e64 v21, v4, 1
	v_and_b32_e32 v21, v22, v21
	v_cmp_ne_u32_e32 vcc, 0, v21
	s_and_saveexec_b64 s[6:7], vcc
	; mask branch BB0_5
BB0_4:                                  ; %if.then.i
                                        ;   in Loop: Header=BB0_1 Depth=1
	ds_write_b32 v3, v0
BB0_5:                                  ; %if.end.i
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_or_b64 exec, exec, s[6:7]
	s_waitcnt lgkmcnt(0)
	s_barrier
	ds_read_b32 v21, v3
	s_mov_b32 s9, s8
	s_mov_b32 s10, s8
	s_mov_b32 s11, s8
	s_mov_b32 s12, s8
	s_mov_b32 s13, s8
	s_mov_b32 s14, s8
	s_mov_b32 s15, s8
	v_mov_b32_e32 v29, s15
	v_mov_b32_e32 v28, s14
	v_mov_b32_e32 v27, s13
	v_mov_b32_e32 v26, s12
	v_mov_b32_e32 v25, s11
	v_mov_b32_e32 v24, s10
	v_mov_b32_e32 v23, s9
	v_mov_b32_e32 v22, s8
	v_mov_b32_e32 v37, v29
	s_and_b64 s[6:7], exec, vcc
	s_xor_b64 s[6:7], s[6:7], -1
	v_mov_b32_e32 v36, v28
	v_mov_b32_e32 v35, v27
	v_mov_b32_e32 v34, v26
	v_mov_b32_e32 v33, v25
	v_mov_b32_e32 v32, v24
	v_mov_b32_e32 v31, v23
	v_mov_b32_e32 v30, v22
	s_and_saveexec_b64 s[10:11], s[6:7]
	s_xor_b64 s[6:7], exec, s[10:11]
	; mask branch BB0_7
	s_cbranch_execz BB0_7
BB0_6:                                  ; %for.inc62.critedge.i
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_waitcnt lgkmcnt(0)
	s_barrier
	v_mov_b32_e32 v29, v12
	v_mov_b32_e32 v37, v20
	v_mov_b32_e32 v28, v11
	v_mov_b32_e32 v27, v10
	v_mov_b32_e32 v26, v9
	v_mov_b32_e32 v25, v8
	v_mov_b32_e32 v24, v7
	v_mov_b32_e32 v23, v6
	v_mov_b32_e32 v22, v5
	v_mov_b32_e32 v36, v19
	v_mov_b32_e32 v35, v18
	v_mov_b32_e32 v34, v17
	v_mov_b32_e32 v33, v16
	v_mov_b32_e32 v32, v15
	v_mov_b32_e32 v31, v14
	v_mov_b32_e32 v30, v13
BB0_7:                                  ; %Flow
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_or_saveexec_b64 s[6:7], s[6:7]
	s_xor_b64 exec, exec, s[6:7]
	; mask branch BB0_9
	s_cbranch_execz BB0_9
BB0_8:                                  ; %for.body49.i.preheader
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_waitcnt lgkmcnt(0)
	v_lshlrev_b32_e32 v13, 5, v21
	v_add_u32_e32 v19, s1, v13
	ds_read2_b32 v[13:14], v19 offset0:6 offset1:7
	ds_read2_b32 v[15:16], v19 offset1:1
	ds_read2_b32 v[17:18], v19 offset0:2 offset1:3
	ds_read2_b32 v[19:20], v19 offset0:4 offset1:5
	s_waitcnt lgkmcnt(0)
	s_barrier
	v_xor_b32_e32 v29, v15, v12
	v_xor_b32_e32 v28, v16, v11
	v_xor_b32_e32 v27, v17, v10
	v_xor_b32_e32 v26, v18, v9
	v_xor_b32_e32 v25, v19, v8
	v_xor_b32_e32 v24, v20, v7
	v_xor_b32_e32 v23, v13, v6
	v_xor_b32_e32 v22, v14, v5
	v_mov_b32_e32 v30, v29
	v_mov_b32_e32 v31, v28
	v_mov_b32_e32 v32, v27
	v_mov_b32_e32 v33, v26
	v_mov_b32_e32 v34, v25
	v_mov_b32_e32 v35, v24
	v_mov_b32_e32 v36, v23
	v_mov_b32_e32 v37, v22
	ds_write2_b32 v1, v27, v26 offset0:2 offset1:3
	ds_write2_b32 v1, v29, v28 offset1:1
	ds_write2_b32 v1, v23, v22 offset0:6 offset1:7
	ds_write2_b32 v1, v25, v24 offset0:4 offset1:5
BB0_9:                                  ; %for.inc62.i
                                        ;   in Loop: Header=BB0_1 Depth=1
	s_or_b64 exec, exec, s[6:7]
	s_waitcnt lgkmcnt(0)
	v_cmp_ne_u32_e32 vcc, -1, v21
	v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
	v_add_u32_e32 v4, 1, v4
	v_mov_b32_e32 v13, v30
	v_mov_b32_e32 v5, v22
	v_cmp_ne_u32_e32 vcc, 0x100, v4
	v_add_u32_e32 v3, 4, v3
	s_and_b64 vcc, exec, vcc
	v_mov_b32_e32 v14, v31
	v_mov_b32_e32 v15, v32
	v_mov_b32_e32 v16, v33
	v_mov_b32_e32 v17, v34
	v_mov_b32_e32 v18, v35
	v_mov_b32_e32 v19, v36
	v_mov_b32_e32 v20, v37
	v_mov_b32_e32 v6, v23
	v_mov_b32_e32 v7, v24
	v_mov_b32_e32 v8, v25
	v_mov_b32_e32 v9, v26
	v_mov_b32_e32 v10, v27
	v_mov_b32_e32 v11, v28
	v_mov_b32_e32 v12, v29
	s_cbranch_vccnz BB0_1
; %bb.10:                               ; %"_ZZ4mainENK3$_0clEN2hc11tiled_indexILi1EEE.exit"
	v_lshlrev_b32_e32 v0, 2, v0
	v_mov_b32_e32 v1, s3
	v_add_co_u32_e32 v0, vcc, s2, v0
	s_ashr_i32 s5, s4, 31
	s_lshl_b64 s[2:3], s[4:5], 2
	v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
	s_ashr_i32 s1, s0, 31
	v_mov_b32_e32 v3, s3
	v_add_co_u32_e32 v0, vcc, s2, v0
	v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
	s_lshl_b64 s[0:1], s[0:1], 2
	v_mov_b32_e32 v3, s1
	v_add_co_u32_e32 v0, vcc, s0, v0
	v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
	global_store_dword v[0:1], v2, off
	s_endpgm
.Lfunc_end0:

Oct 17 '18 23:10 misos1

hcc hcc copied to clipboard

Incorrectly compiled branching code with barriers

hcc
hcc copied to clipboard