TornadoVM
TornadoVM copied to clipboard
Compilation error with a nested for and if blocks.
Describe the bug
The generated OpenCL code has a syntax error.
How To Reproduce
Following is my kernel in Java.
public static void scan(KernelContext context, IntArray input, IntArray sum) {
int[] temp = context.allocateIntLocalArray(4 * 128);
int gid2 = context.globalIdx << 1;
int group = context.groupIdx;
int item = context.localIdx;
int n = context.localGroupSizeX << 1;
temp[2 * item] = input.get(gid2);
temp[2 * item + 1] = input.get(gid2 + 1);
int decale = 1;
for (int d = n >> 1; d > 0; d = d >> 1) {
context.localBarrier();
if (item < d) {
int ai = decale * ((item << 1) + 1) - 1;
int bi = decale * ((item << 1) + 2) - 1;
temp[bi] += temp[ai];
}
decale = decale << 1;
}
if (item == 0) {
sum.set(group, temp[n - 1]);
temp[n - 1] = 0;
}
for (int d = 1; d < n; d = d << 1) {
decale = decale >> 1;
context.localBarrier();
if (item < d) {
int ai = decale * ((item << 1) + 1) - 1;
int bi = decale * ((item << 1) + 2) - 1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
context.localBarrier();
input.set(gid2, temp[item << 1]);
input.set(gid2 + 1, temp[(item << 1) + 1]);
}
Expected behavior
Successfully compile the code to OpenCL kernel without any error.
Computing system setup (please complete the following information):
- OS: Ubuntu 22.04
- v1.0.3
Additional context
This is the generated OpenCL code which is wrong.
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void scan(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *input, __global uchar *sum)
{
ulong ul_8, ul_43, ul_1, ul_0, ul_15;
long l_14, l_13, l_42, l_41, l_7, l_6;
int i_25, i_27, i_28, i_29, i_30, i_31, i_32, i_17, i_18, i_19, i_20, i_21, i_22, i_23, i_24, i_9, i_10, i_11, i_12, i_16, i_3, i_4, i_5, i_57, i_58, i_59, i_60, i_49, i_50, i_51, i_52, i_53, i_54, i_55, i_56, i_44, i_45, i_46, i_47, i_33, i_34, i_35, i_37, i_38, i_39, i_40;
bool b_36, b_26, b_48;
// BLOCK 0
ul_0 = (ulong) input;
ul_1 = (ulong) sum;
__local int adi_2[512];
i_3 = get_global_id(0);
i_4 = i_3 << 1;
i_5 = i_4 + 6;
l_6 = (long) i_5;
l_7 = l_6 << 2;
ul_8 = ul_0 + l_7;
i_9 = *((__global int *) ul_8);
i_10 = get_local_id(0);
i_11 = i_10 << 1;
adi_2[i_11] = i_9;
i_12 = i_4 + 7;
l_13 = (long) i_12;
l_14 = l_13 << 2;
ul_15 = ul_0 + l_14;
i_16 = *((__global int *) ul_15);
i_17 = i_11 + 1;
adi_2[i_17] = i_16;
i_18 = i_11 + 2;
i_19 = get_local_size(0);
i_20 = i_19 << 1;
i_21 = i_20 >> 1;
// BLOCK 1 MERGES [0 5 ]
i_22 = 1;
i_23 = i_21;
for(;i_23 >= 1;)
{
// BLOCK 2
barrier(CLK_LOCAL_MEM_FENCE);
i_24 = i_23 >> 1;
i_25 = i_22 << 1;
b_26 = i_10 < i_23;
if(b_26)
{
// BLOCK 3
i_27 = i_18 * i_22;
i_28 = i_27 + -1;
i_29 = adi_2[i_28];
i_30 = i_22 * i_17;
i_31 = i_30 + -1;
i_32 = adi_2[i_31];
i_33 = i_29 + i_32;
adi_2[i_28] = i_33;
} // B3
else
{
// BLOCK 4
} // B4
// BLOCK 5 MERGES [4 3 ]
i_34 = i_25;
i_35 = i_24;
i_22 = i_34;
i_23 = i_35;
} // B5
// BLOCK 6
b_36 = i_10 == 0;
if(b_36)
{
// BLOCK 7
i_37 = i_20 + -1;
i_38 = adi_2[i_37];
i_39 = get_group_id(0);
i_40 = i_39 + 6;
l_41 = (long) i_40;
l_42 = l_41 << 2;
ul_43 = ul_1 + l_42;
*((__global int *) ul_43) = i_38;
adi_2[i_37] = 0;
} // B7
else
{
// BLOCK 8
} // B8
// BLOCK 9 MERGES [8 7 ]
// BLOCK 10 MERGES [9 14 ]
i_44 = i_22;
i_45 = 1;
for(;i_45 < i_20;)
{
// BLOCK 11
barrier(CLK_LOCAL_MEM_FENCE);
i_46 = i_45 << 1;
i_47 = i_44 >> 1;
b_48 = i_10 < i_45;
if(b_48)
{
// BLOCK 12
i_49 = i_47 * i_17;
i_50 = i_49 + -1;
i_51 = adi_2[i_50];
i_52 = i_18 * i_47;
i_53 = i_52 + -1;
i_54 = adi_2[i_53];
adi_2[i_50] = i_54;
i_55 = adi_2[i_53];
i_56 = i_51 + i_55;
adi_2[i_53] = i_56;
} // B12
else
{
// BLOCK 13
} // B13
// BLOCK 14 MERGES [13 12 ]
i_57 = i_47;
i_58 = i_46;
i_44 = i_57;
i_45 = i_58;
} // B14
// BLOCK 15
barrier(CLK_LOCAL_MEM_FENCE);
i_59 = adi_2[i_11];
*((__global int *) ul_8) = i_59;
i_60 = adi_2[i_17];
*((__global int *) ul_15) = i_60;
return;
} // B15
} // kernel
Thank you for the report. We will work on this.
@ShAlireza can you also provide the exact input sizes that you run the scan kernel? Thanks
Sure. IntArray input = new IntArray(256 * 128 * 4); IntArray sum = new IntArray(256);
And here is my GridScheduler: int totalLocalScanItems = 256 * 128 * 4 / 2; int localItems = totalLocalScanItems / 256;
WorkerGrid scanWorker = new WorkerGrid1D(totalLocalScanItems);
scanWorker.setLocalWork(localItems, 1, 1);
Thank you @ShAlireza, I managed to reprorduce it locally. We will work on the issue and we will get back to you.