[BUG] Vector Reduce Example using Shared Memory is Wrongly Executed
The following code doesn't output correct result, while the output from CUDA is correct.
#include <hip/hip_runtime.h>
#include <stdio.h>
__global__ void __KernelReduce(float *d_z, float *d_x, float *d_y, int N) {
int x_id = blockIdx.y;
int z_id = blockIdx.x;
int y_id = threadIdx.x;
HIP_DYNAMIC_SHARED( float, sdata)
sdata[y_id] = d_x[y_id] * d_y[y_id];
__syncthreads();
{ int s = 1; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 2; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 4; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 8; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 16; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 32; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 64; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 128; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 256; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
{ int s = 512; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); }
if (y_id == 0) {
d_z[0] = sdata[0];
}
}
#define N 1024
float glob[N];
int main() {
hipSetDevice(0);
for (int i = 0; i < N; ++i) glob[i] = 1.0f;
float *d_z, *d_x, *d_y;
hipMalloc((void**)&d_z, sizeof(float));
hipMalloc((void**)&d_x, sizeof(float) * N);
hipMalloc((void**)&d_y, sizeof(float) * N);
hipMemcpy(d_x, glob, sizeof(glob), hipMemcpyHostToDevice);
hipMemcpy(d_y, glob, sizeof(glob), hipMemcpyHostToDevice);
hipLaunchKernelGGL(__KernelReduce, dim3(1), dim3(N), sizeof(float) * N, NULL, d_z, d_x, d_y, N);
float h_z;
hipMemcpy(&h_z, d_z, sizeof(float), hipMemcpyDeviceToHost);
hipDeviceSynchronize();
printf("sdot = %g\n", h_z);
return 0;
}
The code is very simple, which computes Z(1, 1) = X(1, 1024) * Y(1024, 1), where all elements in X and Y are filled with 1.0f, so the output answer should be 1024.0f, while ROCm outputs 512.0f.
The error happens on this line, which doesn't take effect, so the finally answer is half of the correct sum:
..
{ int s = 256; if (y_id % (2 * s) == 0) sdata[y_id] += sdata[y_id + s]; __syncthreads(); } // This line makes wrong answer.
..
@sunway513 Any attention for this compiler bug? Hope to fix it soon~
cc @david-salinas @scchan , I can repro the result sdot = 512 with ROCm2.3.
@sunway513 Yeah, this compiler bug has a seriously bad impact on ROCm stability.
@ghostplant : thanks. Just to keep you updated, we've opened an internal issue/bug report for this problem. We're investigating and will update this Issue.