Polygeist
Polygeist copied to clipboard
Incorrect execution result of running CUDA to OpenMP compilation?
I have a simple CUDA program here that performs a simple reduction:
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA kernel for performing reduction (sum) of an array
__global__ void reduceSum(int *g_input, int *g_output, int n) {
extern __shared__ int s_data[];
// Each thread loads one element from global to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) s_data[tid] = g_input[i];
else s_data[tid] = 0;
__syncthreads();
// Do reduction in shared memory
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] += s_data[tid + s];
}
__syncthreads();
}
// Write result for this block to global memory
if (tid == 0) g_output[blockIdx.x] = s_data[0];
}
int main() {
int n = 1024;
int size = n * sizeof(int);
int *h_input, *h_output;
int *d_input, *d_output;
// Allocate host memory
h_input = (int*)malloc(size);
h_output = (int*)malloc(sizeof(int));
// Initialize input array
for(int i = 0; i < n; i++) {
h_input[i] = 1; // Example: fill with 1 for simplicity
}
// Allocate device memory
cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_output, sizeof(int));
// Copy from host to device
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
// Launch the kernel
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
reduceSum<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, n);
// Copy result back to host
cudaMemcpy(h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);
printf("Sum is %d\n", *h_output);
// Cleanup
free(h_input);
free(h_output);
cudaFree(d_input);
cudaFree(d_output);
return 0;
}
Then I compile it using this command (which I believe is the correct one to generate OpenMP with all the optimization):
cgeist --cuda-gpu-arch=sm_75 --cuda-lower --cpuify="distribute.mincut" -scal-rep=0 -raise-scf-to-affine --inner-serialize=1 --function=* -O2 -I/home/ericxu233/CUDAtoX/Polygeist/build/projects/openmp/src/runtime -L/home/ericxu233/CUDAtoX/Polygeist/build/projects/openmp/libomptarget/ -resource-dir=/home/ericxu233/CUDAtoX/Polygeist/build/lib/clang/18 simple.cu -o simple
When I try to run the executable ./simple it shows that:
Sum is 0
but in fact according to the source code the sum should be 256 and my CUDA GPU run confirms this. Moreover, when I remove the optimization options "-scal-rep=0 -raise-scf-to-affine --inner-serialize= -O2", the executable run results in a segmentation fault.
I was wondering what am I doing wrong here? Is the CUDA to OpenMP flow not properly supported anymore?
Managed to solve this by explicitly declaring shared memory.