compute-runtime icon indicating copy to clipboard operation
compute-runtime copied to clipboard

Memory operations (clEnqueueSVMMemcpy/clEnqueueReadBuffer) hang after event callbacks complete

Open pvelesko opened this issue 1 month ago • 3 comments

Works on PVC Driver Version 25.18.33578 Fails on A770: Driver Version 25.27.34303.5 Linux cupcake 6.8.0-65-generic #68~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue Jul 15 18:06:34 UTC 2 x86_64 x86_64 x86_64 GNU/Linux

#include <CL/cl.h>
#include <iostream>
#include <atomic>
#include <cassert>
#include <vector>

std::atomic<int> callbackCount{0};
std::atomic<int> executionOrder{0};

struct CallbackData {
  cl_event CallbackFinishEvent;
  int* Order;
  int ExpectedOrder;
};

void CL_CALLBACK pfn_notify(cl_event Event, cl_int CommandExecStatus, void *UserData) {
  (void)Event;
  (void)CommandExecStatus;
  CallbackData *Cb = static_cast<CallbackData *>(UserData);
  if (Cb == nullptr) {
    std::cerr << "ERROR: Callback data is null" << std::endl;
    return;
  }
  
  int order = executionOrder.fetch_add(1) + 1;
  callbackCount.fetch_add(1);
  
  std::cout << "testHostFunc called" << std::endl;
  std::cout << "order: " << order << std::endl;
  std::cout << "hostFuncCallCount: " << callbackCount.load() << std::endl;
  std::cout << "executionOrder: " << executionOrder.load() << std::endl;
  
  if (Cb->Order) {
    *Cb->Order = order;
  }
  
  if (Cb->CallbackFinishEvent != nullptr) {
    cl_int status = clSetUserEventStatus(Cb->CallbackFinishEvent, CL_COMPLETE);
    if (status != CL_SUCCESS) {
      std::cerr << "ERROR: clSetUserEventStatus failed with " << status << std::endl;
    }
  }
  
  delete Cb;
}

int main() {
  cl_int err;
  cl_uint numPlatforms = 0;
  cl_platform_id platform = nullptr;
  cl_device_id device = nullptr;
  cl_context context = nullptr;
  cl_command_queue queue = nullptr;
  
  err = clGetPlatformIDs(0, nullptr, &numPlatforms);
  if (err != CL_SUCCESS || numPlatforms == 0) {
    std::cerr << "Failed to get platforms" << std::endl;
    return 1;
  }
  
  std::vector<cl_platform_id> platforms(numPlatforms);
  err = clGetPlatformIDs(numPlatforms, platforms.data(), nullptr);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to get platform IDs" << std::endl;
    return 1;
  }
  
  platform = platforms[0];
  
  cl_uint numDevices = 0;
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices);
  if (err != CL_SUCCESS || numDevices == 0) {
    std::cerr << "Failed to get GPU devices" << std::endl;
    return 1;
  }
  
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to get device ID" << std::endl;
    return 1;
  }
  
  context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to create context" << std::endl;
    return 1;
  }
  
  queue = clCreateCommandQueueWithProperties(context, device, 0, &err);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to create command queue" << std::endl;
    clReleaseContext(context);
    return 1;
  }
  
  callbackCount = 0;
  executionOrder = 0;
  
  int order1 = 0, order2 = 0;
  
  cl_event barrier1 = nullptr;
  err = clEnqueueBarrierWithWaitList(queue, 0, nullptr, &barrier1);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to enqueue barrier 1" << std::endl;
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  cl_event callbackEvent1 = clCreateUserEvent(context, &err);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to create user event 1" << std::endl;
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  CallbackData *cbData1 = new CallbackData{callbackEvent1, &order1, 1};
  
  err = clSetEventCallback(barrier1, CL_COMPLETE, pfn_notify, cbData1);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to set event callback 1" << std::endl;
    delete cbData1;
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  cl_event waitEvents1[] = {callbackEvent1};
  cl_event barrier2 = nullptr;
  err = clEnqueueBarrierWithWaitList(queue, 1, waitEvents1, &barrier2);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to enqueue barrier 2" << std::endl;
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  cl_event barrier3 = nullptr;
  err = clEnqueueBarrierWithWaitList(queue, 0, nullptr, &barrier3);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to enqueue barrier 3" << std::endl;
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  cl_event callbackEvent2 = clCreateUserEvent(context, &err);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to create user event 2" << std::endl;
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  CallbackData *cbData2 = new CallbackData{callbackEvent2, &order2, 2};
  
  err = clSetEventCallback(barrier3, CL_COMPLETE, pfn_notify, cbData2);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to set event callback 2" << std::endl;
    delete cbData2;
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  cl_event waitEvents2[] = {callbackEvent2};
  cl_event barrier4 = nullptr;
  err = clEnqueueBarrierWithWaitList(queue, 1, waitEvents2, &barrier4);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to enqueue barrier 4" << std::endl;
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  std::cout << "About to call clFinish immediately - this may hang..." << std::endl;
  std::cout << "Callback count before clFinish: " << callbackCount.load() << std::endl;
  std::cout << "NOTE: If callbacks haven't executed yet, clFinish will wait for barrier4," << std::endl;
  std::cout << "      which waits for callbackEvent2, which is only set by the callback." << std::endl;
  std::cout << "      If the callback doesn't execute, this will hang!" << std::endl;
  
  err = clFinish(queue);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to finish queue" << std::endl;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  std::cout << "clFinish completed (callbacks should have executed)" << std::endl;
  std::cout << "Final callback count: " << callbackCount.load() << std::endl;
  
  if (callbackCount.load() != 2) {
    std::cerr << "FAIL: Expected 2 callbacks, got " << callbackCount.load() << std::endl;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  std::cout << "Callbacks completed successfully. Now attempting memory copy..." << std::endl;
  std::cout << "This is where the hang occurs in chipStar!" << std::endl;
  
  cl_int eventStatus1, eventStatus2, eventStatus3, eventStatus4;
  clGetEventInfo(barrier1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus1, nullptr);
  clGetEventInfo(barrier2, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus2, nullptr);
  clGetEventInfo(barrier3, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus3, nullptr);
  clGetEventInfo(barrier4, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus4, nullptr);
  
  std::cout << "Event statuses:" << std::endl;
  std::cout << "  barrier1 (callback trigger): " << eventStatus1 << " (CL_COMPLETE=" << CL_COMPLETE << ")" << std::endl;
  std::cout << "  barrier2 (waits for callbackEvent1): " << eventStatus2 << " (CL_COMPLETE=" << CL_COMPLETE << ")" << std::endl;
  std::cout << "  barrier3 (callback trigger): " << eventStatus3 << " (CL_COMPLETE=" << CL_COMPLETE << ")" << std::endl;
  std::cout << "  barrier4 (waits for callbackEvent2): " << eventStatus4 << " (CL_COMPLETE=" << CL_COMPLETE << ")" << std::endl;
  
  cl_int userEvent1Status, userEvent2Status;
  clGetEventInfo(callbackEvent1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &userEvent1Status, nullptr);
  clGetEventInfo(callbackEvent2, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &userEvent2Status, nullptr);
  std::cout << "  callbackEvent1 (user event): " << userEvent1Status << " (CL_COMPLETE=" << CL_COMPLETE << ")" << std::endl;
  std::cout << "  callbackEvent2 (user event): " << userEvent2Status << " (CL_COMPLETE=" << CL_COMPLETE << ")" << std::endl;
  
  void* dev_ptr = clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(int), 0);
  if (dev_ptr == nullptr) {
    std::cerr << "Failed to allocate SVM memory" << std::endl;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  int* host_ptr = new int;
  *host_ptr = 0;
  int* dev_data = static_cast<int*>(dev_ptr);
  *dev_data = 42;
  
  std::cout << "About to enqueue memory copy from device to host..." << std::endl;
  std::cout << "This may hang if barriers (barrier2/barrier4) haven't properly completed!" << std::endl;
  
  cl_event memcpyEvent = nullptr;
  cl_event waitForBarriers[] = {barrier2, barrier4};
  err = clEnqueueSVMMemcpy(queue, CL_FALSE, host_ptr, dev_data, sizeof(int), 
                           2, waitForBarriers, &memcpyEvent);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to enqueue memory copy" << std::endl;
    clSVMFree(context, dev_ptr);
    delete host_ptr;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  std::cout << "Memory copy enqueued. About to call clFinish - THIS MAY HANG!" << std::endl;
  
  err = clFinish(queue);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to finish queue after memory copy" << std::endl;
    clReleaseEvent(memcpyEvent);
    clSVMFree(context, dev_ptr);
    delete host_ptr;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  std::cout << "Memory copy completed successfully!" << std::endl;
  
  if (*host_ptr != 42) {
    std::cerr << "FAIL: Memory copy failed. Expected 42, got " << *host_ptr << std::endl;
    clReleaseEvent(memcpyEvent);
    clSVMFree(context, dev_ptr);
    delete host_ptr;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  clReleaseEvent(memcpyEvent);
  clSVMFree(context, dev_ptr);
  delete host_ptr;
  
  if (callbackCount.load() != 2) {
    std::cerr << "FAIL: Expected 2 callbacks, got " << callbackCount.load() << std::endl;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  if (order1 != 1 || order2 != 2) {
    std::cerr << "FAIL: Execution order incorrect. order1=" << order1 << ", order2=" << order2 << std::endl;
    clReleaseEvent(barrier4);
    clReleaseEvent(callbackEvent2);
    clReleaseEvent(barrier3);
    clReleaseEvent(barrier2);
    clReleaseEvent(callbackEvent1);
    clReleaseEvent(barrier1);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    return 1;
  }
  
  clReleaseEvent(barrier4);
  clReleaseEvent(callbackEvent2);
  clReleaseEvent(barrier3);
  clReleaseEvent(barrier2);
  clReleaseEvent(callbackEvent1);
  clReleaseEvent(barrier1);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
  
  std::cout << "PASS" << std::endl;
  return 0;
}

pvelesko avatar Nov 19 '25 13:11 pvelesko

Hi @pvelesko,

Thank you for your contribution. We have attempted to reproduce your reported issue, but on our side, the test is working as expected - no hangs or other problems were observed on the A770 GPU. Could you please try testing it again with the latest release? You can find it here: https://github.com/intel/compute-runtime/releases/tag/25.44.36015.5.

kgibala avatar Nov 24 '25 08:11 kgibala

@kgibala

Updated the driver as per your instructions. It now gives incorrect result. The same code works fine on PoCL:

╭─pvelesko@cupcake ~/reproducers/neo-callback-hang 
╰─$ ./main 
About to call clFinish immediately - this may hang...testHostFunc called
Callback count before clFinish: 1
NOTE: If callbacks haven't executed yet, clFinish will wait for barrier4,
      which waits for callbackEvent2, which is only set by the callback.
      If the callback doesn't execute, this will hang!

order: 1
hostFuncCallCount: 1
executionOrder: 1
testHostFunc called
order: 2
hostFuncCallCount: 2
executionOrder: 2
clFinish completed (callbacks should have executed)
Final callback count: 2
Callbacks completed successfully. Now attempting memory copy...
This is where the hang occurs in chipStar!
Event statuses:
  barrier1 (callback trigger): 0 (CL_COMPLETE=0, err=0)
  barrier2 (waits for callbackEvent1): 0 (CL_COMPLETE=0, err=0)
  barrier3 (callback trigger): 0 (CL_COMPLETE=0, err=0)
  barrier4 (waits for callbackEvent2): 0 (CL_COMPLETE=0, err=0)
  callbackEvent1 (user event): 0 (CL_COMPLETE=0, err=0)
  callbackEvent2 (user event): 0 (CL_COMPLETE=0, err=0)
SVM memory value before copy: 42
SVM memory address: 0x5c2b0ed36f80, host_ptr address: 0x5c2b0ed95460
Explicitly waiting on barrier2 and barrier4 events...
Events barrier2 and barrier4 are confirmed complete
SVM memory value after waiting: 42
About to enqueue memory copy from device to host...
This may hang if barriers (barrier2/barrier4) haven't properly completed!
NOTE: Using clEnqueueSVMMemcpy - if this fails, it may indicate a chipStar SVM bug
Memory copy enqueued. About to call clFinish - THIS MAY HANG!
Memory copy completed successfully!
SVM memory value after copy: 42
Host memory value after copy: 42
PASS
╭─pvelesko@cupcake ~/reproducers/neo-callback-hang 
╰─$ module unload pocl/7.1 
╭─pvelesko@cupcake ~/reproducers/neo-callback-hang 
╰─$ ./main 
testHostFunc called
order: 1
hostFuncCallCount: 1
executionOrder: 1
About to call clFinish immediately - this may hang...
Callback count before clFinish: 1
NOTE: If callbacks haven't executed yet, clFinish will wait for barrier4,
      which waits for callbackEvent2, which is only set by the callback.
      If the callback doesn't execute, this will hang!
testHostFunc called
order: 2
hostFuncCallCount: 2
executionOrder: 2
clFinish completed (callbacks should have executed)
Final callback count: 2
Callbacks completed successfully. Now attempting memory copy...
This is where the hang occurs in chipStar!
Event statuses:
  barrier1 (callback trigger): 0 (CL_COMPLETE=0, err=0)
  barrier2 (waits for callbackEvent1): 0 (CL_COMPLETE=0, err=0)
  barrier3 (callback trigger): 0 (CL_COMPLETE=0, err=0)
  barrier4 (waits for callbackEvent2): 0 (CL_COMPLETE=0, err=0)
  callbackEvent1 (user event): 0 (CL_COMPLETE=0, err=0)
  callbackEvent2 (user event): 0 (CL_COMPLETE=0, err=0)
SVM memory value before copy: 42
SVM memory address: 0x7ba13d000000, host_ptr address: 0x58b04e9c7e90
Explicitly waiting on barrier2 and barrier4 events...
Events barrier2 and barrier4 are confirmed complete
SVM memory value after waiting: 42
About to enqueue memory copy from device to host...
This may hang if barriers (barrier2/barrier4) haven't properly completed!
NOTE: Using clEnqueueSVMMemcpy - if this fails, it may indicate a chipStar SVM bug
Memory copy enqueued. About to call clFinish - THIS MAY HANG!
Memory copy completed successfully!
SVM memory value after copy: 42
Host memory value after copy: 0
FAIL: Memory copy failed. Expected 42, got 0

pvelesko avatar Nov 25 '25 13:11 pvelesko

H @pvelesko, thank you for your response.

Could you please highlight the differences between the working and non-working runs? For example, is a different driver being used?

Please attach the output logs from both runs for comparison when each driver is loaded from:

sudo dpkg --list | grep -iE "igc|gmm|opencl|level-zero|fc|level_zero|ocloc|libze"

kgibala avatar Dec 03 '25 11:12 kgibala

Not sure what you mean by working and non-working runs. None of them work.

ii  clinfo                                     3.0.23.01.25-1build1                    amd64        Query OpenCL system information
ii  intel-igc-core-2                           2.22.2                                  amd64        Intel(R) Graphics Compiler for OpenCL(TM)
ii  intel-igc-opencl-2                         2.22.2                                  amd64        Intel(R) Graphics Compiler for OpenCL(TM)
ii  intel-ocloc                                25.44.36015.5-0                         amd64        Tool for managing Intel Compute GPU device binary format
ii  intel-opencl-icd                           25.44.36015.5-0                         amd64        Intel graphics compute runtime for OpenCL
ii  libcbor0.10:amd64                          0.10.2-1.2ubuntu2                       amd64        library for parsing and generating CBOR (RFC 7049)
ii  libclc-dev                                 0.2.0+git20190827-8                     all          OpenCL C language implementation - development files
ii  libdebconfclient0:amd64                    0.271ubuntu3                            amd64        Debian Configuration Management System (C-implementation library)
ii  libfile-fcntllock-perl                     0.22-3build7                            amd64        Perl module for file locking with fcntl(2)
ii  libigdgmm12:amd64                          22.8.2                                  amd64        Intel Graphics Memory Management Library -- shared library
ii  libsigc++-2.0-0v5:amd64                    2.12.1-2                                amd64        type-safe Signal Framework for C++ - runtime
ii  libze-intel-gpu1                           25.44.36015.5-0                         amd64        Intel(R) Graphics Compute Runtime for oneAPI Level Zero.
ii  ocl-icd-libopencl1:amd64                   2.3.2-1build1                           amd64        Generic OpenCL ICD Loader
ii  python3-idna                               3.6-2ubuntu0.1                          all          Python IDNA2008 (RFC 5891) handling (Python 3)

pvelesko avatar Dec 11 '25 19:12 pvelesko

H @pvelesko, thank you for your response.

I meant the runs working and not working one from the previous comment were pass and failed is visible.

Could you please collect the API call logs using the ClIntercept? We are currently unable to reproduce the issue on our side, the test is working as expected.

Additionally, please provide the complete setup information as reported by XPUManager, following the example shown below:

  • https://github.com/intel/xpumanager/blob/master/doc/smi_user_guide.md

Show the detailed info of one device. The device info includes the model, frequency, driver/firmware info, PCI info, memory info and tile/execution unit info.

xpu-smi discovery -d 0000:4d:00.0
+-----------+--------------------------------------------------------------------------------------+
| Device ID | Device Information                                                                   |
+-----------+--------------------------------------------------------------------------------------+
| 0         | Device Type: GPU                                                                     |
|           | Device Name: Intel(R) Graphics [0x020a]                                              |
...

kgibala avatar Dec 12 '25 14:12 kgibala

I used THAPI for getting the OpenCL trace, let me know if that is ok:

**╭─pvelesko@cupcake ~/reproducers/opencl-hang-after-callback ‹main●› 
╰─$ iprof -t ./main 
About to call clFinish immediately - this may hang...
Callback count before clFinish: 0
NOTE: If callbacks haven't executed yet, clFinish will wait for barrier4,
      which waits for callbackEvent2, which is only set by the callback.
      If the callback doesn't execute, this will hang!
testHostFunc called
order: 1
hostFuncCallCount: 1
executionOrder: 1
testHostFunc called
order: 2
hostFuncCallCount: 2
executionOrder: 2
clFinish completed (callbacks should have executed)
Final callback count: 2
Callbacks completed successfully. Now attempting memory copy...
This is where the hang occurs in chipStar!
Event statuses:
  barrier1 (callback trigger): 0 (CL_COMPLETE=0)
  barrier2 (waits for callbackEvent1): 0 (CL_COMPLETE=0)
  barrier3 (callback trigger): 0 (CL_COMPLETE=0)
  barrier4 (waits for callbackEvent2): 0 (CL_COMPLETE=0)
  callbackEvent1 (user event): 0 (CL_COMPLETE=0)
  callbackEvent2 (user event): 0 (CL_COMPLETE=0)
About to enqueue memory copy from device to host...
This may hang if barriers (barrier2/barrier4) haven't properly completed!
Memory copy enqueued. About to call clFinish - THIS MAY HANG!
FAIL: Memory copy failed. Expected 42, got Memory copy completed successfully!
0
THAPI: Trace location: /space/pvelesko/thapi-traces/thapi--2025-12-15T05:52:51+02:00
05:52:50.708939687 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetPlatformIDs_entry: num_entries: 0, platforms: 0x0, num_platforms: 0x7fff4b18b388
05:52:50.708942872 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetPlatformIDs_exit: errcode_ret_val: CL_SUCCESS, num_platforms_val: 3, platforms_vals: []
05:52:50.708944200 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetPlatformIDs_entry: num_entries: 3, platforms: 0x64dd44c726c0, num_platforms: 0x0
05:52:50.708944622 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetPlatformIDs_exit: errcode_ret_val: CL_SUCCESS, num_platforms_val: 0, platforms_vals: [0x64dd44b05a40, 0x64dd44b1e780, 0x64dd44b2d640]
05:52:50.708945660 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetDeviceIDs_entry: platform: 0x64dd44b05a40, device_type: [CL_DEVICE_TYPE_ALL], num_entries: 0, devices: 0x0, num_devices: 0x7fff4b18b38c
05:52:50.708947984 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetDeviceIDs_exit: errcode_ret_val: CL_SUCCESS, num_devices_val: 1, devices_vals: []
05:52:50.708948295 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetDeviceIDs_entry: platform: 0x64dd44b05a40, device_type: [CL_DEVICE_TYPE_ALL], num_entries: 1, devices: 0x7fff4b18b3b0, num_devices: 0x0
05:52:50.708952300 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl_devices:device_name: device: 0x64dd44b1cde0, name: "Intel(R) Arc(TM) A770 Graphics"
05:52:50.708953174 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetDeviceIDs_exit: errcode_ret_val: CL_SUCCESS, num_devices_val: 1, devices_vals: [0x64dd44b1cde0]
05:52:50.708954376 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateContext_entry: properties: 0, num_devices: 1, devices: 0x7fff4b18b3b0, pfn_notify: 0x0, user_data: 0x0, errcode_ret: 0x7fff4b18b384, properties_vals: [], devices_vals: [0x64dd44b1cde0]
05:52:50.709006691 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateContext_exit: context: 0x64dd44c79270, errcode_ret_val: CL_SUCCESS
05:52:50.709007610 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateCommandQueueWithProperties_entry: context: 0x64dd44c79270, device: 0x64dd44b1cde0, properties: 0x0, errcode_ret: 0x7fff4b18b384, properties_vals: []
05:52:50.709017295 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateCommandQueueWithProperties_exit: command_queue: 0x64dd43595c10, errcode_ret_val: CL_SUCCESS
05:52:50.715585079 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_entry: command_queue: 0x64dd43595c10, num_events_in_wait_list: 0, event_wait_list: 0x0, event: 0x7fff4b18b3b8, event_wait_list_vals: []
05:52:50.716709319 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl_profiling:event_profiling: status: 0, event: 0x64dd43598e00
05:52:50.716710252 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_exit: errcode_ret_val: CL_SUCCESS, event_val: 0x64dd43598e00
05:52:50.716713871 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateUserEvent_entry: context: 0x64dd44c79270, errcode_ret: 0x7fff4b18b384
05:52:50.716715893 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateUserEvent_exit: event: 0x64dd44c7d180, errcode_ret_val: CL_SUCCESS
05:52:50.716717297 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSetEventCallback_entry: event: 0x64dd43598e00, command_exec_callback_type: 0, pfn_notify: 0x64dd324c9280, user_data: 0x64dd44c7d3b0
05:52:50.716721574 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSetEventCallback_exit: errcode_ret_val: CL_SUCCESS
05:52:50.716722033 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_entry: command_queue: 0x64dd43595c10, num_events_in_wait_list: 1, event_wait_list: 0x7fff4b18b400, event: 0x7fff4b18b3c0, event_wait_list_vals: [0x64dd44c7d180]
05:52:50.716770203 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl_profiling:event_profiling: status: 0, event: 0x64dd44c7d400
05:52:50.716770450 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_exit: errcode_ret_val: CL_SUCCESS, event_val: 0x64dd44c7d400
05:52:50.716770699 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_entry: command_queue: 0x64dd43595c10, num_events_in_wait_list: 0, event_wait_list: 0x0, event: 0x7fff4b18b3c8, event_wait_list_vals: []
05:52:50.716802283 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl_profiling:event_profiling: status: 0, event: 0x64dd44c82ed0
05:52:50.716802498 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_exit: errcode_ret_val: CL_SUCCESS, event_val: 0x64dd44c82ed0
05:52:50.716802716 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateUserEvent_entry: context: 0x64dd44c79270, errcode_ret: 0x7fff4b18b384
05:52:50.716803156 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clCreateUserEvent_exit: event: 0x64dd44c88910, errcode_ret_val: CL_SUCCESS
05:52:50.716803475 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSetEventCallback_entry: event: 0x64dd44c82ed0, command_exec_callback_type: 0, pfn_notify: 0x64dd324c9280, user_data: 0x64dd44c888c0
05:52:50.716803786 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSetEventCallback_exit: errcode_ret_val: CL_SUCCESS
05:52:50.716803961 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_entry: command_queue: 0x64dd43595c10, num_events_in_wait_list: 1, event_wait_list: 0x7fff4b18b408, event: 0x7fff4b18b3d0, event_wait_list_vals: [0x64dd44c88910]
05:52:50.716835165 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl_profiling:event_profiling: status: 0, event: 0x64dd44c88b50
05:52:50.716835318 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueBarrierWithWaitList_exit: errcode_ret_val: CL_SUCCESS, event_val: 0x64dd44c88b50
05:52:50.716861759 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clFinish_entry: command_queue: 0x64dd43595c10
05:52:50.717025774 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetEventCallback_callback_entry: event: 0x64dd43598e00, type: 0, user_data: 0x64dd44c7d3b0
05:52:50.717038098 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetUserEventStatus_entry: event: 0x64dd44c7d180, execution_status: 0
05:52:50.717343530 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetUserEventStatus_exit: errcode_ret_val: CL_SUCCESS
05:52:50.717344198 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetEventCallback_callback_exit: 
05:52:50.717351106 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl_profiling:event_profiling_results: event: 0x64dd43598e00, event_command_exec_status: 0, queued_status: 0, queued: 1100762739416, submit_status: 0, submit: 1100763022812, start_status: 0, start: 1100763889062, end_status: 0, end: 1100763889114
05:52:50.717561601 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl_profiling:event_profiling_results: event: 0x64dd44c7d400, event_command_exec_status: 0, queued_status: 0, queued: 1100763622751, submit_status: 0, submit: 1100763939531, start_status: 0, start: 1100764430104, end_status: 0, end: 1100764430156
05:52:50.717567465 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetEventCallback_callback_entry: event: 0x64dd44c82ed0, type: 0, user_data: 0x64dd44c888c0
05:52:50.717571817 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetUserEventStatus_entry: event: 0x64dd44c88910, execution_status: 0
05:52:50.717635311 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetUserEventStatus_exit: errcode_ret_val: CL_SUCCESS
05:52:50.717635557 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl:clSetEventCallback_callback_exit: 
05:52:50.717640281 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl_profiling:event_profiling_results: event: 0x64dd44c82ed0, event_command_exec_status: 0, queued_status: 0, queued: 1100763777935, submit_status: 0, submit: 1100764184010, start_status: 0, start: 1100764442604, end_status: 0, end: 1100764442656
05:52:50.717970166 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clFinish_exit: errcode_ret_val: CL_SUCCESS
05:52:50.717971753 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl_profiling:event_profiling_results: event: 0x64dd44c88b50, event_command_exec_status: 0, queued_status: 0, queued: 1100763699369, submit_status: 0, submit: 1100764467239, start_status: 0, start: 1100764840260, end_status: 0, end: 1100764840260
05:52:50.717974767 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_entry: event: 0x64dd43598e00, param_name: CL_EVENT_COMMAND_EXECUTION_STATUS, param_value_size: 4, param_value: 0x7fff4b18b398, param_value_size_ret: 0x0
05:52:50.717975898 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_exit: errcode_ret_val: CL_SUCCESS, param_value_size_ret_val: 4, param_value_vals: ""
05:52:50.717976366 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_entry: event: 0x64dd44c7d400, param_name: CL_EVENT_COMMAND_EXECUTION_STATUS, param_value_size: 4, param_value: 0x7fff4b18b39c, param_value_size_ret: 0x0
05:52:50.717976621 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_exit: errcode_ret_val: CL_SUCCESS, param_value_size_ret_val: 4, param_value_vals: ""
05:52:50.717976830 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_entry: event: 0x64dd44c82ed0, param_name: CL_EVENT_COMMAND_EXECUTION_STATUS, param_value_size: 4, param_value: 0x7fff4b18b3a0, param_value_size_ret: 0x0
05:52:50.717977083 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_exit: errcode_ret_val: CL_SUCCESS, param_value_size_ret_val: 4, param_value_vals: ""
05:52:50.717977261 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_entry: event: 0x64dd44c88b50, param_name: CL_EVENT_COMMAND_EXECUTION_STATUS, param_value_size: 4, param_value: 0x7fff4b18b3a4, param_value_size_ret: 0x0
05:52:50.717977529 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_exit: errcode_ret_val: CL_SUCCESS, param_value_size_ret_val: 4, param_value_vals: ""
05:52:50.717980776 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_entry: event: 0x64dd44c7d180, param_name: CL_EVENT_COMMAND_EXECUTION_STATUS, param_value_size: 4, param_value: 0x7fff4b18b3a8, param_value_size_ret: 0x0
05:52:50.717981020 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_exit: errcode_ret_val: CL_SUCCESS, param_value_size_ret_val: 4, param_value_vals: ""
05:52:50.717981201 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_entry: event: 0x64dd44c88910, param_name: CL_EVENT_COMMAND_EXECUTION_STATUS, param_value_size: 4, param_value: 0x7fff4b18b3ac, param_value_size_ret: 0x0
05:52:50.717981383 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clGetEventInfo_exit: errcode_ret_val: CL_SUCCESS, param_value_size_ret_val: 4, param_value_vals: ""
05:52:50.717983631 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSVMAlloc_entry: context: 0x64dd44c79270, flags: [CL_MEM_READ_WRITE], size: 4, alignment: 0
05:52:50.718012124 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSVMAlloc_exit: _retval: 0x7f3754400000
05:52:50.718468538 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueSVMMemcpy_entry: command_queue: 0x64dd43595c10, blocking_copy: CL_FALSE, dst_ptr: 0x64dd44c8b3b0, src_ptr: 0x7f3754400000, size: 4, num_events_in_wait_list: 2, event_wait_list: 0x7fff4b18b410, event: 0x7fff4b18b3d8, event_wait_list_vals: [0x64dd44c7d400, 0x64dd44c88b50]
05:52:50.719392128 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl_profiling:event_profiling: status: 0, event: 0x64dd44c91f70
05:52:50.719392938 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clEnqueueSVMMemcpy_exit: errcode_ret_val: CL_SUCCESS, event_val: 0x64dd44c91f70
05:52:50.719396424 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clFinish_entry: command_queue: 0x64dd43595c10
05:52:50.719546267 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clFinish_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719552039 - cupcake - vpid: 110763, vtid: 110780 - lttng_ust_opencl_profiling:event_profiling_results: event: 0x64dd44c91f70, event_command_exec_status: 0, queued_status: 0, queued: 1100765661521, submit_status: 0, submit: 1100765970208, start_status: 0, start: 1100766417135, end_status: 0, end: 1100766419947
05:52:50.719557706 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd44c91f70
05:52:50.719560039 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719560589 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSVMFree_entry: context: 0x64dd44c79270, svm_pointer: 0x7f3754400000
05:52:50.719579192 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clSVMFree_exit: 
05:52:50.719579686 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd44c88b50
05:52:50.719581940 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719582110 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd44c88910
05:52:50.719583236 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719583399 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd44c82ed0
05:52:50.719584626 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719584762 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd44c7d400
05:52:50.719585951 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719586083 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd44c7d180
05:52:50.719586474 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719586607 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_entry: event: 0x64dd43598e00
05:52:50.719587462 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseEvent_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719587922 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseCommandQueue_entry: command_queue: 0x64dd43595c10
05:52:50.719590971 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseCommandQueue_exit: errcode_ret_val: CL_SUCCESS
05:52:50.719591724 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseContext_entry: context: 0x64dd44c79270
05:52:50.719597638 - cupcake - vpid: 110763, vtid: 110763 - lttng_ust_opencl:clReleaseContext_exit: errcode_ret_val: CL_SUCCESS**

@kgibala

pvelesko avatar Dec 15 '25 03:12 pvelesko

+-----------+--------------------------------------------------------------------------------------+
| Device ID | Device Information                                                                   |
+-----------+--------------------------------------------------------------------------------------+
| 0         | Device Type: GPU                                                                     |
|           | Device Name: Intel(R) Arc(TM) A770 Graphics                                          |
|           | PCI Device ID: 0x56a0                                                                |
|           | Vendor Name: Intel(R) Corporation                                                    |
|           | SOC UUID: 00000000-0000-0003-0000-000856a08086                                       |
|           | Serial Number: unknown                                                               |
|           | Core Clock Rate: 2400 MHz                                                            |
|           | Stepping: C0                                                                         |
|           | SKU Type: N/A                                                                        |
|           |                                                                                      |
|           | Driver Version: C03A312045F977B142A57A4                                              |
|           | Kernel Version: 6.8.0-65-generic                                                     |
|           | GFX Firmware Name: GFX                                                               |
|           | GFX Firmware Version: unknown                                                        |
|           | GFX Firmware Status: normal                                                          |
|           |                                                                                      |
|           | PCI BDF Address: 0000:03:00.0                                                        |
|           | PCI Slot: N/A                                                                        |
|           | PCIe Generation: -1                                                                  |
|           | PCIe Max Link Width: -1                                                              |
|           | PCIe Max Bandwidth: -0.00 GB/s                                                       |
|           |                                                                                      |
|           | Memory Physical Size: 16288.00 MiB                                                   |
|           | Max Mem Alloc Size: 4095.99 MiB                                                      |
|           | ECC State: N/A                                                                       |
|           | Number of Memory Channels: N/A                                                       |
|           | Memory Bus Width: N/A                                                                |
|           | Max Hardware Contexts: 65536                                                         |
|           | Max Command Queue Priority: 0                                                        |
|           |                                                                                      |
|           | Number of EUs: 512                                                                   |
|           | Number of Tiles: 1                                                                   |
|           | Number of Slices: 1                                                                  |
|           | Number of Sub Slices per Slice: 32                                                   |
|           | Number of Threads per EU: 8                                                          |
|           | Physical EU SIMD Width: 8                                                            |
|           | Number of Media Engines: 0                                                           |
|           | Number of Media Enhancement Engines: 0                                               |
|           |                                                                                      |
|           | Number of Xe Link ports: N/A                                                         |
|           | Max Tx/Rx Speed per Xe Link port: N/A                                                |
|           | Number of Lanes per Xe Link port: N/A                                                |
+-----------+--------------------------------------------------------------------------------------+

pvelesko avatar Dec 15 '25 04:12 pvelesko

Hi @pvelesko,

Thank you for providing the additional logs. We were able to reproduce the issue.

In your example, if the device does not support CL_DEVICE_SVM_FINE_GRAIN_BUFFER, a synchronization point is required before directly accessing SVM memory on the host, such as in the following code:

 int* dev_data = static_cast<int*>(dev_ptr);
 *dev_data = 42;
  • https://github.com/pvelesko/reproducers/commit/c1a0f2d03b4aff04250e202fafa1b28db1566bea#diff-6aa888a8a819d4bda563be6e4ede80a66af8894a2e3b970a9ef79b48748d1d1aR272-R273

To check if your device supports CL_DEVICE_SVM_FINE_GRAIN_BUFFER, use clGetDeviceInfo with the CL_DEVICE_SVM_CAPABILITIES parameter:

  • https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clGetDeviceInfo

If fine-grain buffer support is not available, you should explicitly map and unmap the SVM memory to ensure proper synchronization. Here’s a proposed solution:

  void* dev_ptr = clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(int), 0);
  if (dev_ptr == nullptr) {
    std::cerr << "Failed to allocate SVM memory" << std::endl;
...
    return 1;
  }
  
  // Map the SVM memory for writing
  err = clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, dev_ptr, sizeof(int), 0, nullptr, nullptr);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to map SVM memory" << std::endl;
...
    return 1;
  }
  
  int* dev_data = static_cast<int*>(dev_ptr);
  *dev_data = 42;
  
  // Unmap the SVM memory before copying
  err = clEnqueueSVMUnmap(queue, dev_ptr, 0, nullptr, nullptr);
  if (err != CL_SUCCESS) {
    std::cerr << "Failed to unmap SVM memory" << std::endl;
...
    return 1;
  }

Results:

...
Device type: GPU
About to call clFinish immediately - this may hang...
Callback count before clFinish: 0
NOTE: If callbacks haven't executed yet, clFinish will wait for barrier4,
      which waits for callbackEvent2, which is only set by the callback.
      If the callback doesn't execute, this will hang!
testHostFunc called
order: 1
hostFuncCallCount: 1
executionOrder: 1
testHostFunc called
order: 2
hostFuncCallCount: 2
executionOrder: 2
clFinish completed (callbacks should have executed)
Final callback count: 2
Callbacks completed successfully. Now attempting memory copy...
This is where the hang occurs in chipStar!
Event statuses:
  barrier1 (callback trigger): 0 (CL_COMPLETE=0)
  barrier2 (waits for callbackEvent1): 0 (CL_COMPLETE=0)
  barrier3 (callback trigger): 0 (CL_COMPLETE=0)
  barrier4 (waits for callbackEvent2): 0 (CL_COMPLETE=0)
  callbackEvent1 (user event): 0 (CL_COMPLETE=0)
  callbackEvent2 (user event): 0 (CL_COMPLETE=0)
About to enqueue memory copy from device to host...
This may hang if barriers (barrier2/barrier4) haven't properly completed!
Memory copy enqueued. About to call clFinish - THIS MAY HANG!
Memory copy completed successfully!
PASS

References:

  • https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#CL_DEVICE_SVM_COARSE_GRAIN_BUFFER

CL_DEVICE_SVM_COARSE_GRAIN_BUFFER - Support for coarse-grain buffer sharing using clSVMAlloc. Memory consistency is guaranteed at synchronization points and the host must use calls to clEnqueueMapBuffer and clEnqueueUnmapMemObject.

  • https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#CL_DEVICE_SVM_FINE_GRAIN_BUFFER

CL_DEVICE_SVM_FINE_GRAIN_BUFFER - Support for fine-grain buffer sharing using clSVMAlloc. Memory consistency is guaranteed at synchronization points without need for clEnqueueMapBuffer and clEnqueueUnmapMemObject.

kgibala avatar Dec 16 '25 11:12 kgibala