Segfault from non-blocking SVM (or USM) copy from an unmanaged host buffer

Open al42and opened this issue 2 months ago • 0 comments

A copy from an unpinned host buffer to a device is likely to, in practice, be always blocking, but it should still be legal to set blocking_copy to CL_FALSE as far as I know.

However, with recent compute-runtime (tested 25.40 and 25.35), it crashes on my TigerLake-LP GT2:

$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 24.04.3 LTS
Release:        24.04
Codename:       noble

$ uname -a
Linux aland-dell 6.14.0-35-generic #35~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue Oct 14 13:55:17 UTC 2 x86_64 x86_64 x86_64 GNU/Linux

$ clinfo -l
Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics

$ apt list --installed | grep -i -e "intel" -e "libze" -e "libigdmm" -e "opencl" | grep -v intel-oneapi

WARNING: apt does not have a stable CLI interface. Use with caution in scripts.

intel-basekit/all,now 2025.3.0-361 amd64 [installed]
intel-gmmlib/now 22.0.2 amd64 [installed,local]
intel-igc-core-2/now 2.20.3 amd64 [installed,local]
intel-igc-core/now 1.0.17791.9 amd64 [installed,local]
intel-igc-opencl-2/now 2.20.3 amd64 [installed,local]
intel-igc-opencl/now 1.0.17791.9 amd64 [installed,local]
intel-microcode/noble-updates,noble-security,now 3.20250512.0ubuntu0.24.04.1 amd64 [installed]
intel-ocloc-dbgsym/now 25.40.35563.4-0 amd64 [installed,local]
intel-ocloc/now 25.40.35563.4-0 amd64 [installed,local]
intel-opencl-icd-dbgsym/now 25.40.35563.4-0 amd64 [installed,local]
intel-opencl-icd/now 25.40.35563.4-0 amd64 [installed,local]
libdrm-intel1/noble,now 2.4.124+git2501180500.a7eb2c~oibaf~n amd64 [installed,automatic]
libze-intel-gpu1-dbgsym/now 25.40.35563.4-0 amd64 [installed,local]
libze-intel-gpu1/now 25.40.35563.4-0 amd64 [installed,local]
ocl-icd-libopencl1/noble,noble-updates,now 2.3.2-1build1 amd64 [installed]
ocl-icd-opencl-dev/noble,noble-updates,now 2.3.2-1build1 amd64 [installed,automatic]
opencl-c-headers/noble,noble,now 3.0~2023.12.14-1 all [installed]
opencl-clhpp-headers/noble,noble,now 3.0~2023.12.14-1ubuntu1 all [installed,automatic]
xserver-xorg-video-intel/noble,now 2:2.99.917+git20210115-1build1 amd64 [installed,automatic]


$ gcc -Wall svm.c -g -lOpenCL

$ PrintDebugMessages=1  NEOReadDebugKeys=1 gdb --args ./a.out 
Reading symbols from ./a.out...
(gdb) r
Starting program: /home/aland/sycl_tests/a.out 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
INFO: System Info query failed!
WARNING: Failed to request OCL Turbo Boost
[New Thread 0x7ffff53ff6c0 (LWP 120059)]
NEO_CACHE_PERSISTENT is enabled. Cache is located in: /home/aland/.cache/neo_compiler_cache

computeUnitsUsedForScratch: 768
hwInfo: {96, 672}: (16, 1, 6)
Platform: Intel(R) OpenCL Graphics
Device: Intel(R) Iris(R) Xe Graphics
Device Version: OpenCL 3.0 NEO 
Created context and command queue.
Created and populated unpinned host buffer.
Allocated coarse-grain SVM buffer.

Thread 1 "a.out" received signal SIGSEGV, Segmentation fault.
0x0000000000001910 in ?? ()
(gdb) bt
#0  0x0000000000001910 in ?? ()
#1  0x00007ffff64b45a5 in NEO::GmmClientContext::cachePolicyGetPATIndex (this=<optimized out>, gmmResourceInfo=gmmResourceInfo@entry=0x0, usage=usage@entry=GMM_RESOURCE_USAGE_OCL_SYSTEM_MEMORY_BUFFER, 
    compressed=compressed@entry=false, cacheable=cacheable@entry=true) at ../../neo/shared/source/gmm_helper/client_context/gmm_client_context.cpp:92
#2  0x00007ffff660b9b6 in NEO::DrmMemoryManager::allocateGraphicsMemoryForNonSvmHostPtr (this=0x5555569b8980, allocationData=...) at ../../neo/shared/source/os_interface/linux/drm_memory_manager.cpp:698
#3  0x00007ffff64dc033 in NEO::MemoryManager::allocateGraphicsMemory (this=0x5555569b8980, allocationData=...) at ../../neo/shared/source/memory_manager/memory_manager.cpp:859
#4  0x00007ffff64de913 in NEO::MemoryManager::allocateGraphicsMemoryInPreferredPool (this=this@entry=0x5555569b8980, properties=..., hostPtr=0x55555555a8a0) at ../../neo/shared/source/memory_manager/memory_manager.cpp:784
#5  0x00007ffff639168e in NEO::MemoryManager::allocateGraphicsMemoryWithProperties (ptr=<optimized out>, properties=..., this=0x5555569b8980) at ../../neo/shared/source/memory_manager/memory_manager.h:135
#6  NEO::CommandStreamReceiver::createAllocationForHostSurface (this=0x555556aada20, surface=..., requiresL3Flush=requiresL3Flush@entry=false) at ../../neo/shared/source/command_stream/command_stream_receiver.cpp:995
#7  0x00007ffff5f8fa92 in NEO::CommandQueueHw<NEO::Gen12LpFamily>::enqueueSVMMemcpy (this=0x5555555592a0, blockingCopy=0, dstPtr=<optimized out>, srcPtr=<optimized out>, size=<optimized out>, numEventsInWaitList=0, eventWaitList=0x0, 
    event=0x7fffffffd3c8, csrParam=0x0) at ../../neo/opencl/source/command_queue/enqueue_svm.h:410
#8  0x00007ffff5dfc5e5 in clEnqueueSVMMemcpy (commandQueue=<optimized out>, blockingCopy=<optimized out>, dstPtr=<optimized out>, srcPtr=<optimized out>, size=<optimized out>, numEventsInWaitList=<optimized out>, 
    eventWaitList=<optimized out>, event=<optimized out>) at ../../neo/opencl/source/api/api.cpp:4888
#9  0x0000555555555aca in main () at svm.c:121

The original issue was encountered with a SYCL application using USM, and can be reproduced with both OpenCL and LevelZero bakends (both crashing in NEO::GmmClientContext::cachePolicyGetPATIndex).

Below is a smaller videboded reproducer. If I use a blocking copy or do the non-blocking copy between two SVM allocations, things work fine.

Source code of svm.c

#define CL_TARGET_OPENCL_VERSION 200

#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// Helper function to check for and report OpenCL errors
void check_error(cl_int err, const char* operation) {
    if (err != CL_SUCCESS) {
        fprintf(stderr, "Error during %s: %d\n", operation, err);
        exit(1);
    }
}

int main() {
    cl_int err;
    cl_device_id device = NULL;
    cl_context context = NULL;
    cl_command_queue queue = NULL;

    const int DATA_SIZE = 1024;
    const size_t buffer_size = sizeof(int) * DATA_SIZE;

    // --- 1. Find an OpenCL 2.0+ Platform with SVM Support ---
    cl_uint num_platforms;
    err = clGetPlatformIDs(0, NULL, &num_platforms);
    check_error(err, "clGetPlatformIDs (count)");

    if (num_platforms == 0) {
        fprintf(stderr, "No OpenCL platforms found.\n");
        return 1;
    }

    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id));
    err = clGetPlatformIDs(num_platforms, platforms, NULL);
    check_error(err, "clGetPlatformIDs (list)");

    for (cl_uint i = 0; i < num_platforms; i++) {
        cl_uint num_devices;
        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
        if (err != CL_SUCCESS || num_devices == 0) continue;

        cl_device_id* devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id));
        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
        if (err != CL_SUCCESS) {
            free(devices);
            continue;
        }

        for (cl_uint j = 0; j < num_devices; j++) {
            char version_string[128];
            err = clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, sizeof(version_string), version_string, NULL);
            if (err != CL_SUCCESS) continue;

            // Check for "OpenCL 2." or "OpenCL 3."
            if (strstr(version_string, "OpenCL 2.") || strstr(version_string, "OpenCL 3.")) {
                cl_device_svm_capabilities svm_caps;
                err = clGetDeviceInfo(devices[j], CL_DEVICE_SVM_CAPABILITIES, sizeof(svm_caps), &svm_caps, NULL);
                if (err == CL_SUCCESS && (svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)) {
                    device = devices[j];
                    char platform_name[128];
                    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
                    char device_name[128];
                    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
                    printf("Platform: %s\n", platform_name);
                    printf("Device: %s\n", device_name);
                    printf("Device Version: %s\n", version_string);
                    goto device_found; // Found a good device, exit loops
                }
            }
        }
        free(devices);
    }
    free(platforms);

device_found:
    if (device == NULL) {
        fprintf(stderr, "Could not find an OpenCL 2.0+ device with coarse-grain SVM support.\n");
        return 1;
    }

    // --- 2. Create Context and Queue ---
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    check_error(err, "clCreateContext");

    queue = clCreateCommandQueueWithProperties(context, device, 0, &err);
    check_error(err, "clCreateCommandQueueWithProperties");
    printf("Created context and command queue.\n");

    // --- 3. Allocate Unpinned Host Memory ---
    // Standard `malloc` memory is unpinned.
    int* host_src_data = (int*)malloc(buffer_size);
    if (host_src_data == NULL) {
        fprintf(stderr, "Failed to allocate host source memory.\n");
        return 1;
    }
    for (int i = 0; i < DATA_SIZE; i++) {
        host_src_data[i] = i;
    }
    printf("Created and populated unpinned host buffer.\n");

    // --- 4. Allocate Coarse-Grain SVM Memory ---
    // Use read/write flags. The allocation will be coarse-grain
    // because that's what the device supports.
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    // Note: No alignment specified (set to 0)
    int* svm_mem = (int*)clSVMAlloc(context, flags, buffer_size, 0);
    if (svm_mem == NULL) {
        fprintf(stderr, "Failed to allocate SVM memory.\n");
        clReleaseCommandQueue(queue);
        clReleaseContext(context);
        free(host_src_data);
        return 1;
    }
    printf("Allocated coarse-grain SVM buffer.\n");

    // --- 5. Copy Host -> SVM (Non-blocking) ---
    cl_event h2d_event;
    // We copy from the unpinned host_src_data to the svm_mem buffer.
    err = clEnqueueSVMMemcpy(queue, CL_FALSE, svm_mem, host_src_data, buffer_size, 0, NULL, &h2d_event);
    check_error(err, "clEnqueueSVMMemcpy (Host -> SVM)");
    printf("Enqueued non-blocking copy from Host -> SVM.\n");

    // --- 6. Copy SVM -> Host (Non-blocking) ---
    // Allocate a new unpinned host buffer for readback
    int* host_dst_data = (int*)malloc(buffer_size);
    if (host_dst_data == NULL) {
        fprintf(stderr, "Failed to allocate host destination memory.\n");
        return 1;
    }

    cl_event d2h_event;
    // Copy from svm_mem to the new host_dst_data buffer
    // This copy must wait for the first (h2d_event) to complete.
    err = clEnqueueSVMMemcpy(queue, CL_FALSE, host_dst_data, svm_mem, buffer_size, 1, &h2d_event, &d2h_event);
    check_error(err, "clEnqueueSVMMemcpy (SVM -> Host)");
    printf("Enqueued non-blocking copy from SVM -> Host (waits on H2D).\n");

    // --- 7. Verify and Clean Up ---
    
    // Explicitly wait for the final copy (SVM -> Host) to complete before reading the data.
    // Since d2h_event already waits on h2d_event, we only need to wait for d2h_event.
    err = clWaitForEvents(1, &d2h_event);
    check_error(err, "clWaitForEvents");
    printf("All non-blocking copies have completed.\n");

    int mismatches = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        if (host_src_data[i] != host_dst_data[i]) {
            mismatches++;
        }
    }

    if (mismatches > 0) {
        printf("\nVERIFICATION FAILED! %d mismatches found.\n", mismatches);
    } else {
        printf("\nVERIFICATION PASSED! Data copied successfully.\n");
    }

    // Cleanup
    printf("Cleaning up resources...\n");
    clReleaseEvent(h2d_event);
    clReleaseEvent(d2h_event);
    clSVMFree(context, svm_mem);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    free(host_src_data);
    free(host_dst_data);

    return 0;
}

Nov 07 '25 16:11 al42and