CV-CUDA [QUESTION]how to create nvcv::Tensor from an existing gpu memory by zero-copy?

how to create nvcv::Tensor from an fixed gpu memory by zero-copy?

By the way, when I use TensorWrapData, release the tensor will release original gpu memory too😂

float* pGpuData= ...
...
{
    nvcv::TensorDataStridedCuda::Buffer inBuf;
    inBuf.basePtr = pGpuData;
    ...
    nvcv::TensorDataStridedCuda inData(... , inBuf);
    nvcv::Tensor inTensor = TensorWrapData(inData);
}
...
when I use pGpuData, it wall released

Mar 28 '24 11:03 kywish

@kywish hi; any updates or you have solve the ploblem?

Nov 10 '24 11:11 lazylazypig

@lazylazypig @kywish Does something like the following work for you?

#include <cuda_runtime.h>
#include <nvcv/DataType.hpp>
#include <nvcv/Tensor.hpp>
#include <nvcv/TensorData.hpp>
#include <nvcv/TensorLayout.hpp>

#include <cstdint>
#include <iostream>
#include <vector>

// Utility function to check CUDA errors
#define CHECK_CUDA_ERROR(call)                                               \
    do {                                                                     \
        cudaError_t error = call;                                            \
        if (error != cudaSuccess) {                                          \
            std::cerr << "CUDA error in " << __FILE__ << " line " << __LINE__ \
                    << ": " << cudaGetErrorString(error) << std::endl;       \
            exit(EXIT_FAILURE);                                              \
        }                                                                    \
    } while (0)

// Simple kernel to fill a tensor with values
__global__ void fillTensorKernel(float* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        data[idx] = static_cast<float>(idx) / 10.0f;
    }
}

// Another kernel to process tensor data
__global__ void processTensorKernel(const float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = input[idx] * 2.0f; // Just double the values
    }
}

int main() {
    // Create CUDA stream
    cudaStream_t stream;
    CHECK_CUDA_ERROR(cudaStreamCreate(&stream));

    try {
        // Define tensor dimensions (NCHW format)
        const int batch_size = 2;
        const int channels = 3;
        const int height = 4;
        const int width = 5;
        const int tensor_size = batch_size * channels * height * width;
        
        std::cout << "Creating a tensor with dimensions: [" 
                  << batch_size << ", " << channels << ", " 
                  << height << ", " << width << "]" << std::endl;

        // Allocate GPU memory manually
        float* device_data;
        size_t pitch_bytes;
        CHECK_CUDA_ERROR(cudaMallocPitch(&device_data, &pitch_bytes, 
                                          width * sizeof(float), 
                                          height * channels * batch_size));
        
        std::cout << "Allocated GPU memory with pitch: " << pitch_bytes << " bytes" << std::endl;

        // Fill the tensor with data
        int threadsPerBlock = 256;
        int blocksPerGrid = (tensor_size + threadsPerBlock - 1) / threadsPerBlock;
        fillTensorKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(device_data, tensor_size);
        CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));

        // Create strides for the NCHW layout
        // Calculate strides based on pitch
        int64_t stride_w = sizeof(float);                // stride between elements in a row
        int64_t stride_h = pitch_bytes;                  // stride between rows
        int64_t stride_c = pitch_bytes * height;         // stride between channels
        int64_t stride_n = pitch_bytes * height * channels; // stride between batches

        // Set up the tensor buffer with our GPU memory pointer and strides
        nvcv::TensorDataStridedCuda::Buffer tensor_buffer;
        // The strides array follows the NCHW order from slowest to fastest dimension
        tensor_buffer.strides[0] = stride_n; 
        tensor_buffer.strides[1] = stride_c;
        tensor_buffer.strides[2] = stride_h;
        tensor_buffer.strides[3] = stride_w;
        
        // Set the base pointer to our allocated GPU memory
        tensor_buffer.basePtr = reinterpret_cast<NVCVByte*>(device_data);

        // Create the tensor shape with NCHW layout
        nvcv::TensorShape tensor_shape({batch_size, channels, height, width}, "NCHW");
        
        // Create the tensor data with our GPU memory
        nvcv::TensorDataStridedCuda tensor_data(
            tensor_shape,
            nvcv::TYPE_F32,  // FP32 data type
            tensor_buffer
        );

        // Create NVCV Tensor by wrapping the tensor data (zero-copy)
        nvcv::Tensor tensor = nvcv::TensorWrapData(tensor_data);
        std::cout << "Successfully created NVCV Tensor from existing GPU memory" << std::endl;

        // Demonstrate using the tensor by allocating output tensor and performing an operation
        // Allocate output tensor with the same shape
        nvcv::Tensor output_tensor(tensor_shape, nvcv::TYPE_F32);
        
        // Get device data pointer for the output tensor
        auto output_data = output_tensor.exportData<nvcv::TensorDataStridedCuda>();
        if (!output_data) {
            throw std::runtime_error("Failed to export output tensor data");
        }

        // Process the tensor data
        processTensorKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
            device_data, 
            reinterpret_cast<float*>(output_data->basePtr()), 
            tensor_size
        );
        CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));

        // Copy a small portion of the results back to verify
        const int sample_size = 10;
        std::vector<float> input_results(sample_size);
        std::vector<float> output_results(sample_size);
        
        CHECK_CUDA_ERROR(cudaMemcpy(input_results.data(), device_data,
                                     sample_size * sizeof(float),
                                     cudaMemcpyDeviceToHost));
        
        CHECK_CUDA_ERROR(cudaMemcpy(output_results.data(), 
                                     output_data->basePtr(),
                                     sample_size * sizeof(float),
                                     cudaMemcpyDeviceToHost));

        // Print results
        std::cout << "\nSample results comparison:" << std::endl;
        std::cout << "Index\tInput\tOutput" << std::endl;
        for (int i = 0; i < sample_size; ++i) {
            std::cout << i << "\t" << input_results[i] << "\t" << output_results[i] << std::endl;
            // Verify output is double the input
            if (std::abs(output_results[i] - (input_results[i] * 2.0f)) > 1e-5) {
                std::cerr << "Error: Output data doesn't match expected result at index " << i << std::endl;
            }
        }

        std::cout << "\nTensor zero-copy wrapping successful!" << std::endl;

        // Free the GPU memory
        CHECK_CUDA_ERROR(cudaFree(device_data));
        
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
        return EXIT_FAILURE;
    }

    // Clean up
    CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
    return EXIT_SUCCESS;
}

Apr 03 '25 22:04 dsuthar-nvidia

@lazylazypig @kywish Does something like the following work for you?


        // Create strides for the NCHW layout
        // Calculate strides based on pitch
        int64_t stride_w = sizeof(float);                // stride between elements in a row
        int64_t stride_h = pitch_bytes;                  // stride between rows
        int64_t stride_c = pitch_bytes * height;         // stride between channels
        int64_t stride_n = pitch_bytes * height * channels; // stride between batches

        // Set up the tensor buffer with our GPU memory pointer and strides
        nvcv::TensorDataStridedCuda::Buffer tensor_buffer;
        // The strides array follows the NCHW order from slowest to fastest dimension
        tensor_buffer.strides[0] = stride_n; 
        tensor_buffer.strides[1] = stride_c;
        tensor_buffer.strides[2] = stride_h;
        tensor_buffer.strides[3] = stride_w;
        
        // Set the base pointer to our allocated GPU memory
        tensor_buffer.basePtr = reinterpret_cast<NVCVByte*>(device_data);

        // Create the tensor shape with NCHW layout
        nvcv::TensorShape tensor_shape({batch_size, channels, height, width}, "NCHW");
        
        // Create the tensor data with our GPU memory
        nvcv::TensorDataStridedCuda tensor_data(
            tensor_shape,
            nvcv::TYPE_F32,  // FP32 data type
            tensor_buffer
        );

        // Create NVCV Tensor by wrapping the tensor data (zero-copy)
        nvcv::Tensor tensor = nvcv::TensorWrapData(tensor_data);
        std::cout << "Successfully created NVCV Tensor from existing GPU memory" << std::endl;

The tensor is created by tensor_data from device_data. When object tensor is released, will device_data be released as well?

Apr 08 '25 07:04 kywish