CV-CUDA
CV-CUDA copied to clipboard
[QUESTION]how to create nvcv::Tensor from an existing gpu memory by zero-copy?
how to create nvcv::Tensor from an fixed gpu memory by zero-copy?
By the way, when I use TensorWrapData, release the tensor will release original gpu memory too😂
float* pGpuData= ...
...
{
nvcv::TensorDataStridedCuda::Buffer inBuf;
inBuf.basePtr = pGpuData;
...
nvcv::TensorDataStridedCuda inData(... , inBuf);
nvcv::Tensor inTensor = TensorWrapData(inData);
}
...
when I use pGpuData, it wall released
@kywish hi; any updates or you have solve the ploblem?
@lazylazypig @kywish Does something like the following work for you?
#include <cuda_runtime.h>
#include <nvcv/DataType.hpp>
#include <nvcv/Tensor.hpp>
#include <nvcv/TensorData.hpp>
#include <nvcv/TensorLayout.hpp>
#include <cstdint>
#include <iostream>
#include <vector>
// Utility function to check CUDA errors
#define CHECK_CUDA_ERROR(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
std::cerr << "CUDA error in " << __FILE__ << " line " << __LINE__ \
<< ": " << cudaGetErrorString(error) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
// Simple kernel to fill a tensor with values
__global__ void fillTensorKernel(float* data, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
data[idx] = static_cast<float>(idx) / 10.0f;
}
}
// Another kernel to process tensor data
__global__ void processTensorKernel(const float* input, float* output, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
output[idx] = input[idx] * 2.0f; // Just double the values
}
}
int main() {
// Create CUDA stream
cudaStream_t stream;
CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
try {
// Define tensor dimensions (NCHW format)
const int batch_size = 2;
const int channels = 3;
const int height = 4;
const int width = 5;
const int tensor_size = batch_size * channels * height * width;
std::cout << "Creating a tensor with dimensions: ["
<< batch_size << ", " << channels << ", "
<< height << ", " << width << "]" << std::endl;
// Allocate GPU memory manually
float* device_data;
size_t pitch_bytes;
CHECK_CUDA_ERROR(cudaMallocPitch(&device_data, &pitch_bytes,
width * sizeof(float),
height * channels * batch_size));
std::cout << "Allocated GPU memory with pitch: " << pitch_bytes << " bytes" << std::endl;
// Fill the tensor with data
int threadsPerBlock = 256;
int blocksPerGrid = (tensor_size + threadsPerBlock - 1) / threadsPerBlock;
fillTensorKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(device_data, tensor_size);
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
// Create strides for the NCHW layout
// Calculate strides based on pitch
int64_t stride_w = sizeof(float); // stride between elements in a row
int64_t stride_h = pitch_bytes; // stride between rows
int64_t stride_c = pitch_bytes * height; // stride between channels
int64_t stride_n = pitch_bytes * height * channels; // stride between batches
// Set up the tensor buffer with our GPU memory pointer and strides
nvcv::TensorDataStridedCuda::Buffer tensor_buffer;
// The strides array follows the NCHW order from slowest to fastest dimension
tensor_buffer.strides[0] = stride_n;
tensor_buffer.strides[1] = stride_c;
tensor_buffer.strides[2] = stride_h;
tensor_buffer.strides[3] = stride_w;
// Set the base pointer to our allocated GPU memory
tensor_buffer.basePtr = reinterpret_cast<NVCVByte*>(device_data);
// Create the tensor shape with NCHW layout
nvcv::TensorShape tensor_shape({batch_size, channels, height, width}, "NCHW");
// Create the tensor data with our GPU memory
nvcv::TensorDataStridedCuda tensor_data(
tensor_shape,
nvcv::TYPE_F32, // FP32 data type
tensor_buffer
);
// Create NVCV Tensor by wrapping the tensor data (zero-copy)
nvcv::Tensor tensor = nvcv::TensorWrapData(tensor_data);
std::cout << "Successfully created NVCV Tensor from existing GPU memory" << std::endl;
// Demonstrate using the tensor by allocating output tensor and performing an operation
// Allocate output tensor with the same shape
nvcv::Tensor output_tensor(tensor_shape, nvcv::TYPE_F32);
// Get device data pointer for the output tensor
auto output_data = output_tensor.exportData<nvcv::TensorDataStridedCuda>();
if (!output_data) {
throw std::runtime_error("Failed to export output tensor data");
}
// Process the tensor data
processTensorKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
device_data,
reinterpret_cast<float*>(output_data->basePtr()),
tensor_size
);
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
// Copy a small portion of the results back to verify
const int sample_size = 10;
std::vector<float> input_results(sample_size);
std::vector<float> output_results(sample_size);
CHECK_CUDA_ERROR(cudaMemcpy(input_results.data(), device_data,
sample_size * sizeof(float),
cudaMemcpyDeviceToHost));
CHECK_CUDA_ERROR(cudaMemcpy(output_results.data(),
output_data->basePtr(),
sample_size * sizeof(float),
cudaMemcpyDeviceToHost));
// Print results
std::cout << "\nSample results comparison:" << std::endl;
std::cout << "Index\tInput\tOutput" << std::endl;
for (int i = 0; i < sample_size; ++i) {
std::cout << i << "\t" << input_results[i] << "\t" << output_results[i] << std::endl;
// Verify output is double the input
if (std::abs(output_results[i] - (input_results[i] * 2.0f)) > 1e-5) {
std::cerr << "Error: Output data doesn't match expected result at index " << i << std::endl;
}
}
std::cout << "\nTensor zero-copy wrapping successful!" << std::endl;
// Free the GPU memory
CHECK_CUDA_ERROR(cudaFree(device_data));
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
return EXIT_FAILURE;
}
// Clean up
CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
return EXIT_SUCCESS;
}
@lazylazypig @kywish Does something like the following work for you?
// Create strides for the NCHW layout // Calculate strides based on pitch int64_t stride_w = sizeof(float); // stride between elements in a row int64_t stride_h = pitch_bytes; // stride between rows int64_t stride_c = pitch_bytes * height; // stride between channels int64_t stride_n = pitch_bytes * height * channels; // stride between batches // Set up the tensor buffer with our GPU memory pointer and strides nvcv::TensorDataStridedCuda::Buffer tensor_buffer; // The strides array follows the NCHW order from slowest to fastest dimension tensor_buffer.strides[0] = stride_n; tensor_buffer.strides[1] = stride_c; tensor_buffer.strides[2] = stride_h; tensor_buffer.strides[3] = stride_w; // Set the base pointer to our allocated GPU memory tensor_buffer.basePtr = reinterpret_cast<NVCVByte*>(device_data); // Create the tensor shape with NCHW layout nvcv::TensorShape tensor_shape({batch_size, channels, height, width}, "NCHW"); // Create the tensor data with our GPU memory nvcv::TensorDataStridedCuda tensor_data( tensor_shape, nvcv::TYPE_F32, // FP32 data type tensor_buffer ); // Create NVCV Tensor by wrapping the tensor data (zero-copy) nvcv::Tensor tensor = nvcv::TensorWrapData(tensor_data); std::cout << "Successfully created NVCV Tensor from existing GPU memory" << std::endl;
The tensor is created by tensor_data from device_data. When object tensor is released, will device_data be released as well?