TensorRT
TensorRT copied to clipboard
How to use tensorrt with torch tensor on cuda
The code below shows that the numpy part works perfectly, but using torch's gpu tensor will report an error. My actual usage scenario is to decode video using vpf first, which can directly decode into a torch cuda tensor. Therefore, I hope that the input to tensorrt is a torch cuda tensor.
python=3.8.10 torch=2.2.0 tensorrt=8.6.1.2-1+cuda12.0 Driver Version: 525.125.06 CUDA Version: 12.0
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
from pycuda.tools import make_default_context
batch_size = 1
cuda.init()
cuda_context = make_default_context()
# import pycuda.autoinit
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open("./yolov8.trt", "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
context.set_binding_shape(0, (batch_size, 3, 640, 640))
input_memory_image = cuda.mem_alloc(cuda.pagelocked_empty(batch_size * 3 * 640 * 640, np.float32).nbytes)
output_buffer = cuda.pagelocked_empty(batch_size * 12 * 8400, np.float32)
output = cuda.mem_alloc(output_buffer.nbytes)
stream = cuda.Stream()
# from numpy
img0 = np.random.rand(batch_size, 3, 640, 640).astype(np.float32)
cuda.memcpy_htod_async(input_memory_image, np.ascontiguousarray(img0), stream)
context.execute_async_v2(bindings=[input_memory_image, output], stream_handle=stream.handle)
# True
cuda.memcpy_dtoh_async(output_buffer, output, stream)
stream.synchronize()
print("numpy output_buffer.shape=",output_buffer.shape)
# from torch gpu
import torch
class PyTorchTensorHolder(cuda.PointerHolderBase):
def __init__(self, tensor):
super(PyTorchTensorHolder, self).__init__()
self.tensor = tensor
def get_pointer(self):
return self.tensor.data_ptr()
img_tensor = torch.from_numpy(np.random.rand(batch_size, 3, 640, 640).astype(np.float32)).cuda()
cuda.memcpy_dtod_async(input_memory_image, PyTorchTensorHolder(img_tensor), img_tensor.nelement() * img_tensor.element_size(), stream)
context.execute_async_v2(bindings=[input_memory_image, output], stream_handle=stream.handle)
# [02/18/2024-14:13:38] [TRT] [E] 1: [reformat.cpp::executeCutensor::329] Error Code 1: CuTensor (Internal cuTensor permutate execute failed)
# [02/18/2024-14:13:38] [TRT] [E] 1: [checkMacros.cpp::catchCudaError::203] Err
# False
cuda.memcpy_dtoh_async(output_buffer, output, stream)
stream.synchronize()
print(output_buffer.shape)
Try searching in issues, you'll find some example code.
Hi @chenj133 , have you resolved your issue?
closing since no activity for more than 3 weeks per our policy, thanks all!