TensorRT icon indicating copy to clipboard operation
TensorRT copied to clipboard

How to use tensorrt with torch tensor on cuda

Open chenj133 opened this issue 1 year ago • 1 comments

The code below shows that the numpy part works perfectly, but using torch's gpu tensor will report an error. My actual usage scenario is to decode video using vpf first, which can directly decode into a torch cuda tensor. Therefore, I hope that the input to tensorrt is a torch cuda tensor.

python=3.8.10 torch=2.2.0 tensorrt=8.6.1.2-1+cuda12.0 Driver Version: 525.125.06 CUDA Version: 12.0

import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
from pycuda.tools import make_default_context
batch_size = 1

cuda.init()
cuda_context = make_default_context()
# import pycuda.autoinit
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open("./yolov8.trt", "rb") as f:
    serialized_engine = f.read()

engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
context.set_binding_shape(0, (batch_size, 3, 640, 640))
input_memory_image = cuda.mem_alloc(cuda.pagelocked_empty(batch_size * 3 * 640 * 640, np.float32).nbytes)
output_buffer = cuda.pagelocked_empty(batch_size * 12 * 8400, np.float32)
output = cuda.mem_alloc(output_buffer.nbytes)
stream = cuda.Stream()

# from numpy
img0 = np.random.rand(batch_size, 3, 640, 640).astype(np.float32)
cuda.memcpy_htod_async(input_memory_image, np.ascontiguousarray(img0), stream)
context.execute_async_v2(bindings=[input_memory_image, output], stream_handle=stream.handle)
# True
cuda.memcpy_dtoh_async(output_buffer, output, stream)
stream.synchronize()
print("numpy output_buffer.shape=",output_buffer.shape)


# from torch gpu
import torch
class PyTorchTensorHolder(cuda.PointerHolderBase):
    def __init__(self, tensor):
        super(PyTorchTensorHolder, self).__init__()
        self.tensor = tensor
    def get_pointer(self):
        return self.tensor.data_ptr()


img_tensor = torch.from_numpy(np.random.rand(batch_size, 3, 640, 640).astype(np.float32)).cuda()
cuda.memcpy_dtod_async(input_memory_image, PyTorchTensorHolder(img_tensor), img_tensor.nelement() * img_tensor.element_size(), stream)
context.execute_async_v2(bindings=[input_memory_image, output], stream_handle=stream.handle)
# [02/18/2024-14:13:38] [TRT] [E] 1: [reformat.cpp::executeCutensor::329] Error Code 1: CuTensor (Internal cuTensor permutate execute failed)
# [02/18/2024-14:13:38] [TRT] [E] 1: [checkMacros.cpp::catchCudaError::203] Err
# False
cuda.memcpy_dtoh_async(output_buffer, output, stream)
stream.synchronize()
print(output_buffer.shape)

chenj133 avatar Feb 18 '24 14:02 chenj133

Try searching in issues, you'll find some example code.

zerollzeng avatar Feb 20 '24 09:02 zerollzeng

Hi @chenj133 , have you resolved your issue?

trantuankhoi avatar Mar 18 '24 06:03 trantuankhoi

closing since no activity for more than 3 weeks per our policy, thanks all!

ttyio avatar May 07 '24 18:05 ttyio