run resnet50 trt in python multiprocessing is error
when i run resnet50 trt in python multiprocessing, something goes wrong! error is:
[01/26/2024-17:16:44] [TRT] [E] 1: [caskUtils.cpp::createCaskHardwareInfo::852] Error Code 1: Cuda Runtime (initialization error)
(1, 1000)
my env: cuda: 11.6 ; TensorRT-8.6.0.12; T4
cuda-python==12.3.0
my code is below:
import tensorrt as trt
from cuda import cudart
import torch
import numpy as np
import timeit
import multiprocessing as mp
# 1. resnet trt推理模型
class TRT():
def __init__(self, trt_path):
self.trt_path = trt_path
trt.init_libnvinfer_plugins(None, "")
self.logger = trt.Logger(trt.Logger.ERROR)
with open(self.trt_path, "rb") as f:
resnet_engine_string = f.read()
if resnet_engine_string == None:
print("Failed getting serialized engine!")
return
self.resnet_engine = trt.Runtime(self.logger).deserialize_cuda_engine(resnet_engine_string)
# 创建self.context && 分配内存
self.resnet_nIO = self.resnet_engine.num_io_tensors
self.resnet_lTensorName = [self.resnet_engine.get_tensor_name(i) for i in range(self.resnet_nIO)]
self.resnet_nInput = [self.resnet_engine.get_tensor_mode(self.resnet_lTensorName[i]) for i in range(self.resnet_nIO)].count(trt.TensorIOMode.INPUT)
self.resnet_context = self.resnet_engine.create_execution_context()
# 设置输入的尺寸
self.resnet_context.set_input_shape(self.resnet_lTensorName[0], [1, 3, 224, 224])
for i in range(self.resnet_nIO):
print("[%2d]%s->" % (i, "Input " if i < self.resnet_nInput else "Output"), self.resnet_engine.get_tensor_dtype(self.resnet_lTensorName[i]), \
self.resnet_engine.get_tensor_shape(self.resnet_lTensorName[i]), self.resnet_context.get_tensor_shape(self.resnet_lTensorName[i]), self.resnet_lTensorName[i])
## for sync infer
self.resnet_input_device = torch.zeros(1, 3, 224, 224, dtype=torch.float32).to("cuda")
self.resnet_output = torch.zeros(1, 1000, dtype=torch.float32).to("cuda")
## for async infer
_, self.resnet_stream = cudart.cudaStreamCreate()
dummy_input_data = np.random.rand(1 * 3 * 224 * 224).astype(np.int32).reshape(1, 3, 224, 224)
dummy_input_data = np.ascontiguousarray(dummy_input_data.reshape(-1))
self.output_host = np.empty(self.resnet_context.get_binding_shape(1), \
dtype=trt.nptype(self.resnet_engine.get_binding_dtype(1)))
_, self.input_ids_device = cudart.cudaMallocAsync(dummy_input_data.nbytes, self.resnet_stream)
_, self.ouput_device = cudart.cudaMallocAsync(self.output_host.nbytes, self.resnet_stream)
# 异步推理
def process_async(self, input_ids_host):
# do a complete inference
input_ids_host = input_ids_host.astype(np.int32)
cudart.cudaMemcpyAsync(self.input_ids_device, input_ids_host.ctypes.data, \
input_ids_host.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.resnet_stream)
self.resnet_context.execute_async_v2([int(self.input_ids_device), int(self.ouput_device)], self.resnet_stream)
cudart.cudaMemcpyAsync(self.output_host.ctypes.data, self.ouput_device, \
self.output_host.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, self.resnet_stream)
cudart.cudaStreamSynchronize(self.resnet_stream)
return self.output_host
# 同步推理
def process(self, input_data):
input_data = torch.from_numpy(input_data).float().to("cuda")
buffer_deivces = []
buffer_deivces.append(input_data.reshape(-1).data_ptr())
buffer_deivces.append(self.resnet_output.reshape(-1).data_ptr())
self.resnet_context.execute_v2(buffer_deivces)
return self.resnet_output.cpu().numpy()
# 多线程测试样例
class TestMP:
def __init__(self):
self.model_path = "../models/resnet50_fp16.trt"
self.trt = TRT(self.model_path)
# self.data = np.random.randn(1,3,224,224)
self.mp_trt = mp.Process(target=self.infer)
self.mp_trt.start()
def __del__(self):
self.mp_trt.join()
def infer(self):
data = np.random.randn(1,3,224,224)
output = self.trt.process_async(data)
print(output.shape)
if __name__ == "__main__":
test = TestMP()
Does the above code work if you don't use mp? looks more like a usage issue to me.
Does the above code work if you don't use mp? looks more like a usage issue to me.
The above code alse work without mp. the code is below:
import tensorrt as trt
from cuda import cudart
import torch
import numpy as np
import timeit
import multiprocessing as mp
# 1. resnet trt推理模型
class TRT():
def __init__(self, trt_path):
self.trt_path = trt_path
trt.init_libnvinfer_plugins(None, "")
self.logger = trt.Logger(trt.Logger.ERROR)
with open(self.trt_path, "rb") as f:
resnet_engine_string = f.read()
if resnet_engine_string == None:
print("Failed getting serialized engine!")
return
self.resnet_engine = trt.Runtime(self.logger).deserialize_cuda_engine(resnet_engine_string)
# 创建self.context && 分配内存
self.resnet_nIO = self.resnet_engine.num_io_tensors
self.resnet_lTensorName = [self.resnet_engine.get_tensor_name(i) for i in range(self.resnet_nIO)]
self.resnet_nInput = [self.resnet_engine.get_tensor_mode(self.resnet_lTensorName[i]) for i in range(self.resnet_nIO)].count(trt.TensorIOMode.INPUT)
self.resnet_context = self.resnet_engine.create_execution_context()
# 设置输入的尺寸
self.resnet_context.set_input_shape(self.resnet_lTensorName[0], [1, 3, 224, 224])
for i in range(self.resnet_nIO):
print("[%2d]%s->" % (i, "Input " if i < self.resnet_nInput else "Output"), self.resnet_engine.get_tensor_dtype(self.resnet_lTensorName[i]), \
self.resnet_engine.get_tensor_shape(self.resnet_lTensorName[i]), self.resnet_context.get_tensor_shape(self.resnet_lTensorName[i]), self.resnet_lTensorName[i])
## for sync infer
self.resnet_input_device = torch.zeros(1, 3, 224, 224, dtype=torch.float32).to("cuda")
self.resnet_output = torch.zeros(1, 1000, dtype=torch.float32).to("cuda")
## for async infer
_, self.resnet_stream = cudart.cudaStreamCreate()
dummy_input_data = np.random.rand(1 * 3 * 224 * 224).astype(np.int32).reshape(1, 3, 224, 224)
dummy_input_data = np.ascontiguousarray(dummy_input_data.reshape(-1))
self.output_host = np.empty(self.resnet_context.get_binding_shape(1), \
dtype=trt.nptype(self.resnet_engine.get_binding_dtype(1)))
_, self.input_ids_device = cudart.cudaMallocAsync(dummy_input_data.nbytes, self.resnet_stream)
_, self.ouput_device = cudart.cudaMallocAsync(self.output_host.nbytes, self.resnet_stream)
# 异步推理
def process_async(self, input_ids_host):
# do a complete inference
input_ids_host = input_ids_host.astype(np.int32)
cudart.cudaMemcpyAsync(self.input_ids_device, input_ids_host.ctypes.data, \
input_ids_host.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.resnet_stream)
self.resnet_context.execute_async_v2([int(self.input_ids_device), int(self.ouput_device)], self.resnet_stream)
cudart.cudaMemcpyAsync(self.output_host.ctypes.data, self.ouput_device, \
self.output_host.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, self.resnet_stream)
cudart.cudaStreamSynchronize(self.resnet_stream)
return self.output_host
# 同步推理
def process(self, input_data):
input_data = torch.from_numpy(input_data).float().to("cuda")
buffer_deivces = []
buffer_deivces.append(input_data.reshape(-1).data_ptr())
buffer_deivces.append(self.resnet_output.reshape(-1).data_ptr())
self.resnet_context.execute_v2(buffer_deivces)
return self.resnet_output.cpu().numpy()
def test_speed(model_path = "../models/resnet50_fp16.trt"):
model = TRT(model_path)
# data = torch.randn(1,3,224,224).float()
data = np.random.randn(1,3,224,224)
# 1. 同步推理
for i in range(10):
output = model.process(data)
# 测试推理时间
# onnx_fp32: 5ms
# trt_fp32: 6.5ms
# trt_fp16: 2.1ms
all_time = timeit.timeit(lambda:model.process(data), number = 10)
print("execut time = {:.2}ms".format(all_time / 10 * 1000))
# print(output.cpu())
# 2. 异步推理(比同步要快)
for i in range(10):
output = model.process_async(data)
# trt_fp32: 6.0ms
# trt_fp16: 2.0ms
all_time = timeit.timeit(lambda:model.process_async(data), number = 10)
print("async execut time = {:.2}ms".format(all_time / 10 * 1000))
if __name__ == "__main__":
test_speed()
and the result below:
[ 0]Input -> DataType.FLOAT (1, 3, 224, 224) (1, 3, 224, 224) input
[ 1]Output-> DataType.FLOAT (1, 1000) (1, 1000) output
trt_demo.py:40: DeprecationWarning: Use get_tensor_shape instead.
self.output_host = np.empty(self.resnet_context.get_binding_shape(1), \
trt_demo.py:41: DeprecationWarning: Use get_tensor_dtype instead.
dtype=trt.nptype(self.resnet_engine.get_binding_dtype(1)))
execut time = 2.4ms
async execut time = 2.3ms
The above code alse work without mp.
So there is no problem if you don't use mp. Could you please try don't use mp package but open several terminal and launch multiple processes?
closing since no activity for more than 3 weeks, pls reopen if you still have question. Thanks!