mmpose
mmpose copied to clipboard
onnx infer speed is too slow
i used pytorch2onnx.py to transfer res50_coco_256x192.pth to onnx. also i transferd it to torchscript. i tested onnx and torchscript model infer dynamic_batch inputs. all were used gpu with cuda, onnx speed was slow with batchsize==10(530ms/avg),although when batchsize==1 was 55ms/avg but torchscript was fast with batchsize==10(40ms/avg),batchsize==1 was 8ms/avg
i wonder why onnx infer was slow
pth2pt code:
traces_script_module = torch.jit.trace(model, one_img, check_trace=False)
traces_script_module.save('res50_coco_256x192.pt')
pth2onnx code:
torch.onnx.export(
model,
one_img,
output_file,
export_params=True,
keep_initializers_as_inputs=True,
verbose=show,
opset_version=opset_version,
input_names=['input.1'],
output_names=['output'],
dynamic_axes={"input.1":{0: "batch_size"}, "output":{0: "batch_size"}}
)
@liuxufenfeiya Can you please share the inference code used for torchscript?
@liuxufenfeiya Can you please share the inference code used for torchscript?
model = torch.jit.load('res50_coco_256x192.pt', map_location='cuda:0')
output = model.forward(img.cuda())
@liuxufenfeiya Your shared code will give heatmap as output, right? Have you used keypoints_from_heatmap function from mmpose for final results?
I have tested both 10 batches and 1 batch of Lite-HRNet.
- Ubuntu 20.04.3 LTS
- Intel(R) Core(TM) i9-10900K CPU @ 3.70GHz
- NVIDIA GeForce RTX 3070
- N batch: litehrnet_18_coco_Nx256x192.onnx.zip
- 1 batch: litehrnet_18_coco_256x192.onnx.zip
- Test Code - TensorRT vs CUDA vs CPU
import onnxruntime as ort
import numpy as np
import time
MODEL_FILE1 = 'litehrnet_18_coco_Nx256x192.onnx'
MODEL_FILE2 = 'litehrnet_18_coco_256x192.onnx'
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ TensorRT test')
session_option = ort.SessionOptions()
session_option.log_severity_level = 4
# Batch inference x1
onnx_session = ort.InferenceSession(
MODEL_FILE1,
sess_options=session_option,
providers=[
(
'TensorrtExecutionProvider', {
'trt_engine_cache_enable': True,
'trt_fp16_enable': True,
}
),
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Batch inference x1')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([10, 3, 256, 192]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
results = onnx_session.run(
None,
{input_name: np.ones(([10, 3, 256, 192]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
print(f'elapsed time: {(time.time()-start)*1000} ms')
print()
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option,
providers=[
(
'TensorrtExecutionProvider', {
'trt_engine_cache_enable': True,
'trt_fp16_enable': True,
}
),
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, 256, 192]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, 256, 192]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
print(f'elapsed time: {(time.time()-start)*1000} ms')
print()
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CUDA test')
session_option1 = ort.SessionOptions()
session_option1.log_severity_level = 4
session_option1.optimized_model_filepath = f"{MODEL_FILE1}_cudaopt.onnx"
session_option1.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_option2 = ort.SessionOptions()
session_option2.log_severity_level = 4
session_option2.optimized_model_filepath = f"{MODEL_FILE2}_cudaopt.onnx"
session_option2.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
# Batch inference x1
onnx_session = ort.InferenceSession(
MODEL_FILE1,
sess_options=session_option1,
providers=[
'CUDAExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Batch inference x1')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([10, 3, 256, 192]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
results = onnx_session.run(
None,
{input_name: np.ones(([10, 3, 256, 192]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
print(f'elapsed time: {(time.time()-start)*1000} ms')
print()
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option2,
providers=[
'CUDAExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, 256, 192]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, 256, 192]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
print(f'elapsed time: {(time.time()-start)*1000} ms')
print()
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CPU test')
session_option = ort.SessionOptions()
session_option.log_severity_level = 4
# Batch inference x1
onnx_session = ort.InferenceSession(
MODEL_FILE1,
sess_options=session_option,
providers=[
'CPUExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Batch inference x1')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([10, 3, 256, 192]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
results = onnx_session.run(
None,
{input_name: np.ones(([10, 3, 256, 192]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
print(f'elapsed time: {(time.time()-start)*1000} ms')
print()
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option,
providers=[
'CPUExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, 256, 192]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, 256, 192]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
print(f'elapsed time: {(time.time()-start)*1000} ms')
print()
- Results
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ TensorRT test
@@@@@@@@@@ Batch inference x1
input.shape: ['-1', 3, 256, 192]
results.shape: (10, 17, 64, 48)
elapsed time: 9.397268295288086 ms
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 192]
results.shape: (1, 17, 64, 48)
elapsed time: 34.825801849365234 ms
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CUDA test
@@@@@@@@@@ Batch inference x1
input.shape: ['-1', 3, 256, 192]
results.shape: (10, 17, 64, 48)
elapsed time: 24.973392486572266 ms
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 192]
results.shape: (1, 17, 64, 48)
elapsed time: 79.75530624389648 ms
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CPU test
@@@@@@@@@@ Batch inference x1
input.shape: ['-1', 3, 256, 192]
results.shape: (10, 17, 64, 48)
elapsed time: 239.66598510742188 ms
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 192]
results.shape: (1, 17, 64, 48)
elapsed time: 213.9906883239746 ms
@liuxufenfeiya Can you please share the code you used to convert it into torchscript?