super-gradients copied to clipboard
TensorRT doesn't show predictions YOLO-NAS-POSE
💡 Your Question
I'm trying to do inference with a trt YOLO-NAS-POSE model. I have exported the model to onnx like it shows on the website:
export_result = yolo_nas_pose_s.export("yolo_nas_pose_s.onnx")
and then I got the .engine model with this command:
trtexec --explicitBatch --onnx=yolo_nas_pose_s.onnx --saveEngine=yolo_nas_pose_s_batch.engine
I tried the onnx model and it works fine, but with the .engine model I just get the image without the predictions.
This is my python code:
import tensorrt as trt
import numpy as np
import torch
from PIL import Image
import cv2
import time
from import PoseVisualization
from collections import namedtuple
def iterate_over_batch_predictions(predictions, batch_size):
num_detections, batch_boxes, batch_scores, batch_joints = predictions
print(num_detections.shape, batch_boxes.shape, batch_scores.shape, batch_joints.shape)
for image_index in range(batch_size):
num_detection_in_image = num_detections[image_index, 0]
pred_scores = batch_scores[image_index, :num_detection_in_image]
pred_boxes = batch_boxes[image_index, :num_detection_in_image]
pred_joints = batch_joints[image_index, :num_detection_in_image].reshape((len(pred_scores), -1, 3))
yield image_index, pred_boxes, pred_scores, pred_joints
def get_predictions_from_batch_format(image, predictions):
# In this tutorial we are using batch size of 1, therefore we are getting only first element of the predictions
image_index, pred_boxes, pred_scores, pred_joints = next(iter(iterate_over_batch_predictions(predictions, 1)))
image = PoseVisualization.draw_poses(
image=image, poses=pred_joints, scores=pred_scores, boxes=pred_boxes,
edge_links=None, edge_colors=None, keypoint_colors=None, is_crowd=None
return image
def load_engine(engine_file):
with open(engine_file, "rb") as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
return runtime.deserialize_cuda_engine(
def inference(engine, image, device):
image = cv2.resize(image, (640, 640))
image = np.transpose(image, (2, 0, 1)).astype(np.uint8)
image = np.expand_dims(image, axis=0)
image = torch.from_numpy(image).to(device)
Binding = namedtuple("Binding", ["data", "ptr"])
bindings = {}
start = time.perf_counter()
with engine.create_execution_context() as context:
ptrs = []
for binding in engine:
dtype = trt.nptype(engine.get_tensor_dtype(binding))
shape = engine.get_binding_shape(binding)
if engine.binding_is_input(binding):
data = image
data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
# Memory address
ptr = data.data_ptr()
bindings[binding] = Binding(data, ptr)
ptrs = [binding.ptr for binding in bindings.values()]
predictions = []
for binding in engine:
if not engine.binding_is_input(binding):
exec_cost = time.perf_counter() - start
print(f"Execution time: {exec_cost:.2f} s")
return predictions
engine_file = "yolo_nas_pose_s_batch.engine"
device = "cuda"
engine = load_engine(engine_file)
image_path = "image.jpg"
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictions = inference(engine, image, device)
img = get_predictions_from_batch_format(image, predictions)
img = Image.fromarray(img, mode='RGB')'myimage.png')
Collecting environment information... PyTorch version: 2.2.0+cu118 Is debug build: False CUDA used to build PyTorch: 11.8 ROCM used to build PyTorch: N/A
OS: Microsoft Windows 10 Enterprise LTSC GCC version: Could not collect Clang version: Could not collect CMake version: Could not collect Libc version: N/A
Python version: 3.8.0 (tags/v3.8.0:fa919fd, Oct 14 2019, 19:37:50) [MSC v.1916 64 bit (AMD64)] (64-bit runtime) Python platform: Windows-10-10.0.17763-SP0 Is CUDA available: True CUDA runtime version: 11.8.89 CUDA_MODULE_LOADING set to: LAZY GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1650 Nvidia driver version: 551.52 cuDNN version: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\cudnn_ops_train64_8.dll HIP runtime version: N/A MIOpen runtime version: N/A Is XNNPACK available: True
CPU: Architecture=9 CurrentClockSpeed=1992 DeviceID=CPU0 Family=198 L2CacheSize=2048 L2CacheSpeed= Manufacturer=GenuineIntel MaxClockSpeed=1992 Name=Intel(R) Core(TM) i7-10700TE CPU @ 2.00GHz ProcessorType=3 Revision=
Versions of relevant libraries: [pip3] numpy==1.23.0 [pip3] onnx==1.13.0 [pip3] onnx-graphsurgeon==0.3.12 [pip3] onnxruntime==1.13.1 [pip3] onnxsim==0.4.35 [pip3] super-gradients==3.6.0 [pip3] torch==2.2.0+cu118 [pip3] torchaudio==2.2.0+cu118 [pip3] torchmetrics==0.8.0 [pip3] torchvision==0.17.0 [conda] Could not collect