RT-DETR (pytorch) onnx is slower than pytorch

Describe the bug I trained my custom dataset with rtdetr_r101vd_6x_coco_custom.yml. However, I found onnx is three times slower than pytorch. I just run export_onnx.py in the github and saves model.onnx. Please review my inference code referenced from issue.

result

onnx

python ./tools/predict_onnx.py -i ./images/D16030_196_Add00407.jpg

torch.Size([1, 3, 640, 640])
Inferece time = 0.421980619430542 s
FPS = 2.3697770796902677

pytorch

`python ./tools/predict_pytorch.py -c ./configs/rtdetr/rtdetr_r101vd_6x_coco_custom.yml -w ../output/rtdetr_r101vd_6x_coco_custom/checkpoint0004.pth -i ./images/D16030_196_Add00407.jpg

Load PResNet101 state_dict
Inferece time = 0.15229344367980957 s
FPS = 6.566270850782369

pytorch inference code

import argparse
from pathlib import Path
import time

class ImageReader:
    def __init__(self, resize=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
        self.transform = transforms.Compose([
            # transforms.Resize((resize, resize)) if isinstance(resize, int) else transforms.Resize(
            #     (resize[0], resize[1])),
            transforms.ToTensor(),
            # transforms.Normalize(mean=mean, std=std),
        ])
        self.resize = resize
        self.pil_img = None   

    def __call__(self, image_path, *args, **kwargs):
        self.pil_img = Image.open(image_path).convert('RGB').resize((self.resize, self.resize))
        return self.transform(self.pil_img).unsqueeze(0)


class Model(nn.Module):
    def __init__(self, confg=None, ckpt="") -> None:
        super().__init__()
        self.cfg = YAMLConfig(confg, resume=ckpt)
        if ckpt:
            checkpoint = torch.load(ckpt, map_location='cpu') 
            if 'ema' in checkpoint:
                state = checkpoint['ema']['module']
            else:
                state = checkpoint['model']
        else:
            raise AttributeError('only support resume to load model.state_dict by now.')

        # NOTE load train mode state -> convert to deploy mode
        self.cfg.model.load_state_dict(state)

        self.model = self.cfg.model.deploy()
        self.postprocessor = self.cfg.postprocessor.deploy()
        # print(self.postprocessor.deploy_mode)
        
    def forward(self, images, orig_target_sizes):
        outputs = self.model(images)
        return self.postprocessor(outputs, orig_target_sizes)



def get_argparser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", '-c', type=str, )
    parser.add_argument("--ckpt", '-w', type=str, ) # pth
    parser.add_argument("--image", '-i', type=str, ) 
    parser.add_argument("--device", default="cuda:1")

    return parser


def main(args):
    img_path = Path(args.image)
    device = torch.device(args.device)
    reader = ImageReader(resize=640)
    model = Model(confg=args.config, ckpt=args.ckpt)
    model.to(device=device)

    img = reader(img_path).to(device)
    size = torch.tensor([[img.shape[2], img.shape[3]]]).to(device)
    
    start_time = time.time()
    output = model(img, size)
    inf_time = time.time() - start_time
    fps = float(1/inf_time)
    print("Inferece time = {} s".format(inf_time, '.4f'))
    print("FPS = {} ".format(fps, '.1f') )
    
    labels, boxes, scores = output
    
    im = reader.pil_img
    draw = ImageDraw.Draw(im)
    thrh = 0.6

    for i in range(img.shape[0]):

        scr = scores[i]
        lab = labels[i][scr > thrh]
        box = boxes[i][scr > thrh]

        for b in box:
            draw.rectangle(list(b), outline='red', )
            draw.text((b[0], b[1]), text=str(lab[i]), fill='blue', )

    # save_path = Path(args.output_dir) / img_path.name
    file_dir = os.path.dirname(args.image)
    new_file_name = os.path.basename(args.image).split('.')[0] + '_torch'+ os.path.splitext(args.image)[1]
    new_file_path = file_dir + '/' + new_file_name
    print('new_file_path: ', new_file_path)
    im.save(new_file_path)
 

if __name__ == "__main__":
    main(get_argparser().parse_args())

onnx inference code

mport os 
import sys
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

import torch
import onnxruntime as ort
from PIL import Image, ImageDraw, ImageFont
from torchvision.transforms import ToTensor
import argparse
import time

def main(args, ):
    im = Image.open(args.img).convert('RGB')
    im = im.resize((640, 640))
    im_data = ToTensor()(im)[None]
    # (width, height) = im.size
    print(im_data.shape)
    # print(width, height)
    # size = torch.tensor([[width, height]])
    size = torch.tensor([[640, 640]])
    sess = ort.InferenceSession(args.model)
    
    start_time = time.time()
    output = sess.run(
        # output_names=['labels', 'boxes', 'scores'],
        output_names=None,
        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}        
    )
    end_time = time.time()
    # inf_time = time.time() - start_time
    inf_time = end_time - start_time
    fps = float(1/inf_time)
    print("Inferece time = {} s".format(inf_time, '.4f'))
    print("FPS = {} ".format(fps, '.1f') )
    #print(type(output))
    #print([out.shape for out in output])

    labels, boxes, scores = output
    
    draw = ImageDraw.Draw(im)  # Draw on the original image
    thrh = 0.6

    for i in range(im_data.shape[0]):

        scr = scores[i]
        lab = labels[i][scr > thrh]
        box = boxes[i][scr > thrh]

        #print(i, sum(scr > thrh))

        for b in box:
            draw.rectangle(list(b), outline='red',)
            # font = ImageFont.truetype("Arial.ttf", 15)
            draw.text((b[0], b[1]), text=str(lab[i]), fill='yellow', )

    # Save the original image with bounding boxes
    file_dir = os.path.dirname(args.img)
    new_file_name = os.path.basename(args.img).split('.')[0] + '_onnx'+ os.path.splitext(args.img)[1]
    new_file_path = file_dir + '/' + new_file_name
    print('new_file_path: ', new_file_path)
    im.save(new_file_path)
 

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--img', '-i', type=str, )
    parser.add_argument('--model', '-m', type=str, default='model.onnx')

    args = parser.parse_args()

    main(args)

Jun 20 '24 01:06 SoraJung

Please check that your onnxruntime is using GPU.

# pip install onnxruntime-gpu

import onnxruntime as ort

print(ort.get_device())

Jun 20 '24 03:06 lyuwenyu

Thank you for your prompt response. I tried your suggestion, but much slower than before. help me please...

add my code

    providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
    sess_options = ort.SessionOptions()
    sess = ort.InferenceSession(args.model, sess_options=sess_options, providers=providers)
    
    start_time = time.time()
    output = sess.run(
        # output_names=['labels', 'boxes', 'scores'],
        output_names=None,
        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}        
    )
    end_time = time.time()
    # inf_time = time.time() - start_time
    inf_time = end_time - start_time
    fps = float(1/inf_time)
    print("Inferece time = {:.4f} s".format(inf_time))
    print("FPS = {:.2f} ".format(fps))

result

python ./tools/predict_onnx.py -i ./images/D16030_196_Add00407.jpg

ort.get_device() GPU
torch.Size([1, 3, 640, 640])
Inferece time = 19.2355 s
FPS = 0.05

Jun 20 '24 05:06 SoraJung

Now I found different providers have an effect on FPS. FPS increased by 2. ( 6.5 -> 8.5 ). But I'm not sure if this is right. How about paddle? paddle onnx is faster than pytorch onnx?

code

providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}), "CPUExecutionProvider"]
sess_options = ort.SessionOptions()
sess_options.enable_profiling = True
sess = ort.InferenceSession(args.model, sess_options=sess_options, providers=providers)

result

python ./tools/predict_onnx.py -i ./images/D16030_196_Add00407.jpg

ort.get_device() GPU
torch.Size([1, 3, 640, 640])
Inferece time = 0.1174 s
FPS = 8.52

Jun 20 '24 06:06 SoraJung

    start_time = time.time()
    output = sess.run(
        # output_names=['labels', 'boxes', 'scores'],
        output_names=None,
        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}        
    )
    end_time = time.time()

I think you can run this piece of code several times, then compute average time.

tic = time.time()
for _ in range(N)
    # code

average_time = (time.time() - tic) / N

Jun 20 '24 06:06 lyuwenyu

Thanks for your advice! I solved the problem. I modified the code, from one img to img directory. pytorch average FPS 23.45, onnx average FPS 28.32 for 10 images!

pytorch

python ./tools/predict_pytorch.py -c ./configs/rtdetr/rtdetr_r101vd_6x_coco_custom.yml -w ../output/rtdetr_r101vd_6x_coco_custom/checkpoint0004.pth -i ./images/input

img_path: images/input/D16030_196_Add00407.jpg, inf_time: 0.1581, FPS: 6.32
new_file_path:  images/output/D16030_196_Add00407_torch.jpg
================================================================================
Load PResNet101 state_dict
img_path: images/input/aihub3.jpg, inf_time: 0.0446, FPS: 22.40
new_file_path:  images/output/aihub3_torch.jpg
================================================================================
Load PResNet101 state_dict
img_path: images/input/D16030_196_Add00407_1.jpg, inf_time: 0.0430, FPS: 23.26
new_file_path:  images/output/D16030_196_Add00407_1_torch.jpg
================================================================================
.
.
All images count: 10
Average Inferece time = 0.0426 s
Average FPS = 23.45

onnx

python ./tools/predict_onnx.py -i ./images/input/

img_path: ./images/input//D16030_196_Add00407.jpg, inf_time: 0.1257, FPS: 7.95
new_file_path:  images/output/D16030_196_Add00407_onnx.jpg
================================================================================
img_path: ./images/input//aihub3.jpg, inf_time: 0.0415, FPS: 24.09
new_file_path:  images/output/aihub3_onnx.jpg
================================================================================
img_path: ./images/input//D16030_196_Add00407_1.jpg, inf_time: 0.0414, FPS: 24.13
new_file_path:  images/output/D16030_196_Add00407_1_onnx.jpg
================================================================================
img_path: ./images/input//ytb_SterlingT_Suwon_0_000044_1.jpg, inf_time: 0.0415, FPS: 24.12
new_file_path:  images/output/ytb_SterlingT_Suwon_0_000044_1_onnx.jpg
================================================================================
img_path: ./images/input//aihub1.jpg, inf_time: 0.0354, FPS: 28.25
new_file_path:  images/output/aihub1_onnx.jpg
================================================================================
img_path: ./images/input//aihub.jpg, inf_time: 0.0352, FPS: 28.44
new_file_path:  images/output/aihub_onnx.jpg
.
.
All images count: 10
Average Inferece time = 0.0353 s
Average FPS = 28.32

Jun 20 '24 08:06 SoraJung

@SoraJung Could you please send me the complete modified prediction codes again? I may need your codes. Thanks.

Jun 21 '24 06:06 DaCheng1823

@SoraJung Could you please send me the complete modified prediction codes again? I may need your codes. Thanks.

It's final code. Check please ^__^

import os 
import sys
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

import torch
import onnxruntime as ort
from PIL import Image, ImageDraw, ImageFont
from torchvision.transforms import ToTensor
import argparse
import time
from pathlib import Path

def read_img(img_path):
    im = Image.open(img_path).convert('RGB')
    im = im.resize((640, 640))
    im_data = ToTensor()(im)[None]
    # (width, height) = im.size
    # print(im_data.shape)
    # print(width, height)
    # size = torch.tensor([[width, height]])
    size = torch.tensor([[640, 640]])
    return im, im_data, size

def createDirectory(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print("Error: Failed to create the directory.")


def main(args, ):
    
    print("ort.get_device()", ort.get_device())
    providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}), "CPUExecutionProvider"]
    sess_options = ort.SessionOptions()
    sess_options.enable_profiling = True
    sess = ort.InferenceSession(args.model, sess_options=sess_options, providers=providers)
    
    img_path_list = []
    possible_img_extension = ['.jpg', '.jpeg', '.JPG', '.bmp', '.png'] # 이미지 확장자들
    for (root, dirs, files) in os.walk(args.img):
        if len(files) > 0:
            for file_name in files:
                if os.path.splitext(file_name)[1] in possible_img_extension:
                    img_path = root + '/' + file_name     
                    img_path_list.append(img_path)
    
    all_inf_time = []
    for img_path in img_path_list:
        im, im_data, size = read_img(img_path) 
        
        tic = time.time()
        output = sess.run(
            # output_names=['labels', 'boxes', 'scores'],
            output_names=None,
            input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}        
        )
        inf_time = time.time() - tic
        fps = float(1/inf_time)
        print('img_path: {}, inf_time: {:.4f}, FPS: {:.2f}'.format(img_path, inf_time, fps))
        all_inf_time.append(inf_time)
        
        #print(type(output))
        #print([out.shape for out in output])

        labels, boxes, scores = output
    
        draw = ImageDraw.Draw(im)  # Draw on the original image
        thrh = 0.6

        for i in range(im_data.shape[0]):

            scr = scores[i]
            lab = labels[i][scr > thrh]
            box = boxes[i][scr > thrh]

            #print(i, sum(scr > thrh))

            for b in box:
                draw.rectangle(list(b), outline='red',)
                # font = ImageFont.truetype("Arial.ttf", 15)
                draw.text((b[0], b[1]), text=str(lab[i]), fill='yellow', )

        # Save the original image with bounding boxes
        file_dir = Path(img_path).parent.parent / 'output'
        createDirectory(file_dir)
        new_file_name = os.path.basename(img_path).split('.')[0] + '_onnx'+ os.path.splitext(img_path)[1]
        new_file_path = file_dir / new_file_name
        print('new_file_path: ', new_file_path)
        print("================================================================================")
        im.save(new_file_path)
    
    avr_time = sum(all_inf_time) / len(img_path_list)
    avr_fps = float(1/avr_time)
    print('All images count: {}'.format(len(img_path_list)))
    print("Average Inferece time = {:.4f} s".format(inf_time))
    print("Average FPS = {:.2f} ".format(fps))
 
if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--img', '-i', type=str, )  # dir 
    parser.add_argument('--model', '-m', type=str, default='model.onnx')

    args = parser.parse_args()

    main(args)

Jun 22 '24 03:06 SoraJung

 root@a306fea1701c:/RT-DETR/rtdetr_pytorch# python tools/predict_onnx.py -i configs/dataset/opixray/train/train_image/ -m model.onnx
ort.get_device() GPU
img_path: configs/dataset/opixray/train/train_image//009000.jpg, inf_time: 0.4232, FPS: 2.36
img_path: configs/dataset/opixray/train/train_image//009002.jpg, inf_time: 0.1311, FPS: 7.63
img_path: configs/dataset/opixray/train/train_image//009003.jpg, inf_time: 0.0675, FPS: 14.82
img_path: configs/dataset/opixray/train/train_image//009004.jpg, inf_time: 0.1354, FPS: 7.38
img_path: configs/dataset/opixray/train/train_image//009005.jpg, inf_time: 0.0704, FPS: 14.21
img_path: configs/dataset/opixray/train/train_image//009007.jpg, inf_time: 0.1265, FPS: 7.91
img_path: configs/dataset/opixray/train/train_image//009008.jpg, inf_time: 0.1261, FPS: 7.93
img_path: configs/dataset/opixray/train/train_image//009009.jpg, inf_time: 0.0719, FPS: 13.92
img_path: configs/dataset/opixray/train/train_image//009012.jpg, inf_time: 0.1228, FPS: 8.14
img_path: configs/dataset/opixray/train/train_image//009013.jpg, inf_time: 0.1387, FPS: 7.21
img_path: configs/dataset/opixray/train/train_image//009014.jpg, inf_time: 0.0662, FPS: 15.11
img_path: configs/dataset/opixray/train/train_image//009016.jpg, inf_time: 0.1388, FPS: 7.20
img_path: configs/dataset/opixray/train/train_image//009017.jpg, inf_time: 0.1059, FPS: 9.44
img_path: configs/dataset/opixray/train/train_image//009018.jpg, inf_time: 0.1300, FPS: 7.6

Why I use the predict_onnx.py to cal my fps on my dataset ,the fps is unstable and lower compared with the benchmark ?

Jul 18 '24 06:07 xyb1314

 root@a306fea1701c:/RT-DETR/rtdetr_pytorch# python tools/predict_onnx.py -i configs/dataset/opixray/train/train_image/ -m model.onnx
ort.get_device() GPU
img_path: configs/dataset/opixray/train/train_image//009000.jpg, inf_time: 0.4232, FPS: 2.36
img_path: configs/dataset/opixray/train/train_image//009002.jpg, inf_time: 0.1311, FPS: 7.63
img_path: configs/dataset/opixray/train/train_image//009003.jpg, inf_time: 0.0675, FPS: 14.82
img_path: configs/dataset/opixray/train/train_image//009004.jpg, inf_time: 0.1354, FPS: 7.38
img_path: configs/dataset/opixray/train/train_image//009005.jpg, inf_time: 0.0704, FPS: 14.21
img_path: configs/dataset/opixray/train/train_image//009007.jpg, inf_time: 0.1265, FPS: 7.91
img_path: configs/dataset/opixray/train/train_image//009008.jpg, inf_time: 0.1261, FPS: 7.93
img_path: configs/dataset/opixray/train/train_image//009009.jpg, inf_time: 0.0719, FPS: 13.92
img_path: configs/dataset/opixray/train/train_image//009012.jpg, inf_time: 0.1228, FPS: 8.14
img_path: configs/dataset/opixray/train/train_image//009013.jpg, inf_time: 0.1387, FPS: 7.21
img_path: configs/dataset/opixray/train/train_image//009014.jpg, inf_time: 0.0662, FPS: 15.11
img_path: configs/dataset/opixray/train/train_image//009016.jpg, inf_time: 0.1388, FPS: 7.20
img_path: configs/dataset/opixray/train/train_image//009017.jpg, inf_time: 0.1059, FPS: 9.44
img_path: configs/dataset/opixray/train/train_image//009018.jpg, inf_time: 0.1300, FPS: 7.6

Why I use the predict_onnx.py to cal my fps on my dataset ,the fps is unstable and lower compared with the benchmark ?

@xyb1314 You are using gpu, It needs longer warm-up time to load data. When comparing with benchmark, try to use same environment and device setting. I would suggest install paddle and convert to trt to get real fps. Onnx is always confusing. Internal ops are not optimized.

Aug 08 '24 07:08 jyang68sh

Hello @SoraJung and @lyuwenyu,

I used the inference_onnx code provided above to calculate the inference time. I used the rtdetrv2_onnxruntime.py as reference and added the inference time calculation part in the file for inference on batch of images in a provided image directory. The code is as follows:

import torch
import torchvision.transforms as T
import numpy as np 
import onnxruntime as ort 
from PIL import Image, ImageDraw, ImageFont
import time
import os
import glob

# Disable CPU affinity if needed (for ONNX performance on CPU)
os.environ["ORT_DISABLE_CPU_AFFINITY"] = "1"

label_dict = {
    1: "pedestrian",
    2: "people",
    3: "bicycle",
    4: "car",
    5: "van",
    6: "truck",
    7: "tricycle",
    8: "awning-tricycle",
    9: "bus",
    10: "motor",
    11: "others",
}

def draw(images, labels, boxes, scores, thrh=0.6, output_filename=None):
    """Draws bounding boxes and labels on the image."""
    for i, im in enumerate(images):
        draw = ImageDraw.Draw(im)

        # Filter by score threshold
        scr = scores[i]
        lab = labels[i][scr > thrh]
        box = boxes[i][scr > thrh]
        scrs = scores[i][scr > thrh]

        # Draw each bounding box and label
        for j, b in enumerate(box):
            label_id = lab[j].item()
            label_name = label_dict.get(label_id, f"Unknown({label_id})")
            draw.rectangle(list(b), outline='red')
            draw.text((b[0], b[1]), text=f"{label_name}: {round(scrs[j].item(), 2)}", fill='blue')

        # Save the image with a unique filename
        if output_filename:
            im.save(output_filename)

def main(args):
    """Main function to load ONNX model, perform inference, and measure inference time."""
    
    # Load the ONNX model
    sess = ort.InferenceSession(args.onnx_file)

    print("***************Summary*******************")
    print("DEVICE USED: ", ort.get_device())
    #providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}), "CPUExecutionProvider"]
    #sess_options = ort.SessionOptions()
    #sess_options.enable_profiling = True
    #sess = ort.InferenceSession(args.onnx_file, providers=["CUDAExecutionProvider"])
    
    # Loop through all image files in the input folder
    image_files = glob.glob(os.path.join(args.images_folder, '*.[jp][pn][g]*'))  # Looks for .jpg, .jpeg, .png files

    #total_inference_time = 0
    all_inf_time = []
    total_images = len(image_files)
    
    # Ensure the output directory exists
    os.makedirs(args.output_dir, exist_ok=True)
    
    for image_path in image_files:
        # Open image
        im_pil = Image.open(image_path).convert('RGB')
        w, h = im_pil.size
        orig_size = torch.tensor([w, h])[None]

        # Define transformations
        transforms = T.Compose([T.Resize((640, 640)), T.ToTensor()])
        im_data = transforms(im_pil)[None]

        # Measure inference time
        #start_time = time.perf_counter()
        tic = time.time()
        
        # Run inference
        output = sess.run(
            output_names=['labels', 'boxes', 'scores'],
            input_feed={'images': im_data.data.numpy(), "orig_target_sizes": orig_size.data.numpy()}
        )

        inf_time = time.time() - tic
        fps = float(1/inf_time)
        all_inf_time.append(inf_time)
        print('img_path: {}, inf_time: {:.4f}, FPS: {:.2f}'.format(image_path, inf_time, fps))
        
        # Get the outputs (labels, boxes, scores)
        labels, boxes, scores = output

        # Generate unique output filename based on the image filename
        base_filename = os.path.basename(image_path)
        output_filename = os.path.join(args.output_dir, f'{os.path.splitext(base_filename)[0]}_inference.jpg')

        # Draw the results and save images
        draw([im_pil], labels, boxes, scores, output_filename=output_filename)

        
        # Measure end time
        #end_time = time.perf_counter()
        
        # Calculate inference time for this image in milliseconds
        #inference_time_ms = (end_time - start_time) * 1000
        #total_inference_time += inference_time_ms
        #print(f"Inference time for {image_path}: {inference_time_ms:.2f} ms")
        
    # Calculate and print the average inference time for all images
    #average_inference_time = total_inference_time / total_images if total_images > 0 else 0
    #print(f"\nAverage Inference Time for {total_images} images: {average_inference_time:.2f} ms")

    #Summary of Inference
    avr_time = sum(all_inf_time) / total_images
    avr_fps = total_images/sum(all_inf_time)
    #print("Total Inference Time: {}".format(sum(all_inf_time)))
    print('All images count: {}'.format(total_images))
    print("Average Inference time: {:.4f}s".format(avr_time))
    #print("Average FPS: {:.2f} ".format(avr_fps))
    print("******************************************************")

if __name__ == '__main__':
    # Parse command-line arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--onnx-file', type=str, required=True, help='Path to ONNX model file')
    parser.add_argument('--images-folder', type=str, required=True, help='Path to folder containing images')
    parser.add_argument('--output-dir', type=str, required=True, help='Path to folder where output images will be saved')
    args = parser.parse_args()

    # Run the main function
    main(args)

It runs sucessfully, However the inference time is higher in milliseconds and FPS is very low. The result snippet is as follows.

Could you please review the code and see what the issue with the code?

Your help is appreciated.

Regards, Bijay

Jan 29 '25 05:01 bshakya77

Thanks for your advice! I solved the problem. I modified the code, from one img to img directory. pytorch average FPS 23.45, onnx average FPS 28.32 for 10 images!

pytorch

python ./tools/predict_pytorch.py -c ./configs/rtdetr/rtdetr_r101vd_6x_coco_custom.yml -w ../output/rtdetr_r101vd_6x_coco_custom/checkpoint0004.pth -i ./images/input

img_path: images/input/D16030_196_Add00407.jpg, inf_time: 0.1581, FPS: 6.32 new_file_path: images/output/D16030_196_Add00407_torch.jpg

Load PResNet101 state_dict img_path: images/input/aihub3.jpg, inf_time: 0.0446, FPS: 22.40 new_file_path: images/output/aihub3_torch.jpg

Load PResNet101 state_dict img_path: images/input/D16030_196_Add00407_1.jpg, inf_time: 0.0430, FPS: 23.26 new_file_path: images/output/D16030_196_Add00407_1_torch.jpg

. . All images count: 10 Average Inferece time = 0.0426 s Average FPS = 23.45 2. onnx

python ./tools/predict_onnx.py -i ./images/input/

img_path: ./images/input//D16030_196_Add00407.jpg, inf_time: 0.1257, FPS: 7.95 new_file_path: images/output/D16030_196_Add00407_onnx.jpg

img_path: ./images/input//aihub3.jpg, inf_time: 0.0415, FPS: 24.09 new_file_path: images/output/aihub3_onnx.jpg

img_path: ./images/input//D16030_196_Add00407_1.jpg, inf_time: 0.0414, FPS: 24.13 new_file_path: images/output/D16030_196_Add00407_1_onnx.jpg

img_path: ./images/input//ytb_SterlingT_Suwon_0_000044_1.jpg, inf_time: 0.0415, FPS: 24.12 new_file_path: images/output/ytb_SterlingT_Suwon_0_000044_1_onnx.jpg

img_path: ./images/input//aihub1.jpg, inf_time: 0.0354, FPS: 28.25 new_file_path: images/output/aihub1_onnx.jpg

img_path: ./images/input//aihub.jpg, inf_time: 0.0352, FPS: 28.44 new_file_path: images/output/aihub_onnx.jpg . . All images count: 10 Average Inferece time = 0.0353 s Average FPS = 28.32

@SoraJung Could you please send me the complete modified prediction codes again? I may need your codes. Thanks.

It's final code. Check please ^__^

import os import sys sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(file)), '..'))

import torch import onnxruntime as ort from PIL import Image, ImageDraw, ImageFont from torchvision.transforms import ToTensor import argparse import time from pathlib import Path

def read_img(img_path): im = Image.open(img_path).convert('RGB') im = im.resize((640, 640)) im_data = ToTensor()(im)[None] # (width, height) = im.size # print(im_data.shape) # print(width, height) # size = torch.tensor([[width, height]]) size = torch.tensor([[640, 640]]) return im, im_data, size

def createDirectory(directory): try: if not os.path.exists(directory): os.makedirs(directory) except OSError: print("Error: Failed to create the directory.")

def main(args, ):

print("ort.get_device()", ort.get_device())
providers = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}), "CPUExecutionProvider"]
sess_options = ort.SessionOptions()
sess_options.enable_profiling = True
sess = ort.InferenceSession(args.model, sess_options=sess_options, providers=providers)

img_path_list = []
possible_img_extension = ['.jpg', '.jpeg', '.JPG', '.bmp', '.png'] # 이미지 확장자들
for (root, dirs, files) in os.walk(args.img):
    if len(files) > 0:
        for file_name in files:
            if os.path.splitext(file_name)[1] in possible_img_extension:
                img_path = root + '/' + file_name     
                img_path_list.append(img_path)

all_inf_time = []
for img_path in img_path_list:
    im, im_data, size = read_img(img_path) 
    
    tic = time.time()
    output = sess.run(
        # output_names=['labels', 'boxes', 'scores'],
        output_names=None,
        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}        
    )
    inf_time = time.time() - tic
    fps = float(1/inf_time)
    print('img_path: {}, inf_time: {:.4f}, FPS: {:.2f}'.format(img_path, inf_time, fps))
    all_inf_time.append(inf_time)
    
    #print(type(output))
    #print([out.shape for out in output])

    labels, boxes, scores = output

    draw = ImageDraw.Draw(im)  # Draw on the original image
    thrh = 0.6

    for i in range(im_data.shape[0]):

        scr = scores[i]
        lab = labels[i][scr > thrh]
        box = boxes[i][scr > thrh]

        #print(i, sum(scr > thrh))

        for b in box:
            draw.rectangle(list(b), outline='red',)
            # font = ImageFont.truetype("Arial.ttf", 15)
            draw.text((b[0], b[1]), text=str(lab[i]), fill='yellow', )

    # Save the original image with bounding boxes
    file_dir = Path(img_path).parent.parent / 'output'
    createDirectory(file_dir)
    new_file_name = os.path.basename(img_path).split('.')[0] + '_onnx'+ os.path.splitext(img_path)[1]
    new_file_path = file_dir / new_file_name
    print('new_file_path: ', new_file_path)
    print("================================================================================")
    im.save(new_file_path)

avr_time = sum(all_inf_time) / len(img_path_list)
avr_fps = float(1/avr_time)
print('All images count: {}'.format(len(img_path_list)))
print("Average Inferece time = {:.4f} s".format(inf_time))
print("Average FPS = {:.2f} ".format(fps))

if name == 'main':

parser = argparse.ArgumentParser()
parser.add_argument('--img', '-i', type=str, )  # dir 
parser.add_argument('--model', '-m', type=str, default='model.onnx')

args = parser.parse_args()

main(args)

Can you share the pytorch version infernce code with us?Thanks!

Sep 09 '25 07:09 xsa12345

Thanks for your advice! I solved the problem. I modified the code, from one img to img directory. pytorch average FPS 23.45, onnx average FPS 28.32 for 10 images!

pytorch

python ./tools/predict_pytorch.py -c ./configs/rtdetr/rtdetr_r101vd_6x_coco_custom.yml -w ../output/rtdetr_r101vd_6x_coco_custom/checkpoint0004.pth -i ./images/input

img_path: images/input/D16030_196_Add00407.jpg, inf_time: 0.1581, FPS: 6.32 new_file_path: images/output/D16030_196_Add00407_torch.jpg

Load PResNet101 state_dict img_path: images/input/aihub3.jpg, inf_time: 0.0446, FPS: 22.40 new_file_path: images/output/aihub3_torch.jpg

Load PResNet101 state_dict img_path: images/input/D16030_196_Add00407_1.jpg, inf_time: 0.0430, FPS: 23.26 new_file_path: images/output/D16030_196_Add00407_1_torch.jpg

. . All images count: 10 Average Inferece time = 0.0426 s Average FPS = 23.45 2. onnx

python ./tools/predict_onnx.py -i ./images/input/

img_path: ./images/input//D16030_196_Add00407.jpg, inf_time: 0.1257, FPS: 7.95 new_file_path: images/output/D16030_196_Add00407_onnx.jpg

img_path: ./images/input//aihub3.jpg, inf_time: 0.0415, FPS: 24.09 new_file_path: images/output/aihub3_onnx.jpg

img_path: ./images/input//D16030_196_Add00407_1.jpg, inf_time: 0.0414, FPS: 24.13 new_file_path: images/output/D16030_196_Add00407_1_onnx.jpg

img_path: ./images/input//ytb_SterlingT_Suwon_0_000044_1.jpg, inf_time: 0.0415, FPS: 24.12 new_file_path: images/output/ytb_SterlingT_Suwon_0_000044_1_onnx.jpg

img_path: ./images/input//aihub1.jpg, inf_time: 0.0354, FPS: 28.25 new_file_path: images/output/aihub1_onnx.jpg

img_path: ./images/input//aihub.jpg, inf_time: 0.0352, FPS: 28.44 new_file_path: images/output/aihub_onnx.jpg . . All images count: 10 Average Inferece time = 0.0353 s Average FPS = 28.32

Could you please send me the complete modified prediction codes on pytorch version? I may need your codes. Thanks.

Sep 09 '25 07:09 xsa12345

RT-DETR RT-DETR copied to clipboard

(pytorch) onnx is slower than pytorch

img_path: images/input/D16030_196_Add00407.jpg, inf_time: 0.1581, FPS: 6.32 new_file_path: images/output/D16030_196_Add00407_torch.jpg

Load PResNet101 state_dict img_path: images/input/aihub3.jpg, inf_time: 0.0446, FPS: 22.40 new_file_path: images/output/aihub3_torch.jpg

Load PResNet101 state_dict img_path: images/input/D16030_196_Add00407_1.jpg, inf_time: 0.0430, FPS: 23.26 new_file_path: images/output/D16030_196_Add00407_1_torch.jpg

img_path: ./images/input//D16030_196_Add00407.jpg, inf_time: 0.1257, FPS: 7.95 new_file_path: images/output/D16030_196_Add00407_onnx.jpg

img_path: ./images/input//aihub3.jpg, inf_time: 0.0415, FPS: 24.09 new_file_path: images/output/aihub3_onnx.jpg

img_path: ./images/input//D16030_196_Add00407_1.jpg, inf_time: 0.0414, FPS: 24.13 new_file_path: images/output/D16030_196_Add00407_1_onnx.jpg

img_path: ./images/input//ytb_SterlingT_Suwon_0_000044_1.jpg, inf_time: 0.0415, FPS: 24.12 new_file_path: images/output/ytb_SterlingT_Suwon_0_000044_1_onnx.jpg

img_path: ./images/input//aihub1.jpg, inf_time: 0.0354, FPS: 28.25 new_file_path: images/output/aihub1_onnx.jpg

img_path: images/input/D16030_196_Add00407.jpg, inf_time: 0.1581, FPS: 6.32 new_file_path: images/output/D16030_196_Add00407_torch.jpg

Load PResNet101 state_dict img_path: images/input/aihub3.jpg, inf_time: 0.0446, FPS: 22.40 new_file_path: images/output/aihub3_torch.jpg

Load PResNet101 state_dict img_path: images/input/D16030_196_Add00407_1.jpg, inf_time: 0.0430, FPS: 23.26 new_file_path: images/output/D16030_196_Add00407_1_torch.jpg

img_path: ./images/input//D16030_196_Add00407.jpg, inf_time: 0.1257, FPS: 7.95 new_file_path: images/output/D16030_196_Add00407_onnx.jpg

img_path: ./images/input//aihub3.jpg, inf_time: 0.0415, FPS: 24.09 new_file_path: images/output/aihub3_onnx.jpg

img_path: ./images/input//D16030_196_Add00407_1.jpg, inf_time: 0.0414, FPS: 24.13 new_file_path: images/output/D16030_196_Add00407_1_onnx.jpg

img_path: ./images/input//ytb_SterlingT_Suwon_0_000044_1.jpg, inf_time: 0.0415, FPS: 24.12 new_file_path: images/output/ytb_SterlingT_Suwon_0_000044_1_onnx.jpg

img_path: ./images/input//aihub1.jpg, inf_time: 0.0354, FPS: 28.25 new_file_path: images/output/aihub1_onnx.jpg

RT-DETR
RT-DETR copied to clipboard