Observing higher D-FINE (nano) latency against YOLOv8n
I am trying to verify the lower latency of D-FINE models against YOLO models, as I understand that is what is shown in the left-most figure in the paper's first page, where there are lower latency times shown for D-FINE than respective YOLO (v8 included), at least for the points shown.
The results seem to show that D-FINE exhibits a few times slower inference. Am I wrong to assume, based on the paper and tables, that I should expect smaller inference times than YOLOv8 (possibly including pre-processing and post-processing) with the respective nano or small checkpoint?
Measurements (on an RTX3060)
To obtain these measurements, I first perform several warmup inferences using a COCO validation set image. Then, I measure the inference time on a different image, ensuring that the model is already loaded and initialized. I observed a discrepancy in the inference times between D-FINE-N and YOLOv8n when measured using custom benchmarking scripts (detailed later) on both GPU and CPU. Indicative inference times are as follows:
| GPU | CPU | |
|---|---|---|
| D-FINE-N | 15 ms | 77 ms |
| yolov8n | 6 ms | 29 ms |
The scripts that are used for each case are presented below:
YOLOv8:
- ultralytics==8.3.153
- pillow==11.2.1
- opencv-python==4.11.0.86
- numpy==2.2.6
from ultralytics import YOLO
import cv2
import time
import requests
import numpy as np
from PIL import Image
import argparse
# Download image from URL
def download_image_from_url(url):
"""Download image from URL and convert to OpenCV format"""
try:
response = requests.get(url)
response.raise_for_status()
# Convert to PIL Image
pil_image = Image.open(requests.get(url, stream=True).raw)
# Convert PIL to OpenCV format (BGR)
opencv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
return opencv_image
except Exception as e:
print(f"Error downloading image: {e}")
return None
def main():
parser = argparse.ArgumentParser(description="YOLO Inference Script")
parser.add_argument("-d", "--device", type=str, default="cuda",
help="Device to run inference on (default: cuda)")
args = parser.parse_args()
# Load pretrained model
model = YOLO('yolov8n.pt')
img_warmup_url = 'http://images.cocodataset.org/train2017/000000262145.jpg'
img_warmup = download_image_from_url(img_warmup_url)
# Download the COCO image
img_url = 'http://images.cocodataset.org/train2017/000000179950.jpg'
img = download_image_from_url(img_url)
if img_warmup is None or img is None:
print("Failed to download images!")
return
# Warmup runs
print(f"Running warmup on {args.device}...")
for _ in range(3):
_ = model(img_warmup, conf=0.3, device=args.device)
print(f"Running inference on {args.device}...")
start_time_inference = time.time()
# Run inference
results = model(img, conf=0.3, device=args.device)
end_time_inference = time.time()
results[0].plot(save=True, filename='./yolo_inference.jpg') # save to disk
print(f"YOLO Inference Time: {end_time_inference - start_time_inference:.6f} seconds")
print(f"Device: {args.device}")
if __name__ == "__main__":
main()
To run the script one can just use the following command:
python yolo_inference.py -d [cuda:0 | cpu]
For D-FINE (a script fully based on the tools/inference/torch_inf.py script, a measure flag has been added in the process_image function to enable measurement) :
- SHA: 4a1f73a8bcfac736a88abde9596d87f116d780a7
import os
import sys
import requests
import cv2 # Added for video processing
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image, ImageDraw
import time
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
from src.core import YAMLConfig
# Download image from URL
def download_image_from_url(url):
"""Download image from URL and return as PIL Image"""
try:
response = requests.get(url)
response.raise_for_status()
# Convert to PIL Image and ensure RGB format
pil_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
return pil_image
except Exception as e:
print(f"Error downloading image: {e}")
return None
img_warmup_url = 'http://images.cocodataset.org/train2017/000000522418.jpg'
img_warmup = download_image_from_url(img_warmup_url)
# Download the COCO image
img_url = 'http://images.cocodataset.org/train2017/000000179950.jpg'
img = download_image_from_url(img_url)
def draw(images, labels, boxes, scores, thrh=0.4):
for i, im in enumerate(images):
draw = ImageDraw.Draw(im)
scr = scores[i]
lab = labels[i][scr > thrh]
box = boxes[i][scr > thrh]
scrs = scr[scr > thrh]
for j, b in enumerate(box):
draw.rectangle(list(b), outline="red")
draw.text(
(b[0], b[1]),
text=f"{lab[j].item()} {round(scrs[j].item(), 2)}",
fill="blue",
)
im.save("torch_results.jpg")
def process_image(model, device, image_input, measure=False):
# Check if input is a file path (string) or PIL image
if isinstance(image_input, str):
# It's a file path
im_pil = Image.open(image_input).convert("RGB")
elif isinstance(image_input, Image.Image):
# It's already a PIL image
im_pil = image_input
else:
raise ValueError("image_input must be either a file path (string) or PIL Image")
if measure:
start_inference = time.time()
w, h = im_pil.size
orig_size = torch.tensor([[w, h]]).to(device)
transforms = T.Compose(
[
T.Resize((640, 640)),
T.ToTensor(),
]
)
im_data = transforms(im_pil).unsqueeze(0).to(device)
output = model(im_data, orig_size)
labels, boxes, scores = output
if measure:
end_inference = time.time()
print(f"D-FINE-N Inference Time: {end_inference - start_inference} seconds")
draw([im_pil], labels, boxes, scores)
def process_video(model, device, file_path):
cap = cv2.VideoCapture(file_path)
# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter("torch_results.mp4", fourcc, fps, (orig_w, orig_h))
transforms = T.Compose(
[
T.Resize((640, 640)),
T.ToTensor(),
]
)
frame_count = 0
print("Processing video frames...")
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Convert frame to PIL image
frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
w, h = frame_pil.size
orig_size = torch.tensor([[w, h]]).to(device)
im_data = transforms(frame_pil).unsqueeze(0).to(device)
output = model(im_data, orig_size)
labels, boxes, scores = output
# Draw detections on the frame
draw([frame_pil], labels, boxes, scores)
# Convert back to OpenCV image
frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
# Write the frame
out.write(frame)
frame_count += 1
if frame_count % 10 == 0:
print(f"Processed {frame_count} frames...")
cap.release()
out.release()
print("Video processing complete. Result saved as 'results_video.mp4'.")
def main(args):
"""Main function"""
cfg = YAMLConfig(args.config, resume=args.resume)
if "HGNetv2" in cfg.yaml_cfg:
cfg.yaml_cfg["HGNetv2"]["pretrained"] = False
if args.resume:
checkpoint = torch.load(args.resume, map_location="cpu")
if "ema" in checkpoint:
state = checkpoint["ema"]["module"]
else:
state = checkpoint["model"]
else:
raise AttributeError("Only support resume to load model.state_dict by now.")
# Load train mode state and convert to deploy mode
cfg.model.load_state_dict(state)
class Model(nn.Module):
def __init__(self):
super().__init__()
self.model = cfg.model.deploy()
self.postprocessor = cfg.postprocessor.deploy()
def forward(self, images, orig_target_sizes):
outputs = self.model(images)
outputs = self.postprocessor(outputs, orig_target_sizes)
return outputs
device = args.device
model = Model().to(device)
# Warmup inferences
for _ in range(3):
_ = process_image(model, device, img_warmup, measure=False)
# Process as image
process_image(model, device, img, measure=True)
print("Image processing complete.")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--config", type=str, required=True)
parser.add_argument("-r", "--resume", type=str, required=True)
parser.add_argument("-d", "--device", type=str, default="cpu")
args = parser.parse_args()
main(args)
To run the script one can just use the following command:
python tools/inference/dfine-inference.py -c configs/dfine/dfine_hgnetv2_n_coco.yml -r /path/to/dfine_n_coco.pth -d [cuda:0 | cpu]
Hi! I came another values with inference on C++ and TensorRT. D-FINE is much faster that yolov8.
Hi @Nuzhny007, could you please share your code or method of comparison - what tools you used to export your models, for example?
Export to onnx?
python.exe tools/deployment/export_onnx.py --check -c configs/dfine/dfine_hgnetv2_s_coco.yml -r weights/dfine_s_obj2coco.pth --simplify
And next I'm using standart TensoRT API for parsing onnx and create engine: https://github.com/Smorodov/Multitarget-tracker/blob/master/src/Detector/tensorrt_yolo/YoloONNX.cpp#L204 This project encapsulate functionality of the trtexec. And result you can see here: https://www.youtube.com/watch?v=Pb_HnejRpY4&t=29s
I'm using openvino for C++ CPU inference, nano is way faster than 77ms this way.
It is pretty straight forward to convert the model to openvino format
model = ov.convert_model(
onnx_file,
input=[("images", [1, 3, dim, dim]), ("orig_target_sizes", [1, 2])],
)
ov.save_model(model, output_file, compress_to_fp16=args.fp16)