YOLO-World The tag name is always person

When I do video inference, I enter the cue text, e.g. aircraft, and save the resulting video, and it does detect the aircraft, but the label name is always person. i tried to print the inference result, and the mapped labels are always tensor([0]) . What is the cause of this, the detection is spot on but the tag name is always person.

python video_demo.py /root/autodl-tmp/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py /root/autodl-tmp/YOLO-World/pretrained_weights/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth /root/autodl-tmp/YOLO-World/source_data/9401.mp4 'airplane' --out /root/autodl-tmp/YOLO-World/source_data/5.mp4

Dec 26 '24 15:12 3083156185

It didn't solve the problem.

Dec 26 '24 16:12 3083156185

modified the video_demo.py file by mimicking the image_demo.py file and was able to label the video correctly. ` import argparse import sys import cv2 import mmcv import torch from mmengine.dataset import Compose from mmdet.apis import init_detector from mmengine.utils import track_iter_progress

from mmyolo.registry import VISUALIZERS import os import supervision as sv

BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1) MASK_ANNOTATOR = sv.MaskAnnotator()

class LabelAnnotator(sv.LabelAnnotator):

@staticmethod
def resolve_text_background_xyxy(
    center_coordinates,
    text_wh,
    position,
):
    center_x, center_y = center_coordinates
    text_w, text_h = text_wh
    return center_x, center_y, center_x + text_w, center_y + text_h

LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, text_scale=0.5, text_thickness=1)

def parse_args(): parser = argparse.ArgumentParser(description='YOLO-World video demo') parser.add_argument('--config', default='configs/pretrain/' 'yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py', help='test config file path') parser.add_argument('--checkpoint', default='weights/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth', help='checkpoint file') parser.add_argument('--video', default='demo/sample_images/car.mp4', help='video file path') parser.add_argument( '--text', default='cat,dog,pig,car', help= 'text prompts, including categories separated by a comma or a txt file with each line as a prompt.' ) parser.add_argument('--device', default='cuda:0', help='device used for inference') parser.add_argument('--score-thr', default=0.2, type=float, help='confidence score threshold for predictions.') parser.add_argument('--out', default='demo_outputs/cat.mp4',type=str, help='output video file') parser.add_argument('--frame-output-dir', default='demo_outputs/frames', type=str, help='directory to save frames') args = parser.parse_args() return args

def inference_detector(model, image, texts, test_pipeline, score_thr=0.3): data_info = dict(img_id=0, img=image, texts=texts) data_info = test_pipeline(data_info) data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), data_samples=[data_info['data_samples']])

with torch.no_grad():
    output = model.test_step(data_batch)[0]
    pred_instances = output.pred_instances
    pred_instances = pred_instances[pred_instances.scores.float() >
                                    score_thr]
output.pred_instances = pred_instances

return output

def prepare_frame_output_dir(frame_output_dir): """Prepare the directory to save frames.""" if os.path.exists(frame_output_dir): # Clear all files in the directory for filename in os.listdir(frame_output_dir): file_path = os.path.join(frame_output_dir, filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): os.rmdir(file_path) except Exception as e: print(f'Failed to delete {file_path}. Reason: {e}') else: # Create the directory if it does not exist os.makedirs(frame_output_dir)

def main(): args = parse_args()

model = init_detector(args.config, args.checkpoint, device=args.device)


model.cfg.test_dataloader.dataset.pipeline[
    0].type = 'mmdet.LoadImageFromNDArray'
test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)

if args.text.endswith('.txt'):
    with open(args.text) as f:
        lines = f.readlines()
    texts = [[t.rstrip('\r\n')] for t in lines] + [[' ']]
else:
    texts = [[t.strip()] for t in args.text.split(',')] + [[' ']]


print("model.dataset_meta[classes]1:", model.dataset_meta["classes"])
model.reparameterize(texts)
print("model.dataset_meta[classes]2:",model.dataset_meta["classes"])



visualizer = VISUALIZERS.build(model.cfg.visualizer)

visualizer.dataset_meta = model.dataset_meta

video_reader = mmcv.VideoReader(args.video)
video_writer = None
if args.out:
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(
        args.out, fourcc, video_reader.fps,
        (video_reader.width, video_reader.height))
    # Prepare the directory to save frames
prepare_frame_output_dir(args.frame_output_dir)
frames = [frame for frame in video_reader]
for idx, frame in enumerate(track_iter_progress(frames, file=sys.stdout)):
# for frame in track_iter_progress(frames, file=sys.stdout):
    result = inference_detector(model,
                                frame,
                                texts,
                                test_pipeline,
                                score_thr=args.score_thr)
    # print('result.pred_instances.labels:',result.pred_instances.labels)
    # print('result.pred_instances.labels:', result.pred_instances.labels[0])
    # print('result.pred_instances.labels:', result.pred_instances.labels[12])
    # result.pred_instances类型为 tensor (识别到的物体个数，数据形式)
    pred_instances = result.pred_instances.cpu().numpy()

    if 'masks' in pred_instances:
        masks = pred_instances['masks']
    else:
        masks = None
    # 将预测结果转换为Detections对象，并添加标签。
    detections = sv.Detections(xyxy=pred_instances['bboxes'],
                               class_id=pred_instances['labels'],
                               confidence=pred_instances['scores'],
                               mask=masks)
    labels = [
        f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
        zip(detections.class_id, detections.confidence)
    ]
    image = BOUNDING_BOX_ANNOTATOR.annotate(frame, detections)
    image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
    # Save each frame as an image
    frame_filename = os.path.join(args.frame_output_dir, f"frame_{idx:06d}.jpg")
    cv2.imwrite(frame_filename, image)
    # visualizer.add_datasample(name='video',
    #                           image=frame,
    #                           data_sample=result,
    #                           draw_gt=False,
    #                           show=False,
    #                           pred_score_thr=args.score_thr)
    # frame = visualizer.get_image()

    if args.out:
        video_writer.write(image)
    # import os
    # # Save each frame as an image
    # frame_filename = os.path.join(args.frame_output_dir, f"frame_{idx:06d}.jpg")
    # cv2.imwrite(frame_filename, frame)
if video_writer:
    video_writer.release()

if name == 'main': main()

`

Feb 25 '25 05:02 ycyg8