GLEE How to Run Inference on a Single Custom Video?

Hi, thank you for your great work!

I’m currently working on a project where your method would be highly beneficial. However, as someone new to the video tracking field, I’m unsure how to perform inference on a single custom video using your code. The instructions in test.md mainly cover evaluation on entire datasets.

Could you kindly provide some guidance or example code for running inference on a single video clip? Any help would be greatly appreciated!

Thanks again for your amazing contribution!

Apr 20 '25 12:04 xiao10ma

Currently, I have down a version. Hope to help guys:

import os
import torch
import numpy as np
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
from detectron2.config import get_cfg
from detectron2.projects.glee import add_glee_config, build_detection_train_loader, build_detection_test_loader
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from PIL import Image

OVIS_CATEGORIES = [
    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "Truck"},
    {"color": [0, 82, 0], "isthing": 1, "id": 2, "name": "Car"},
    {"color": [119, 11, 32], "isthing": 1, "id": 3, "name": "Bus"},
]

def setup(args):
    """
    Create configs and perform basic setups.
    """
    cfg = get_cfg()
    add_glee_config(cfg)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()  
    default_setup(cfg, args)
    return cfg

def main(args):
    cfg = setup(args)
    model = build_model(cfg)
    DetectionCheckpointer(model).load('GLEE_Plus_joint.pth')

    img_dir = './CAM_FRONT_LEFT'
    prompt = [cat['name'] for cat in OVIS_CATEGORIES]
    # prompt = ['Vehical']
    img_list = []
    file_names = []

    min_size = cfg.INPUT.MIN_SIZE_TEST
    max_size = cfg.INPUT.MAX_SIZE_TEST
    sample_style = "choice"
    aug_list = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
    augumentations = T.AugmentationList(aug_list)

    ori_height = 0
    ori_width = 0
    for frame in sorted(os.listdir(img_dir)):
        img_path = os.path.join(img_dir, frame)
        file_names.append(img_path)
        image = utils.read_image(img_path, format='RGB')
        ori_height, ori_width = image.shape[:2]
        aug_input = T.AugInput(image)
        transforms = augumentations(aug_input)
        image = aug_input.image
        image_shape = image.shape[:2]
        img_list.append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
    
    inputs = [{
        'height': ori_height,
        'width': ori_width,
        'image': img_list,
        'task': 'ovis', # TODO: for debug, current use task as 'ovis'
        'file_names': file_names,
        'prompt': None
    }]

    model.eval()
    with torch.no_grad():
        outputs = model(inputs)
        print(outputs)
        
    np_mask = np.zeros((len(file_names), ori_height, ori_width))
    for idx, item in enumerate(outputs['pred_scores']):
        if item > 0.3 and outputs['pred_labels'][idx] == 24:
            for frame in range(len(file_names)):
                if outputs['pred_masks'][idx][frame].sum() > 0:
                    np_mask[frame][outputs['pred_masks'][idx][frame]] = 1
                    img_path = file_names[frame]
                    img = np.array(Image.open(img_path))
                    img[~outputs['pred_masks'][idx][frame]] = 0
                    Image.fromarray(img).save(f'{frame}.png')




if __name__ == "__main__":
    args = default_argument_parser().parse_args()
    print("Command Line Args:", args)
    main(args)

Apr 21 '25 15:04 xiao10ma

Is there any bugs? The prediction matches two different cars for one idx in the whole sequence, here's results:

mp4.zip

Apr 21 '25 15:04 xiao10ma