YOLO-World icon indicating copy to clipboard operation
YOLO-World copied to clipboard

image demo question

Open ZhouXiner opened this issue 9 months ago • 0 comments

Hi, team I failed when I run image_demo.py with my own prompts: cone,car,trafficsign,truck,pillar

I check the code in yolo.world.py and I find this: self.bbox_head.num_classes = txt_feats[0].shape[0] it size is [6, 1, 512], which means I can only get 1 of self.bbox_head.num_classes. I change this line to self.bbox_head.num_classes = txt_feats.shape[0] and i run successfully.

Here is my config and cmd: config: base = ('../../third_party/mmyolo/configs/yolov8/' 'yolov8_x_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)

hyper-parameters

num_classes = 1203 num_training_classes = 80 max_epochs = 100 # Maximum training epochs close_mosaic_epochs = 2 save_epoch_intervals = 2 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 text_model_name = '../pretrain/clip-vit-base-patch32' img_scale = (1280, 1280)

model settings

model = dict( type='YOLOWorldDetector', mm_neck=True, num_train_classes=num_training_classes, num_test_classes=num_classes, data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), backbone=dict( delete=True, type='MultiModalYOLOBackbone', image_model={{base.model.backbone}}, text_model=dict( type='HuggingCLIPLanguageBackbone', model_name=text_model_name, frozen_modules=['all'])), neck=dict(type='YOLOWorldPAFPN', guide_channels=text_channels, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), bbox_head=dict(type='YOLOWorldHead', head_module=dict(type='YOLOWorldHeadModule', use_bn_head=True, embed_dims=text_channels, num_classes=num_training_classes)), train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

dataset settings

text_transform = [ dict(type='RandomLoadText', num_neg_samples=(num_classes, num_classes), max_num_samples=num_training_classes, padding_to_max=True, padding_value=''), dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction', 'texts')) ] train_pipeline = [ *base.pre_transform, dict(type='MultiModalMosaic', img_scale=img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale), max_aspect_ratio=base.max_aspect_ratio, border=(-img_scale[0] // 2, -img_scale[1] // 2), border_val=(114, 114, 114)), *base.last_transform[:-1], *text_transform, ] train_pipeline_stage2 = [ *base.pre_transform, dict(type='YOLOv5KeepRatioResize', scale=img_scale), dict( type='LetterResize', scale=img_scale, allow_scale_up=True, pad_val=dict(img=114.0)), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale), max_aspect_ratio=base.max_aspect_ratio, border_val=(114, 114, 114)), *base.last_transform[:-1], *text_transform ]

train_dataloader = dict(batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=dict(delete=True, type='ConcatDataset', datasets=[ ], ignore_keys=['classes', 'palette']))

test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='YOLOv5KeepRatioResize', scale=img_scale), dict( type='LetterResize', scale=img_scale, allow_scale_up=False, pad_val=dict(img=114)), dict(type='LoadAnnotations', with_bbox=True, scope='mmdet'), dict(type='LoadText'), dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param', 'texts')) ]

val_evaluator = dict(type='mmdet.LVISMetric', ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', metric='bbox') test_evaluator = val_evaluator

training settings

default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), checkpoint=dict(interval=save_epoch_intervals, rule='greater')) custom_hooks = [ dict(type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict(type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict(max_epochs=max_epochs, val_interval=10, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict(optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict(bias_decay_mult=0.0, norm_decay_mult=0.0, custom_keys={ 'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0) }), constructor='YOLOWv5OptimizerConstructor')

cmd: python3 image_demo.py /mnt/training_yrfs/p8n/camera/personal/xin.zhou/open-det/code/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_1280_only_model.py /mnt/training_yrfs/p8n/camera/personal/xin.zhou/open-det/code/YOLO-World/pretrain/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth /mnt/training_yrfs/p8n/camera/personal/xin.zhou/open-det/dataset/pinhole/cone cone,car,trafficsign,truck,pillar --threshold 0.05 --output-dir /mnt/training_yrfs/p8n/camera/personal/xin.zhou/open-det/code/YOLO-World/results

Thanks!

ZhouXiner avatar May 03 '24 17:05 ZhouXiner