YOLO-World
YOLO-World copied to clipboard
Fine-tuning problem Issue
After fine-tuning, the detector fails to detect objects,the Output Image Has No Detected Boxes, even though the loss keeps decreasing during training. I want to know whether the issue lies with my method or my data. Could anyone help me? Thank you.
Inference Command Used After Training: python .\image_demo.py D:\YOLO-World-master\configs\pretrain\custom_yolo_world_l_clip.py D:\YOLO-World-master\log_200\epoch_220.pth D:\YOLO-World-master\datasets\images\train\0000001 .jpg "Book"
metainfo metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes'))
class_text_path: [ ["Chalk"], ["Microphone"], ["MobilePhone"], ["Tablet"], ["OtherTeachingTools"], ["Book"], ["Pen"], ["RulerTools"], ["Eraser"], ["PencilCase"], ["Laptop"], ["NonEducationalItems"], ["BlackboardWriting"], ["Notes"] ]
Config file: base = ( '../../third_party/mmyolo/configs/yolov8/' 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict( imports=['yolo_world'], allow_failed_imports=False)
import os os.chdir('D:/YOLO-World-master') hyper-parameters num_classes = 14 num_training_classes = 14 max_epochs = 500 # Maximum training epochs close_mosaic_epochs = 10 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 1e-4 weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from='pretrained_weights/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' persistent_workers = False
model settings model = dict( type='YOLOWorldDetector', mm_neck=True, num_train_classes=num_training_classes, num_test_classes=num_classes, data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), backbone=dict( delete=True, type='MultiModalYOLOBackbone', image_model={{base.model.backbone}}, text_model=dict( type='HuggingCLIPLanguageBackbone', model_name='openai/clip-vit-base-patch32', frozen_modules=['all'])), neck=dict(type='YOLOWorldDualPAFPN', guide_channels=text_channels, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), text_enhancder=dict(type='ImagePoolingAttentionModule', embed_channels=256, num_heads=8)), bbox_head=dict(type='YOLOWorldHead', head_module=dict(type='YOLOWorldHeadModule', embed_dims=text_channels, num_classes=num_training_classes)), train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
dataset settings text_transform = [ dict(type='RandomLoadText', num_neg_samples=(num_classes, num_classes), max_num_samples=num_training_classes, padding_to_max=True, padding_value=''), dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction', 'texts')) ] mosaic_affine_transform = [ dict( type='MultiModalMosaic', img_scale=base.img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, max_aspect_ratio=100., scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale), # img_scale is (width, height) border=(-base.img_scale[0] // 2, -base.img_scale[1] // 2), border_val=(114, 114, 114)) ] train_pipeline = [ *base.pre_transform, mosaic_affine_transform, dict( type='YOLOv5MultiModalMixUp', prob=base.mixup_prob, pre_transform=[base.pre_transform, *mosaic_affine_transform]), *base.last_transform[:-1], *text_transform ] train_pipeline_stage2 = [ *base.train_pipeline_stage2[:-1], *text_transform ]
metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes')) coco_train_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets', ann_file='annotations/train.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=train_pipeline)
train_dataloader = dict( persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) test_pipeline = [ *base.test_pipeline[:-1], dict(type='LoadText'), dict( type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param', 'texts')) ] coco_val_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets/', ann_file='annotations/val.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=test_pipeline) val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader training settings default_hooks = dict( param_scheduler=dict( scheduler_type='linear', lr_factor=0.01, max_epochs=max_epochs), checkpoint=dict( max_keep_ckpts=-1, save_best=None, interval=save_epoch_intervals)) custom_hooks = [ dict( type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict( type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict( max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict( optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor')
evaluation settings val_evaluator = dict( delete=True, type='mmdet.CocoMetric', proposal_nums=(100, 1, 10), ann_file='datasets/annotations/val.json', metric='bbox')
datasets:
train.json
{
"images": [
{
"file_name": "val\\0000002.jpg",
"id": 0,
"width": 1920,
"height": 1080
},
...
]
"annotations": [
{
"image_id": 0,
"id": 0,
"category_id": 9,
"bbox": [
342.47200000000004,
610.78652,
95.72999999999996,
72.80948000000001
],
"area": 6970.051520399998,
"segmentation": [
[
342.47200000000004,
610.78652,
438.202,
610.78652,
438.202,
683.596,
342.47200000000004,
683.596
]
],
"iscrowd": 0
},
{
"image_id": 1,
"id": 1,
"category_id": 9,
"bbox": [
542.02231,
690.3370000000001,
115.95522000000005,
76.85399999999993
],
"area": 8911.622477879995,
"segmentation": [
[
542.02231,
690.3370000000001,
657.97753,
690.3370000000001,
657.97753,
767.191,
542.02231,
767.191
]
],
"iscrowd": 0
},
...
]
"categories": [
{
"id": 0,
"name": "Chalk"
},
{
"id": 1,
"name": "Microphone"
},
{
"id": 2,
"name": "MobilePhone"
},
{
"id": 3,
"name": "Tablet"
},
{
"id": 4,
"name": "OtherTeachingTools"
},
{
"id": 5,
"name": "Book"
},
{
"id": 6,
"name": "Pen"
},
{
"id": 7,
"name": "RulerTools"
},
{
"id": 8,
"name": "Eraser"
},
{
"id": 9,
"name": "PencilCase"
},
{
"id": 10,
"name": "Laptop"
},
{
"id": 11,
"name": "NonEducationalItems"
},
{
"id": 12,
"name": "BlackboardWriting"
},
{
"id": 13,
"name": "Notes"
}
]
}