YOLO-World icon indicating copy to clipboard operation
YOLO-World copied to clipboard

Fine-tuning problem Issue

Open LLH-Harward opened this issue 6 months ago • 14 comments

After fine-tuning, the detector fails to detect objects,the Output Image Has No Detected Boxes, even though the loss keeps decreasing during training. I want to know whether the issue lies with my method or my data. Could anyone help me? Thank you. image

Inference Command Used After Training: python .\image_demo.py D:\YOLO-World-master\configs\pretrain\custom_yolo_world_l_clip.py D:\YOLO-World-master\log_200\epoch_220.pth D:\YOLO-World-master\datasets\images\train\0000001 .jpg "Book"

metainfo metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes'))

class_text_path: [ ["Chalk"], ["Microphone"], ["MobilePhone"], ["Tablet"], ["OtherTeachingTools"], ["Book"], ["Pen"], ["RulerTools"], ["Eraser"], ["PencilCase"], ["Laptop"], ["NonEducationalItems"], ["BlackboardWriting"], ["Notes"] ]

Config file: base = ( '../../third_party/mmyolo/configs/yolov8/' 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict( imports=['yolo_world'], allow_failed_imports=False)

import os os.chdir('D:/YOLO-World-master') hyper-parameters num_classes = 14 num_training_classes = 14 max_epochs = 500 # Maximum training epochs close_mosaic_epochs = 10 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 1e-4 weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from='pretrained_weights/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' persistent_workers = False

model settings model = dict( type='YOLOWorldDetector', mm_neck=True, num_train_classes=num_training_classes, num_test_classes=num_classes, data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), backbone=dict( delete=True, type='MultiModalYOLOBackbone', image_model={{base.model.backbone}}, text_model=dict( type='HuggingCLIPLanguageBackbone', model_name='openai/clip-vit-base-patch32', frozen_modules=['all'])), neck=dict(type='YOLOWorldDualPAFPN', guide_channels=text_channels, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), text_enhancder=dict(type='ImagePoolingAttentionModule', embed_channels=256, num_heads=8)), bbox_head=dict(type='YOLOWorldHead', head_module=dict(type='YOLOWorldHeadModule', embed_dims=text_channels, num_classes=num_training_classes)), train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

dataset settings text_transform = [ dict(type='RandomLoadText', num_neg_samples=(num_classes, num_classes), max_num_samples=num_training_classes, padding_to_max=True, padding_value=''), dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction', 'texts')) ] mosaic_affine_transform = [ dict( type='MultiModalMosaic', img_scale=base.img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, max_aspect_ratio=100., scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale), # img_scale is (width, height) border=(-base.img_scale[0] // 2, -base.img_scale[1] // 2), border_val=(114, 114, 114)) ] train_pipeline = [ *base.pre_transform, mosaic_affine_transform, dict( type='YOLOv5MultiModalMixUp', prob=base.mixup_prob, pre_transform=[base.pre_transform, *mosaic_affine_transform]), *base.last_transform[:-1], *text_transform ] train_pipeline_stage2 = [ *base.train_pipeline_stage2[:-1], *text_transform ]

metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes')) coco_train_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets', ann_file='annotations/train.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=train_pipeline)

train_dataloader = dict( persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) test_pipeline = [ *base.test_pipeline[:-1], dict(type='LoadText'), dict( type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param', 'texts')) ] coco_val_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets/', ann_file='annotations/val.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=test_pipeline) val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader training settings default_hooks = dict( param_scheduler=dict( scheduler_type='linear', lr_factor=0.01, max_epochs=max_epochs), checkpoint=dict( max_keep_ckpts=-1, save_best=None, interval=save_epoch_intervals)) custom_hooks = [ dict( type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict( type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict( max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict( optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor')

evaluation settings val_evaluator = dict( delete=True, type='mmdet.CocoMetric', proposal_nums=(100, 1, 10), ann_file='datasets/annotations/val.json', metric='bbox')

datasets: image

train.json

{
  "images": [
    {
      "file_name": "val\\0000002.jpg",
      "id": 0,
      "width": 1920,
      "height": 1080
    },
    ...
   ]
    "annotations": [
    {
      "image_id": 0,
      "id": 0,
      "category_id": 9,
      "bbox": [
        342.47200000000004,
        610.78652,
        95.72999999999996,
        72.80948000000001
      ],
      "area": 6970.051520399998,
      "segmentation": [
        [
          342.47200000000004,
          610.78652,
          438.202,
          610.78652,
          438.202,
          683.596,
          342.47200000000004,
          683.596
        ]
      ],
      "iscrowd": 0
    },
    {
      "image_id": 1,
      "id": 1,
      "category_id": 9,
      "bbox": [
        542.02231,
        690.3370000000001,
        115.95522000000005,
        76.85399999999993
      ],
      "area": 8911.622477879995,
      "segmentation": [
        [
          542.02231,
          690.3370000000001,
          657.97753,
          690.3370000000001,
          657.97753,
          767.191,
          542.02231,
          767.191
        ]
      ],
      "iscrowd": 0
    },
   ...
   ]
 "categories": [
    {
      "id": 0,
      "name": "Chalk"
    },
    {
      "id": 1,
      "name": "Microphone"
    },
    {
      "id": 2,
      "name": "MobilePhone"
    },
    {
      "id": 3,
      "name": "Tablet"
    },
    {
      "id": 4,
      "name": "OtherTeachingTools"
    },
    {
      "id": 5,
      "name": "Book"
    },
    {
      "id": 6,
      "name": "Pen"
    },
    {
      "id": 7,
      "name": "RulerTools"
    },
    {
      "id": 8,
      "name": "Eraser"
    },
    {
      "id": 9,
      "name": "PencilCase"
    },
    {
      "id": 10,
      "name": "Laptop"
    },
    {
      "id": 11,
      "name": "NonEducationalItems"
    },
    {
      "id": 12,
      "name": "BlackboardWriting"
    },
    {
      "id": 13,
      "name": "Notes"
    }
  ]
}

LLH-Harward avatar Aug 03 '24 04:08 LLH-Harward