mmfewshot
mmfewshot copied to clipboard
Discrepancy in AP50 while Re-implementing MetaRCNN
I encountered an issue while re-implementing Metarcnn. When finetuning for all classes, I observed a difference of up to 6% in the AP50 values for novel classes. Initially, I utilized 2 GPUs for the implementation. Due to limited GPU access, I adjusted the batch size to 2 and increased the iterations to 36,000 to manage the base training process efficiently.
Here are base training configurations for split1:
"img_norm_cfg = dict( mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) train_multi_pipelines = dict( query=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ], support=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='GenerateMask', target_size=(224, 224)), dict(type='RandomFlip', flip_ratio=0.0), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ]) test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1000, 600), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data_root = 'data/VOCdevkit/' data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='NWayKShotDataset', num_support_ways=15, num_support_shots=1, one_support_shot_per_image=True, num_used_support_shots=200, save_dataset=False, dataset=dict( type='FewShotVOCDataset', ann_cfg=[ dict( type='ann_file', ann_file= 'data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt'), dict( type='ann_file', ann_file= 'data/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt') ], img_prefix='data/VOCdevkit/', multi_pipelines=dict( query=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Resize', img_scale=(1000, 600), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ], support=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='GenerateMask', target_size=(224, 224)), dict(type='RandomFlip', flip_ratio=0.0), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ]), classes='BASE_CLASSES_SPLIT1', use_difficult=True, instance_wise=False, dataset_name='query_dataset'), support_dataset=dict( type='FewShotVOCDataset', ann_cfg=[ dict( type='ann_file', ann_file= 'data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt'), dict( type='ann_file', ann_file= 'data/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt') ], img_prefix='data/VOCdevkit/', multi_pipelines=dict( query=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Resize', img_scale=(1000, 600), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ], support=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='GenerateMask', target_size=(224, 224)), dict(type='RandomFlip', flip_ratio=0.0), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ]), classes='BASE_CLASSES_SPLIT1', use_difficult=False, instance_wise=False, dataset_name='support_dataset')), val=dict( type='FewShotVOCDataset', ann_cfg=[ dict( type='ann_file', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/test.txt') ], img_prefix='data/VOCdevkit/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1000, 600), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ], classes='BASE_CLASSES_SPLIT1'), test=dict( type='FewShotVOCDataset', ann_cfg=[ dict( type='ann_file', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/test.txt') ], img_prefix='data/VOCdevkit/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1000, 600), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ], test_mode=True, classes='BASE_CLASSES_SPLIT1'), model_init=dict( copy_from_train_dataset=True, samples_per_gpu=16, workers_per_gpu=1, type='FewShotVOCDataset', ann_cfg=None, img_prefix='data/VOCdevkit/', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Normalize', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='GenerateMask', target_size=(224, 224)), dict(type='RandomFlip', flip_ratio=0.0), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ], use_difficult=False, instance_wise=True, classes='BASE_CLASSES_SPLIT1', dataset_name='model_init_dataset')) evaluation = dict(interval=12000, metric='mAP') optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', warmup='linear', warmup_iters=100, warmup_ratio=0.001, step=[16000]) runner = dict(type='IterBasedRunner', max_iters=36000) norm_cfg = dict(type='BN', requires_grad=False) pretrained = 'open-mmlab://detectron2/resnet101_caffe' model = dict( type='MetaRCNN', pretrained='open-mmlab://detectron2/resnet101_caffe', backbone=dict( type='ResNetWithMetaConv', depth=101, num_stages=3, strides=(1, 2, 2), dilations=(1, 1, 1), out_indices=(2, ), frozen_stages=2, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), rpn_head=dict( type='RPNHead', in_channels=1024, feat_channels=512, anchor_generator=dict( type='AnchorGenerator', scales=[2, 4, 8, 16, 32], ratios=[0.5, 1.0, 2.0], scale_major=False, strides=[16]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='MetaRCNNRoIHead', shared_head=dict( type='MetaRCNNResLayer', pretrained='open-mmlab://detectron2/resnet101_caffe', depth=50, stage=3, stride=2, dilation=1, style='caffe', norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=1024, featmap_strides=[16]), bbox_head=dict( type='MetaBBoxHead', with_avg_pool=False, roi_feat_size=1, in_channels=2048, num_classes=15, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0), num_meta_classes=15, meta_cls_in_channels=2048, with_meta_cls_loss=True, loss_meta=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), aggregation_layer=dict( type='AggregationLayer', aggregator_cfgs=[ dict( type='DotProductAggregator', in_channels=2048, with_fc=False) ])), train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=12000, max_per_img=2000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=128, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_pre=6000, max_per_img=300, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.3), max_per_img=100))) checkpoint_config = dict(interval=12000) log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) custom_hooks = [dict(type='NumClassCheckHook')] dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] use_infinite_sampler = True seed = 42 work_dir = './work_dirs/meta-rcnn_r101_c4_8xb4_voc-split1_base-training' gpu_ids = range(0, 2)"
Could you please support in resolving this issue?