DAVAR-Lab-OCR icon indicating copy to clipboard operation
DAVAR-Lab-OCR copied to clipboard

question about maskrcnn config and trie config

Open ZHEGG opened this issue 1 year ago • 1 comments

您好!我在使用maskrcnn的config作用于wildreceipt数据集时,config文件如下 """ ####################################################################################################

Copyright Info : Copyright (c) Davar Lab @ Hikvision Research Institute. All rights reserved.

Filename : mask_rcnn_r50_r32_e2e_finetune_ic13.py

Abstract : Model settings for mask rcnn spotter end-to-end finetune on realdata.

Current Version: 1.0.0

Date : 2021-06-24

###################################################################################################### """ base = "./base.py" batch_max_length = 60

model = dict( rcg_roi_extractor=dict( type='MaskRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=(32, 200), sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32], delete=True, ), rcg_backbone=dict( type='ResNet32', input_channel=256, output_channel=256, delete=True, ), rcg_sequence_module=dict( rnn_modules=[ dict( type='BidirectionalLSTM', input_size=256, hidden_size=256, output_size=256, with_linear=True, bidirectional=True,), dict( type='BidirectionalLSTM', input_size=256, hidden_size=256, output_size=256, with_linear=True, bidirectional=True,), ]), rcg_sequence_head=dict( input_size=256, converter=dict( type='AttnLabelConverter', with_unknown=True, ), ), )

File prefix path of the traning dataset

img_prefixes = [ '/home/mdisk3/bianzhewu/dataset/wildreceipt/', ]

Dataset Name

ann_files = [ '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_train_without_iob.json', ]

img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [ dict(type='DavarResize', img_scale=[(512, 512)], multiscale_mode='range', keep_ratio=True), ]

test_pipeline = [ dict(type='DavarLoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(512, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='DavarCollect', keys=['img']), ]) ]

data = dict( samples_per_gpu=2, workers_per_gpu=0, sampler=dict( type='DistBatchBalancedSampler', # BatchBalancedSampler and DistBatchBalancedSampler mode=1, # model 0: Balance in batch, calculate the epoch according to the first iterative data set # model 1: Balance in batch, calculate the epoch according to the last iterative data set # model 2: Balance in batch, record unused data # model -1: Each dataset is directly connected and shuffled ), train=dict( batch_ratios=['1.0'], dataset=dict( ann_file=ann_files, img_prefix=img_prefixes, ) ), val=dict( ann_file='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json', img_prefix='/home/mdisk3/bianzhewu/dataset/wildreceipt/', ), test=dict( ann_file='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json', img_prefix='/home/mdisk3/bianzhewu/dataset/wildreceipt/', pipeline=test_pipeline ) )

optimizer = dict(type='Adadelta', lr=1.0, weight_decay=1e-5) lr_config = dict(step=[40, 80, 120]) runner = dict(max_epochs=150) checkpoint_config = dict(interval=10, filename_tmpl='checkpoint/mask_rcnn_r50_r32_e2e_finetune_epoch_{}.pth') work_dir = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_spotting/mask_rcnn_spot/workspace/wildreceipt_img512_32_200_len60/log/' load_from = None evaluation = dict( interval=1, )

得到的精度如下: 2023-02-06 12:45:04,094 - davarocr - INFO - Detection evaluation results: Precision: 0.8855601146647181, Recall: 0.8610230626297956, hmean: 0.8731192330072876 2023-02-06 12:45:04,094 - davarocr - INFO - Spotting evaluation results: Precision: 0.6957450396267776, Recall: 0.6764673734834408, hmean: 0.6859707944248941

但是当我想在trie中使用maskrcnn,即将maskrcnn的config参数copy至trie中复现其检测识别的结果时(先取消信息抽取模块),其config文件如下: character = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/flatten_dict.txt' batch_max_length = 60 type="SPOTTER"

model = dict( type='MaskRCNN_Trie', pretrained=None, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=-1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.1, 0.2, 0.4, 0.8, 1.6, 3.2], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),

rcg_roi_extractor=dict(
    type='MaskRoIExtractor',
    roi_layer=dict(type='RoIAlign', output_size=(32, 200), sampling_ratio=0),
    out_channels=256,
    featmap_strides=[4, 8, 16, 32],
    # _delete_=True,
    ),
rcg_backbone=dict(
    type='ResNet32',
    input_channel=256,
    output_channel=512,
    # _delete_=True,
    ),
rcg_transformation=None,
rcg_neck=None,
rcg_sequence_module=dict(
    type='CascadeRNN',
    rnn_modules=[
        dict(
            type='BidirectionalLSTM',
            input_size=512,
            hidden_size=256,
            output_size=256,
            with_linear=True,
            bidirectional=True,),
        dict(
            type='BidirectionalLSTM',
            input_size=256,
            hidden_size=256,
            output_size=512,
            with_linear=True,
            bidirectional=True,), ]),
rcg_sequence_head=dict(
    type='AttentionHead',
    input_size=512,
    hidden_size=256,
    batch_max_length=batch_max_length,
    converter=dict(type='AttnLabelConverter',character=character,use_cha_eos=True,with_unknown=True),
    loss_att=dict(type='StandardCrossEntropyLoss',ignore_index=0,loss_weight=1.0,reduction='mean'),
),
# infor_context_module=dict(
#     type='MultiModalContextModule',
#     textual_embedding=dict(
#         type='NodeEmbedding',
#         dropout_ratio=0.1,
#         merge_type='Sum',
#         pos_embedding=dict(
#             type='PositionEmbedding2D',
#             max_position_embeddings=64,
#             embedding_dim=256,
#             width_embedding=False,
#             height_embedding=False,
#         ),
#         sentence_embedding=dict(
#             type='SentenceEmbeddingCNN',
#             embedding_dim=256,
#             kernel_sizes=[3, 5, 7, 9]
#         ),
#     ),
#     multimodal_fusion_module=dict(
#         type='MultiModalFusion',
#         merge_type='Weighted',
#         visual_dim=[256],
#         semantic_dim=[256],
#     ),
#     textual_relation_module=dict(
#         type='BertEncoder',
#         config=dict(
#             hidden_size=256,
#             num_hidden_layers=2,
#             num_attention_heads=16,
#             intermediate_size=512,  # 4 x hidden_size
#             hidden_act="gelu",
#             hidden_dropout_prob=0.1,
#             attention_probs_dropout_prob=0.1,
#             layer_norm_eps=1e-12,
#             output_attentions=False,
#             output_hidden_states=False,
#             is_decoder=False, )
#     )),
# infor_node_cls_head=dict(
#     type='ClsHead',
#     input_size=256,
#     num_classes=26,  #
#     loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False)
# ),

# model training and testing settings
train_cfg=dict(
    # rcg
    keep_dim=False,
    sequence=(),
    # det
    rpn=dict(
        assigner=dict(
            type='MaxIoUAssigner',
            pos_iou_thr=0.7,
            neg_iou_thr=0.3,
            min_pos_iou=0.3,
            match_low_quality=True,
            ignore_iof_thr=-1),
        sampler=dict(
            type='RandomSampler',
            num=256,
            pos_fraction=0.5,
            neg_pos_ub=-1,
            add_gt_as_proposals=False),
        allowed_border=-1,
        pos_weight=-1,
        debug=False),
    rpn_proposal=dict(
        nms_pre=2000,
        max_per_img=1000,
        nms=dict(type='nms', iou_threshold=0.7),
        min_bbox_size=0),
    rcnn=dict(
        assigner=dict(
            type='MaxIoUAssigner',
            pos_iou_thr=0.5,
            neg_iou_thr=0.5,
            min_pos_iou=0.5,
            match_low_quality=True,
            ignore_iof_thr=-1),
        sampler=dict(
            type='RandomSampler',
            num=512,
            pos_fraction=0.25,
            neg_pos_ub=-1,
            add_gt_as_proposals=True),
        mask_size=28,
        pos_weight=-1,
        debug=False),),
test_cfg=dict(
    # rcg
    keep_dim=False,
    sequence=dict(),
    batch_max_length=batch_max_length,
    # det
    rpn=dict(
        nms_pre=2000,
        max_per_img=2000,
        nms=dict(type='nms', iou_threshold=0.7),
        min_bbox_size=0),
    rcnn=dict(
        score_thr=0.5,
        nms=dict(type='nms', iou_threshold=0.3),
        max_per_img=100,
        mask_thr_binary=0.5),
    postprocess=dict(
        type="PostMaskRCNNTrie",
        entity_pred = False
    )),

)

training and testing settings

train_cfg = dict() test_cfg = dict()

dataset settings

train_dataset_type = 'DavarMultiDataset' test_dataset_type = 'TextSpotDataset'

File prefix path of the traning dataset

train_img_prefixes = [ '/home/mdisk3/bianzhewu/dataset/wildreceipt/' ]

test_img_prefixes='/home/mdisk3/bianzhewu/dataset/wildreceipt/'

Dataset Name

train_ann_files = [ '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_train_without_iob.json' ]

test_ann_files = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json'

img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='DavarLoadImageFromFile',), dict(type='DavarLoadAnnotations', with_bbox=True, # Bounding Rect with_poly_mask=True, # Mask with_poly_bbox=True, # bouding poly with_label=True, # Bboxes' labels with_entity_label=True, with_care=True, # Ignore or not with_text=True, # Transcription with_cbbox=False, # Character bounding text_profile=dict(text_max_length=batch_max_length, sensitive='same', filtered=False) ), dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='DavarRandomCrop', instance_key='gt_bboxes'), dict(type='RandomRotate', angles=[-15, 15], borderValue=(0, 0, 0)), dict(type='DavarResize', img_scale=[(512, 512)], multiscale_mode='value', keep_ratio=True), dict(type='Pad', size_divisor=32), dict(type='DavarDefaultFormatBundle'), dict(type='DavarCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_texts', 'gt_masks','gt_entity_labels']), ] test_pipeline = [ dict(type='DavarLoadImageFromFile',), dict( type='MultiScaleFlipAug', img_scale=(512, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='DavarCollect', keys=['img',]), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=0, sampler=dict( type='DistBatchBalancedSampler', # BatchBalancedSampler and DistBatchBalancedSampler mode=1, # model 0: Balance in batch, calculate the epoch according to the first iterative data set # model 1: Balance in batch, calculate the epoch according to the last iterative data set # model 2: Balance in batch, record unused data # model -1: Each dataset is directly connected and shuffled ), train=dict( type=train_dataset_type, batch_ratios=['1.0'], dataset=dict( type=test_dataset_type, ann_file=train_ann_files, img_prefix=train_img_prefixes, test_mode=False, pipeline=train_pipeline) ), val=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, # classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ), test=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, # classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ))

optimizer

find_unused_parameters = True

optimizer = dict(type='AdamW', betas=(0.9, 0.999), eps=1e-8, lr=1e-3, weight_decay=0)

optimizer = dict(type='Adadelta', lr=1.0, weight_decay=1e-5) optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=200, warmup_ratio=1.0 / 3, step=[40, 80, 120]) runner = dict(type='EpochBasedRunner', max_epochs=150)

checkpoint_config = dict(type='DavarCheckpointHook', interval=10, save_mode='general', metric='hmean', filename_tmpl='checkpoint/wildreceipt_{}.pth', save_last=False)

yapf:disable

log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ])

yapf:enable

runtime settings

dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/trie/log/wildreceipt_maskrcnn_e2e_img512_roi_32_200_checkspotter' load_from = None resume_from = None workflow = [('train', 1)]

evaluation = dict( model_type=type, type="DavarEvalHook", interval=1, eval_func_params=dict( # SPECIAL_CHARACTERS="[]+-#$()@=_!?,:;/.%&'">*|<`{~}^\ ", IOU_CONSTRAINT=0.5, AREA_PRECISION_CONSTRAINT=0.5, WORD_SPOTTING=False ), by_epoch=True, eval_mode="general", #eval_mode="lightweight", save_best="hmean", rule='greater', ) 其精度如下: 2023-02-06 12:50:39,579 - davarocr - INFO - Detection evaluation results: Precision: 0.9122613326406027, Recall: 0.7676795278172478, hmean: 0.8337488129154795 2023-02-06 12:50:39,579 - davarocr - INFO - Spotting evaluation results: Precision: 0.5935186387842577, Recall: 0.49945349218493823, hmean: 0.5424382716049382 识别部分差距很大,请问maskrcnn的config文件做过特殊处理吗,为什么近乎相同的config文件会得出差距如此大的结果

谢谢!

ZHEGG avatar Feb 07 '23 11:02 ZHEGG