DAVAR-Lab-OCR
DAVAR-Lab-OCR copied to clipboard
question about maskrcnn config and trie config
您好!我在使用maskrcnn的config作用于wildreceipt数据集时,config文件如下 """ ####################################################################################################
Copyright Info : Copyright (c) Davar Lab @ Hikvision Research Institute. All rights reserved.
Filename : mask_rcnn_r50_r32_e2e_finetune_ic13.py
Abstract : Model settings for mask rcnn spotter end-to-end finetune on realdata.
Current Version: 1.0.0
Date : 2021-06-24
###################################################################################################### """ base = "./base.py" batch_max_length = 60
model = dict( rcg_roi_extractor=dict( type='MaskRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=(32, 200), sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32], delete=True, ), rcg_backbone=dict( type='ResNet32', input_channel=256, output_channel=256, delete=True, ), rcg_sequence_module=dict( rnn_modules=[ dict( type='BidirectionalLSTM', input_size=256, hidden_size=256, output_size=256, with_linear=True, bidirectional=True,), dict( type='BidirectionalLSTM', input_size=256, hidden_size=256, output_size=256, with_linear=True, bidirectional=True,), ]), rcg_sequence_head=dict( input_size=256, converter=dict( type='AttnLabelConverter', with_unknown=True, ), ), )
File prefix path of the traning dataset
img_prefixes = [ '/home/mdisk3/bianzhewu/dataset/wildreceipt/', ]
Dataset Name
ann_files = [ '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_train_without_iob.json', ]
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [ dict(type='DavarResize', img_scale=[(512, 512)], multiscale_mode='range', keep_ratio=True), ]
test_pipeline = [ dict(type='DavarLoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(512, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='DavarCollect', keys=['img']), ]) ]
data = dict( samples_per_gpu=2, workers_per_gpu=0, sampler=dict( type='DistBatchBalancedSampler', # BatchBalancedSampler and DistBatchBalancedSampler mode=1, # model 0: Balance in batch, calculate the epoch according to the first iterative data set # model 1: Balance in batch, calculate the epoch according to the last iterative data set # model 2: Balance in batch, record unused data # model -1: Each dataset is directly connected and shuffled ), train=dict( batch_ratios=['1.0'], dataset=dict( ann_file=ann_files, img_prefix=img_prefixes, ) ), val=dict( ann_file='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json', img_prefix='/home/mdisk3/bianzhewu/dataset/wildreceipt/', ), test=dict( ann_file='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json', img_prefix='/home/mdisk3/bianzhewu/dataset/wildreceipt/', pipeline=test_pipeline ) )
optimizer = dict(type='Adadelta', lr=1.0, weight_decay=1e-5) lr_config = dict(step=[40, 80, 120]) runner = dict(max_epochs=150) checkpoint_config = dict(interval=10, filename_tmpl='checkpoint/mask_rcnn_r50_r32_e2e_finetune_epoch_{}.pth') work_dir = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_spotting/mask_rcnn_spot/workspace/wildreceipt_img512_32_200_len60/log/' load_from = None evaluation = dict( interval=1, )
得到的精度如下: 2023-02-06 12:45:04,094 - davarocr - INFO - Detection evaluation results: Precision: 0.8855601146647181, Recall: 0.8610230626297956, hmean: 0.8731192330072876 2023-02-06 12:45:04,094 - davarocr - INFO - Spotting evaluation results: Precision: 0.6957450396267776, Recall: 0.6764673734834408, hmean: 0.6859707944248941
但是当我想在trie中使用maskrcnn,即将maskrcnn的config参数copy至trie中复现其检测识别的结果时(先取消信息抽取模块),其config文件如下: character = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/flatten_dict.txt' batch_max_length = 60 type="SPOTTER"
model = dict( type='MaskRCNN_Trie', pretrained=None, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=-1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.1, 0.2, 0.4, 0.8, 1.6, 3.2], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
rcg_roi_extractor=dict(
type='MaskRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=(32, 200), sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32],
# _delete_=True,
),
rcg_backbone=dict(
type='ResNet32',
input_channel=256,
output_channel=512,
# _delete_=True,
),
rcg_transformation=None,
rcg_neck=None,
rcg_sequence_module=dict(
type='CascadeRNN',
rnn_modules=[
dict(
type='BidirectionalLSTM',
input_size=512,
hidden_size=256,
output_size=256,
with_linear=True,
bidirectional=True,),
dict(
type='BidirectionalLSTM',
input_size=256,
hidden_size=256,
output_size=512,
with_linear=True,
bidirectional=True,), ]),
rcg_sequence_head=dict(
type='AttentionHead',
input_size=512,
hidden_size=256,
batch_max_length=batch_max_length,
converter=dict(type='AttnLabelConverter',character=character,use_cha_eos=True,with_unknown=True),
loss_att=dict(type='StandardCrossEntropyLoss',ignore_index=0,loss_weight=1.0,reduction='mean'),
),
# infor_context_module=dict(
# type='MultiModalContextModule',
# textual_embedding=dict(
# type='NodeEmbedding',
# dropout_ratio=0.1,
# merge_type='Sum',
# pos_embedding=dict(
# type='PositionEmbedding2D',
# max_position_embeddings=64,
# embedding_dim=256,
# width_embedding=False,
# height_embedding=False,
# ),
# sentence_embedding=dict(
# type='SentenceEmbeddingCNN',
# embedding_dim=256,
# kernel_sizes=[3, 5, 7, 9]
# ),
# ),
# multimodal_fusion_module=dict(
# type='MultiModalFusion',
# merge_type='Weighted',
# visual_dim=[256],
# semantic_dim=[256],
# ),
# textual_relation_module=dict(
# type='BertEncoder',
# config=dict(
# hidden_size=256,
# num_hidden_layers=2,
# num_attention_heads=16,
# intermediate_size=512, # 4 x hidden_size
# hidden_act="gelu",
# hidden_dropout_prob=0.1,
# attention_probs_dropout_prob=0.1,
# layer_norm_eps=1e-12,
# output_attentions=False,
# output_hidden_states=False,
# is_decoder=False, )
# )),
# infor_node_cls_head=dict(
# type='ClsHead',
# input_size=256,
# num_classes=26, #
# loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False)
# ),
# model training and testing settings
train_cfg=dict(
# rcg
keep_dim=False,
sequence=(),
# det
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=2000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),),
test_cfg=dict(
# rcg
keep_dim=False,
sequence=dict(),
batch_max_length=batch_max_length,
# det
rpn=dict(
nms_pre=2000,
max_per_img=2000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
score_thr=0.5,
nms=dict(type='nms', iou_threshold=0.3),
max_per_img=100,
mask_thr_binary=0.5),
postprocess=dict(
type="PostMaskRCNNTrie",
entity_pred = False
)),
)
training and testing settings
train_cfg = dict() test_cfg = dict()
dataset settings
train_dataset_type = 'DavarMultiDataset' test_dataset_type = 'TextSpotDataset'
File prefix path of the traning dataset
train_img_prefixes = [ '/home/mdisk3/bianzhewu/dataset/wildreceipt/' ]
test_img_prefixes='/home/mdisk3/bianzhewu/dataset/wildreceipt/'
Dataset Name
train_ann_files = [ '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_train_without_iob.json' ]
test_ann_files = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json'
img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='DavarLoadImageFromFile',), dict(type='DavarLoadAnnotations', with_bbox=True, # Bounding Rect with_poly_mask=True, # Mask with_poly_bbox=True, # bouding poly with_label=True, # Bboxes' labels with_entity_label=True, with_care=True, # Ignore or not with_text=True, # Transcription with_cbbox=False, # Character bounding text_profile=dict(text_max_length=batch_max_length, sensitive='same', filtered=False) ), dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='DavarRandomCrop', instance_key='gt_bboxes'), dict(type='RandomRotate', angles=[-15, 15], borderValue=(0, 0, 0)), dict(type='DavarResize', img_scale=[(512, 512)], multiscale_mode='value', keep_ratio=True), dict(type='Pad', size_divisor=32), dict(type='DavarDefaultFormatBundle'), dict(type='DavarCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_texts', 'gt_masks','gt_entity_labels']), ] test_pipeline = [ dict(type='DavarLoadImageFromFile',), dict( type='MultiScaleFlipAug', img_scale=(512, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='DavarCollect', keys=['img',]), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=0, sampler=dict( type='DistBatchBalancedSampler', # BatchBalancedSampler and DistBatchBalancedSampler mode=1, # model 0: Balance in batch, calculate the epoch according to the first iterative data set # model 1: Balance in batch, calculate the epoch according to the last iterative data set # model 2: Balance in batch, record unused data # model -1: Each dataset is directly connected and shuffled ), train=dict( type=train_dataset_type, batch_ratios=['1.0'], dataset=dict( type=test_dataset_type, ann_file=train_ann_files, img_prefix=train_img_prefixes, test_mode=False, pipeline=train_pipeline) ), val=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, # classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ), test=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, # classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ))
optimizer
find_unused_parameters = True
optimizer = dict(type='AdamW', betas=(0.9, 0.999), eps=1e-8, lr=1e-3, weight_decay=0)
optimizer = dict(type='Adadelta', lr=1.0, weight_decay=1e-5) optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=200, warmup_ratio=1.0 / 3, step=[40, 80, 120]) runner = dict(type='EpochBasedRunner', max_epochs=150)
checkpoint_config = dict(type='DavarCheckpointHook', interval=10, save_mode='general', metric='hmean', filename_tmpl='checkpoint/wildreceipt_{}.pth', save_last=False)
yapf:disable
log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ])
yapf:enable
runtime settings
dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/trie/log/wildreceipt_maskrcnn_e2e_img512_roi_32_200_checkspotter' load_from = None resume_from = None workflow = [('train', 1)]
evaluation = dict( model_type=type, type="DavarEvalHook", interval=1, eval_func_params=dict( # SPECIAL_CHARACTERS="[]+-#$()@=_!?,:;/.%&'">*|<`{~}^\ ", IOU_CONSTRAINT=0.5, AREA_PRECISION_CONSTRAINT=0.5, WORD_SPOTTING=False ), by_epoch=True, eval_mode="general", #eval_mode="lightweight", save_best="hmean", rule='greater', ) 其精度如下: 2023-02-06 12:50:39,579 - davarocr - INFO - Detection evaluation results: Precision: 0.9122613326406027, Recall: 0.7676795278172478, hmean: 0.8337488129154795 2023-02-06 12:50:39,579 - davarocr - INFO - Spotting evaluation results: Precision: 0.5935186387842577, Recall: 0.49945349218493823, hmean: 0.5424382716049382 识别部分差距很大,请问maskrcnn的config文件做过特殊处理吗,为什么近乎相同的config文件会得出差距如此大的结果
谢谢!