Thank you for releasing the nice code! I try e2e_trie but not use GT_bboxes for inference and have this problem when computing the f1-score I think the number of predict_boxes and GT_bboxes is not equal that lead to this problem. I wonder if you have an example of not using GT_bboxes to infer. Hope your reply, thank you!

Sep 07 '22 17:09 ZHEGG

It's sorry that the colleague who is responsible for maintaining the relevant algorithm has left. Other colleagues will be responsible for the algorithm in the future, but the time may be delayed. We will improve the demo of this part as soon as possible

Sep 14 '22 08:09 qiaoliang6

Thank you for releasing the nice code! I try e2e_trie but not use GT_bboxes for inference and have this problem when computing the f1-score I think the number of predict_boxes and GT_bboxes is not equal that lead to this problem. I wonder if you have an example of not using GT_bboxes to infer. Hope your reply, thank you!

hello，have you fixed this problem? I have some problem try e2e_trie without gt, would you mind if I refer to your model and config file?

May 10 '23 14:05 Extra-rich

Thank you for releasing the nice code! I try e2e_trie but not use GT_bboxes for inference and have this problem when computing the f1-score I think the number of predict_boxes and GT_bboxes is not equal that lead to this problem. I wonder if you have an example of not using GT_bboxes to infer. Hope your reply, thank you!

hello，have you fixed this problem? I have some problem try e2e_trie without gt, would you mind if I refer to your model and config file?

yes,I reimplement the model according to maskrcnn and I can share my config file.

character = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/flatten_dict.txt' batch_max_length = 60 type="SPOTTER"

model = dict( type='MaskRCNN_Trie', pretrained=None, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=-1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.1, 0.2, 0.4, 0.8, 1.6, 3.2], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),

rcg_roi_extractor=dict(
    type='MaskRoIExtractor',
    roi_layer=dict(type='RoIAlign', output_size=(32, 200), sampling_ratio=0),
    out_channels=256,
    featmap_strides=[4, 8, 16, 32],
    # _delete_=True,
    ),
rcg_backbone=dict(
    type='ResNet32',
    input_channel=256,
    output_channel=256,
    # _delete_=True,
    ),
rcg_transformation=None,
rcg_neck=None,
rcg_sequence_module=dict(
    type='CascadeRNN',
    rnn_modules=[
        dict(
            type='BidirectionalLSTM',
            input_size=256,
            hidden_size=256,
            output_size=256,
            with_linear=True,
            bidirectional=True,),
        dict(
            type='BidirectionalLSTM',
            input_size=256,
            hidden_size=256,
            output_size=256,
            with_linear=True,
            bidirectional=True,), ]),
rcg_sequence_head=dict(
    type='AttentionHead',
    input_size=256,
    hidden_size=256,
    batch_max_length=batch_max_length,
    converter=dict(type='AttnLabelConverter',character=character,use_cha_eos=True,with_unknown=True),
    loss_att=dict(type='StandardCrossEntropyLoss',ignore_index=0,loss_weight=1.0,reduction='mean'),
),
infor_context_module=dict(
    type='MultiModalContextModule',
    textual_embedding=dict(
        type='NodeEmbedding',
        dropout_ratio=0.1,
        merge_type='Sum',
        pos_embedding=dict(
            type='PositionEmbedding2D',
            max_position_embeddings=64,
            embedding_dim=256,
            width_embedding=False,
            height_embedding=False,
        ),
        sentence_embedding=dict(
            type='SentenceEmbeddingCNN',
            embedding_dim=256,
            kernel_sizes=[3, 5, 7, 9]
        ),
    ),
    multimodal_fusion_module=dict(
        type='MultiModalFusion',
        merge_type='Weighted',
        visual_dim=[256],
        semantic_dim=[256],
    ),
    textual_relation_module=dict(
        type='BertEncoder',
        config=dict(
            hidden_size=256,
            num_hidden_layers=2,
            num_attention_heads=16,
            intermediate_size=512,  # 4 x hidden_size
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            layer_norm_eps=1e-12,
            output_attentions=False,
            output_hidden_states=False,
            is_decoder=False, )
    )),
infor_node_cls_head=dict(
    type='ClsHead',
    input_size=256,
    num_classes=26,  #
    loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False)
),

# model training and testing settings
train_cfg=dict(
    # rcg
    keep_dim=False,
    sequence=(),
    # det
    rpn=dict(
        assigner=dict(
            type='MaxIoUAssigner',
            pos_iou_thr=0.7,
            neg_iou_thr=0.3,
            min_pos_iou=0.3,
            match_low_quality=True,
            ignore_iof_thr=-1),
        sampler=dict(
            type='RandomSampler',
            num=256,
            pos_fraction=0.5,
            neg_pos_ub=-1,
            add_gt_as_proposals=False),
        allowed_border=-1,
        pos_weight=-1,
        debug=False),
    rpn_proposal=dict(
        nms_pre=2000,
        max_per_img=1000,
        nms=dict(type='nms', iou_threshold=0.7),
        min_bbox_size=0),
    rcnn=dict(
        assigner=dict(
            type='MaxIoUAssigner',
            pos_iou_thr=0.5,
            neg_iou_thr=0.5,
            min_pos_iou=0.5,
            match_low_quality=True,
            ignore_iof_thr=-1),
        sampler=dict(
            type='RandomSampler',
            num=512,
            pos_fraction=0.25,
            neg_pos_ub=-1,
            add_gt_as_proposals=True),
        mask_size=28,
        pos_weight=-1,
        debug=False),),
test_cfg=dict(
    # rcg
    keep_dim=False,
    sequence=dict(),
    batch_max_length=batch_max_length,
    # det
    rpn=dict(
        nms_pre=2000,
        max_per_img=2000,
        nms=dict(type='nms', iou_threshold=0.7),
        min_bbox_size=0),
    rcnn=dict(
        score_thr=0.5,
        nms=dict(type='nms', iou_threshold=0.3),
        max_per_img=100,
        mask_thr_binary=0.5),
    postprocess=dict(
        type="PostMaskRCNNTrie",
        entity_pred = True
    )),

)

training and testing settings

train_cfg = dict() test_cfg = dict()

dataset settings

train_dataset_type = 'DavarMultiDataset' test_dataset_type = 'E2E_IE_Dataset'

File prefix path of the traning dataset

train_img_prefixes = [ '/home/mdisk3/bianzhewu/dataset/wildreceipt/' ]

test_img_prefixes='/home/mdisk3/bianzhewu/dataset/wildreceipt/'

Dataset Name

train_ann_files = [ '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_train_without_iob.json' ]

test_ann_files = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json'

img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='DavarLoadImageFromFile',), dict(type='DavarLoadAnnotations', with_bbox=True, # Bounding Rect with_poly_mask=True, # Mask with_poly_bbox=True, # bouding poly with_label=True, # Bboxes' labels with_entity_label=True, with_care=True, # Ignore or not with_text=True, # Transcription with_cbbox=False, # Character bounding text_profile=dict(text_max_length=batch_max_length, sensitive='same', filtered=False) ), dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='DavarRandomCrop', instance_key='gt_bboxes'), dict(type='RandomRotate', angles=[-15, 15], borderValue=(0, 0, 0)), dict(type='DavarResize', img_scale=[(768, 768)], multiscale_mode='value', keep_ratio=True), dict(type='Pad', size_divisor=32), dict(type='DavarDefaultFormatBundle'), dict(type='DavarCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_texts', 'gt_masks','gt_entity_labels']), ] test_pipeline = [ dict(type='DavarLoadImageFromFile',), dict( type='MultiScaleFlipAug', img_scale=(1350, 950), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='DavarCollect', keys=['img',]), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=0, sampler=dict( type='DistBatchBalancedSampler', # BatchBalancedSampler and DistBatchBalancedSampler mode=1, # model 0: Balance in batch, calculate the epoch according to the first iterative data set # model 1: Balance in batch, calculate the epoch according to the last iterative data set # model 2: Balance in batch, record unused data # model -1: Each dataset is directly connected and shuffled ), train=dict( type=train_dataset_type, batch_ratios=['1.0'], dataset=dict( type=test_dataset_type, ann_file=train_ann_files, img_prefix=train_img_prefixes, test_mode=False, pipeline=train_pipeline) ), val=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ), test=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ))

optimizer

find_unused_parameters = True

optimizer = dict(type='AdamW', betas=(0.9, 0.999), eps=1e-8, lr=1e-3, weight_decay=0)

optimizer = dict(type='Adadelta', lr=1.0, weight_decay=1e-5) optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=200, warmup_ratio=1.0 / 3, step=[40,80,120]) runner = dict(type='EpochBasedRunner', max_epochs=150)

checkpoint_config = dict(type='DavarCheckpointHook', interval=10, save_mode='general', metric='hmean', filename_tmpl='checkpoint/wildreceipt_{}.pth', save_last=False)

yapf:disable

log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ])

yapf:enable

runtime settings

dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/trie/log/wildreceipt_maskrcnn_e2e_train768test1350_90_roi_32_200_len_60_withie' load_from = None resume_from = None workflow = [('train', 1)]

evaluation = dict(

save_best='macro_f1',

model_type='SPOTTER',

interval=1,

metric='macro_f1',

rule="greater",

metric_options=dict(

macro_f1=dict(

ignores=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25])),

priority='HIGH')

evaluation = dict( model_type=type, type="DavarEvalHook", interval=1, eval_func_params=dict( # SPECIAL_CHARACTERS="[]+-#$()@=_!?,:;/.%&'">*|<`{~}^\ ", IOU_CONSTRAINT=0.5, AREA_PRECISION_CONSTRAINT=0.5, WORD_SPOTTING=False ), by_epoch=True, eval_mode="general", #eval_mode="lightweight", save_best="hmean", rule='greater', )

May 10 '23 15:05 ZHEGG

我已收到，。

May 10 '23 15:05 johnson-magic

DAVAR-Lab-OCR
DAVAR-Lab-OCR copied to clipboard

TRIE test error, not use GT

training and testing settings

dataset settings

File prefix path of the traning dataset

Dataset Name

optimizer

optimizer = dict(type='AdamW', betas=(0.9, 0.999), eps=1e-8, lr=1e-3, weight_decay=0)

yapf:disable

yapf:enable

runtime settings

evaluation = dict(

save_best='macro_f1',

model_type='SPOTTER',

interval=1,

metric='macro_f1',

rule="greater",

metric_options=dict(

macro_f1=dict(

ignores=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25])),

priority='HIGH')

DAVAR-Lab-OCR DAVAR-Lab-OCR copied to clipboard

TRIE test error, not use GT

training and testing settings

dataset settings

File prefix path of the traning dataset

Dataset Name

optimizer

optimizer = dict(type='AdamW', betas=(0.9, 0.999), eps=1e-8, lr=1e-3, weight_decay=0)

yapf:disable

yapf:enable

runtime settings

evaluation = dict(

save_best='macro_f1',

model_type='SPOTTER',

interval=1,

metric='macro_f1',

rule="greater",

metric_options=dict(

macro_f1=dict(

ignores=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25])),

priority='HIGH')

DAVAR-Lab-OCR
DAVAR-Lab-OCR copied to clipboard