DAVAR-Lab-OCR
DAVAR-Lab-OCR copied to clipboard
TRIE test error, not use GT
Thank you for releasing the nice code!
I try e2e_trie but not use GT_bboxes for inference and have this problem when computing the f1-score
I think the number of predict_boxes and GT_bboxes is not equal that lead to this problem.
I wonder if you have an example of not using GT_bboxes to infer.
Hope your reply, thank you!
It's sorry that the colleague who is responsible for maintaining the relevant algorithm has left. Other colleagues will be responsible for the algorithm in the future, but the time may be delayed. We will improve the demo of this part as soon as possible
Thank you for releasing the nice code! I try e2e_trie but not use GT_bboxes for inference and have this problem when computing the f1-score
I think the number of predict_boxes and GT_bboxes is not equal that lead to this problem. I wonder if you have an example of not using GT_bboxes to infer. Hope your reply, thank you!
hello,have you fixed this problem? I have some problem try e2e_trie without gt, would you mind if I refer to your model and config file?
Thank you for releasing the nice code! I try e2e_trie but not use GT_bboxes for inference and have this problem when computing the f1-score
I think the number of predict_boxes and GT_bboxes is not equal that lead to this problem. I wonder if you have an example of not using GT_bboxes to infer. Hope your reply, thank you!
hello,have you fixed this problem? I have some problem try e2e_trie without gt, would you mind if I refer to your model and config file?
yes,I reimplement the model according to maskrcnn and I can share my config file.
character = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/flatten_dict.txt' batch_max_length = 60 type="SPOTTER"
model = dict( type='MaskRCNN_Trie', pretrained=None, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=-1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.1, 0.2, 0.4, 0.8, 1.6, 3.2], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
rcg_roi_extractor=dict(
type='MaskRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=(32, 200), sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32],
# _delete_=True,
),
rcg_backbone=dict(
type='ResNet32',
input_channel=256,
output_channel=256,
# _delete_=True,
),
rcg_transformation=None,
rcg_neck=None,
rcg_sequence_module=dict(
type='CascadeRNN',
rnn_modules=[
dict(
type='BidirectionalLSTM',
input_size=256,
hidden_size=256,
output_size=256,
with_linear=True,
bidirectional=True,),
dict(
type='BidirectionalLSTM',
input_size=256,
hidden_size=256,
output_size=256,
with_linear=True,
bidirectional=True,), ]),
rcg_sequence_head=dict(
type='AttentionHead',
input_size=256,
hidden_size=256,
batch_max_length=batch_max_length,
converter=dict(type='AttnLabelConverter',character=character,use_cha_eos=True,with_unknown=True),
loss_att=dict(type='StandardCrossEntropyLoss',ignore_index=0,loss_weight=1.0,reduction='mean'),
),
infor_context_module=dict(
type='MultiModalContextModule',
textual_embedding=dict(
type='NodeEmbedding',
dropout_ratio=0.1,
merge_type='Sum',
pos_embedding=dict(
type='PositionEmbedding2D',
max_position_embeddings=64,
embedding_dim=256,
width_embedding=False,
height_embedding=False,
),
sentence_embedding=dict(
type='SentenceEmbeddingCNN',
embedding_dim=256,
kernel_sizes=[3, 5, 7, 9]
),
),
multimodal_fusion_module=dict(
type='MultiModalFusion',
merge_type='Weighted',
visual_dim=[256],
semantic_dim=[256],
),
textual_relation_module=dict(
type='BertEncoder',
config=dict(
hidden_size=256,
num_hidden_layers=2,
num_attention_heads=16,
intermediate_size=512, # 4 x hidden_size
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
layer_norm_eps=1e-12,
output_attentions=False,
output_hidden_states=False,
is_decoder=False, )
)),
infor_node_cls_head=dict(
type='ClsHead',
input_size=256,
num_classes=26, #
loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False)
),
# model training and testing settings
train_cfg=dict(
# rcg
keep_dim=False,
sequence=(),
# det
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=2000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),),
test_cfg=dict(
# rcg
keep_dim=False,
sequence=dict(),
batch_max_length=batch_max_length,
# det
rpn=dict(
nms_pre=2000,
max_per_img=2000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
score_thr=0.5,
nms=dict(type='nms', iou_threshold=0.3),
max_per_img=100,
mask_thr_binary=0.5),
postprocess=dict(
type="PostMaskRCNNTrie",
entity_pred = True
)),
)
training and testing settings
train_cfg = dict() test_cfg = dict()
dataset settings
train_dataset_type = 'DavarMultiDataset' test_dataset_type = 'E2E_IE_Dataset'
File prefix path of the traning dataset
train_img_prefixes = [ '/home/mdisk3/bianzhewu/dataset/wildreceipt/' ]
test_img_prefixes='/home/mdisk3/bianzhewu/dataset/wildreceipt/'
Dataset Name
train_ann_files = [ '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_train_without_iob.json' ]
test_ann_files = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/convert_test_without_iob.json'
img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='DavarLoadImageFromFile',), dict(type='DavarLoadAnnotations', with_bbox=True, # Bounding Rect with_poly_mask=True, # Mask with_poly_bbox=True, # bouding poly with_label=True, # Bboxes' labels with_entity_label=True, with_care=True, # Ignore or not with_text=True, # Transcription with_cbbox=False, # Character bounding text_profile=dict(text_max_length=batch_max_length, sensitive='same', filtered=False) ), dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='DavarRandomCrop', instance_key='gt_bboxes'), dict(type='RandomRotate', angles=[-15, 15], borderValue=(0, 0, 0)), dict(type='DavarResize', img_scale=[(768, 768)], multiscale_mode='value', keep_ratio=True), dict(type='Pad', size_divisor=32), dict(type='DavarDefaultFormatBundle'), dict(type='DavarCollect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_texts', 'gt_masks','gt_entity_labels']), ] test_pipeline = [ dict(type='DavarLoadImageFromFile',), dict( type='MultiScaleFlipAug', img_scale=(1350, 950), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='DavarCollect', keys=['img',]), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=0, sampler=dict( type='DistBatchBalancedSampler', # BatchBalancedSampler and DistBatchBalancedSampler mode=1, # model 0: Balance in batch, calculate the epoch according to the first iterative data set # model 1: Balance in batch, calculate the epoch according to the last iterative data set # model 2: Balance in batch, record unused data # model -1: Each dataset is directly connected and shuffled ), train=dict( type=train_dataset_type, batch_ratios=['1.0'], dataset=dict( type=test_dataset_type, ann_file=train_ann_files, img_prefix=train_img_prefixes, test_mode=False, pipeline=train_pipeline) ), val=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ), test=dict( type=test_dataset_type, ann_file=test_ann_files, img_prefix=test_img_prefixes, pipeline=test_pipeline, classes='/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/datalist/wildreceipt/class_list.txt' ))
optimizer
find_unused_parameters = True
optimizer = dict(type='AdamW', betas=(0.9, 0.999), eps=1e-8, lr=1e-3, weight_decay=0)
optimizer = dict(type='Adadelta', lr=1.0, weight_decay=1e-5) optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=200, warmup_ratio=1.0 / 3, step=[40,80,120]) runner = dict(type='EpochBasedRunner', max_epochs=150)
checkpoint_config = dict(type='DavarCheckpointHook', interval=10, save_mode='general', metric='hmean', filename_tmpl='checkpoint/wildreceipt_{}.pth', save_last=False)
yapf:disable
log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ])
yapf:enable
runtime settings
dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = '/home/mdisk3/bianzhewu/DAVAR-Lab-OCR-dev/demo/text_ie/trie/log/wildreceipt_maskrcnn_e2e_train768test1350_90_roi_32_200_len_60_withie' load_from = None resume_from = None workflow = [('train', 1)]
evaluation = dict(
save_best='macro_f1',
model_type='SPOTTER',
interval=1,
metric='macro_f1',
rule="greater",
metric_options=dict(
macro_f1=dict(
ignores=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25])),
priority='HIGH')
evaluation = dict( model_type=type, type="DavarEvalHook", interval=1, eval_func_params=dict( # SPECIAL_CHARACTERS="[]+-#$()@=_!?,:;/.%&'">*|<`{~}^\ ", IOU_CONSTRAINT=0.5, AREA_PRECISION_CONSTRAINT=0.5, WORD_SPOTTING=False ), by_epoch=True, eval_mode="general", #eval_mode="lightweight", save_best="hmean", rule='greater', )
我已收到,。