Transformers-Tutorials
Transformers-Tutorials copied to clipboard
layoutlmv3, true inference, return corresponding text for detected labels
hi,
in layoutlmv3
after true inference , labels and boxes are generated.
how can i get the text(word) for a particular label.
should i need to apply ocr in that detected box or any easy way??
thanks in advance .
An easy workaround to achieve this could be to keep track of the input data that is associated with your prediction and then, given any bounding box, search in the data for a match between the predicted bounding box and the input bounding box. When you find a match, you can use the input data to obtain the word that is contained in the matching bounding box
@aditya11ad Hi bro, Were you able to figure it out? And also I'd like to know how did you get the true inference using Layoutlmv3 model cuz I could only do the inference for train and test documents that we already have labels
@Malisha15 did you find a solution please
hi @Malisha15 , sorry for the late reply.
once you have your trained model ready, you can use this inference code:
labels = ['date', 'invoice_num', 'total', 'others']
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained( "microsoft/layoutlmv3-base")
tokenizer = LayoutLMv3TokenizerFast.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
processor = LayoutLMv3Processor.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("checkpoint-1000") // the trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# --------------------------
def unnormalize_box(bbox, width, height):
return [
width * (bbox[0] / 1000),
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
def prediction(im_path):
image = Image.open(im_path)
image = image.convert("RGB")
encoding_feature_extractor = feature_extractor(image, return_tensors="pt")
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
encoding = processor(image, words, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True)
offset_mapping = encoding.pop('offset_mapping')
for k, v in encoding.items():
encoding[k] = v.to(device)
outputs = model(**encoding)
predictions = outputs.logits.argmax(-1).squeeze().tolist()
token_boxes = encoding.bbox.squeeze().tolist()
inp_ids = encoding.input_ids.squeeze().tolist()
inp_words = [tokenizer.decode(i) for i in inp_ids]
width, height = image.size
is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0
true_predictions = [id2label[pred]
for idx, pred in enumerate(predictions) if not is_subword[idx]]
true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(
token_boxes) if not is_subword[idx]]
true_words = []
for id, i in enumerate(inp_words):
if not is_subword[id]:
true_words.append(i)
else:
true_words[-1] = true_words[-1]+i
true_predictions = true_predictions[1:-1]
true_boxes = true_boxes[1:-1]
true_words = true_words[1:-1]
preds = []
l_words = []
bboxes = []
for i, j in enumerate(true_predictions):
if j != 'others':
preds.append(true_predictions[i])
l_words.append(true_words[i])
bboxes.append(true_boxes[i])
d = {}
for id, i in enumerate(preds):
if i not in d.keys():
d[i] = l_words[id]
else:
d[i] = d[i]+l_words[id]
d = {k: v.strip() for (k, v) in d.items()}
draw = ImageDraw.Draw(image, "RGBA")
font = ImageFont.load_default()
label2color = {"invoice_num": 'red', "date": 'red',
"total": 'red', "others": 'green'}
#if len(fields) == 0:
# dict_filtered = d
#else:
# dict_filtered = dict((k, d[k]) for k in fields if k in d)
# preds = [i for i in preds if i in fields]
# bboxes = [bboxes[id] for id, i in enumerate(preds) if i in fields]
for prediction, box in zip(preds, bboxes):
draw.rectangle(box, outline=label2color[prediction], fill=(
255, 255, 0, int(0.4 * 255)))
draw.text((box[0]+10, box[1]-10), text=prediction,
fill=label2color[prediction], font=font)
return d, image
@aditya11ad NameError: name 'fields' is not defined, can you help me with this error please
hi @Monta79 , u don't need that 'fields'(I updated the above code.) that was actually useful when u explicitly pass desired fields to extract like: fields=['invoice_num', 'total']. otherwise its not needed.
hello @aditya11ad this is my code
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained( "microsoft/layoutlmv3-base")
tokenizer = LayoutLMv3TokenizerFast.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
processor = LayoutLMv3Processor.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("/content/drive/MyDrive/SSCL/layoutlm/model")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# --------------------------
def unnormalize_box(bbox, width, height):
return [
width * (bbox[0] / 1000),
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
def prediction(im_path):
image = Image.open(im_path)
image = image.convert("RGB")
encoding_feature_extractor = feature_extractor(image, return_tensors="pt")
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
# Assuming processor is defined and example contains the necessary data
encoding = processor(
image, words, boxes=boxes, return_tensors="pt",
truncation=True, stride=128, padding="max_length",
max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True
)
offset_mapping = encoding.pop('offset_mapping')
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
# Convert pixel_values to a PyTorch tensor
# Convert the list of numpy.ndarrays to a single numpy.ndarray and then to a PyTorch tensor
pixel_values_list = encoding['pixel_values']
pixel_values_array = np.stack(pixel_values_list) # Use np.stack() to convert list of arrays to a single array
encoding['pixel_values'] = torch.tensor(pixel_values_array)
print(encoding['pixel_values'].shape)
# change the shape of pixel values
x = []
for i in range(0, len(encoding['pixel_values'])):
x.append(encoding['pixel_values'][i])
x = torch.stack(x)
encoding['pixel_values'] = x
print(encoding['pixel_values'].shape)
# Convert non-tensor items to PyTorch tensors if necessary
for k, v in encoding.items():
if not isinstance(v, torch.Tensor):
encoding[k] = torch.tensor(v)
# Print the shapes of all items in the encoding dictionary
for k, v in encoding.items():
print(f"{k}: {v.shape}")
with torch.no_grad():
outputs = model(**encoding)
predictions = outputs.logits.argmax(-1).squeeze().tolist()
token_boxes = encoding.bbox.squeeze().tolist()
inp_ids = encoding.input_ids.squeeze().tolist()
inp_words = [tokenizer.decode(i) for i in inp_ids]
width, height = image.size
is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0
true_predictions = [id2label[pred]
for idx, pred in enumerate(predictions) if not is_subword[idx]]
true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(
token_boxes) if not is_subword[idx]]
true_words = []
for id, i in enumerate(inp_words):
if not is_subword[id]:
true_words.append(i)
else:
true_words[-1] = true_words[-1]+i
true_predictions = true_predictions[1:-1]
true_boxes = true_boxes[1:-1]
true_words = true_words[1:-1]
preds = []
l_words = []
bboxes = []
for i, j in enumerate(true_predictions):
if j != 'others':
preds.append(true_predictions[i])
l_words.append(true_words[i])
bboxes.append(true_boxes[i])
d = {}
for id, i in enumerate(preds):
if i not in d.keys():
d[i] = l_words[id]
else:
d[i] = d[i]+l_words[id]
d = {k: v.strip() for (k, v) in d.items()}
draw = ImageDraw.Draw(image, "RGBA")
font = ImageFont.load_default()
label2color = {
"other": "black",
"name_key": "red",
"ben_name": "orange",
"add_key": "brown",
"ben_add": "yellow",
"invoice_key": "blue",
"invoice_no": "violet",
"amount_key": "green",
"total_amount": "pink",
}
for prediction, box in zip(preds, bboxes):
draw.rectangle(box, outline=label2color[prediction], fill=(
255, 255, 0, int(0.4 * 255)))
draw.text((box[0]+10, box[1]-10), text=prediction,
fill=label2color[prediction], font=font)
return d, image
im_path = "/content/drive/MyDrive/SSCL/test/1045_pdf-375.jpg"
# Run prediction on the image
result, annotated_image = prediction(im_path)
# Save the annotated image
annotated_image.save("/content/drive/MyDrive/SSCL/layoutlm/annotated_1045_pdf-375.jpg")
# Display the annotated image
annotated_image.show()
but getting this error could you please help me with that
/usr/local/lib/python3.10/dist-packages/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py:30: FutureWarning: The class LayoutLMv3FeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use LayoutLMv3ImageProcessor instead.
warnings.warn(
torch.Size([2, 3, 224, 224])
torch.Size([2, 3, 224, 224])
input_ids: torch.Size([2, 512])
attention_mask: torch.Size([2, 512])
bbox: torch.Size([2, 512, 4])
pixel_values: torch.Size([2, 3, 224, 224])
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:1052: FutureWarning: The device
argument is deprecated and will be removed in v5 of Transformers.
warnings.warn(
ValueError Traceback (most recent call last)
1 frames
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()