Transformers-Tutorials
Transformers-Tutorials copied to clipboard
Table transformer : Extract information from detected table
Hi bro @NielsRogge while extracting information from detected table are not accurate..
The table which is correctly find..
The output i get..for above table
Max number of columns: 4
['OEAMCGOAEAR', 'CaQ .', 'Darosit', '']
['Balect', '0.0d', '0 . 00', '119.08']
['0i-Jan 2019', 'PRCR/843', '', '']
['0_-Jan-2019', 'DRCR 444', '100-00', '219']
['2019', 'ERcdeay', '100-00', '']
['ane 2019', '@Rcn/aas', '100_00', '1.9 03']
['2019', 'EFTAIC BaK RSINrRD BE;', 'J0j', '100122 .']
['02-Jan-2019', 'PRCR 095', '100.00', '10,022-']
['20| 0', 'PRCR/9', '100-00', '922 .74']
['Jen-2019', 'CrdR /Yokxa-MODIR Tower', '500 _', '']
['07-Jan-?019', 'EET/IC/BANK AS14/FRD BEET;', '913 .6.', '']
['Di-Jan-7019', 'Cade *orie #odit Totrd', '0oo-00', '"413']
['10-Jan-2019', 'CD? /Kor KODIA TC3r3', 'S0o.0q', '']
['', '', '', '']
you can see the words difference from table image and extract information..
code i have used for this..
import numpy as np
import csv
import easyocr
from tqdm.auto import tqdm
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
def apply_ocr(cell_coordinates):
# let's OCR row by row
data = dict()
max_num_columns = 0
for idx, row in enumerate(tqdm(cell_coordinates)):
row_text = []
for cell in row["cells"]:
# crop cell out of image
cell_image = np.array(cropped_table.crop(cell["cell"]))
# apply OCR
result = reader.readtext(np.array(cell_image))
if len(result) > 0:
# print([x[1] for x in list(result)])
text = " ".join([x[1] for x in result])
row_text.append(text)
if len(row_text) > max_num_columns:
max_num_columns = len(row_text)
data[idx] = row_text
print("Max number of columns:", max_num_columns)
# pad rows which don't have max_num_columns elements
# to make sure all rows have the same number of columns
for row, row_data in data.copy().items():
if len(row_data) != max_num_columns:
row_data = row_data + ["" for _ in range(max_num_columns - len(row_data))]
data[row] = row_data
return data
data = apply_ocr(cell_coordinates)
for row, row_data in data.items():
print(row_data)