Not able to fetch all tables and figures when converting pdf into images
Hi Team,
I am using layoutparser for detecting tables and images. When I just try to run code on individual png image file, model detects tables and figures correctly. However, when I am using below code to convert pdf into images and detecting tables out of each page image, I am either not getting full image/table or sometimes get duplicates tables as well.
Can you please guide on how to refine below code and what I can try to resolve this issue? Thank you!
!pip install layoutparser !pip install opencv-python numpy matplotlib
install detectron2:
!pip install 'git+https://github.com/facebookresearch/[email protected]#egg=detectron2' !pip3 install pdf2image !sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev !apt-get install poppler-utils
import os from pdf2image import convert_from_path import shutil import cv2
import layoutparser as lp
PubLayNet
model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.81], label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})
def save_detections(table_blocks, image, image_name, save_dir='/content/'):
for j in range(len(table_blocks)):
x_1, y_1, x_2, y_2 = table_blocks[0].block.x_1, table_blocks[0].block.y_1, table_blocks[0].block.x_2, table_blocks[0].block.y_2
cropped = image[int(y_1):int(y_2), int(x_1):int(x_2)]
cv2_imshow(cropped)
file_name = image_name+'_'+str(j)+'.jpg'
status = cv2.imwrite(save_dir+file_name, cropped)
if status:
print("Saved ", file_name)
def inference(images_dir): table_blocks_list = [] # Getting images from the directory for file in os.listdir(images_dir): if file.endswith(".jpg"): # Extract the image name (excluding the extension) image_name = file[:-4] # # Reading the image using OpenCV image = cv2.imread(images_dir+'/'+file) # OpenCV reads images in BGR format, convert to RGB image = image[..., ::-1] # Running Inference layout = model.detect(image)
# Extracting Tables
table_blocks = lp.Layout([b for b in layout if b.type=="Table"])
figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])
table_blocks = lp.Layout([b for b in table_blocks \
if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
h, w = image.shape[:2]
left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)
left_blocks = table_blocks.filter_by(left_interval, center=True)
left_blocks.sort(key = lambda b:b.coordinates[1])
right_blocks = [b for b in table_blocks if b not in left_blocks]
right_blocks.sort(key = lambda b:b.coordinates[1])
# And finally combine the two list and add the index
# according to the order
table_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])
save_detections(table_blocks, image, image_name)
table_blocks_list.append(table_blocks)
return table_blocks_list
def pdf_inference(pdfName): # Converting each page to an image # Get the current working directory path = os.getcwd() # Construct the full path to the PDF file PDF_file = path + "/" + pdfName # Create a directory to store converted images if os.path.exists(path+'/pdf_images'): shutil.rmtree(path+'/pdf_images') os.mkdir(path+'/pdf_images')
# Convert each page of the PDF to an image
pages = convert_from_path(PDF_file, dpi=100, grayscale=True)
image_counter = 1
# Iterate over the pages
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
# st.write(filename)
filepath = path+"/pdf_images/" + filename
# Save the page as a JPEG image in the 'pdf_images' directory
page.save(f'{filepath}', 'JPEG')
image_counter = image_counter + 1
#filelimit = image_counter-1
# Running inference on the images
table_blocks_list = inference(path+'/pdf_images')
#return table_blocks_list
test = pdf_inference('abc-Datasheet.pdf')
Thanks Reema Jain