Not able to fetch all tables and figures when converting pdf into images

Open reema93jain opened this issue 2 years ago • 0 comments

Hi Team,

I am using layoutparser for detecting tables and images. When I just try to run code on individual png image file, model detects tables and figures correctly. However, when I am using below code to convert pdf into images and detecting tables out of each page image, I am either not getting full image/table or sometimes get duplicates tables as well.

Can you please guide on how to refine below code and what I can try to resolve this issue? Thank you!

!pip install layoutparser !pip install opencv-python numpy matplotlib

install detectron2:

!pip install 'git+https://github.com/facebookresearch/[email protected]#egg=detectron2' !pip3 install pdf2image !sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev !apt-get install poppler-utils

import os from pdf2image import convert_from_path import shutil import cv2

import layoutparser as lp

PubLayNet

model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.81], label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})

def save_detections(table_blocks, image, image_name, save_dir='/content/'): for j in range(len(table_blocks)): x_1, y_1, x_2, y_2 = table_blocks[0].block.x_1, table_blocks[0].block.y_1, table_blocks[0].block.x_2, table_blocks[0].block.y_2 cropped = image[int(y_1):int(y_2), int(x_1):int(x_2)]
cv2_imshow(cropped) file_name = image_name+'_'+str(j)+'.jpg' status = cv2.imwrite(save_dir+file_name, cropped) if status: print("Saved ", file_name)

def inference(images_dir): table_blocks_list = [] # Getting images from the directory for file in os.listdir(images_dir): if file.endswith(".jpg"): # Extract the image name (excluding the extension) image_name = file[:-4] # # Reading the image using OpenCV image = cv2.imread(images_dir+'/'+file) # OpenCV reads images in BGR format, convert to RGB image = image[..., ::-1] # Running Inference layout = model.detect(image)

        # Extracting Tables
        table_blocks = lp.Layout([b for b in layout if b.type=="Table"])
        figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])

        table_blocks = lp.Layout([b for b in table_blocks \
               if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
        h, w = image.shape[:2]

        left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)

        left_blocks = table_blocks.filter_by(left_interval, center=True)
        left_blocks.sort(key = lambda b:b.coordinates[1])

        right_blocks = [b for b in table_blocks if b not in left_blocks]
        right_blocks.sort(key = lambda b:b.coordinates[1])

        # And finally combine the two list and add the index
        # according to the order
        table_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])

        save_detections(table_blocks, image, image_name)

        table_blocks_list.append(table_blocks)
return table_blocks_list

def pdf_inference(pdfName): # Converting each page to an image # Get the current working directory path = os.getcwd() # Construct the full path to the PDF file PDF_file = path + "/" + pdfName # Create a directory to store converted images if os.path.exists(path+'/pdf_images'): shutil.rmtree(path+'/pdf_images') os.mkdir(path+'/pdf_images')

# Convert each page of the PDF to an image
pages = convert_from_path(PDF_file, dpi=100, grayscale=True)
image_counter = 1

# Iterate over the pages
for page in pages:
    filename = "page_"+str(image_counter)+".jpg"

    # st.write(filename)
    filepath = path+"/pdf_images/" + filename
    
    # Save the page as a JPEG image in the 'pdf_images' directory
    page.save(f'{filepath}', 'JPEG')
    image_counter = image_counter + 1

#filelimit = image_counter-1

# Running inference on the images
table_blocks_list = inference(path+'/pdf_images')

#return table_blocks_list

test = pdf_inference('abc-Datasheet.pdf')

Thanks Reema Jain

Jan 25 '24 00:01 reema93jain