what

Extract the images from pdf, and then generate the QA list

implement

direct

Jan 23 '24 02:01 ggservice007

This is a optimization to current pdf QA generation.

Use case

When user asks a question against a knowlegebase, our chat server can respond with extra images which are extracted from pdf and extra indexed in our pg.

Jan 23 '24 05:01 bjwswang

@ggservice007 @wangxinbiao Please provide more details on how you are gonna implement this.

Jan 23 '24 05:01 bjwswang

Use Qwen VL to recognize the content in the image. If it's in English, provide the translated Chinese content.

@bjwswang

Mar 08 '24 03:03 ggservice007

direct

Use the pdf library to extract image directly.

reference

https://mp.weixin.qq.com/s/4mg59Sb7TzaoXVctEMJVWw

source code

def get_image_direct():
    """
    apt install poppler-utils
    """
    # 读取PDF
    import pypdf
    # 分析PDF的layout，提取文本
    from pdfminer.high_level import extract_pages, extract_text
    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
    from pdf2image import convert_from_path
    # 从PDF的表格中提取文本
    import pdfplumber
    import os
    import ulid

    print("get image direct")

    # 创建一个从pdf中裁剪图像元素的函数
    def crop_image(element, pageObj, file_name):
        # 获取从PDF中裁剪图像的坐标
        [image_left, image_top, image_right, image_bottom] = [
                                                    element.x0,
                                                    element.y0,
                                                    element.x1,
                                                    element.y1] 
        
        # 使用坐标(left, bottom, right, top)裁剪页面
        pageObj.mediabox.lower_left = (image_left, image_bottom)
        pageObj.mediabox.upper_right = (image_right, image_top)
        # 将裁剪后的页面保存为新的PDF
        cropped_pdf_writer = pypdf.PdfWriter()
        cropped_pdf_writer.add_page(pageObj)
        # 将裁剪好的PDF保存到一个新文件
        
        with open(file_name, 'wb') as cropped_pdf_file:
            cropped_pdf_writer.write(cropped_pdf_file)

    # 创建一个将PDF内容转换为image的函数
    def convert_to_images(input_file, output_file):
        images = convert_from_path(input_file)
        image = images[0]
        image.save(output_file, "PNG")

        # 查找PDF路径
    pdf_path = 'aa.pdf'

    # 创建一个PDF文件对象
    pdfFileObj = open(pdf_path, 'rb')
    # 创建一个PDF阅读器对象
    pdfReaded = pypdf.PdfReader(pdfFileObj)

    # 打开pdf文件
    pdf = pdfplumber.open(pdf_path)

    # 我们从PDF中提取页面
    for pagenum, page in enumerate(extract_pages(pdf_path)):
        # print(f"第{pagenum + 1}页")
        # 初始化从页面中提取文本所需的变量
        pageObj = pdfReaded.pages[pagenum]
        # 找到所有的元素
        page_elements = [(element.y1, element) for element in page._objs]
        # 对页面中出现的所有元素进行排序
        page_elements.sort(key=lambda a: a[0], reverse=True)
        
        # 查找组成页面的元素
        image_index = 0
        for i, component in enumerate(page_elements):
            # 提取PDF中元素顶部的位置
            pos= component[0]
            # 提取页面布局的元素
            element = component[1]
            if isinstance(element, LTFigure):
                [image_left, image_top, image_right, image_bottom] = [
                                                    element.x0,
                                                    element.y0,
                                                    element.x1,
                                                    element.y1] 
        
                width = image_right - image_left
                height = image_bottom - image_top
                if width >= 64 and height >= 64:
                    print(f"{width} x {height}")
                    print(f"第{pagenum + 1}页: 第{image_index + 1}个图片")
                    # 从PDF中裁剪图像
                    file_name = f"./images/pdf/pdf_{pagenum + 1}_{image_index + 1}.pdf"
                    crop_image(element, pageObj, f"./images/pdf/pdf_{pagenum + 1}_{image_index + 1}.pdf")
                    # 将裁剪后的pdf转换为图像
                    output_file = f"./images/img/img_{pagenum + 1}_{image_index + 1}.png"
                    convert_to_images(file_name, output_file)
                    image_index = image_index + 1
                  

        print(f"第{pagenum + 1}页: 包含{image_index + 1}个图片")



    # 关闭pdf文件对象
    pdfFileObj.close()




if __name__ == '__main__':
    get_image_direct()

result

But the WPS can work well.

resolve it

use the following library

pypdf==4.1.0
Pillow==10.3.0

use the simple code

def extract_image_002():
    from pypdf import PdfReader

    print("extract image 002")

    pdf_path = 'pdf/aa.pdf'
    reader = PdfReader(pdf_path)

    page = reader.pages[2]
    count = 0
    for image_file_object in page.images:
        with open('img' + '/' + str(count) + image_file_object.name, "wb") as fp:
            fp.write(image_file_object.data)
            count += 1

image is ok

Mar 14 '24 08:03 ggservice007

arcadia arcadia copied to clipboard

Extract the images from pdf, and then generate the QA list

what

implement

direct

Use case

direct

reference

source code

result

resolve it

use the following library

use the simple code

image is ok

arcadia
arcadia copied to clipboard