arcadia
arcadia copied to clipboard
Extract the images from pdf, and then generate the QA list
This is a optimization to current pdf QA generation.
Use case
When user asks a question against a knowlegebase, our chat server can respond with extra images which are extracted from pdf and extra indexed in our pg.
@ggservice007 @wangxinbiao Please provide more details on how you are gonna implement this.
Use Qwen VL to recognize the content in the image. If it's in English, provide the translated Chinese content.
@bjwswang
direct
Use the pdf library to extract image directly.
reference
https://mp.weixin.qq.com/s/4mg59Sb7TzaoXVctEMJVWw
source code
def get_image_direct():
"""
apt install poppler-utils
"""
# 读取PDF
import pypdf
# 分析PDF的layout,提取文本
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
from pdf2image import convert_from_path
# 从PDF的表格中提取文本
import pdfplumber
import os
import ulid
print("get image direct")
# 创建一个从pdf中裁剪图像元素的函数
def crop_image(element, pageObj, file_name):
# 获取从PDF中裁剪图像的坐标
[image_left, image_top, image_right, image_bottom] = [
element.x0,
element.y0,
element.x1,
element.y1]
# 使用坐标(left, bottom, right, top)裁剪页面
pageObj.mediabox.lower_left = (image_left, image_bottom)
pageObj.mediabox.upper_right = (image_right, image_top)
# 将裁剪后的页面保存为新的PDF
cropped_pdf_writer = pypdf.PdfWriter()
cropped_pdf_writer.add_page(pageObj)
# 将裁剪好的PDF保存到一个新文件
with open(file_name, 'wb') as cropped_pdf_file:
cropped_pdf_writer.write(cropped_pdf_file)
# 创建一个将PDF内容转换为image的函数
def convert_to_images(input_file, output_file):
images = convert_from_path(input_file)
image = images[0]
image.save(output_file, "PNG")
# 查找PDF路径
pdf_path = 'aa.pdf'
# 创建一个PDF文件对象
pdfFileObj = open(pdf_path, 'rb')
# 创建一个PDF阅读器对象
pdfReaded = pypdf.PdfReader(pdfFileObj)
# 打开pdf文件
pdf = pdfplumber.open(pdf_path)
# 我们从PDF中提取页面
for pagenum, page in enumerate(extract_pages(pdf_path)):
# print(f"第{pagenum + 1}页")
# 初始化从页面中提取文本所需的变量
pageObj = pdfReaded.pages[pagenum]
# 找到所有的元素
page_elements = [(element.y1, element) for element in page._objs]
# 对页面中出现的所有元素进行排序
page_elements.sort(key=lambda a: a[0], reverse=True)
# 查找组成页面的元素
image_index = 0
for i, component in enumerate(page_elements):
# 提取PDF中元素顶部的位置
pos= component[0]
# 提取页面布局的元素
element = component[1]
if isinstance(element, LTFigure):
[image_left, image_top, image_right, image_bottom] = [
element.x0,
element.y0,
element.x1,
element.y1]
width = image_right - image_left
height = image_bottom - image_top
if width >= 64 and height >= 64:
print(f"{width} x {height}")
print(f"第{pagenum + 1}页: 第{image_index + 1}个图片")
# 从PDF中裁剪图像
file_name = f"./images/pdf/pdf_{pagenum + 1}_{image_index + 1}.pdf"
crop_image(element, pageObj, f"./images/pdf/pdf_{pagenum + 1}_{image_index + 1}.pdf")
# 将裁剪后的pdf转换为图像
output_file = f"./images/img/img_{pagenum + 1}_{image_index + 1}.png"
convert_to_images(file_name, output_file)
image_index = image_index + 1
print(f"第{pagenum + 1}页: 包含{image_index + 1}个图片")
# 关闭pdf文件对象
pdfFileObj.close()
if __name__ == '__main__':
get_image_direct()
result
But the WPS can work well.
resolve it
use the following library
pypdf==4.1.0
Pillow==10.3.0
use the simple code
def extract_image_002():
from pypdf import PdfReader
print("extract image 002")
pdf_path = 'pdf/aa.pdf'
reader = PdfReader(pdf_path)
page = reader.pages[2]
count = 0
for image_file_object in page.images:
with open('img' + '/' + str(count) + image_file_object.name, "wb") as fp:
fp.write(image_file_object.data)
count += 1