针对PDF识别MinerU的改进,各位可以测试下看看效果 (含dockerfile文件)
例行检查
- [ ] 我已确认目前没有类似 features
- [X] 我已确认我已升级到最新版本
- [X] 我已完整查看过项目 README,已确定现有版本无法满足需求
- [X] 我理解并愿意跟进此 features,协助测试和提供反馈
- [x] 我理解并认可上述内容,并理解项目维护者精力有限,不遵循规则的 features 可能会被无视或直接关闭
功能描述
针对PDF识别的MinerU的改进,实现全图片PDF的正确识别,使用的MinerU的github提供的API进行修改,实现结果和pdf-marker基本一致,因为更新了最新的模型,所以识别能力要强于现在官方提供的pdf-marker
应用场景
PDF的OCR功能
已知问题
对于复杂表格的PDF,识别后为html格式的表格,fastgpt预览无法正确渲染,暂时不知道怎么解决 ,有解决方案的,烦请留言告知
相关示例
原的MunerU的识别结果
新的MunerU的识别结果
代码如下:
import json
import os
from base64 import b64encode
from glob import glob
from io import StringIO
import tempfile
from typing import Tuple, Union
import shutil
import re
import imghdr
import uvicorn
from fastapi import FastAPI, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
import base64
from magic_pdf.data.read_api import read_local_images, read_local_office
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult
from magic_pdf.operators.pipes import PipeResult
model_config.__use_inside_model__ = True
app = FastAPI()
pdf_extensions = [".pdf"]
office_extensions = [".ppt", ".pptx", ".doc", ".docx"]
image_extensions = [".png", ".jpg", ".jpeg"]
class MemoryDataWriter(DataWriter):
def __init__(self):
self.buffer = StringIO()
def write(self, path: str, data: bytes) -> None:
if isinstance(data, str):
self.buffer.write(data)
else:
self.buffer.write(data.decode("utf-8"))
def write_string(self, path: str, data: str) -> None:
self.buffer.write(data)
def get_value(self) -> str:
return self.buffer.getvalue()
def close(self):
self.buffer.close()
def init_writers(
file_path: str = None,
file: UploadFile = None,
output_path: str = None,
output_image_path: str = None,
) -> Tuple[
Union[S3DataWriter, FileBasedDataWriter],
Union[S3DataWriter, FileBasedDataWriter],
bytes,
]:
"""
Initialize writers based on path type
Args:
file_path: file path (local path or S3 path)
file: Uploaded file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
"""
file_extension:str = None
if file_path:
is_s3_path = file_path.startswith("s3://")
if is_s3_path:
bucket = get_bucket_name(file_path)
ak, sk, endpoint = get_s3_config(bucket)
writer = S3DataWriter(
output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
image_writer = S3DataWriter(
output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
# 临时创建reader读取文件内容
temp_reader = S3DataReader(
"", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
file_bytes = temp_reader.read(file_path)
file_extension = os.path.splitext(file_path)[1]
else:
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
with open(file_path, "rb") as f:
file_bytes = f.read()
file_extension = os.path.splitext(file_path)[1]
else:
# 处理上传的文件
file_bytes = file.file.read()
file_extension = os.path.splitext(file.filename)[1]
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
return writer, image_writer, file_bytes, file_extension
def process_file(
file_bytes: bytes,
file_extension: str,
parse_method: str,
image_writer: Union[S3DataWriter, FileBasedDataWriter],
) -> Tuple[InferenceResult, PipeResult]:
"""
Process PDF file content
Args:
file_bytes: Binary content of file
file_extension: file extension
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds: Union[PymuDocDataset, ImageDataset] = None
if file_extension in pdf_extensions:
ds = PymuDocDataset(file_bytes)
elif file_extension in office_extensions:
# 需要使用office解析
temp_dir = tempfile.mkdtemp()
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
f.write(file_bytes)
ds = read_local_office(temp_dir)[0]
elif file_extension in image_extensions:
# 需要使用ocr解析
temp_dir = tempfile.mkdtemp()
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
f.write(file_bytes)
ds = read_local_images(temp_dir)[0]
infer_result: InferenceResult = None
pipe_result: PipeResult = None
if parse_method == "ocr":
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
elif parse_method == "txt":
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
else: # auto
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
return infer_result, pipe_result
def encode_image(image_path):
"""Encode image using base64"""
with open(image_path, "rb") as image_file: # 以二进制模式读取
image_data = image_file.read()
return base64.b64encode(image_data).decode('utf-8').replace('\n', '') # 解码为字符串
def get_mime_type(image_path):
"""动态检测图片MIME类型"""
img_type = imghdr.what(image_path)
return f"image/{img_type}" if img_type else "application/octet-stream"
@app.post(
"/v2/parse/file",
tags=["projects"],
summary="Parse files (supports local files and S3)",
)
async def file_parse(
file: UploadFile = None,
# file_path: str = None,
# parse_method: str = "auto",
# is_json_md_dump: bool = False,
# output_dir: str = "output",
# return_layout: bool = False,
# return_info: bool = False,
# return_content_list: bool = False,
# return_images: bool = False,
):
"""
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
to the specified directory.
Args:
file: The PDF file to be parsed. Must not be specified together with
`file_path`
file_path: The path to the PDF file to be parsed. Must not be specified together
with `file`
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
results are not satisfactory, try ocr
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
to False. Different stages of data will be written to different .json files
(3 in total), md content will be saved to .md file
output_dir: Output directory for results. A folder named after the PDF file
will be created to store all results
return_layout: Whether to return parsed PDF layout. Default to False
return_info: Whether to return parsed PDF info. Default to False
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try:
output_dir: str = "output"
parse_method: str = "auto"
# Get PDF filename
file_name = os.path.basename(file.filename).split(
"."
)[0]
output_path = f"{output_dir}/{file_name}"
output_image_path = f"{output_path}/images"
#total_pages = pdf_document.page_count
# Initialize readers/writers and get PDF content
writer, image_writer, file_bytes, file_extension = init_writers(
#file_path=file_path,
file=file,
output_path=output_path,
output_image_path=output_image_path,
)
# Process PDF
infer_result, pipe_result = process_file(file_bytes, file_extension, parse_method, image_writer)
# Use MemoryDataWriter to get results
content_list_writer = MemoryDataWriter()
md_content_writer = MemoryDataWriter()
middle_json_writer = MemoryDataWriter()
# Use PipeResult's dump method to get data
pipe_result.dump_content_list(content_list_writer, "", "images")
pipe_result.dump_md(md_content_writer, "", "images")
pipe_result.dump_middle_json(middle_json_writer, "")
# Get content
content_list = json.loads(content_list_writer.get_value())
md_content = md_content_writer.get_value()
middle_json = json.loads(middle_json_writer.get_value())
model_json = infer_result.get_infer_res()
# If results need to be saved
# if is_json_md_dump:
# writer.write_string(
# f"{file_name}_content_list.json", content_list_writer.get_value()
# )
# writer.write_string(f"{file_name}.md", md_content)
# writer.write_string(
# f"{file_name}_middle.json", middle_json_writer.get_value()
# )
# writer.write_string(
# f"{file_name}_model.json",
# json.dumps(model_json, indent=4, ensure_ascii=False),
# )
# # Save visualization results
# pipe_result.draw_layout(os.path.join(output_path, f"{file_name}_layout.pdf"))
# pipe_result.draw_span(os.path.join(output_path, f"{file_name}_spans.pdf"))
# pipe_result.draw_line_sort(
# os.path.join(output_path, f"{file_name}_line_sort.pdf")
# )
# infer_result.draw_model(os.path.join(output_path, f"{file_name}_model.pdf"))
# Build return data
data = {}
# if return_layout:
# data["layout"] = model_json
# if return_info:
# data["info"] = middle_json
# if return_content_list:
# data["content_list"] = content_list
# if return_images:
image_paths = glob(f"{output_image_path}/*.jpg")
data["images"] = {}
for image_path in image_paths:
filename = os.path.basename(image_path) # 提取文件名
mime_type = get_mime_type(image_path)
base64_str = encode_image(image_path)
data["images"][filename] = f"data:{mime_type};base64,{base64_str}" # 完整Data URL
# 替换Markdown中的图片路径
def replace_image(match):
original_path = match.group(1)
filename = original_path.split('/')[-1]
if filename in data['images']:
return f'' # 直接使用完整Data URL
else:
return f''
pattern = re.compile(r'!\[\]\((images/.*?)\)')
data["md_content"] = pattern.sub(replace_image, md_content)
# Clean up memory writers
content_list_writer.close()
md_content_writer.close()
middle_json_writer.close()
return {
"success": True,
"message": "",
"markdown": data.get("md_content", ""),
"pages": len(model_json),
}
#shutil.rmtree(output_path)
except Exception as e:
logger.exception(e)
return JSONResponse(content={
"success": False,
"message": "",
"error": f"Internal server error: {str(e)}"
}, status_code=500)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7333)
库文件: --extra-index-url https://myhloli.github.io/wheels/
requirements.txt
magic-pdf[full]
fastapi
uvicorn
python-multipart
dockerfile:
FROM python:3.10-slim-bookworm AS base
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1
FROM base AS build
# Update the package list and install necessary packages
#新增LibreOffice 19-20行
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential && \
apt-get update && \
apt-get install libreoffice && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Build Python dependencies
COPY requirements.txt .
RUN python -m venv /app/venv && \
. /app/venv/bin/activate && \
pip install -r requirements.txt && \
pip uninstall -y paddlepaddle && \
pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
paddlepaddle-gpu==3.0.0rc1 && \
pip3 install PyMuPDF tqdm paddleocr python-docx -i https://pypi.tuna.tsinghua.edu.cn/simple
# Download models
COPY download_models.py .
RUN . /app/venv/bin/activate && \
./download_models.py
FROM base AS prod
# Copy Python dependencies and models from the build stage
COPY --from=build /app/venv /app/venv
COPY --from=build /opt/models /opt/models
COPY --from=build /opt/layoutreader /opt/layoutreader
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Create volume for paddleocr models
RUN mkdir -p /root/.paddleocr
VOLUME [ "/root/.paddleocr" ]
# Copy the app and its configuration file
COPY entrypoint.sh /app/entrypoint.sh
COPY magic-pdf.json /root/magic-pdf.json
COPY app.py /app/app.py
# Expose the port that FastAPI will run on
EXPOSE 8000
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
ENTRYPOINT [ "/app/entrypoint.sh" ]
CMD ["--host", "0.0.0.0", "--port", "8000"]
magic-pdf.json
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/opt/models",
"layoutreader-model-dir":"/opt/layoutreader",
"device-mode":"cuda",
"layout-config": {
"model": "doclayout_yolo"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"sub_model": "slanet_plus",
"enable": true,
"max_time": 400
},
"llm-aided-config": {
"formula_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct",
"enable": false
},
"text_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct",
"enable": false
},
"title_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-32b-instruct",
"enable": false
}
},
"config_version": "1.2.0"
}
topdf.py
import fitz
from tqdm import tqdm
import os
from paddleocr import PaddleOCR
from docx import Document # 用于创建 Word 文档
from docx.shared import Pt # 设置字体大小
from doc_to_pdf import docx_to_pdf
import shutil
def pdf_to_png(pdf_path, img_path,parts_per_page=4):
"""
将 PDF 每页分割为更多图片
:param pdf_path: PDF 文件路径
:param img_path: 输出图片目录
:param parts_per_page: 每页分割成的图片数量
"""
# 确保图片输出目录存在
os.makedirs(img_path, exist_ok=True)
# 打开 PDF 文件
pdf_doc = fitz.open(pdf_path)
cnt = 0
# 遍历每一页
for pg in tqdm(range(pdf_doc.page_count), total=pdf_doc.page_count, desc='PDF分割处理'):
page = pdf_doc[pg]
page_width = page.rect.width # 页面宽度
page_height = page.rect.height # 页面高度
# 计算分割的高度
split_height = page_height / parts_per_page
for part in range(parts_per_page):
# 定义裁剪区域
clip_rect = fitz.Rect(0, split_height * part, page_width, split_height * (part + 1))
pix = page.get_pixmap(matrix=fitz.Matrix(1, 1), clip=clip_rect, alpha=False)
cnt += 1
# 保存为 PNG 图片
part_img_path = os.path.join(img_path, f"img_{cnt}.png")
pix.save(part_img_path)
print(f"PDF分割完成,图片保存在:{img_path}")
def traversal_file(pdf_name,img_path, out_path):
# 初始化 OCR 模型,避免在循环中重复加载
det = r"/home/ialover/document/PaddleOCR-release-2.6.1/model/ch_PP-OCRv4_det_infer"
rec = r"/home/ialover/document/PaddleOCR-release-2.6.1/model/ch_PP-OCRv4_rec_infer"
ocr = PaddleOCR(use_angle_cls=True, rec_model_dir=rec, det_model_dir=det,lang = 'ch')
# 创建一个新的 Word 文档
document = Document()
# 设置中文字体(如果需要)
from docx.oxml.ns import qn
style = document.styles['Normal']
font = style.font
font.name = 'SimSun' # 宋体
font.size = Pt(12)
font.element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
# 遍历所有图片文件并进行 OCR 识别
image_files = sorted(os.listdir(img_path), key=lambda x: int(x.split('_')[1].split('.')[0]))
for fp in tqdm(image_files, desc='OCR识别处理中'):
file_path = os.path.join(img_path, fp)
ocr_result = ocr.ocr(file_path, cls=True)
# 将识别结果写入 Word 文档
for one_content in ocr_result[0]: # 访问第一页内容
# print(one_content)
text, confidence = one_content[1] # 提取文字内容和置信度
document.add_paragraph(text) # 仅写入文字内容
# 保存 Word 文档
output_docx = os.path.join(out_path, f'{pdf_name}.docx')
document.save(output_docx)
print(f'OCR 结果已保存到 {output_docx}')
def remove_dir(input_path):
for fg in os.listdir(input_path):
file_path = os.path.join(input_path, fg)
if os.path.isfile(file_path):
os.remove(file_path)
def doctopdf(pdf_input_dir,pdf_book_dir):
for fp in os.listdir(pdf_input_dir):
file_pdf = os.path.join(pdf_input_dir,fp)
target_pdf_file = os.path.join(pdf_book_dir,fp)
if file_pdf.endswith('.pdf'):
shutil.move(file_pdf,target_pdf_file)
else:
docx_to_pdf(file_pdf,pdf_book_dir)
def deal_with_pdf_dir(pdf_book_dir,img_path,pdf_output_dir):
for fp in os.listdir(pdf_book_dir):
file_path = os.path.join(pdf_book_dir,fp)
pdf_to_png(file_path,img_path)
file_name_with_ext = os.path.basename(file_path)
pdf_name = os.path.splitext(file_name_with_ext)[0]
traversal_file(pdf_name=pdf_name,img_path=img_path,out_path=pdf_output_dir)
if __name__ == "__main__":
img_path = "./pdf_img_dir/"
pdf_input_dir = './pdf_input_dir'
pdf_book_dir = './pdf_book_dir/'
out_path = './pdf_output_dir'
os.makedirs(out_path,exist_ok=True)
doctopdf(pdf_input_dir=pdf_input_dir,pdf_book_dir=pdf_book_dir)
deal_with_pdf_dir(pdf_book_dir=pdf_book_dir,img_path=img_path,pdf_output_dir=out_path)
remove_dir(img_path)
remove_dir(pdf_input_dir)
remove_dir(pdf_book_dir)
download_models.py
#!/usr/bin/env python
from huggingface_hub import snapshot_download
if __name__ == "__main__":
mineru_patterns = [
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download(
"opendatalab/PDF-Extract-Kit-1.0",
allow_patterns=mineru_patterns,
local_dir="/opt/",
)
layoutreader_pattern = [
"*.json",
"*.safetensors",
]
layoutreader_model_dir = snapshot_download(
"hantian/layoutreader",
allow_patterns=layoutreader_pattern,
local_dir="/opt/layoutreader/",
)
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
entrypoint.sh
#!/usr/bin/env bash
set -euo pipefail
. /app/venv/bin/activate
exec uvicorn app:app "$@"
是不是漏了entrypoint.sh文件?
是不是漏了entrypoint.sh文件? @ruanjunmin 已经补充好了,所有文件放在一起,然后docker build即可
感谢你的贡献,可否代码优化一下,提个PR,放到plugins/model下
是不是漏了entrypoint.sh文件? @ruanjunmin 已经补充好了,所有文件放在一起,然后docker build即可
大佬,已经用起来了很棒!谢谢分享。
请问这个和fastgpt里带的mineru功能不一样吗? 要怎么使用呀
是不是漏了entrypoint.sh文件? @ruanjunmin 已经补充好了,所有文件放在一起,然后docker build即可
大佬,已经用起来了很棒!谢谢分享。
请问这个和fastgpt里带的mineru功能不一样吗? 要怎么使用呀
可以把pdf文档里的图片进行分片,比如图片区域,表格区域,文字区域,在输出的时候,会把图片内含有的文字给OCR出来,若是纯图片(非文字和表格图片)则会直接单独截取放置在知识库文档内,相比之前的MinerU而言,不仅仅只是提取文字,这点你可以参照我发的图片可以看出来,也可以自己测试下做对比
感谢你的贡献,可否代码优化一下,提个PR,放到plugins/model下
最近有点忙,我会尽快抽个时间提PR
是不是漏了entrypoint.sh文件? @ruanjunmin 已经补充好了,所有文件放在一起,然后docker build即可
请问entrypoint.sh 在哪儿?主题帖里没有看到啊大佬。
已补充
最前面的一大段代码放在main.py 里面吗?
最前面的一大段代码放在main.py 里面吗?
直接保存为aap.py即可,对应的是entrypoint.sh文件里:
. /app/venv/bin/activate exec uvicorn app:app "$@"
docker build 的是时候下载速度太慢,最后失败,有什么办法解决吗 ?三次了
docker build 的是时候下载速度太慢,最后失败,有什么办法解决吗 ?三次了
建议挂梯子,如果没有梯子,对python指定国内的清华源,模型换成modlescope(不确定有没有)或者hf-mirror,这个需要你自己修改下dockerfile文件,可以问问AI
启动命令能给一个吗?用 docker run -d -p 7333:8000 mineru_fastgpt 没有启动成功。 对了,镜像构造用的docker build --no-cache -t mineru_fastgpt .
实在是打不成功,有好兄弟能分享一个打包好的吗。。。谢谢
打包了一个 cu128 的镜像 ssryps999/mineru:v0, port 7232:8000
https://github.com/labring/FastGPT/pull/5662 可以看这个fastgpt接入的docker 镜像包,支持多个文件处理