MinerU icon indicating copy to clipboard operation
MinerU copied to clipboard

提取PDF中表格的其他方案(间接)

Open beiluo opened this issue 6 months ago • 11 comments

因为0.6版本不支持提取PDF中的表格,0.7版本的提取表格功能暂时不可用(巨慢)。所以参考Issues中其他人给出的方案,自己实现了一个版本

首先需要按照模块

pip install rapidocr_onnxruntime
pip install rapid_table

代码main.py

import json
import re
import os
import sys
from typing import Dict, List

from rapidocr_onnxruntime import RapidOCR
from rapid_table import RapidTable

table_engine = RapidTable()
ocr_engine = RapidOCR()

def read_markdown_file(file_path: str) -> str:
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def write_markdown_file(file_path: str, content: str):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def read_json_file(file_path: str) -> List[Dict]:
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def perform_ocr(img_path: str) -> str:
    ocr_result, _ = ocr_engine(img_path)
    table_html_str, table_cell_bboxes, elapse = table_engine(img_path, ocr_result)
    return table_html_str

def replace_image_with_ocr_content(markdown_content: str, image_path: str, ocr_content: str) -> str:
    # 这里假设图片在Markdown中的格式是 ![alt text](image_path)
    image_pattern = f"!\\[.*?\\]\\({re.escape(image_path)}\\)"
    return re.sub(image_pattern, ocr_content, markdown_content)

def find_markdown_file(base_path: str) -> str:
    auto_folder = os.path.join(base_path, 'auto')
    for file in os.listdir(auto_folder):
        if file.endswith('.md'):
            return os.path.join(auto_folder, file)
    return None

def main(base_path: str):
    # 查找Markdown文件
    markdown_file_path = find_markdown_file(base_path)
    if not markdown_file_path:
        print(f"错误:在 {os.path.join(base_path, 'auto')} 中未找到 Markdown 文件")
        return

    # 构建JSON文件路径
    markdown_filename = os.path.basename(markdown_file_path)
    json_filename = f"{os.path.splitext(markdown_filename)[0]}_content_list.json"
    json_file_path = os.path.join(base_path, "auto", json_filename)

    # 检查文件是否存在
    if not os.path.exists(json_file_path):
        print(f"错误:无法找到JSON文件: {json_file_path}")
        return

    # 读取Markdown文件
    markdown_content = read_markdown_file(markdown_file_path)

    # 读取JSON文件
    json_data = read_json_file(json_file_path)

    # 计算需要OCR处理的项目数量
    total_items = sum(1 for item in json_data if item['type'] == 'table' and 'img_path' in item)

    # 处理JSON数据
    ocr_count = 0

    # 处理JSON数据
    for item in json_data:
        if item['type'] == 'table' and 'img_path' in item:
            img_path = os.path.join(base_path, 'auto', item['img_path'])
            if os.path.exists(img_path):
                ocr_count += 1
                ocr_content = perform_ocr(img_path)
                markdown_content = replace_image_with_ocr_content(markdown_content, item['img_path'], ocr_content)
                print(f"OCR 进度: {ocr_count}/{total_items}")
            else:
                print(f"警告:图片文件不存在 {img_path}")

    # 保存更改后的Markdown文件
    write_markdown_file(markdown_file_path, markdown_content)
    print(f"处理完成,已更新 {markdown_file_path} 文件。")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("用法: python tt.py <base_path>")
        sys.exit(1)
    
    base_path = sys.argv[1]
    main(base_path)

使用方式

python main.py {magic-pdf输出路径}

路径示例

E:\mineru\output\magic-pdf\companies-list

需要读取的文件(支持<0.7版本)

E:\mineru\output\magic-pdf\companies-list\auto\companies-list.md
E:\mineru\output\magic-pdf\companies-list\auto\companies-list_content_list.json

目前速度还可以,在笔记本RTX4060上基本上1S以内一张图

期待0.7更好的支持表格提取,这只是一个可用的临时方案。

beiluo avatar Aug 08 '24 02:08 beiluo