MinerU icon indicating copy to clipboard operation
MinerU copied to clipboard

将模型加载和解析的内容分开

Open 2257396011 opened this issue 1 year ago • 14 comments

目前只能提前输入解析的方式,文件路径,是否使用表格检测,ocr等,然后再加载模型并解析。感觉可以改成先不输入任何信息,先加载好模型,然后再输入信息进行解析操作,感觉这种也方便部署和进行连续的pdf解析等操作。

2257396011 avatar Aug 15 '24 02:08 2257396011

您好, 我也有同样的需求和问题, 我想要将识别的过程部署在线上服务器, 使用gpu处理, 本机只需要在需要时调用api即可, 您有好的解决办法的话麻烦告诉我一下

lygiants avatar Aug 15 '24 07:08 lygiants

您好,我也有同样的需求和问题,我想要将识别的流程部署在线上服务器,使用gpu处理,本机只需要在需要时调用api即可,您有解决办法麻烦告诉我一下好吗

ok

2257396011 avatar Aug 15 '24 07:08 2257396011

您好,我也有同样的需求和问题,我想要将识别的流程部署在线服务器上,使用gpu处理,本机只需要在需要时调用api即可,您有解决方法麻烦告诉我一下好吗?

你可以先自己试着修改一下,主要是doc_analyze.....这个里边的doc_analyze函数,你需要把模型加载部分和pdf的导入部分分开,然后就是Pipe这个里边的程序,都需要修改。

2257396011 avatar Aug 15 '24 07:08 2257396011

有人解决了吗

cskkx1 avatar Aug 27 '24 02:08 cskkx1

有人解决了吗

我解决了,目前可以支持将加载和导入pdf分开了,但是只支持txt方式和ocr方式,那个auto必须在之前输入pdf来判断使用txt还是ocr所以不行

2257396011 avatar Aug 27 '24 02:08 2257396011

主程序代码改动: def pdf_parse_main( parse_method: str = 'ocr', is_json_md_dump: bool = True ): try: model_json = []

    # 选择解析方式
    if parse_method == "txt":
        pipe = TXTPipe(model_json)
    elif parse_method == "ocr":
        pipe = OCRPipe(model_json)
    else:
        logger.error("unknown parse method, only auto, ocr, txt allowed")
        exit(1)

    # 执行分类
    pipe.pipe_classify()

    # 如果没有传入模型数据,则使用内置模型解析
    if not model_json:
        if model_config.__use_inside_model__:
            pipe.pipe_analyze()  # 解析
        else:
            logger.error("need model list input")
            exit(1)
    while True:
        time.sleep(0.00001)
        pdf_path = input("请输入PDF文件路径(输入 'exit' 以退出程序): ")
        if pdf_path.lower() == 'exit':
            break
        if not os.path.exists(pdf_path):
            print(f"错误:文件 {pdf_path} 不存在")
            continue
        pdf_name = os.path.basename(pdf_path).split(".")[0]
        pdf_path_parent = os.path.dirname(pdf_path)


        output_path = os.path.join(pdf_path_parent, pdf_name)

        output_image_path = os.path.join(output_path, 'images')

        # 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
        image_path_parent = os.path.basename(output_image_path)

        pdf_bytes = open(pdf_path, "rb").read()  # 读取 pdf 文件的二进制数据

        pipe.set_pdf(pdf_bytes)
        image_writer = DiskReaderWriter(output_image_path)
        pipe.set_image_writer(image_writer)
        md_writer = DiskReaderWriter(output_path)
        # 执行解析
        pipe.pipe_jiexi()

        # 执行解析
        pipe.pipe_parse()

        # 保存 text 和 md 格式的结果
        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")

        if is_json_md_dump:
            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)

        pdf_path_without_extension, _ = os.path.splitext(pdf_path)
        main(pdf_path_without_extension)



except KeyboardInterrupt:
    print("\n程序已退出")
except Exception as e:
    logger.exception(e)

2257396011 avatar Aug 27 '24 02:08 2257396011

ocrpipe代码: class OCRPipe(AbsPipe):

def __init__(self, model_list: list, is_debug: bool = False):
    super().__init__(model_list, is_debug)

def pipe_classify(self):
    pass

def set_pdf(self, pdf_bytes: bytes):
    """设置 PDF 数据"""
    self.pdf_bytes = pdf_bytes

def pipe_analyze(self):
    self.custom_model = doc_analyze(ocr=True)

def set_image_writer(self, image_writer: AbsReaderWriter):
    super().set_image_writer(image_writer)

def pipe_jiexi(self):
    self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)

def pipe_parse(self):
    self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
    result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
    logger.info("ocr_pipe mk content list finished")
    return result

def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
    result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
    logger.info(f"ocr_pipe mk {md_make_mode} finished")
    return result

2257396011 avatar Aug 27 '24 02:08 2257396011

txtpipe代码: class TXTPipe(AbsPipe):

def __init__(self,model_list: list, is_debug: bool = False):
    super().__init__(model_list, is_debug)

def pipe_classify(self):
    pass

def set_pdf(self, pdf_bytes: bytes):
    """设置 PDF 数据"""
    self.pdf_bytes = pdf_bytes

def pipe_analyze(self):
    self.custom_model = doc_analyze(ocr=False)

def set_image_writer(self, image_writer: AbsReaderWriter):
    super().set_image_writer(image_writer)

def pipe_jiexi(self):
    self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)

def pipe_parse(self):
    self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
    result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
    logger.info("txt_pipe mk content list finished")
    return result

def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
    result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
    logger.info(f"txt_pipe mk {md_make_mode} finished")
    return result

2257396011 avatar Aug 27 '24 02:08 2257396011

abspipe代码: class AbsPipe(ABC): """ txt和ocr处理的抽象类 """ PIP_OCR = "ocr" PIP_TXT = "txt"

def __init__(self, model_list: list, is_debug: bool = False):
    self.model_list = model_list
    self.pdf_mid_data = None  # 未压缩
    self.is_debug = is_debug

def set_image_writer(self, image_writer: AbsReaderWriter):
    self.image_writer = image_writer
def get_compress_pdf_mid_data(self):
    return JsonCompressor.compress_json(self.pdf_mid_data)

@abstractmethod
def pipe_classify(self):
    """
    有状态的分类
    """
    raise NotImplementedError

@abstractmethod
def pipe_analyze(self):
    """
    有状态的跑模型分析
    """
    raise NotImplementedError

@abstractmethod
def pipe_parse(self):
    """
    有状态的解析
    """
    raise NotImplementedError

2257396011 avatar Aug 27 '24 02:08 2257396011

在model_json为空的情况下使用

2257396011 avatar Aug 27 '24 02:08 2257396011

txtpipe代码: class TXTPipe(AbsPipe):

def __init__(self,model_list: list, is_debug: bool = False):
    super().__init__(model_list, is_debug)

def pipe_classify(self):
    pass

def set_pdf(self, pdf_bytes: bytes):
    """设置 PDF 数据"""
    self.pdf_bytes = pdf_bytes

def pipe_analyze(self):
    self.custom_model = doc_analyze(ocr=False)

def set_image_writer(self, image_writer: AbsReaderWriter):
    super().set_image_writer(image_writer)

def pipe_jiexi(self):
    self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)

def pipe_parse(self):
    self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
    result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
    logger.info("txt_pipe mk content list finished")
    return result

def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
    result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
    logger.info(f"txt_pipe mk {md_make_mode} finished")
    return result

有完整的重构代码吗?

hlzhu1983 avatar Aug 29 '24 09:08 hlzhu1983

txtpipe代码: class TXTPipe(AbsPipe):

def __init__(self,model_list: list, is_debug: bool = False):
    super().__init__(model_list, is_debug)

def pipe_classify(self):
    pass

def set_pdf(self, pdf_bytes: bytes):
    """设置 PDF 数据"""
    self.pdf_bytes = pdf_bytes

def pipe_analyze(self):
    self.custom_model = doc_analyze(ocr=False)

def set_image_writer(self, image_writer: AbsReaderWriter):
    super().set_image_writer(image_writer)

def pipe_jiexi(self):
    self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)

def pipe_parse(self):
    self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
    result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
    logger.info("txt_pipe mk content list finished")
    return result

def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
    result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
    logger.info(f"txt_pipe mk {md_make_mode} finished")
    return result

有完整的重构代码吗?

没放的代码就是没变动的,可以对照原代码看一下

2257396011 avatar Aug 30 '24 01:08 2257396011

txtpipe代码: class TXTPipe(AbsPipe):

def __init__(self,model_list: list, is_debug: bool = False):
    super().__init__(model_list, is_debug)

def pipe_classify(self):
    pass

def set_pdf(self, pdf_bytes: bytes):
    """设置 PDF 数据"""
    self.pdf_bytes = pdf_bytes

def pipe_analyze(self):
    self.custom_model = doc_analyze(ocr=False)

def set_image_writer(self, image_writer: AbsReaderWriter):
    super().set_image_writer(image_writer)

def pipe_jiexi(self):
    self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)

def pipe_parse(self):
    self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
    result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
    logger.info("txt_pipe mk content list finished")
    return result

def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
    result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
    logger.info(f"txt_pipe mk {md_make_mode} finished")
    return result

有完整的重构代码吗?

没放的代码就是没变动的,可以对照原代码看一下

doc_analyze和doc_jiexi重构了吧?

hlzhu1983 avatar Aug 30 '24 01:08 hlzhu1983

txtpipe代码: class TXTPipe(AbsPipe):

def __init__(self,model_list: list, is_debug: bool = False):
    super().__init__(model_list, is_debug)

def pipe_classify(self):
    pass

def set_pdf(self, pdf_bytes: bytes):
    """设置 PDF 数据"""
    self.pdf_bytes = pdf_bytes

def pipe_analyze(self):
    self.custom_model = doc_analyze(ocr=False)

def set_image_writer(self, image_writer: AbsReaderWriter):
    super().set_image_writer(image_writer)

def pipe_jiexi(self):
    self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)

def pipe_parse(self):
    self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
    result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
    logger.info("txt_pipe mk content list finished")
    return result

def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
    result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
    logger.info(f"txt_pipe mk {md_make_mode} finished")
    return result

有完整的重构代码吗?

没放的代码就是没移动的,可以对照原代码看一下

doc_analyze和doc_jiexi重构了吧?

是的忘了发了 def doc_analyze(ocr: bool = False, show_log: bool = False): model_manager = ModelSingleton() custom_model = model_manager.get_model(ocr, show_log) return custom_model

def doc_jiexi(pdf_bytes: bytes, custom_model): images = load_images_from_pdf(pdf_bytes) model_json = [] doc_analyze_start = time.time() for index, img_dict in enumerate(images): img = img_dict["img"] page_width = img_dict["width"] page_height = img_dict["height"] result = custom_model(img) page_info = {"page_no": index, "height": page_height, "width": page_width} page_dict = {"layout_dets": result, "page_info": page_info} model_json.append(page_dict) doc_analyze_cost = time.time() - doc_analyze_start logger.info(f"doc analyze cost: {doc_analyze_cost}") return model_json

2257396011 avatar Aug 30 '24 02:08 2257396011

@2257396011 @lygiants @cskkx1 @hlzhu1983

可以通过以下方法对模型预加载,只需要调用一次init_model方法,后续解析代码都可以不用更改 https://github.com/opendatalab/MinerU/issues/517#issuecomment-2324324940

myhloli avatar Sep 02 '24 10:09 myhloli