将模型加载和解析的内容分开
目前只能提前输入解析的方式,文件路径,是否使用表格检测,ocr等,然后再加载模型并解析。感觉可以改成先不输入任何信息,先加载好模型,然后再输入信息进行解析操作,感觉这种也方便部署和进行连续的pdf解析等操作。
您好, 我也有同样的需求和问题, 我想要将识别的过程部署在线上服务器, 使用gpu处理, 本机只需要在需要时调用api即可, 您有好的解决办法的话麻烦告诉我一下
您好,我也有同样的需求和问题,我想要将识别的流程部署在线上服务器,使用gpu处理,本机只需要在需要时调用api即可,您有解决办法麻烦告诉我一下好吗
ok
您好,我也有同样的需求和问题,我想要将识别的流程部署在线服务器上,使用gpu处理,本机只需要在需要时调用api即可,您有解决方法麻烦告诉我一下好吗?
你可以先自己试着修改一下,主要是doc_analyze.....这个里边的doc_analyze函数,你需要把模型加载部分和pdf的导入部分分开,然后就是Pipe这个里边的程序,都需要修改。
有人解决了吗
有人解决了吗
我解决了,目前可以支持将加载和导入pdf分开了,但是只支持txt方式和ocr方式,那个auto必须在之前输入pdf来判断使用txt还是ocr所以不行
主程序代码改动: def pdf_parse_main( parse_method: str = 'ocr', is_json_md_dump: bool = True ): try: model_json = []
# 选择解析方式
if parse_method == "txt":
pipe = TXTPipe(model_json)
elif parse_method == "ocr":
pipe = OCRPipe(model_json)
else:
logger.error("unknown parse method, only auto, ocr, txt allowed")
exit(1)
# 执行分类
pipe.pipe_classify()
# 如果没有传入模型数据,则使用内置模型解析
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # 解析
else:
logger.error("need model list input")
exit(1)
while True:
time.sleep(0.00001)
pdf_path = input("请输入PDF文件路径(输入 'exit' 以退出程序): ")
if pdf_path.lower() == 'exit':
break
if not os.path.exists(pdf_path):
print(f"错误:文件 {pdf_path} 不存在")
continue
pdf_name = os.path.basename(pdf_path).split(".")[0]
pdf_path_parent = os.path.dirname(pdf_path)
output_path = os.path.join(pdf_path_parent, pdf_name)
output_image_path = os.path.join(output_path, 'images')
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent = os.path.basename(output_image_path)
pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据
pipe.set_pdf(pdf_bytes)
image_writer = DiskReaderWriter(output_image_path)
pipe.set_image_writer(image_writer)
md_writer = DiskReaderWriter(output_path)
# 执行解析
pipe.pipe_jiexi()
# 执行解析
pipe.pipe_parse()
# 保存 text 和 md 格式的结果
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
pdf_path_without_extension, _ = os.path.splitext(pdf_path)
main(pdf_path_without_extension)
except KeyboardInterrupt:
print("\n程序已退出")
except Exception as e:
logger.exception(e)
ocrpipe代码: class OCRPipe(AbsPipe):
def __init__(self, model_list: list, is_debug: bool = False):
super().__init__(model_list, is_debug)
def pipe_classify(self):
pass
def set_pdf(self, pdf_bytes: bytes):
"""设置 PDF 数据"""
self.pdf_bytes = pdf_bytes
def pipe_analyze(self):
self.custom_model = doc_analyze(ocr=True)
def set_image_writer(self, image_writer: AbsReaderWriter):
super().set_image_writer(image_writer)
def pipe_jiexi(self):
self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info("ocr_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"ocr_pipe mk {md_make_mode} finished")
return result
txtpipe代码: class TXTPipe(AbsPipe):
def __init__(self,model_list: list, is_debug: bool = False):
super().__init__(model_list, is_debug)
def pipe_classify(self):
pass
def set_pdf(self, pdf_bytes: bytes):
"""设置 PDF 数据"""
self.pdf_bytes = pdf_bytes
def pipe_analyze(self):
self.custom_model = doc_analyze(ocr=False)
def set_image_writer(self, image_writer: AbsReaderWriter):
super().set_image_writer(image_writer)
def pipe_jiexi(self):
self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model)
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info("txt_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"txt_pipe mk {md_make_mode} finished")
return result
abspipe代码: class AbsPipe(ABC): """ txt和ocr处理的抽象类 """ PIP_OCR = "ocr" PIP_TXT = "txt"
def __init__(self, model_list: list, is_debug: bool = False):
self.model_list = model_list
self.pdf_mid_data = None # 未压缩
self.is_debug = is_debug
def set_image_writer(self, image_writer: AbsReaderWriter):
self.image_writer = image_writer
def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data)
@abstractmethod
def pipe_classify(self):
"""
有状态的分类
"""
raise NotImplementedError
@abstractmethod
def pipe_analyze(self):
"""
有状态的跑模型分析
"""
raise NotImplementedError
@abstractmethod
def pipe_parse(self):
"""
有状态的解析
"""
raise NotImplementedError
在model_json为空的情况下使用
txtpipe代码: class TXTPipe(AbsPipe):
def __init__(self,model_list: list, is_debug: bool = False): super().__init__(model_list, is_debug) def pipe_classify(self): pass def set_pdf(self, pdf_bytes: bytes): """设置 PDF 数据""" self.pdf_bytes = pdf_bytes def pipe_analyze(self): self.custom_model = doc_analyze(ocr=False) def set_image_writer(self, image_writer: AbsReaderWriter): super().set_image_writer(image_writer) def pipe_jiexi(self): self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model) def pipe_parse(self): self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): result = super().pipe_mk_uni_format(img_parent_path, drop_mode) logger.info("txt_pipe mk content list finished") return result def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) logger.info(f"txt_pipe mk {md_make_mode} finished") return result
有完整的重构代码吗?
txtpipe代码: class TXTPipe(AbsPipe):
def __init__(self,model_list: list, is_debug: bool = False): super().__init__(model_list, is_debug) def pipe_classify(self): pass def set_pdf(self, pdf_bytes: bytes): """设置 PDF 数据""" self.pdf_bytes = pdf_bytes def pipe_analyze(self): self.custom_model = doc_analyze(ocr=False) def set_image_writer(self, image_writer: AbsReaderWriter): super().set_image_writer(image_writer) def pipe_jiexi(self): self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model) def pipe_parse(self): self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): result = super().pipe_mk_uni_format(img_parent_path, drop_mode) logger.info("txt_pipe mk content list finished") return result def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) logger.info(f"txt_pipe mk {md_make_mode} finished") return result有完整的重构代码吗?
没放的代码就是没变动的,可以对照原代码看一下
txtpipe代码: class TXTPipe(AbsPipe):
def __init__(self,model_list: list, is_debug: bool = False): super().__init__(model_list, is_debug) def pipe_classify(self): pass def set_pdf(self, pdf_bytes: bytes): """设置 PDF 数据""" self.pdf_bytes = pdf_bytes def pipe_analyze(self): self.custom_model = doc_analyze(ocr=False) def set_image_writer(self, image_writer: AbsReaderWriter): super().set_image_writer(image_writer) def pipe_jiexi(self): self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model) def pipe_parse(self): self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): result = super().pipe_mk_uni_format(img_parent_path, drop_mode) logger.info("txt_pipe mk content list finished") return result def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) logger.info(f"txt_pipe mk {md_make_mode} finished") return result有完整的重构代码吗?
没放的代码就是没变动的,可以对照原代码看一下
doc_analyze和doc_jiexi重构了吧?
txtpipe代码: class TXTPipe(AbsPipe):
def __init__(self,model_list: list, is_debug: bool = False): super().__init__(model_list, is_debug) def pipe_classify(self): pass def set_pdf(self, pdf_bytes: bytes): """设置 PDF 数据""" self.pdf_bytes = pdf_bytes def pipe_analyze(self): self.custom_model = doc_analyze(ocr=False) def set_image_writer(self, image_writer: AbsReaderWriter): super().set_image_writer(image_writer) def pipe_jiexi(self): self.model_list = doc_jiexi(self.pdf_bytes, self.custom_model) def pipe_parse(self): self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): result = super().pipe_mk_uni_format(img_parent_path, drop_mode) logger.info("txt_pipe mk content list finished") return result def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) logger.info(f"txt_pipe mk {md_make_mode} finished") return result有完整的重构代码吗?
没放的代码就是没移动的,可以对照原代码看一下
doc_analyze和doc_jiexi重构了吧?
是的忘了发了 def doc_analyze(ocr: bool = False, show_log: bool = False): model_manager = ModelSingleton() custom_model = model_manager.get_model(ocr, show_log) return custom_model
def doc_jiexi(pdf_bytes: bytes, custom_model): images = load_images_from_pdf(pdf_bytes) model_json = [] doc_analyze_start = time.time() for index, img_dict in enumerate(images): img = img_dict["img"] page_width = img_dict["width"] page_height = img_dict["height"] result = custom_model(img) page_info = {"page_no": index, "height": page_height, "width": page_width} page_dict = {"layout_dets": result, "page_info": page_info} model_json.append(page_dict) doc_analyze_cost = time.time() - doc_analyze_start logger.info(f"doc analyze cost: {doc_analyze_cost}") return model_json
@2257396011 @lygiants @cskkx1 @hlzhu1983
可以通过以下方法对模型预加载,只需要调用一次init_model方法,后续解析代码都可以不用更改 https://github.com/opendatalab/MinerU/issues/517#issuecomment-2324324940