MinerU icon indicating copy to clipboard operation
MinerU copied to clipboard

识别pdf报错:json.decoder.JSONDecodeError: Expecting ',' delimiter: line 28 column 9 (char 727)

Open sliontc opened this issue 2 months ago • 3 comments

Description of the bug | 错误描述

(MinerU) E:\development\mineru>magic-pdf -p CN101213040B.pdf -o output -m ocr -l ch import tensorrt_llm failed, if do not use tensorrt, ignore this message import lmdeploy failed, if do not use lmdeploy, ignore this message 2024-12-09 22:12:39.107 | ERROR | magic_pdf.tools.cli:parse_doc:108 - Expecting ',' delimiter: line 28 column 9 (char 727) Traceback (most recent call last):

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\runpy.py", line 196, in _run_module_as_main return run_code(code, main_globals, None, │ │ └ {'name': 'main', 'doc': None, 'package': '', 'loader': <zipimporter object "C:\Users\sgw\miniconda3\envs... │ └ <code object at 0x000001D891894030, file "C:\Users\sgw\miniconda3\envs\MinerU\Scripts\magic-pdf.exe_main.py", li... └ <function _run_code at 0x000001D891871480>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\runpy.py", line 86, in run_code exec(code, run_globals) │ └ {'name': 'main', 'doc': None, 'package': '', 'loader': <zipimporter object "C:\Users\sgw\miniconda3\envs... └ <code object at 0x000001D891894030, file "C:\Users\sgw\miniconda3\envs\MinerU\Scripts\magic-pdf.exe_main.py", li...

File "C:\Users\sgw\miniconda3\envs\MinerU\Scripts\magic-pdf.exe_main_.py", line 7, in sys.exit(cli()) │ │ └ <Command cli> │ └ └ <module 'sys' (built-in)>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\click\core.py", line 1157, in call return self.main(*args, **kwargs) │ │ │ └ {} │ │ └ () │ └ <function BaseCommand.main at 0x000001D891D051B0> └ <Command cli>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\click\core.py", line 1078, in main rv = self.invoke(ctx) │ │ └ <click.core.Context object at 0x000001D8918D89A0> │ └ <function Command.invoke at 0x000001D891D05C60> └ <Command cli>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\click\core.py", line 1434, in invoke return ctx.invoke(self.callback, **ctx.params) │ │ │ │ │ └ {'path': 'CN101213040B.pdf', 'output_dir': 'output', 'method': 'ocr', 'lang': 'ch', 'debug_able': False, 'start_page_id': 0, ... │ │ │ │ └ <click.core.Context object at 0x000001D8918D89A0> │ │ │ └ <function cli at 0x000001D8C1BE04C0> │ │ └ <Command cli> │ └ <function Context.invoke at 0x000001D891D049D0> └ <click.core.Context object at 0x000001D8918D89A0>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\click\core.py", line 783, in invoke return __callback(*args, **kwargs) │ └ {'path': 'CN101213040B.pdf', 'output_dir': 'output', 'method': 'ocr', 'lang': 'ch', 'debug_able': False, 'start_page_id': 0, ... └ ()

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\tools\cli.py", line 114, in cli parse_doc(path) │ └ 'CN101213040B.pdf' └ <function cli..parse_doc at 0x000001D8918BB880>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\tools\cli.py", line 95, in parse_doc do_parse( └ <function do_parse at 0x000001D8C1BA7EB0>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\tools\common.py", line 134, in do_parse pipe.pipe_analyze() │ └ <function OCRPipe.pipe_analyze at 0x000001D8C1BA7520> └ <magic_pdf.pipe.OCRPipe.OCRPipe object at 0x000001D8C1BB0DF0>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\pipe\OCRPipe.py", line 22, in pipe_analyze self.model_list = doc_analyze(self.pdf_bytes, ocr=True, │ │ │ │ └ b'%PDF-1.7\n%\xc2\xb5\xc2\xb6\n\n1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n\n2 0 obj\n<</Type/Pages/Count 18/Kids[5 0 R... │ │ │ └ <magic_pdf.pipe.OCRPipe.OCRPipe object at 0x000001D8C1BB0DF0> │ │ └ <function doc_analyze at 0x000001D8AB1423B0> │ └ [] └ <magic_pdf.pipe.OCRPipe.OCRPipe object at 0x000001D8C1BB0DF0>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\model\doc_analyze_by_custom_model.py", line 147, in doc_analyze custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable) │ │ │ │ │ │ │ └ None │ │ │ │ │ │ └ None │ │ │ │ │ └ None │ │ │ │ └ 'ch' │ │ │ └ False │ │ └ True │ └ <function ModelSingleton.get_model at 0x000001D8AB142320> └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x000001D8C1BB1090>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\model\doc_analyze_by_custom_model.py", line 75, in get_model self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang, layout_model=layout_model, │ │ │ │ │ │ │ └ None │ │ │ │ │ │ └ 'ch' │ │ │ │ │ └ False │ │ │ │ └ True │ │ │ └ <function custom_model_init at 0x000001D8AB142200> │ │ └ (True, False, 'ch', None, None, None) │ └ {} └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x000001D8C1BB1090>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\model\doc_analyze_by_custom_model.py", line 100, in custom_model_init local_models_dir = get_local_models_dir() └ <function get_local_models_dir at 0x000001D8AB141CF0>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\libs\config_reader.py", line 59, in get_local_models_dir config = read_config() └ <function read_config at 0x000001D8AB1417E0>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\site-packages\magic_pdf\libs\config_reader.py", line 26, in read_config config = json.load(f) │ │ └ <_io.TextIOWrapper name='C:\Users\sgw\magic-pdf.json' mode='r' encoding='utf-8'> │ └ <function load at 0x000001D89385ECB0> └ <module 'json' from 'C:\Users\sgw\miniconda3\envs\MinerU\lib\json\init.py'>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\json_init_.py", line 293, in load return loads(fp.read(), │ │ └ <method 'read' of '_io.TextIOWrapper' objects> │ └ <_io.TextIOWrapper name='C:\Users\sgw\magic-pdf.json' mode='r' encoding='utf-8'> └ <function loads at 0x000001D89385ED40>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\json_init_.py", line 346, in loads return _default_decoder.decode(s) │ │ └ '{\n "bucket_info": {\n "bucket-name-1": [\n "ak",\n "sk",\n "endpoint"\n ]... │ └ <function JSONDecoder.decode at 0x000001D89385E560> └ <json.decoder.JSONDecoder object at 0x000001D893864A60>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\json\decoder.py", line 337, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) │ │ │ │ └ '{\n "bucket_info": {\n "bucket-name-1": [\n "ak",\n "sk",\n "endpoint"\n ]... │ │ │ └ <built-in method match of re.Pattern object at 0x000001D8937F5150> │ │ └ '{\n "bucket_info": {\n "bucket-name-1": [\n "ak",\n "sk",\n "endpoint"\n ]... │ └ <function JSONDecoder.raw_decode at 0x000001D89385E5F0> └ <json.decoder.JSONDecoder object at 0x000001D893864A60>

File "C:\Users\sgw\miniconda3\envs\MinerU\lib\json\decoder.py", line 353, in raw_decode obj, end = self.scan_once(s, idx) │ │ │ └ 0 │ │ └ '{\n "bucket_info": {\n "bucket-name-1": [\n "ak",\n "sk",\n "endpoint"\n ]... │ └ <_json.Scanner object at 0x000001D89383A800> └ <json.decoder.JSONDecoder object at 0x000001D893864A60>

json.decoder.JSONDecodeError: Expecting ',' delimiter: line 28 column 9 (char 727)

How to reproduce the bug | 如何复现

1.magic-pdf -p CN101213040B.pdf -o output -m ocr -l ch

Operating system | 操作系统

Windows

Python version | Python 版本

3.10

Software version | 软件版本 (magic-pdf --version)

0.10.x

Device mode | 设备模式

cpu

sliontc avatar Dec 09 '24 14:12 sliontc