MinerU
MinerU copied to clipboard
AssertionError: ('Unhandled', 12) 因pdfminer不支持utf-32编码导致乱码分析过程报错
Description of the bug | 错误描述
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[8], line 10
8 jso_useful_key = {"_pdf_type": "", "model_list": []}
9 pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
---> 10 pipe.pipe_classify()
11 pipe.pipe_analyze()
12 pipe.pipe_parse()
File ~/.local/lib/python3.10/site-packages/magic_pdf/pipe/UNIPipe.py:26, in UNIPipe.pipe_classify(self)
25 def pipe_classify(self):
---> 26 self.pdf_type = AbsPipe.classify(self.pdf_bytes)
File ~/.local/lib/python3.10/site-packages/magic_pdf/pipe/AbsPipe.py:66, in AbsPipe.classify(pdf_bytes)
61 @staticmethod
62 def classify(pdf_bytes: bytes) -> str:
63 """
64 根据pdf的元数据,判断是文本pdf,还是ocr pdf
65 """
---> 66 pdf_meta = pdf_meta_scan(pdf_bytes)
67 if pdf_meta.get("_need_drop", False): # 如果返回了需要丢弃的标志,则抛出异常
68 raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
File ~/.local/lib/python3.10/site-packages/magic_pdf/filter/pdf_meta_scan.py:339, in pdf_meta_scan(pdf_bytes)
337 text_language = get_language(doc)
338 # logger.info(f"text_language: {text_language}")
--> 339 invalid_chars = check_invalid_chars(pdf_bytes)
340 # logger.info(f"invalid_chars: {invalid_chars}")
341
342 # 最后输出一条json
343 res = {
344 "is_needs_password": is_needs_password,
345 "is_encrypted": is_encrypted,
(...)
357 "metadata": doc.metadata
358 }
File ~/.local/lib/python3.10/site-packages/magic_pdf/filter/pdf_meta_scan.py:305, in check_invalid_chars(pdf_bytes)
301 def check_invalid_chars(pdf_bytes):
302 """
303 乱码检测
304 """
--> 305 return detect_invalid_chars(pdf_bytes)
File ~/.local/lib/python3.10/site-packages/magic_pdf/libs/pdf_check.py:44, in detect_invalid_chars(src_pdf_bytes)
42 sample_pdf_bytes = sample_docs.tobytes()
43 sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
---> 44 text = extract_text(sample_pdf_file_like_object)
45 text = text.replace("\n", "")
46 # logger.info(text)
File ~/.local/lib/python3.10/site-packages/pdfminer/high_level.py:175, in extract_text(pdf_file, password, page_numbers, maxpages, caching, codec, laparams)
166 interpreter = PDFPageInterpreter(rsrcmgr, device)
168 for page in PDFPage.get_pages(
169 fp,
170 page_numbers,
(...)
173 caching=caching,
174 ):
--> 175 interpreter.process_page(page)
177 return output_string.getvalue()
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:997, in PDFPageInterpreter.process_page(self, page)
995 ctm = (1, 0, 0, 1, -x0, -y0)
996 self.device.begin_page(page, ctm)
--> 997 self.render_contents(page.resources, page.contents, ctm=ctm)
998 self.device.end_page(page)
999 return
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:1016, in PDFPageInterpreter.render_contents(self, resources, streams, ctm)
1014 self.init_resources(resources)
1015 self.init_state(ctm)
-> 1016 self.execute(list_value(streams))
1017 return
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:1042, in PDFPageInterpreter.execute(self, streams)
1040 log.debug("exec: %s %r", name, args)
1041 if len(args) == nargs:
-> 1042 func(*args)
1043 else:
1044 log.debug("exec: %s", name)
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:972, in PDFPageInterpreter.do_Do(self, xobjid_arg)
970 resources = self.resources.copy()
971 self.device.begin_figure(xobjid, bbox, matrix)
--> 972 interpreter.render_contents(
973 resources, [xobj], ctm=mult_matrix(matrix, self.ctm)
974 )
975 self.device.end_figure(xobjid)
976 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:1014, in PDFPageInterpreter.render_contents(self, resources, streams, ctm)
1007 """Render the content streams.
1008
1009 This method may be called recursively.
1010 """
1011 log.debug(
1012 "render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm
1013 )
-> 1014 self.init_resources(resources)
1015 self.init_state(ctm)
1016 self.execute(list_value(streams))
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:384, in PDFPageInterpreter.init_resources(self, resources)
382 objid = spec.objid
383 spec = dict_value(spec)
--> 384 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
385 elif k == "ColorSpace":
386 for (csid, spec) in dict_value(v).items():
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:234, in PDFResourceManager.get_font(self, objid, spec)
232 if k in spec:
233 subspec[k] = resolve1(spec[k])
--> 234 font = self.get_font(None, subspec)
235 else:
236 if settings.STRICT:
File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:225, in PDFResourceManager.get_font(self, objid, spec)
222 font = PDFType3Font(self, spec)
223 elif subtype in ("CIDFontType0", "CIDFontType2"):
224 # CID Font
--> 225 font = PDFCIDFont(self, spec)
226 elif subtype == "Type0":
227 # Type0 Font
228 dfonts = list_value(spec["DescendantFonts"])
File ~/.local/lib/python3.10/site-packages/pdfminer/pdffont.py:1097, in PDFCIDFont.__init__(self, rsrcmgr, spec, strict)
1095 if ttf:
1096 try:
-> 1097 self.unicode_map = ttf.create_unicode_map()
1098 except TrueTypeFont.CMapNotFound:
1099 pass
File ~/.local/lib/python3.10/site-packages/pdfminer/pdffont.py:830, in TrueTypeFont.create_unicode_map(self)
828 char2gid[c] = (c + idd) & 0xFFFF
829 else:
--> 830 assert False, str(("Unhandled", fmttype))
831 if not char2gid:
832 raise TrueTypeFont.CMapNotFound
AssertionError: ('Unhandled', 12)
How to reproduce the bug | 如何复现
Operating system | 操作系统
Linux
Python version | Python 版本
3.10
Software version | 软件版本 (magic-pdf --version)
0.8.x
Device mode | 设备模式
cuda
已通过魔改detect_invalid_chars函数的方式修复~
已通过魔改detect_invalid_chars函数的方式修复~
怎么改的,有示例代码吗
怎么改的,有示例代码吗