pdfminer.six icon indicating copy to clipboard operation
pdfminer.six copied to clipboard

"error: unpack requires a buffer of 14 bytes" in cmadb.py

Open IceFlameWorm opened this issue 3 years ago • 3 comments

This error still occurs when I try to parse the following pdf file with pdfplumber, although I have upgrade pdfminer.six to the latest test.pdf

My code is as follows:

import pdfplumber
pdf = pdfplumber.open(pdf_file)
page = pdf.pages[0]
page.extract_text()

Once the above code runs on the attached file, the following error will come out:

error                                     Traceback (most recent call last)
<ipython-input-6-d4d51c6cf413> in <module>
----> 1 print(page.extract_text())

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in extract_text(self, x_tolerance, y_tolerance)
    178         y_tolerance=utils.DEFAULT_Y_TOLERANCE):
    179 
--> 180         return utils.extract_text(self.chars,
    181             x_tolerance=x_tolerance,
    182             y_tolerance=y_tolerance)

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\container.py in chars(self)
     33     @property
     34     def chars(self):
---> 35         return self.objects.get("char", [])
     36 
     37     @property

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in objects(self)
     64     def objects(self):
     65         if hasattr(self, "_objects"): return self._objects
---> 66         self._objects = self.parse_objects()
     67         return self._objects
     68 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in parse_objects(self)
    152                     process_object(child)
    153 
--> 154         for obj in self.layout._objs:
    155             process_object(obj)
    156 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in layout(self)
     58     def layout(self):
     59         if hasattr(self, "_layout"): return self._layout
---> 60         self._layout = self.pdf.process_page(self.page_obj)
     61         return self._layout
     62 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\pdf.py in process_page(self, page)
     49 
     50     def process_page(self, page):
---> 51         self.interpreter.process_page(page)
     52         return self.device.get_result()
     53 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
    893             ctm = (1, 0, 0, 1, -x0, -y0)
    894         self.device.begin_page(page, ctm)
--> 895         self.render_contents(page.resources, page.contents, ctm=ctm)
    896         self.device.end_page(page)
    897         return

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
    906         self.init_resources(resources)
    907         self.init_state(ctm)
--> 908         self.execute(list_value(streams))
    909         return
    910 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
    931                         log.debug('exec: %s %r', name, args)
    932                         if len(args) == nargs:
--> 933                             func(*args)
    934                     else:
    935                         log.debug('exec: %s', name)

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in do_Tj(self, s)
    806     def do_Tj(self, s):
    807         """Show text"""
--> 808         self.do_TJ([s])
    809         return
    810 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in do_TJ(self, seq)
    801             return
    802         self.device.render_string(self.textstate, seq, self.ncs,
--> 803                                   self.graphicstate.copy())
    804         return
    805 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string(self, textstate, seq, ncs, graphicstate)
     81                 seq, matrix, textstate.linematrix, font, fontsize,
     82                 scaling, charspace, wordspace, rise, dxscale, ncs,
---> 83                 graphicstate)
     84         return
     85 

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string_horizontal(self, seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
     94                 needcharspace = True
     95             else:
---> 96                 for cid in font.decode(obj):
     97                     if needcharspace:
     98                         x += charspace

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdffont.py in decode(self, bytes)
    766 
    767     def decode(self, bytes):
--> 768         return self.cmap.decode(bytes)
    769 
    770     def char_disp(self, cid):

c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\cmapdb.py in decode(self, code)
    115         n = len(code)//2
    116         if n:
--> 117             return struct.unpack('>%dH' % n, code)
    118         else:
    119             return ()

error: unpack requires a buffer of 14 bytes

IceFlameWorm avatar Sep 18 '21 06:09 IceFlameWorm

I’m getting a very similar problem, any solution?

Vricken avatar Jul 08 '22 17:07 Vricken

Similar to https://github.com/pdfminer/pdfminer.six/issues/785

I can replicate this. Solution is needed.

$ python tools/pdf2txt.py ~/Downloads/test.pdf 
Traceback (most recent call last):
  File "/home/pieter/projects/pdfminer-upstream/tools/pdf2txt.py", line 317, in <module>
    sys.exit(main())
  File "/home/pieter/projects/pdfminer-upstream/tools/pdf2txt.py", line 311, in main
    outfp = extract_text(**vars(parsed_args))
  File "/home/pieter/projects/pdfminer-upstream/tools/pdf2txt.py", line 62, in extract_text
    pdfminer.high_level.extract_text_to_fp(fp, **locals())
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/high_level.py", line 121, in extract_text_to_fp
    interpreter.process_page(page)
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 991, in process_page
    self.render_contents(page.resources, page.contents, ctm=ctm)
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 1010, in render_contents
    self.execute(list_value(streams))
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 1036, in execute
    func(*args)
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 903, in do_Tj
    self.do_TJ([s])
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 896, in do_TJ
    self.device.render_string(
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfdevice.py", line 133, in render_string
    textstate.linematrix = self.render_string_horizontal(
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfdevice.py", line 170, in render_string_horizontal
    for cid in font.decode(obj):
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdffont.py", line 1175, in decode
    return self.cmap.decode(bytes)
  File "/home/pieter/projects/pdfminer-upstream/pdfminer/cmapdb.py", line 136, in decode
    return struct.unpack(">%dH" % n, code)
struct.error: unpack requires a buffer of 26 bytes

pietermarsman avatar Aug 08 '22 20:08 pietermarsman

Fix pdf use this cmds.

gs -o b.pdf -sDEVICE=pdfwrite a.pdf

FANGOD avatar Oct 19 '23 02:10 FANGOD