pdfminer.six
pdfminer.six copied to clipboard
"error: unpack requires a buffer of 14 bytes" in cmadb.py
This error still occurs when I try to parse the following pdf file with pdfplumber, although I have upgrade pdfminer.six to the latest test.pdf
My code is as follows:
import pdfplumber
pdf = pdfplumber.open(pdf_file)
page = pdf.pages[0]
page.extract_text()
Once the above code runs on the attached file, the following error will come out:
error Traceback (most recent call last)
<ipython-input-6-d4d51c6cf413> in <module>
----> 1 print(page.extract_text())
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in extract_text(self, x_tolerance, y_tolerance)
178 y_tolerance=utils.DEFAULT_Y_TOLERANCE):
179
--> 180 return utils.extract_text(self.chars,
181 x_tolerance=x_tolerance,
182 y_tolerance=y_tolerance)
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\container.py in chars(self)
33 @property
34 def chars(self):
---> 35 return self.objects.get("char", [])
36
37 @property
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in objects(self)
64 def objects(self):
65 if hasattr(self, "_objects"): return self._objects
---> 66 self._objects = self.parse_objects()
67 return self._objects
68
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in parse_objects(self)
152 process_object(child)
153
--> 154 for obj in self.layout._objs:
155 process_object(obj)
156
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\page.py in layout(self)
58 def layout(self):
59 if hasattr(self, "_layout"): return self._layout
---> 60 self._layout = self.pdf.process_page(self.page_obj)
61 return self._layout
62
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfplumber\pdf.py in process_page(self, page)
49
50 def process_page(self, page):
---> 51 self.interpreter.process_page(page)
52 return self.device.get_result()
53
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
893 ctm = (1, 0, 0, 1, -x0, -y0)
894 self.device.begin_page(page, ctm)
--> 895 self.render_contents(page.resources, page.contents, ctm=ctm)
896 self.device.end_page(page)
897 return
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
906 self.init_resources(resources)
907 self.init_state(ctm)
--> 908 self.execute(list_value(streams))
909 return
910
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
931 log.debug('exec: %s %r', name, args)
932 if len(args) == nargs:
--> 933 func(*args)
934 else:
935 log.debug('exec: %s', name)
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in do_Tj(self, s)
806 def do_Tj(self, s):
807 """Show text"""
--> 808 self.do_TJ([s])
809 return
810
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in do_TJ(self, seq)
801 return
802 self.device.render_string(self.textstate, seq, self.ncs,
--> 803 self.graphicstate.copy())
804 return
805
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string(self, textstate, seq, ncs, graphicstate)
81 seq, matrix, textstate.linematrix, font, fontsize,
82 scaling, charspace, wordspace, rise, dxscale, ncs,
---> 83 graphicstate)
84 return
85
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string_horizontal(self, seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
94 needcharspace = True
95 else:
---> 96 for cid in font.decode(obj):
97 if needcharspace:
98 x += charspace
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\pdffont.py in decode(self, bytes)
766
767 def decode(self, bytes):
--> 768 return self.cmap.decode(bytes)
769
770 def char_disp(self, cid):
c:\programfiles\anaconda3\envs\py36\lib\site-packages\pdfminer\cmapdb.py in decode(self, code)
115 n = len(code)//2
116 if n:
--> 117 return struct.unpack('>%dH' % n, code)
118 else:
119 return ()
error: unpack requires a buffer of 14 bytes
I’m getting a very similar problem, any solution?
Similar to https://github.com/pdfminer/pdfminer.six/issues/785
I can replicate this. Solution is needed.
$ python tools/pdf2txt.py ~/Downloads/test.pdf
Traceback (most recent call last):
File "/home/pieter/projects/pdfminer-upstream/tools/pdf2txt.py", line 317, in <module>
sys.exit(main())
File "/home/pieter/projects/pdfminer-upstream/tools/pdf2txt.py", line 311, in main
outfp = extract_text(**vars(parsed_args))
File "/home/pieter/projects/pdfminer-upstream/tools/pdf2txt.py", line 62, in extract_text
pdfminer.high_level.extract_text_to_fp(fp, **locals())
File "/home/pieter/projects/pdfminer-upstream/pdfminer/high_level.py", line 121, in extract_text_to_fp
interpreter.process_page(page)
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 991, in process_page
self.render_contents(page.resources, page.contents, ctm=ctm)
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 1010, in render_contents
self.execute(list_value(streams))
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 1036, in execute
func(*args)
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 903, in do_Tj
self.do_TJ([s])
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfinterp.py", line 896, in do_TJ
self.device.render_string(
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfdevice.py", line 133, in render_string
textstate.linematrix = self.render_string_horizontal(
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdfdevice.py", line 170, in render_string_horizontal
for cid in font.decode(obj):
File "/home/pieter/projects/pdfminer-upstream/pdfminer/pdffont.py", line 1175, in decode
return self.cmap.decode(bytes)
File "/home/pieter/projects/pdfminer-upstream/pdfminer/cmapdb.py", line 136, in decode
return struct.unpack(">%dH" % n, code)
struct.error: unpack requires a buffer of 26 bytes
Fix pdf use this cmds.
gs -o b.pdf -sDEVICE=pdfwrite a.pdf