pdfquery
pdfquery copied to clipboard
ValueError: Invalid attribute name u'AAPL:AKExtras'
Processing a PDF with annotations that have a colon in their key value gives an exception:
Traceback (most recent call last):
File "test_ocr.py", line 633, in test_petition
analyze = analyze_bankruptcy_petition(pdf_txt = pdf_txt, pdf_fp = file)
File "program.py", line 255, in analyze_bankruptcy_petition
pdfq.load(*pages_to_analyze)
File "..\libs\pdfquery\pdfquery.py", line 385, in load
self.tree = self.get_tree(*_flatten(page_numbers))
File "..\libs\pdfquery\pdfquery.py", line 484, in get_tree
_flatten(page_numbers)]
File "..\libs\pdfquery\pdfquery.py", line 603, in get_layout
layout = self._add_annots(layout, page.annots)
File "..\libs\pdfquery\pdfquery.py", line 663, in _add_annots
elem = parser.makeelement('Annot', annot)
File "parser.pxi", line 878, in lxml.etree._BaseParser.makeelement (src/lxml/lxml.etree.c:74798)
File "apihelpers.pxi", line 156, in lxml.etree._makeElement (src/lxml/lxml.etree.c:12231)
File "apihelpers.pxi", line 144, in lxml.etree._makeElement (src/lxml/lxml.etree.c:12106)
File "apihelpers.pxi", line 298, in lxml.etree._initNodeAttributes (src/lxml/lxml.etree.c:13603)
File "apihelpers.pxi", line 1554, in lxml.etree._attributeValidOrRaise (src/lxml/lxml.etree.c:24197)
ValueError: Invalid attribute name u'AAPL:AKExtras'
This is the PDF at issue that causes the problem. I fixed this bug by monkeypatching the function _add_annots
in pdfquery.py
:
def _add_annots(self, layout, annots):
"""Adds annotations to the layout object
"""
if annots:
for annot in resolve1(annots):
annot = resolve1(annot)
if annot.get('Rect') is not None:
annot['bbox'] = annot.pop('Rect') # Rename key
annot = self._set_hwxy_attrs(annot)
try:
annot['URI'] = resolve1(annot['A'])['URI']
except KeyError:
pass
rep_keys = {}
for k, v in six.iteritems(annot):
if not isinstance(v, six.string_types):
if ":" in k:
import logging
logging.warning("Converting key: %s"%k)
rep_keys[k] = k.replace(":", "_")
annot[k] = obj_to_string(v)
for keyfrom, keyto in rep_keys.items():
annot[keyto] = annot[keyfrom]
del annot[keyfrom]
elem = parser.makeelement('Annot', annot)
layout.add(elem)
return layout
thanks very much @speedplane ! your monkeypatch just worked for me, too.
in case you might be interested... i also wound up having to add this exception handler:
if annots:
for annot in resolve1(annots):
annot = resolve1(annot)
if annot.get('Rect') is not None:
try:
annot['bbox'] = annot.pop('Rect') # Rename key
annot = self._set_hwxy_attrs(annot)
except Exception as e:
print('PDFQuery._add_annots: cant form bbox?!',e,annot)
try:
annot['URI'] = resolve1(annot['A'])['URI']
except KeyError:
pass