pdfminer
pdfminer copied to clipboard
AssertionError in drange when process a page
Hi. I get an error when process page in some PDF files. Code:
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
#device = PDFDevice(rsrcmgr)
# Create a PDF page aggregator object.
# Set parameters for analysis.
laparams = LAParams()
laparams.all_texts = True
laparams.dectect_vertical = True
laparams.word_margin = 0.06
# Modificado para PERAN
laparams.line_margin = 0.5
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
ipage = 0
layout_dict = None
try:
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
ipage += 1
if page_num == ipage:
offset = (0, 0)
# obtenemos desplazamiento del cropbox
# (las coordenadas que guardamos son del cropbox)
if page.mediabox != page.cropbox:
offset = (page.mediabox[0]-page.cropbox[0],
page.cropbox[3]-page.mediabox[3])
pagebox = [0, 0, page.cropbox[2]-page.cropbox[0], page.cropbox[3]-page.cropbox[1]]
interpreter.process_page(page)
layout = device.get_result()
layout_dict = extract_layout_dict(layout, overlap_pct, include_textline, include_char, offset, pagebox)
# analyze_overlap(layout_dict, overlap_pct=70)
if correct_images:
correct_blocimages(layout_dict)
break
except Exception, e:
print "Error PDFMiner: %s" % format(e)
pass
fp.close()
[CAP191211012.PDF](https://github.com/euske/pdfminer/files/3961030/CAP191211012.PDF)
Output:
File "/var/www/worker_ocr/layout_analyzer.py", line 1265, in analyzer
interpreter.process_page(page)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 833, in process_page
self.device.end_page(page)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/converter.py", line 35, in end_page
self.cur_item.analyze(self.laparams)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 646, in analyze
obj.analyze(laparams)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 686, in analyze
LTLayoutContainer.analyze(self, laparams)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 653, in analyze
textboxes = list(self.group_textlines(laparams, textlines))
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 551, in group_textlines
plane.extend(lines)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 283, in extend
self.add(obj)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 288, in add
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 275, in _getrange
for y in drange(y0, y1, self.gridsize):
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 121, in drange
assert v0 < v1
AssertionError