pdfminer icon indicating copy to clipboard operation
pdfminer copied to clipboard

AssertionError in drange when process a page

Open jserrano-rebold opened this issue 5 years ago • 0 comments

Hi. I get an error when process page in some PDF files. Code:

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    #device = PDFDevice(rsrcmgr)
    # Create a PDF page aggregator object.
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.all_texts = True
    laparams.dectect_vertical = True
    laparams.word_margin = 0.06
    # Modificado para PERAN
    laparams.line_margin = 0.5
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    ipage = 0
    layout_dict = None
    try:
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            ipage += 1
            if page_num == ipage:
                offset = (0, 0)
                # obtenemos desplazamiento del cropbox
                # (las coordenadas que guardamos son del cropbox)
                if page.mediabox != page.cropbox:
                    offset = (page.mediabox[0]-page.cropbox[0],
                        page.cropbox[3]-page.mediabox[3])
                pagebox = [0, 0, page.cropbox[2]-page.cropbox[0], page.cropbox[3]-page.cropbox[1]]
                interpreter.process_page(page)
                layout = device.get_result()
                layout_dict = extract_layout_dict(layout, overlap_pct, include_textline, include_char, offset, pagebox)
                # analyze_overlap(layout_dict, overlap_pct=70)
                if correct_images:
                    correct_blocimages(layout_dict)
                break
    except Exception, e:
        print "Error PDFMiner: %s" % format(e)
        pass
    fp.close()
[CAP191211012.PDF](https://github.com/euske/pdfminer/files/3961030/CAP191211012.PDF)

Output:

  File "/var/www/worker_ocr/layout_analyzer.py", line 1265, in analyzer
    interpreter.process_page(page)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 833, in process_page
    self.device.end_page(page)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/converter.py", line 35, in end_page
    self.cur_item.analyze(self.laparams)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 646, in analyze
    obj.analyze(laparams)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 686, in analyze
    LTLayoutContainer.analyze(self, laparams)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 653, in analyze
    textboxes = list(self.group_textlines(laparams, textlines))
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 551, in group_textlines
    plane.extend(lines)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 283, in extend
    self.add(obj)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 288, in add
    for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 275, in _getrange
    for y in drange(y0, y1, self.gridsize):
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 121, in drange
    assert v0 < v1
AssertionError

jserrano-rebold avatar Dec 13 '19 13:12 jserrano-rebold