docling
docling copied to clipboard
Tables recognized as images
Bug
When trying to parse the documentation attached, some of the tables are wrongfully recognized as images. Even though the following option is set: pipeline_options.generate_table_images = False
Steps to reproduce
import pathlib
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
import os
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
pdf_path = "Operating_Manual_PlanTeam-SPACE_first20.pdf"
model_name = os.path.expanduser("~/models/models--Hiveurban--multilingual-e5-large-pooled/snapshots/72881863bb80d223adce6fdb057972bdbdaddf9f")
output_dir = pathlib.Path("images_docling")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.ocr_options = TesseractOcrOptions(force_full_page_ocr=True, lang=["deu"])
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True
pipeline_options.generate_table_images = False
converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},)
conv_res = converter.convert(pdf_path)
print("Dokument name:", conv_res.document.name)
print("Number of groups:", len(conv_res.document.groups))
print("Number of texts:", len(conv_res.document.texts))
print("Number of tables:", len(conv_res.document.tables))
print("Number of images:", len(conv_res.document.pictures))
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
table_counter = 0
picture_counter = 0
for element, _level in conv_res.document.iterate_items():
if isinstance(element, PictureItem):
picture_counter += 1
element_image_filename = (
output_dir / f"{doc_filename}-picture-{picture_counter}.png"
)
with element_image_filename.open("wb") as fp:
element.get_image(conv_res.document).save(fp, "PNG")
# Save markdown with embedded pictures
md_filename = output_dir / f"{doc_filename}-with-images.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
Docling version
2.34.0
Python version
3.10.12
@PeterStaar-IBM Can you please assign this to me. I am really interested.