Proposal: Add Docling Integration for End-to-End Document Extraction
Motivation / Use-Case
LangExtract today works only on raw text strings. In real-world workflows the source is usually a PDF , DOCX or PPTX. Users currently have to:
- Manually convert the file to text (losing layout & provenance).
- Feed the plain text into LangExtract.
- Manually map extractions back to the original document for verification.
A single-step pipeline would make LangExtract dramatically easier to adopt.
Proposed Solution
Integrate Docling library as an optional front-end:
- Docling converts multiple document formats into a unified DoclingDocument.
- It preserves provenance (page, bounding box, reading order).
- Feed the extracted text chunks into LangExtract exactly as today.
- Map the resulting extractions back to the original document via the provenance metadata.
The integration would be opt-in (pip install langextract[docling]) so the core package stays dependency-free.
Proof of Concept
I have a minimal working code:
markdown_chunker.py
from docling.chunking import BaseChunk, BaseChunker, DocChunk, DocMeta
from docling.document_converter import DocumentConverter
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.types import DoclingDocument as DLDocument
from docling_core.types.doc.document import (
DocItem,
InlineGroup,
LevelNumber,
ListGroup,
)
from pydantic import ConfigDict
from typing import Any, Iterator
class HierarchicalMarkdownChunker(BaseChunker):
r"""Modified HierarchicalChunker preserving markdown tables and sections.
Args:
delim (str): Delimiter to use for merging text. Defaults to "\n".
"""
model_config = ConfigDict(arbitrary_types_allowed=True)
def chunk(
self,
dl_doc: DLDocument,
**kwargs: Any,
) -> Iterator[BaseChunk]:
r"""Chunk the provided document.
Args:
dl_doc (DLDocument): document to chunk
Yields:
Iterator[Chunk]: iterator over extracted chunks
"""
my_doc_ser = MarkdownDocSerializer(doc=dl_doc)
heading_by_level: dict[LevelNumber, str] = {}
visited: set[str] = set()
ser_res = create_ser_result()
excluded_refs = my_doc_ser.get_excluded_refs(**kwargs)
for item, level in dl_doc.iterate_items(with_groups=True):
if item.self_ref in excluded_refs:
continue
elif (
isinstance(item, (ListGroup, InlineGroup, DocItem))
and item.self_ref not in visited
):
ser_res = my_doc_ser.serialize(item=item, visited=visited)
else:
continue
if not ser_res.text:
continue
if doc_items := [u.item for u in ser_res.spans]:
c = DocChunk(
text=ser_res.text,
meta=DocMeta(
doc_items=doc_items,
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
origin=dl_doc.origin,
),
)
yield c
pdf_extract.py
import dataclasses
from pathlib import Path
from typing import List, Iterable
from docling.document_converter import DocumentConverter
from docling_core.types.doc.document import ProvenanceItem
from langextract import data, extract
from markdown_chunker import HierarchicalMarkdownChunker
@dataclasses.dataclass(init=False)
class ExtractionWithProvenance(data.Extraction):
"""Extraction extended with provenance information."""
provenance: List[ProvenanceItem] | None = None
def __init__(self, provenance=None, **kwargs):
super().__init__(**kwargs)
self.provenance = provenance
@dataclasses.dataclass
class AnnotatedDocumentWithProvenance(data.AnnotatedDocument):
"""AnnotatedDocument with provenance-aware extractions."""
extractions: list[ExtractionWithProvenance] | None = None
def _add_provenance_to_doc(
doc: data.AnnotatedDocument,
chunks_with_offsets: list[tuple],
) -> AnnotatedDocumentWithProvenance:
"""Attach provenance info to each extraction in a document.
Args:
doc: The annotated document returned from extract().
chunks_with_offsets: List of (chunk, start_offset, end_offset).
Returns:
AnnotatedDocumentWithProvenance with provenance-aware extractions.
"""
new_extractions = []
extraction_fields = {f.name for f in dataclasses.fields(data.Extraction) if not f.name.startswith("_")}
for extraction in doc.extractions or []:
prov_items = []
if extraction.char_interval:
start = extraction.char_interval.start_pos or 0
end = extraction.char_interval.end_pos or 0
# Find all overlapping chunks
for chunk, c_start, c_end in chunks_with_offsets:
if start < c_end and end > c_start:
prov_items.extend([p for x in chunk.meta.doc_items for p in x.prov])
extraction_dict = dataclasses.asdict(extraction)
filtered_extraction_dict = {k: v for k, v in extraction_dict.items() if k in extraction_fields}
new_extractions.append( ExtractionWithProvenance(**filtered_extraction_dict, provenance=prov_items) )
return AnnotatedDocumentWithProvenance(
extractions=new_extractions,
text=doc.text,
)
def extract_with_file_support(
source: str | Path | data.Document | Iterable[data.Document],
**kwargs,
) -> data.AnnotatedDocument | AnnotatedDocumentWithProvenance | Iterable[data.AnnotatedDocument]:
"""Wrapper for `extract` that also supports file paths.
Args:
source: Raw text, a file path (str or Path), a Document,
or an iterable of Documents.
**kwargs: Additional arguments passed through to `extract`.
Returns:
An AnnotatedDocument with the extracted information when source is a
string or URL, AnnotatedDocumentWithProvenance when source is filepath,
or an iterable of AnnotatedDocuments when input is an iterable of Documents.
Raises:
FileNotFoundError: If a file path is provided but does not exist.
OSError: If the file cannot be read.
"""
if not isinstance(source, (str, Path)):
return extract(source, **kwargs)
if not Path(source).is_file():
return extract(source, **kwargs)
converter = DocumentConverter()
result = converter.convert(source)
chunks = [x for x in HierarchicalMarkdownChunker().chunk(result.document)]
normalized_text = ""
chunks_with_offsets = []
offset = 0
for c in chunks:
start = offset
end = start + len(c.text)
chunks_with_offsets.append((c, start, end))
normalized_text += c.text + "\n\n"
offset = end + 2 # account for "\n\n"
result = extract(normalized_text, **kwargs)
return _add_provenance_to_doc(result, chunks_with_offsets)
import langextract as lx
import textwrap
from pdf_extract import extract_with_file_support
# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""\
Extract characters, emotions, and relationships in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful attributes for each entity to add context.""")
# 2. Provide a high-quality example to guide the model
examples = [
lx.data.ExampleData(
text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
extractions=[
lx.data.Extraction(
extraction_class="character",
extraction_text="ROMEO",
attributes={"emotional_state": "wonder"}
),
lx.data.Extraction(
extraction_class="emotion",
extraction_text="But soft!",
attributes={"feeling": "gentle awe"}
),
lx.data.Extraction(
extraction_class="relationship",
extraction_text="Juliet is the sun",
attributes={"type": "metaphor"}
),
]
)
]
source = "<sample pdf file>.pdf"
result = extract_with_file_support(
source=source,
prompt_description=prompt,
examples=examples,
model_id="gemini-2.5-flash",
)
# result.extractions[0].extraction_text
# result.extractions[0].provenance
Limitations of the example
- Provenance is sentence-level, not word-level.
- No updated visualizer (yet) to highlight extractions in the original PDF.
Related issues: #79, #121, #178
Would this be something worth submitting a PR?
Update: I've published langextract-docling — a wrapper that adds PDF support to LangExtract.
It works as a drop-in replacement for lx.extract(...), with support for PDF files (local or URLs).
import langextract_docling as lx
result = lx.extract(
text_or_documents="path/to/file.pdf",
prompt_description="Extract entities",
examples=[...],
)
Install via:
pip install langextract-docling
Feedback welcome!
This is awesome, @tijoseymathew - thank you for sharing! Native PDF support is on the roadmap for LangExtract, and your example with Docling will be very valuable for planning and the final approach. If anyone processes larger-scale PDFs with langextract-docling, please report back as well.
Hi @tijoseymathew - Just to clarify, the langextract-docling seems to return an annotated document, not a PDF - but eventually, the aim would be to overlay grounded data on the PDF. Thanks again for creating the fork and opening this issue!
Hi @aksg87 , I wasn’t sure if there was enough interest earlier, so I didn’t expand the library further. Based on this thread I’ve created a new branch feat/provenance that adds bounding-box provenance directly to AnnotatedDocument and enables PDF visualization.
Here’s a quick GIF of the annotated PDF output:
There are a few quirks I’m still ironing out, but the core flow is working. Bounding boxes currently map to the span of words that produced the extraction, not the exact token.
To test it locally:
uvx --from git+https://github.com/tijoseymathew/langextract-docling/@feat/provenance python test_lx_docling.py
Sample code
import langextract_docling as lx
import textwrap
# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""\
Extract characters, emotions, and relationships in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful attributes for each entity to add context.""")
# 2. Provide a high-quality example to guide the model
examples = [
lx.data.ExampleData(
text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
extractions=[
lx.data.Extraction(
extraction_class="character",
extraction_text="ROMEO",
attributes={"emotional_state": "wonder"}
),
lx.data.Extraction(
extraction_class="emotion",
extraction_text="But soft!",
attributes={"feeling": "gentle awe"}
),
lx.data.Extraction(
extraction_class="relationship",
extraction_text="Juliet is the sun",
attributes={"type": "metaphor"}
),
]
)
]
# The input pdf to be processed
pdf_file = "<local pdf file>"
# Run the extraction
result = lx.extract(
text_or_documents=pdf_file,
prompt_description=prompt,
examples=examples,
model_id="gemini-2.5-flash",
)
# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")
# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")
with open("visualization.html", "w") as f:
if hasattr(html_content, 'data'):
f.write(html_content.data) # For Jupyter/Colab
else:
f.write(html_content)
Let me know if this direction matches what you were thinking.
@tijoseymathew I think this is really great, lots of clients asking for precise annotations in their pdfs. Please keep up the work, there is definitely interest.
+1 @tijoseymathew this is great, and a very valuable reference! Ultimately going from data items back to the PDF will be very valuable for those workflows.