docling
docling copied to clipboard
Docling crashes on the attached docx
Bug
Docling crashes on the attached docx file.
The file features examples of:
- standard heading styles;
- custom heading styles (styles that have exact outline level specified);
- tables;
- formulas;
- enumerations with numbers (single-level);
- enumerations with numbers (multi-level);
- enumerations with bullet-points;
- headers;
- footers;
- footnotes.
The file was created using Word from Microsoft Office Professional Plus 2019.
Steps to reproduce
Create a python script: convert_with_docling.py
#!/usr/bin/env python3
import sys
from docling.document_converter import DocumentConverter
def main():
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <source> <output_file>")
sys.exit(1)
source = sys.argv[1]
output_file = sys.argv[2]
converter = DocumentConverter()
result = converter.convert(source)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(result.document.export_to_markdown())
if __name__ == "__main__":
main()
Run:
python convert_with_docling.py test_doc-small.docx output.md
I get the following exception:
Traceback (most recent call last):
File "./pandoc_test/convert_with_docling.py", line 18, in <module>
main()
File "./pandoc_test/convert_with_docling.py", line 13, in main
result = converter.convert(source)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/pydantic/_internal/_validate_call.py", line 38, in wrapper_function
return wrapper(*args, **kwargs)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/pydantic/_internal/_validate_call.py", line 111, in __call__
res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/document_converter.py", line 203, in convert
return next(all_res)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/document_converter.py", line 226, in convert_all
for conv_res in conv_res_iter:
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/document_converter.py", line 261, in _convert
for item in map(
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/document_converter.py", line 302, in _process_document
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/document_converter.py", line 325, in _execute_pipeline
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/pipeline/base_pipeline.py", line 53, in execute
raise e
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/pipeline/base_pipeline.py", line 45, in execute
conv_res = self._build_document(conv_res)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/pipeline/simple_pipeline.py", line 41, in _build_document
conv_res.document = conv_res.input._backend.convert()
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/backend/msword_backend.py", line 119, in convert
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/backend/msword_backend.py", line 195, in walk_linear
self.handle_text_elements(element, docx_obj, doc)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/backend/msword_backend.py", line 324, in handle_text_elements
self.add_header(doc, p_level, text)
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling/backend/msword_backend.py", line 371, in add_header
self.parents[curr_level] = doc.add_heading(
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/docling_core/types/doc/document.py", line 1746, in add_heading
section_header_item = SectionHeaderItem(
File "~/anaconda3/envs/docling/lib/python3.10/site-packages/pydantic/main.py", line 214, in __init__
validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for SectionHeaderItem
level
Input should be less than or equal to 100 [type=less_than_equal, input_value=111, input_type=int]
For further information visit https://errors.pydantic.dev/2.10/v/less_than_equal
Docling version
2.18.0
Python version
Python 3.10.4