docling is not providing better results for arabic language
I have been working on arabic pdf.what if arabic pdf consist of scanned images ?.At this stage , we have to classify wheather the page is machine readable text or scanned image.Do we have a to way figure out wheather page has machine readable text or scanned image . If we are able to figure that out, then we can apply ocr pipeline to that particular page. I would be grateful, if anyone provide me with required configurations to get better results for arabic language.
@mudassir206 We have so far taken care of correct representation of arabic script from digital PDF text. For embedded bitmaps (e.g. scanned pages) we currently depend on the capabilities of the available OCR engines. Some have automatic language detection built in, others expect an explicit languages parameter. There is an internal mechanism which automatically engages OCR on bitmap regions in a page when they are present. Could you provide a PDF sample for us to check on?
Here is an example financial statement of a listed company in the local market.
@cau-git .Thanks for the response we have tested with couple of samples where docling failed to extract text from bitmaps.And there are couple of scenarios where the text line got interchanged(arabic).[it was from right to left but after extraction it got reversed ] below is the code configuration we have used and i have attached a pdf file. --/////// import logging import time from pathlib import Path from PIL import Image import numpy as np import cv2
from docling_core.types.doc import ( ImageRefMode, PictureItem, TableItem, DocItemLabel, ) from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, EasyOcrOptions, TableFormerMode, TesseractCliOcrOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(name) IMAGE_RESOLUTION_SCALE = 2.0 TARGET_DPI = 300
def rescale_image_to_dpi( input_path: Path, output_path: Path, target_dpi: int = 300 ) -> Image.Image: with Image.open(input_path) as img: original_dpi = img.info.get("dpi", (72, 72))[0] scale_factor = target_dpi / original_dpi new_size = (int(img.width * scale_factor), int(img.height * scale_factor)) rescaled_img = img.resize(new_size, resample=Image.LANCZOS) rescaled_img.save(output_path, dpi=(target_dpi, target_dpi)) return rescaled_img
def preprocess_image_for_ocr(pil_image: Image.Image) -> np.ndarray: """ Enhances image for Arabic OCR: grayscale, denoise, and threshold. """ img = np.array(pil_image) gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) denoised = cv2.bilateralFilter(gray, 9, 75, 75) thresh = cv2.adaptiveThreshold( denoised, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 25, 15 ) return thresh
def main(): logging.basicConfig(level=logging.INFO)
input_doc_path = Path("/home/mudassir/Downloads/all_test 1.pdf")
output_dir = Path("/home/mudassir/projects/docling_/all_test_refined_tesser")
ocr_options_1 = TesseractCliOcrOptions(
force_full_page_ocr=False, lang=["ara", "eng"]
)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options.ocr_options = ocr_options_1
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
image_pipeline_options = PdfPipelineOptions()
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True, lang=["ara", "eng"])
image_pipeline_options.do_ocr = True
image_pipeline_options.ocr_options = ocr_options
image_pipeline_options.do_table_structure = True
image_pipeline_options.generate_page_images = True
image_pipeline_options.generate_picture_images = True
image_pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
image_converter = DocumentConverter(
format_options={
InputFormat.IMAGE: PdfFormatOption(pipeline_options=image_pipeline_options)
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
table_counter = 0
picture_counter = 0
replacements = []
for element, _ in conv_res.document.iterate_items():
if isinstance(element, TableItem):
table_counter += 1
img_path = output_dir / f"{doc_filename}-table-{table_counter}.png"
with img_path.open("wb") as fp:
element.get_image(conv_res.document).save(fp, "PNG")
elif isinstance(element, PictureItem):
picture_counter += 1
original_path = (
output_dir / f"{doc_filename}-picture-{picture_counter}-orig.png"
)
rescaled_path = output_dir / f"{doc_filename}-picture-{picture_counter}.png"
# Save original picture image
with original_path.open("wb") as fp:
element.get_image(conv_res.document).save(fp, "PNG")
# Rescale image to DPI
rescaled_img = rescale_image_to_dpi(
original_path, rescaled_path, TARGET_DPI
)
# Preprocess the image for OCR
processed_array = preprocess_image_for_ocr(rescaled_img)
processed_pil = Image.fromarray(processed_array)
# Overwrite rescaled_path with preprocessed image
processed_pil.save(rescaled_path, dpi=(TARGET_DPI, TARGET_DPI))
# Run OCR
res = image_converter.convert(original_path).document.export_to_text()
# Replace PictureItem with OCR'd text
prov = (
element.prov[0]
if isinstance(element.prov, list) and element.prov
else None
)
new_text = conv_res.document.add_text(
label=DocItemLabel.TEXT,
text=res,
prov=prov,
)
replacements.append((element, new_text))
for old_item, new_item in replacements:
conv_res.document.replace_item(new_item=new_item, old_item=old_item)
conv_res.document.print_element_tree()
# Save markdown page-by-page
total_pages = conv_res.document.num_pages()
for page_no in range(1, total_pages + 1):
md_content = conv_res.document.export_to_markdown(
page_no=page_no,
image_mode=ImageRefMode.REFERENCED,
enable_chart_tables=True,
image_placeholder="<!-- image -->",
escape_underscores=True,
)
md_file = output_dir / f"{doc_filename}-page-{page_no}.md"
with md_file.open("w", encoding="utf-8") as f:
f.write(md_content)
_log.info(
f"Document processed and exported in {(time.time() - start_time):.2f} seconds."
)
if name == "main": main() //////
@cau-git -- grateful ,if you could look into the above images and compare the second line.
@mudassir206 Yes it's clear the words should be from right to left RTL Reading. Some context is correct from RTL
Check a very simple code that generate HTML file:
from docling.document_converter import DocumentConverter source = "doc2.pdf" converter = DocumentConverter() doc = converter.convert(source).document html_output = doc.export_to_html() with open("Doc2.html", "w", encoding="utf-8") as f: f.write(html_output)
brother @Jalmood i used the above code you posted.do you have better configuration that can solve the issue.
brother @Jalmood i used the above code you posted.do you have better configuration that can solve the issue.
I am using LlamaCloud and Mistral and it's working greate with Arabic PDF files ;) check attched MD file.
brother @Jalmood thanks for the response. i will check with mistral.