Detect language in EasyOCR at runtime for multi-language image

Open TheRabidWolverine opened this issue 1 year ago • 1 comments

The requirement in EasyOCR seems to be that it can be loaded only with a set of mutually compatible languages. Like English is compatible with all languages, but Spanish is not necessarily compatible with Japanese. We have a situation - the image we will have can have texts with multiple languages in various areas, and there is no guarantee whether they will be mutually compatible sets. Not to mention, the languages are not known at runtime, it can be any combination of languages. How can EasyOCR be used to detect the languages for the various parts of the image and do OCR for them separately? We could have loaded EasyOCR with all languages to cover them all, problem is, all languages are not compatible with each other, so that tactic cannot be used.

Example image:

Feb 01 '24 20:02 TheRabidWolverine

Works great, use it

import easyocr
import concurrent.futures
from PIL import Image
import numpy as np

from easyocr.utils import get_paragraph
from easyocr.config import (
    latin_lang_list,
    arabic_lang_list,
    cyrillic_lang_list,
)

bengali_lang_list = ['bn', 'as']
devanagari_lang_list = [
    'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom'
]
other_lang_list = ['th', 'ch_sim', 'ch_tra', 'ja', 'ko', 'te', 'kn']

readers_map = {
    'latin_langs': easyocr.Reader(latin_lang_list, gpu=True),
    'arabic_langs': easyocr.Reader(arabic_lang_list, gpu=True),
    'bengali_langs': easyocr.Reader(bengali_lang_list, gpu=True),
    'cyrillic_langs': easyocr.Reader(cyrillic_lang_list, gpu=True),
    'devanagari_langs': easyocr.Reader(devanagari_lang_list, gpu=True),
}
for lang in other_lang_list:
    readers_map[lang] = easyocr.Reader([lang], gpu=True)


def boxes_are_close(box_a, box_b, eps=3):
    """
    Простейшее сравнение координат двух боксов (список из 4 точек).
    eps – допустимая разница в пикселях.
    """
    xs_a = [pt[0] for pt in box_a]
    ys_a = [pt[1] for pt in box_a]
    min_x_a, max_x_a = min(xs_a), max(xs_a)
    min_y_a, max_y_a = min(ys_a), max(ys_a)

    xs_b = [pt[0] for pt in box_b]
    ys_b = [pt[1] for pt in box_b]
    min_x_b, max_x_b = min(xs_b), max(xs_b)
    min_y_b, max_y_b = min(ys_b), max(ys_b)

    if (
            abs(min_x_a - min_x_b) <= eps
            and abs(min_y_a - min_y_b) <= eps
            and abs(max_x_a - max_x_b) <= eps
            and abs(max_y_a - max_y_b) <= eps
    ):
        return True
    return False


def recognize_text_from_image(img: Image.Image,
                              x_ths=1.0,
                              y_ths=0.5,
                              run_parallel=True):
    """
    Распознаём текст всеми ридерами (из readers_map).
    Если run_parallel=True, используем потоки для одновременного распознавания.
    Если run_parallel=False, обрабатываем ридеров по очереди (меньше пиковое использование GPU).

    1) Собираем все результаты (detail=1) в список вида [[box, text, conf, reader_name], ...].
    2) Группируем результаты по близким bounding-box, берём у каждого бокса запись с max confidence.
    3) Делим результаты на две группы: 'arabic_langs' и 'остальные'.
    4) Для каждой группы вызываем get_paragraph(...), передавая:
       - mode='rtl', если arabic_langs
       - mode='ltr' иначе
    5) Возвращаем список параграфов (сначала арабские, потом все остальные).
       Каждый параграф — [box, text, conf].
    """
    base_np_img = np.array(img.convert("RGB"))

    def run_reader(reader, name):
        # Возвращаем [[box, text, conf, name], ...]
        results = reader.readtext(base_np_img, detail=1)
        extended = []
        for (box, txt, cf) in results:
            extended.append([box, txt, cf, name])
        return extended

    # Шаг 1. Собираем результаты со всех ридеров в raw_all
    raw_all = []

    if run_parallel:
        # --- ПАРАЛЛЕЛЬНЫЙ РЕЖИМ ---
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_map = {
                executor.submit(run_reader, rdr, rname): rname
                for rname, rdr in readers_map.items()
            }
            for future in concurrent.futures.as_completed(future_map):
                reader_name = future_map[future]
                try:
                    reader_results = future.result()
                    raw_all.extend(reader_results)
                except Exception as e:
                    print(f'Ошибка в ридере "{reader_name}": {e}')
    else:
        # --- ПОСЛЕДОВАТЕЛЬНЫЙ РЕЖИМ ---
        for rname, rdr in readers_map.items():
            try:
                reader_results = run_reader(rdr, rname)
                raw_all.extend(reader_results)
            except Exception as e:
                print(f'Ошибка в ридере "{rname}": {e}')

    # Шаг 2. "Схлопываем" боксы с близкими координатами,
    #        выбираем вариант с самым высоким confidence
    grouped = []
    used = [False] * len(raw_all)

    for i, item_i in enumerate(raw_all):
        if used[i]:
            continue
        box_i, text_i, conf_i, rdr_i = item_i
        duplicates_idx = [i]
        for j in range(i + 1, len(raw_all)):
            if used[j]:
                continue
            box_j, text_j, conf_j, rdr_j = raw_all[j]
            if boxes_are_close(box_i, box_j, eps=0):
                duplicates_idx.append(j)

        # Ищем лучший из дубликатов по conf
        best_idx = i
        best_conf = conf_i
        for d_idx in duplicates_idx:
            _, _, c, _ = raw_all[d_idx]
            if c > best_conf:
                best_idx = d_idx
                best_conf = c

        grouped.append(raw_all[best_idx])
        # Помечаем все эти дубликаты как использованные
        for d_idx in duplicates_idx:
            used[d_idx] = True

    # Шаг 3. Разделяем на arabic / прочие
    arabic_list = []
    other_list = []
    for (box, txt, cf, rdr_name) in grouped:
        if rdr_name == 'arabic_langs':
            arabic_list.append([box, txt, cf])
        else:
            other_list.append([box, txt, cf])

    # Шаг 4. Прогоняем get_paragraph
    arabic_paragraphs = []
    other_paragraphs = []

    if arabic_list:
        arabic_paragraphs = get_paragraph(arabic_list, x_ths=x_ths, y_ths=y_ths, mode='rtl')
    if other_list:
        other_paragraphs = get_paragraph(other_list, x_ths=x_ths, y_ths=y_ths, mode='ltr')

    # Шаг 5. Возвращаем общий список
    return arabic_paragraphs + other_paragraphs


if __name__ == '__main__':
    # Test scenario
    test_image = Image.open('/home/gordey/Pictures/test-photos/cover6_1438789097-576x288.png')

    x_ths_val = 1.0
    y_ths_val = 0.5


    result = recognize_text_from_image(test_image, x_ths=x_ths_val, y_ths=y_ths_val)

    for item in result:
        print(item)  # [box, text, conf]

Feb 19 '25 18:02 GordeyTsy