fix : If image_processor is not empty, then use it,same with tokenizer

Open xiaoymin opened this issue 1 year ago • 1 comments

Hello @VikParuchuri , I saw the Surya project and it is great

I used the official python code locally without any problems. The model would be downloaded remotely from huggingface to the local computer, and then parsed and executed. However, when I tested it on the server, I found some problems.

For some reason, the server cannot access huggingface, so the model file can only be downloaded to the server manually, and then the model loading directory is specified through code.

Official code example:

If huggingface is accessible, the downloaded code can be executed

from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

image = Image.open(IMAGE_PATH)
langs = ["en"] # Replace with your languages
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)

I looked at the source code SuryaProcessor and there are some problems with the initialization method.


class SuryaProcessor(DonutProcessor):
    def __init__(self, image_processor=None, tokenizer=None, train=False, **kwargs):
        image_processor = SuryaImageProcessor.from_pretrained(settings.RECOGNITION_MODEL_CHECKPOINT)
        tokenizer = Byt5LangTokenizer()
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
        self._in_target_context_manager = False

The image_processor and tokenizer passed by this method will not have any effect. Even if they are passed when initializing the SuryaProcessor object, I cannot change the problem of using the local directory to load the model.

So in order to debug the current code on the server, I can only rewrite a new class to inherit SuryaProcessor to change the problem that image_processor and tokenizer do not take effect. This is the code I tested on the server, as follows:

from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
from surya.model.recognition.tokenizer import Byt5LangTokenizer
from transformers import DonutImageProcessor,DonutProcessor
from surya.model.recognition.processor import SuryaProcessor, SuryaImageProcessor
from surya.settings import settings




class SuryaLocalProcessor(DonutProcessor):
    def __init__(self, image_processor=None, tokenizer=None, train=False, **kwargs):
        image_processor = image_processor or SuryaImageProcessor.from_pretrained(settings.RECOGNITION_MODEL_CHECKPOINT)
        tokenizer = tokenizer or Byt5LangTokenizer()
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
        self._in_target_context_manager = False

    def __call__(self, *args, **kwargs):
        # For backward compatibility
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        images = kwargs.pop("images", None)
        text = kwargs.pop("text", None)
        lang = kwargs.pop("lang", None)

        if len(args) > 0:
            images = args[0]
            args = args[1:]

        if images is None and text is None:
            raise ValueError("You need to specify either an `images` or `text` input to process.")

        if images is not None:
            inputs = self.image_processor(images, *args, **kwargs)

        if text is not None:
            encodings = self.tokenizer(text, lang, **kwargs)

        if text is None:
            return inputs
        elif images is None:
            return encodings
        else:
            inputs["labels"] = encodings["input_ids"]
            inputs["langs"] = encodings["langs"]
            return inputs

def load_local_processor(checkpoint=settings.RECOGNITION_MODEL_CHECKPOINT):
        image_processor=SuryaImageProcessor.from_pretrained(checkpoint)
        processor = SuryaLocalProcessor(image_processor=image_processor)
        processor.image_processor.train = False
        processor.image_processor.max_size = settings.RECOGNITION_IMAGE_SIZE
        processor.tokenizer.model_max_length = settings.RECOGNITION_MAX_TOKENS
        return processor


IMAGE_PATH = "/mnt/surya-ocr-demo/ocr-test9.png"
image = Image.open(IMAGE_PATH)
langs = ["zh"]  # Replace with your languages
det_model_path = "/mnt/surya-ocr-demo/vikp:surya_det2"
rec_model_path = "/mnt/surya-ocr-demo/vikp:surya_rec"
det_processor, det_model = load_det_processor(checkpoint=det_model_path), load_det_model(checkpoint=det_model_path)
rec_model, rec_processor = load_rec_model(checkpoint=rec_model_path), load_local_processor(checkpoint=rec_model_path)

predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)

for pre in predictions:
        print(pre)
        for line in pre.text_lines:
                print(line)

Therefore, I think the constructor of SuryaProcessor should be compatible with external parameters passed

Mar 25 '24 13:03 xiaoymin

Plz update this request ASAP!!!!!! Really important changes!!!

Jul 11 '24 02:07 ChengsongLu