model_server OVMS model loading time is much longer than openvino model loading time and crashes from time to time

Describe the bug In our application we have a need to switch models due to resource constraint. When we compare model loading time in OVMS vs in native openvino, we found the overhead is quite heavy.

To Reproduce Steps to reproduce the behavior:

Update testing script for ovms root and model files exported for ovms:

import gc
import itertools
import json
from pathlib import Path
import time
import httpx
from transformers import AutoTokenizer
import subprocess

from optimum.intel import OVModelForCausalLM

def ov_load_model(model_path: Path):
    ov_config = {"INFERENCE_PRECISION_HINT": "f16"}
    model = OVModelForCausalLM.from_pretrained(model_path, compile=True, compile_only=True, device="GPU", ov_config=ov_config)
    model.compile()
    tok = AutoTokenizer.from_pretrained(model_path)
    return model, tok


class OVMS:
    _ovms_process = None
    port = 9091
    rest_port = 8081
    model_config = Path("model_config.json")
    client = httpx.Client(trust_env=False)

    @classmethod
    def prompt_to_start_ovms(cls, ovms_root: Path):
        ovms_binary_path = ovms_root / "ovms.exe"
        assert ovms_binary_path.exists()

        # Set environment variables for ovms.exe execution
        cmd_start_server = f'"{ovms_root/"setupvars.bat"}" && "{ovms_binary_path}" --port {cls.port} --rest_port {cls.rest_port} --config_path "{cls.model_config.absolute()}" --file_system_poll_wait_seconds 0'
        print(cmd_start_server)
        input(f"Use command to start OVMS server: {cmd_start_server}\nPress Enter to continue...")

    @classmethod
    def update_ovms_config_file(cls, model_path: Path):
        config_path = cls.model_config
        if not config_path.exists():
            raise FileNotFoundError(f"OVMS config file {config_path} does not exist.")
        
        config_data = {
            "mediapipe_config_list": [{
                "name": model_path.name,
                "base_path": str(model_path)
            }],
            "model_config_list": []
        }
        with config_path.open("w", encoding="utf-8") as f:
            json.dump(config_data, f, indent=4, ensure_ascii=False)
    
    @classmethod
    def wait_until_model_status(cls, model_name: str, wait_ready=True):
        url = f"http://localhost:{cls.rest_port}/v2/models/{model_name}/ready"
        #print(f"Checking if model {model_name} is ready:{wait_ready} at {url}...")
        while True:
            try:
                response = cls.client.get(url)
                if wait_ready and response.status_code == 200:
                    #print(f"Model {model_name} is ready.")
                    return
                elif not wait_ready and response.status_code == 404:
                    #print(f"Model {model_name} is unloaded as expected")
                    return
            except httpx.RequestError as e:
                print(f"Request error: {e}, retrying...")
            time.sleep(0.1)

    @classmethod
    def reload_ovms_config(cls):
        url = f"http://localhost:{cls.rest_port}/v1/config/reload"
        response = cls.client.post(url, timeout=60)
        if response.status_code != 200:
            raise RuntimeError(f"Failed to reload OVMS config: {response.text}")

def ovms_load_model(model_path: Path):
    OVMS.update_ovms_config_file(model_path)
    OVMS.reload_ovms_config()


class TimerContext:
    def __init__(self, name: str):
        self.name = name
        self.start_time = None

    def __enter__(self):
        self.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        elapsed_time = time.time() - self.start_time
        print(f"{self.name} took {elapsed_time:.2f} seconds")

def test_ovms_load_model(ovms_root:Path, model_a:Path, model_b:Path, iterations=10):
    OVMS.update_ovms_config_file(model_b)
    OVMS.prompt_to_start_ovms(ovms_root)
    OVMS.wait_until_model_status(model_b.name, wait_ready=True)
    for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
        with TimerContext(f"Loading model {model_path.name:40}"):
            ovms_load_model(model_path)
        time.sleep(1) # wait for a bit before to unload current model

def test_ov_load_model(model_a: Path, model_b: Path, iterations=10):
    with TimerContext(f"Intialize by loading model {model_b.name:40}"):
        model, tok = ov_load_model(model_b)
    for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
        with TimerContext(f"Loading model {model_path.name:40}"):
            del model, tok
            gc.collect()
            model, tok = ov_load_model(model_path)


if __name__ == "__main__":
    ovms_root = Path(r"c:\workspace\ovms")
    model_a = Path(r"C:\workspace\local_models\Qwen2.5-7B-Instruct-int4-ov")
    model_b = Path(r"C:\workspace\local_models\Phi-3-mini-4k-instruct-int4-ov")

    if not model_a.exists() or not model_b.exists():
        raise FileNotFoundError("Model paths do not exist.")
    
    print("Testing OVMS model loading...")
    test_ovms_load_model(ovms_root, model_a, model_b, 5)

    input("Terminate ovms process and press Enter to continue...")

    print("Testing OpenVINO model loading...")
    test_ov_load_model(model_a, model_b, 5)

run the test script in command prompt, follow command to start and stop ovms
Sample output:

Testing OVMS model loading...
"C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Use command to start OVMS server: "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Press Enter to continue...
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.59 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 4.49 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 5.99 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 3.98 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.19 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.13 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.10 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 4.15 seconds
Terminate ovms process and press Enter to continue...
Testing OpenVINO model loading...
Intialize by loading model Phi-3-mini-4k-instruct-int4-ov           took 3.54 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.56 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.66 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.54 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.65 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.52 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.64 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.63 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.65 seconds

Overhead is quite heavy. 3.5s -> 6.1s for 7B model and 1.65s -> 4s for 3.8B model

Expected behavior Similiar or better model loading time comparing to native openvino.

Logs Logs from OVMS, ideally with --log_level DEBUG. Logs from client.

Configuration

OVMS version: 2025.2.1
OVMS config.json file: script generated
pbtxt:

input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"

node: {
  name: "LLMExecutor"
  calculator: "HttpLLMCalculator"
  input_stream: "LOOPBACK:loopback"
  input_stream: "HTTP_REQUEST_PAYLOAD:input"
  input_side_packet: "LLM_NODE_RESOURCES:llm"
  output_stream: "LOOPBACK:loopback"
  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
  input_stream_info: {
    tag_index: 'LOOPBACK:0',
    back_edge: true
  }
  node_options: {
      [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
          models_path: "./",
          plugin_config: '{}',
          enable_prefix_caching: false,
          cache_size: 2,
          max_num_seqs: 256,
          device: "GPU",
      }
  }
  input_stream_handler {
    input_stream_handler: "SyncSetInputStreamHandler",
    options {
      [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
        sync_set {
          tag_index: "LOOPBACK:0"
        }
      }
    }
  }
}

ovms will crash from time to time: script output:

Loading model Qwen2.5-7B-Instruct-int4-ov              took 8.65 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 4.18 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 4.81 seconds
Traceback (most recent call last):
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
    yield
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
    resp = self._pool.handle_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
    raise exc from None
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
    response = connection.handle_request(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection.py", line 103, in handle_request
    return self._connection.handle_request(request)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 136, in handle_request
    raise exc
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 106, in handle_request
    ) = self._receive_response_headers(**kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 177, in _receive_response_headers
    event = self._receive_event(timeout=timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 217, in _receive_event
    data = self._network_stream.read(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_backends\sync.py", line 126, in read
    with map_exceptions(exc_map):
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
    self.gen.throw(value)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
    raise to_exc(exc) from exc
httpcore.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 123, in <module>
    test_ovms_load_model(ovms_root, model_a, model_b, 5)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 101, in test_ovms_load_model
    ovms_load_model(model_path)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 79, in ovms_load_model
    OVMS.reload_ovms_config()
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 73, in reload_ovms_config
    response = cls.client.post(url, timeout=60)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1144, in post
    return self.request(
           ^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 825, in request
    return self.send(request, auth=auth, follow_redirects=follow_redirects)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 914, in send
    response = self._send_handling_auth(
               ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
    response = self._send_handling_redirects(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
    response = self._send_single_request(request)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
    response = transport.handle_request(request)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
    with map_httpcore_exceptions():
         ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
    self.gen.throw(value)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
    raise mapped_exc(message) from exc
httpx.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host

and ovms output:

[2025-07-29 11:58:26.840][44664][llm_executor][info][llm_executor.hpp:90] All requests: 0; Scheduled requests: 0;
[2025-07-29 11:58:26.892][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RETIRED after handling: RetireEvent: 
[2025-07-29 11:58:27.897][114868][modelmanager][error][src/modelmanager.cpp:960] Configuration file doesn't have models property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:658] Configuration file doesn't have custom node libraries property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:701] Configuration file doesn't have pipelines property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/mediapipe_internal/mediapipefactory.cpp:103] Reloading mediapipe graph: Qwen2.5-7B-Instruct-int4-ov
[2025-07-29 11:58:27.897][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RELOADING after handling: ReloadEvent: 
[2025-07-29 11:58:27.898][114868][serving][info][src/mediapipe_internal/mediapipegraphdefinition.cpp:421] MediapipeGraphDefinition initializing graph nodes
[2025-07-29 11:58:27.898][114868][modelmanager][info][src/llm/servable_initializer.cpp:268] Initializing Language Model Continuous Batching servable

(superpy312) qually@HP-LNL C:\workspace\qually\applications.ai.superbuilder>

Additional context Add any other context about the problem here.

Jul 29 '25 04:07 Quallyjiang

The comparision is between using Optimum Intel (ov_load_model) vs GenAI (ovms). It is not comparision of the same pipeline types. Can you change optimum-intel model loading to loading the model via GenAI? It would be fair comparision.

Here are samples that you can use, slightly modify to load the model via OpenVINO GenAI: https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/cpp/text_generation/README.md

Jul 29 '25 08:07 dkalinowski

GenAI loading speed is about same as ovms. Should I got there to report performance issue comparing to optimum OV model loading time?

And how about the crash? It's more important than the loading speed.

Jul 29 '25 16:07 Quallyjiang

Do you monitor RAM/vRAM usage during model reload? The scenario you have showcased in the python script switches base_path of a model. OVMS order of operations is that it always tries to serve old model, until the new one is loaded correctly. The downside is that it is possible to have both models loaded at the same time (for short period of time - swapping the model). Are you sure it is not Out of Memory issue? @Quallyjiang

Regarding model loading speed, if this is reproducible in bare OpenVINO GenAI app - yes, you can report it there. However, the difference is most likely due to different pipeline type (optimum does not use continuous batching, have different params, works only for sequential processing).

Jul 31 '25 14:07 dkalinowski

i've monitored the vram and ram status, no abnormal found. and I've tried to unload model first then load it, still will crash if keep on loading for like 10 or 15 minutes. can you try to reproduce it?

    @classmethod
    def update_ovms_config_file(cls, model_path: Optional[Path]):
        config_path = cls.model_config
        if not config_path.exists():
            raise FileNotFoundError(f"OVMS config file {config_path} does not exist.")
        
        if model_path is None:
            config_data = {
                "mediapipe_config_list": [],
                "model_config_list": []
            }
        else:
            config_data = {
                "mediapipe_config_list": [{
                    "name": model_path.name,
                    "base_path": str(model_path)
                }],
                "model_config_list": []
            }
            
        with config_path.open("w", encoding="utf-8") as f:
            json.dump(config_data, f, indent=4, ensure_ascii=False)


def ovms_load_model_2steps(model_path: Path):
    OVMS.update_ovms_config_file(None)
    OVMS.reload_ovms_config()
    OVMS.update_ovms_config_file(model_path)
    OVMS.reload_ovms_config()

Jul 31 '25 17:07 Quallyjiang

btw, in case of OOM, do you expect it to crash quietly?

Jul 31 '25 18:07 Quallyjiang