OVMS model loading time is much longer than openvino model loading time and crashes from time to time
Describe the bug In our application we have a need to switch models due to resource constraint. When we compare model loading time in OVMS vs in native openvino, we found the overhead is quite heavy.
To Reproduce Steps to reproduce the behavior:
- Update testing script for ovms root and model files exported for ovms:
import gc
import itertools
import json
from pathlib import Path
import time
import httpx
from transformers import AutoTokenizer
import subprocess
from optimum.intel import OVModelForCausalLM
def ov_load_model(model_path: Path):
ov_config = {"INFERENCE_PRECISION_HINT": "f16"}
model = OVModelForCausalLM.from_pretrained(model_path, compile=True, compile_only=True, device="GPU", ov_config=ov_config)
model.compile()
tok = AutoTokenizer.from_pretrained(model_path)
return model, tok
class OVMS:
_ovms_process = None
port = 9091
rest_port = 8081
model_config = Path("model_config.json")
client = httpx.Client(trust_env=False)
@classmethod
def prompt_to_start_ovms(cls, ovms_root: Path):
ovms_binary_path = ovms_root / "ovms.exe"
assert ovms_binary_path.exists()
# Set environment variables for ovms.exe execution
cmd_start_server = f'"{ovms_root/"setupvars.bat"}" && "{ovms_binary_path}" --port {cls.port} --rest_port {cls.rest_port} --config_path "{cls.model_config.absolute()}" --file_system_poll_wait_seconds 0'
print(cmd_start_server)
input(f"Use command to start OVMS server: {cmd_start_server}\nPress Enter to continue...")
@classmethod
def update_ovms_config_file(cls, model_path: Path):
config_path = cls.model_config
if not config_path.exists():
raise FileNotFoundError(f"OVMS config file {config_path} does not exist.")
config_data = {
"mediapipe_config_list": [{
"name": model_path.name,
"base_path": str(model_path)
}],
"model_config_list": []
}
with config_path.open("w", encoding="utf-8") as f:
json.dump(config_data, f, indent=4, ensure_ascii=False)
@classmethod
def wait_until_model_status(cls, model_name: str, wait_ready=True):
url = f"http://localhost:{cls.rest_port}/v2/models/{model_name}/ready"
#print(f"Checking if model {model_name} is ready:{wait_ready} at {url}...")
while True:
try:
response = cls.client.get(url)
if wait_ready and response.status_code == 200:
#print(f"Model {model_name} is ready.")
return
elif not wait_ready and response.status_code == 404:
#print(f"Model {model_name} is unloaded as expected")
return
except httpx.RequestError as e:
print(f"Request error: {e}, retrying...")
time.sleep(0.1)
@classmethod
def reload_ovms_config(cls):
url = f"http://localhost:{cls.rest_port}/v1/config/reload"
response = cls.client.post(url, timeout=60)
if response.status_code != 200:
raise RuntimeError(f"Failed to reload OVMS config: {response.text}")
def ovms_load_model(model_path: Path):
OVMS.update_ovms_config_file(model_path)
OVMS.reload_ovms_config()
class TimerContext:
def __init__(self, name: str):
self.name = name
self.start_time = None
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_value, traceback):
elapsed_time = time.time() - self.start_time
print(f"{self.name} took {elapsed_time:.2f} seconds")
def test_ovms_load_model(ovms_root:Path, model_a:Path, model_b:Path, iterations=10):
OVMS.update_ovms_config_file(model_b)
OVMS.prompt_to_start_ovms(ovms_root)
OVMS.wait_until_model_status(model_b.name, wait_ready=True)
for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
with TimerContext(f"Loading model {model_path.name:40}"):
ovms_load_model(model_path)
time.sleep(1) # wait for a bit before to unload current model
def test_ov_load_model(model_a: Path, model_b: Path, iterations=10):
with TimerContext(f"Intialize by loading model {model_b.name:40}"):
model, tok = ov_load_model(model_b)
for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
with TimerContext(f"Loading model {model_path.name:40}"):
del model, tok
gc.collect()
model, tok = ov_load_model(model_path)
if __name__ == "__main__":
ovms_root = Path(r"c:\workspace\ovms")
model_a = Path(r"C:\workspace\local_models\Qwen2.5-7B-Instruct-int4-ov")
model_b = Path(r"C:\workspace\local_models\Phi-3-mini-4k-instruct-int4-ov")
if not model_a.exists() or not model_b.exists():
raise FileNotFoundError("Model paths do not exist.")
print("Testing OVMS model loading...")
test_ovms_load_model(ovms_root, model_a, model_b, 5)
input("Terminate ovms process and press Enter to continue...")
print("Testing OpenVINO model loading...")
test_ov_load_model(model_a, model_b, 5)
- run the test script in command prompt, follow command to start and stop ovms
- Sample output:
Testing OVMS model loading...
"C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Use command to start OVMS server: "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Press Enter to continue...
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.59 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 4.49 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 5.99 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 3.98 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.19 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.13 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.10 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 4.15 seconds
Terminate ovms process and press Enter to continue...
Testing OpenVINO model loading...
Intialize by loading model Phi-3-mini-4k-instruct-int4-ov took 3.54 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.56 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.66 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.54 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.65 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.52 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.64 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.63 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.65 seconds
Overhead is quite heavy. 3.5s -> 6.1s for 7B model and 1.65s -> 4s for 3.8B model
Expected behavior Similiar or better model loading time comparing to native openvino.
Logs Logs from OVMS, ideally with --log_level DEBUG. Logs from client.
Configuration
- OVMS version: 2025.2.1
- OVMS config.json file: script generated
- pbtxt:
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node: {
name: "LLMExecutor"
calculator: "HttpLLMCalculator"
input_stream: "LOOPBACK:loopback"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
input_side_packet: "LLM_NODE_RESOURCES:llm"
output_stream: "LOOPBACK:loopback"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
input_stream_info: {
tag_index: 'LOOPBACK:0',
back_edge: true
}
node_options: {
[type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
models_path: "./",
plugin_config: '{}',
enable_prefix_caching: false,
cache_size: 2,
max_num_seqs: 256,
device: "GPU",
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler",
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "LOOPBACK:0"
}
}
}
}
}
- ovms will crash from time to time: script output:
Loading model Qwen2.5-7B-Instruct-int4-ov took 8.65 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 4.18 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 4.81 seconds
Traceback (most recent call last):
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
yield
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
resp = self._pool.handle_request(req)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
raise exc from None
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
response = connection.handle_request(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection.py", line 103, in handle_request
return self._connection.handle_request(request)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 136, in handle_request
raise exc
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 106, in handle_request
) = self._receive_response_headers(**kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 177, in _receive_response_headers
event = self._receive_event(timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 217, in _receive_event
data = self._network_stream.read(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_backends\sync.py", line 126, in read
with map_exceptions(exc_map):
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
self.gen.throw(value)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
raise to_exc(exc) from exc
httpcore.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 123, in <module>
test_ovms_load_model(ovms_root, model_a, model_b, 5)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 101, in test_ovms_load_model
ovms_load_model(model_path)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 79, in ovms_load_model
OVMS.reload_ovms_config()
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 73, in reload_ovms_config
response = cls.client.post(url, timeout=60)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1144, in post
return self.request(
^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 825, in request
return self.send(request, auth=auth, follow_redirects=follow_redirects)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 914, in send
response = self._send_handling_auth(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
response = self._send_handling_redirects(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
response = self._send_single_request(request)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
response = transport.handle_request(request)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
with map_httpcore_exceptions():
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
self.gen.throw(value)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host
and ovms output:
[2025-07-29 11:58:26.840][44664][llm_executor][info][llm_executor.hpp:90] All requests: 0; Scheduled requests: 0;
[2025-07-29 11:58:26.892][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RETIRED after handling: RetireEvent:
[2025-07-29 11:58:27.897][114868][modelmanager][error][src/modelmanager.cpp:960] Configuration file doesn't have models property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:658] Configuration file doesn't have custom node libraries property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:701] Configuration file doesn't have pipelines property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/mediapipe_internal/mediapipefactory.cpp:103] Reloading mediapipe graph: Qwen2.5-7B-Instruct-int4-ov
[2025-07-29 11:58:27.897][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RELOADING after handling: ReloadEvent:
[2025-07-29 11:58:27.898][114868][serving][info][src/mediapipe_internal/mediapipegraphdefinition.cpp:421] MediapipeGraphDefinition initializing graph nodes
[2025-07-29 11:58:27.898][114868][modelmanager][info][src/llm/servable_initializer.cpp:268] Initializing Language Model Continuous Batching servable
(superpy312) qually@HP-LNL C:\workspace\qually\applications.ai.superbuilder>
Additional context Add any other context about the problem here.
The comparision is between using Optimum Intel (ov_load_model) vs GenAI (ovms). It is not comparision of the same pipeline types. Can you change optimum-intel model loading to loading the model via GenAI? It would be fair comparision.
Here are samples that you can use, slightly modify to load the model via OpenVINO GenAI: https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/cpp/text_generation/README.md
GenAI loading speed is about same as ovms. Should I got there to report performance issue comparing to optimum OV model loading time?
And how about the crash? It's more important than the loading speed.
Do you monitor RAM/vRAM usage during model reload? The scenario you have showcased in the python script switches base_path of a model. OVMS order of operations is that it always tries to serve old model, until the new one is loaded correctly. The downside is that it is possible to have both models loaded at the same time (for short period of time - swapping the model). Are you sure it is not Out of Memory issue? @Quallyjiang
Regarding model loading speed, if this is reproducible in bare OpenVINO GenAI app - yes, you can report it there. However, the difference is most likely due to different pipeline type (optimum does not use continuous batching, have different params, works only for sequential processing).
i've monitored the vram and ram status, no abnormal found. and I've tried to unload model first then load it, still will crash if keep on loading for like 10 or 15 minutes. can you try to reproduce it?
@classmethod
def update_ovms_config_file(cls, model_path: Optional[Path]):
config_path = cls.model_config
if not config_path.exists():
raise FileNotFoundError(f"OVMS config file {config_path} does not exist.")
if model_path is None:
config_data = {
"mediapipe_config_list": [],
"model_config_list": []
}
else:
config_data = {
"mediapipe_config_list": [{
"name": model_path.name,
"base_path": str(model_path)
}],
"model_config_list": []
}
with config_path.open("w", encoding="utf-8") as f:
json.dump(config_data, f, indent=4, ensure_ascii=False)
def ovms_load_model_2steps(model_path: Path):
OVMS.update_ovms_config_file(None)
OVMS.reload_ovms_config()
OVMS.update_ovms_config_file(model_path)
OVMS.reload_ovms_config()
btw, in case of OOM, do you expect it to crash quietly?