Describe the bug
2024-05-07 14:43:59,173 xinference.api.restful_api 836 ERROR [address=0.0.0.0:45545, pid=2324] CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=, num_gpus=
CUDA call was originally invoked at:
File "", line 1, in
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/forkserver.py", line 274, in main
code = _serve_one(child_r, fds,
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/forkserver.py", line 313, in _serve_one
code = spawn._main(child_r, parent_sentinel)
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/spawn.py", line 125, in _main
prepare(preparation_data)
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/spawn.py", line 236, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/spawn.py", line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
File "/root/miniconda3/envs/xinference/lib/python3.10/runpy.py", line 289, in run_path
return _run_module_code(code, init_globals, run_name,
File "/root/miniconda3/envs/xinference/lib/python3.10/runpy.py", line 96, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/root/miniconda3/envs/xinference/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/root/miniconda3/envs/xinference/bin/xinference-local", line 5, in
from xinference.deploy.cmdline import local
File "", line 1027, in _find_and_load
File "", line 992, in _find_and_load_unlocked
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 992, in _find_and_load_unlocked
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/init.py", line 38, in
_install()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/init.py", line 35, in _install
install_model()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/init.py", line 17, in _install
from .llm import _install as llm_install
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/llm/init.py", line 19, in
from .core import (
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/llm/core.py", line 23, in
from ...core.utils import parse_replica_model_uid
File "", line 1027, in _find_and_load
File "", line 992, in _find_and_load_unlocked
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/init.py", line 15, in
from .model import ModelActor
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/model.py", line 47, in
from ..device_utils import empty_cache
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/device_utils.py", line 17, in
import torch
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/init.py", line 1478, in
_C._initExtension(manager_path())
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 238, in
_lazy_call(_check_capability)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 235, in _lazy_call
_queued_calls.append((callable, traceback.format_stack()))
Traceback (most recent call last):
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/api/restful_api.py", line 731, in launch_model
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/context.py", line 227, in send
return self._process_result_message(result)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/pool.py", line 659, in send
result = await self._run_coro(message.message_id, coro)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/supervisor.py", line 850, in launch_builtin_model
await _launch_model()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/supervisor.py", line 814, in _launch_model
await _launch_one_model(rep_model_uid)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/supervisor.py", line 796, in _launch_one_model
await worker_ref.launch_builtin_model(
File "xoscar/core.pyx", line 284, in __pyx_actor_method_wrapper
async with lock:
File "xoscar/core.pyx", line 287, in xoscar.core.__pyx_actor_method_wrapper
result = await result
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/utils.py", line 45, in wrapped
ret = await func(*args, **kwargs)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/worker.py", line 697, in launch_builtin_model
await model_ref.load()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/context.py", line 227, in send
return self._process_result_message(result)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/pool.py", line 659, in send
result = await self._run_coro(message.message_id, coro)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 524, in xoscar.core._BaseActor.on_receive
result = func(*args, **kwargs)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/model.py", line 239, in load
self._model.load()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 178, in load
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 346, in from_engine_args
engine_config = engine_args.create_engine_config()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/vllm/engine/arg_utils.py", line 520, in create_engine_config
model_config = ModelConfig(
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/vllm/config.py", line 131, in init
self._verify_quantization()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/vllm/config.py", line 170, in _verify_quantization
elif GPTQMarlinConfig.is_marlin_compatible(quant_cfg):
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/gptq_marlin.py", line 144, in is_marlin_compatible
major, minor = torch.cuda.get_device_capability()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 430, in get_device_capability
prop = get_device_properties(device)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 444, in get_device_properties
_lazy_init() # will define _get_device_properties
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 312, in _lazy_init
raise DeferredCudaCallError(msg) from e
torch.cuda.DeferredCudaCallError: [address=0.0.0.0:45545, pid=2324] CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=, num_gpus=
CUDA call was originally invoked at:
File "", line 1, in
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/forkserver.py", line 274, in main
code = _serve_one(child_r, fds,
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/forkserver.py", line 313, in _serve_one
code = spawn._main(child_r, parent_sentinel)
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/spawn.py", line 125, in _main
prepare(preparation_data)
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/spawn.py", line 236, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "/root/miniconda3/envs/xinference/lib/python3.10/multiprocessing/spawn.py", line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
File "/root/miniconda3/envs/xinference/lib/python3.10/runpy.py", line 289, in run_path
return _run_module_code(code, init_globals, run_name,
File "/root/miniconda3/envs/xinference/lib/python3.10/runpy.py", line 96, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/root/miniconda3/envs/xinference/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/root/miniconda3/envs/xinference/bin/xinference-local", line 5, in
from xinference.deploy.cmdline import local
File "", line 1027, in _find_and_load
File "", line 992, in _find_and_load_unlocked
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 992, in _find_and_load_unlocked
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/init.py", line 38, in
_install()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/init.py", line 35, in _install
install_model()
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/init.py", line 17, in _install
from .llm import _install as llm_install
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/llm/init.py", line 19, in
from .core import (
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/model/llm/core.py", line 23, in
from ...core.utils import parse_replica_model_uid
File "", line 1027, in _find_and_load
File "", line 992, in _find_and_load_unlocked
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/init.py", line 15, in
from .model import ModelActor
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/core/model.py", line 47, in
from ..device_utils import empty_cache
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/xinference/device_utils.py", line 17, in
import torch
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/init.py", line 1478, in
_C._initExtension(manager_path())
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 238, in
_lazy_call(_check_capability)
File "/root/miniconda3/envs/xinference/lib/python3.10/site-packages/torch/cuda/init.py", line 235, in _lazy_call
_queued_calls.append((callable, traceback.format_stack()))
To Reproduce
To help us to reproduce this bug, please provide information below:
Python ==3.10.14
xinference==0.10.3