inference
inference copied to clipboard
vllm 启动qwen2-gptq时报错Server error: 400 - [address=0.0.0.0:46141, pid=668] Marlin does not support weight_bits = uint4b8. Only types = [] are supported (for group_size = 128, min_capability = 75, zp = False).
System Info / 系統信息
乌班图v24,docker启动 2080ti*4 cuda12.6 Driver Version: 560.31.02
Running Xinference with Docker? / 是否使用 Docker 运行 Xinfernece?
- [X] docker / docker
- [ ] pip install / 通过 pip install 安装
- [ ] installation from source / 从源码安装
Version info / 版本信息
镜像idsha256:98de95c148937f7ee71522584f4f57097ba8dcafd344098b23ccda5e8e96b7d5
The command used to start Xinference / 用以启动 xinference 的命令
sudo docker run -d -v /home/njzd/xinf/data/.xinference:/root/.xinference -v /home/njzd/xinf/data/huggingface:/root/.cache/huggingface -v /home/njzd/xinf/data/modelscope:/root/.cache/modelscope -e XINFERENCE_MODEL_SRC=modelscope -p 9997:9997 --gpus all registry.cn-hangzhou.aliyuncs.com/xprobe_xinference/xinference:latest xinference-local -H 0.0.0.0
Reproduction / 复现过程
点击qwen2-instruct
选择vllm gptq 0.5b N-GPU=auto,replica=1
报错如图
错误详情2024-09-02 06:15:25,819 xinference.api.restful_api 1 ERROR [address=0.0.0.0:36197, pid=749] Marlin does not support weight_bits = uint4b8. Only types = [] are supported (for group_size = 128, min_capability = 75, zp = False).
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/xinference/api/restful_api.py", line 878, in launch_model
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 656, in send
result = await self._run_coro(message.message_id, coro)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 367, in _run_coro
return await coro
File "/usr/local/lib/python3.10/dist-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 1027, in launch_builtin_model
await _launch_model()
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 991, in _launch_model
await _launch_one_model(rep_model_uid)
File "/usr/local/lib/python3.10/dist-packages/xinference/core/supervisor.py", line 970, in _launch_one_model
await worker_ref.launch_builtin_model(
File "xoscar/core.pyx", line 284, in __pyx_actor_method_wrapper
async with lock:
File "xoscar/core.pyx", line 287, in xoscar.core.__pyx_actor_method_wrapper
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/utils.py", line 45, in wrapped
ret = await func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/xinference/core/worker.py", line 882, in launch_builtin_model
await model_ref.load()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 656, in send
result = await self._run_coro(message.message_id, coro)
File "/usr/local/lib/python3.10/dist-packages/xoscar/backends/pool.py", line 367, in _run_coro
return await coro
File "/usr/local/lib/python3.10/dist-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/usr/local/lib/python3.10/dist-packages/xinference/core/model.py", line 300, in load
self._model.load()
File "/usr/local/lib/python3.10/dist-packages/xinference/model/llm/vllm/core.py", line 239, in load
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 471, in from_engine_args
engine = cls(
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 381, in init
self.engine = self._init_engine(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 552, in _init_engine
return engine_class(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 249, in init
self.model_executor = executor_class(
File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 36, in _init_executor
self.driver_worker.load_model()
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 139, in load_model
self.model_runner.load_model()
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 722, in load_model
self.model = get_model(model_config=self.model_config,
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/init.py", line 21, in get_model
return loader.load_model(model_config=model_config,
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 324, in load_model
model = _initialize_model(model_config, self.load_config,
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 152, in _initialize_model
quant_config = _get_quantization_config(model_config, load_config)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 93, in _get_quantization_config
quant_config = get_quant_config(model_config, load_config)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/weight_utils.py", line 132, in get_quant_config
return quant_cls.from_config(hf_quant_config)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/gptq_marlin.py", line 84, in from_config
return cls(weight_bits, group_size, desc_act, is_sym,
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/gptq_marlin.py", line 51, in init
verify_marlin_supported(quant_type=self.quant_type,
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/utils/marlin_utils.py", line 88, in verify_marlin_supported
raise ValueError(err_msg)
ValueError: [address=0.0.0.0:36197, pid=749] Marlin does not support weight_bits = uint4b8. Only types = [] are supported (for group_size = 128, min_capability = 75, zp = False).
Expected behavior / 期待表现
能够正常启动