LLaMA-Factory
LLaMA-Factory copied to clipboard
使用vllm推理的时候如何传入dtype参数
Traceback (most recent call last):
File "/data/disk2/ybZhang/LLaMA-Factory/src/cli_demo.py", line 49, in <module>
main()
File "/data/disk2/ybZhang/LLaMA-Factory/src/cli_demo.py", line 15, in main
chat_model = ChatModel()
File "/data/disk2/ybZhang/LLaMA-Factory/src/llmtuner/chat/chat_model.py", line 25, in __init__
self.engine: "BaseEngine" = VllmEngine(model_args, data_args, finetuning_args, generating_args)
File "/data/disk2/ybZhang/LLaMA-Factory/src/llmtuner/chat/vllm_engine.py", line 37, in __init__
self.model = AsyncLLMEngine.from_engine_args(engine_args)
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 361, in from_engine_args
engine = cls(
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 319, in __init__
self.engine = self._init_engine(*args, **kwargs)
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 437, in _init_engine
return engine_class(*args, **kwargs)
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 148, in __init__
self.model_executor = executor_class(
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 41, in __init__
self._init_executor()
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 22, in _init_executor
self._init_non_spec_worker()
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 50, in _init_non_spec_worker
self.driver_worker.init_device()
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/worker/worker.py", line 103, in init_device
_check_if_gpu_supports_dtype(self.model_config.dtype)
File "/home/ybZhang/miniconda3/envs/glm-f/lib/python3.10/site-packages/vllm/worker/worker.py", line 327, in _check_if_gpu_supports_dtype
raise ValueError(
ValueError: Bfloat16 is only supported on GPUs with compute capability of at least 8.0. Your Tesla V100-SXM2-32GB GPU has compute capability 7.0. You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
已修复
已经拉去最新代码,还是有同样问题
pytorch库在最后检测时进行了一次torch.bfloat16 tensor的创建。V100, torch=2.3.0, cuda=12.1的情况下是可以创建成功的,所以最后的返回结果是True。导致transformer库判断为True,最终导致LLaMA-Factory库判断为True。
torch.tensor([1.0], dtype=torch.bfloat16, device=device)
LLaMA-Factory vllm_engine.py
infer_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
infer_dtype = str(infer_dtype).split(".")[-1]
LLaMA-Factory misc.py
from transformers.utils import is_torch_bf16_gpu_available
try:
_is_bf16_available = is_torch_bf16_gpu_available()
except Exception:
_is_bf16_available = False
def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
r"""
Infers the optimal dtype according to the model_dtype and device compatibility.
"""
if _is_bf16_available and model_dtype == torch.bfloat16:
return torch.bfloat16
elif _is_fp16_available:
return torch.float16
else:
return torch.float32
transformers/src/transformers/utils/import_utils.py
def is_torch_bf16_gpu_available():
if not is_torch_available():
return False
import torch
return torch.cuda.is_available() and torch.cuda.is_bf16_supported()
pytorch/torch/cuda/init.py
def is_bf16_supported():
r"""Return a bool indicating if the current CUDA/ROCm device supports dtype bfloat16."""
# Check for ROCm, if true return true, no ROCM_VERSION check required,
# since it is supported on AMD GPU archs.
if torch.version.hip:
return True
device = torch.cuda.current_device()
# Check for CUDA version and device compute capability.
# This is a fast way to check for it.
cuda_version = torch.version.cuda
if (
cuda_version is not None
and int(cuda_version.split(".")[0]) >= 11
and torch.cuda.get_device_properties(device).major >= 8
):
return True
# Finally try to create a bfloat16 device.
return _check_bf16_tensor_supported(device)
@lru_cache(maxsize=16)
def _check_bf16_tensor_supported(device: _device_t):
try:
torch.tensor([1.0], dtype=torch.bfloat16, device=device)
return True
except Exception:
return False
vllm_dtype: float16