FastChat
FastChat copied to clipboard
CUDA_VISIBLE_DEVICES="3,4" fails with --num-gpus 2
Seems like the num_gpus counts from 0.
CUDA_VISIBLE_DEVICES="3,4" python3 -m fastchat.serve.model_worker --model-path ../text-generation-webui/models/vicuna-13b/ --port 310001 --worker http://localhost:31001 --num-gpus 2
2023-06-16 10:07:22 | INFO | model_worker | args: Namespace(host='localhost', port=310001, worker_address='http://localhost:31001', controller_address='http://localhost:21001', model_path='../text-generation-webui/models/vicuna-13b/', device='cuda', gpus=None, num_gpus=2, max_gpu_memory=None, load_8bit=False, cpu_offloading=False, gptq_ckpt=None, gptq_wbits=16, gptq_groupsize=-1, gptq_act_order=False, model_names=None, limit_model_concurrency=5, stream_interval=2, no_register=False)
2023-06-16 10:07:22 | INFO | model_worker | Loading the model ['vicuna-13b'] on worker 5b10be ...
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /easybuild/2020/software/Python/3.10.4-GCCcore-11.3.0/lib/python3.10/runpy.py:196 in │
│ _run_module_as_main │
│ │
│ 193 │ main_globals = sys.modules["__main__"].__dict__ │
│ 194 │ if alter_argv: │
│ 195 │ │ sys.argv[0] = mod_spec.origin │
│ ❱ 196 │ return _run_code(code, main_globals, None, │
│ 197 │ │ │ │ │ "__main__", mod_spec) │
│ 198 │
│ 199 def run_module(mod_name, init_globals=None, │
│ │
│ /easybuild/2020/software/Python/3.10.4-GCCcore-11.3.0/lib/python3.10/runpy.py:86 in _run_code │
│ │
│ 83 │ │ │ │ │ __loader__ = loader, │
│ 84 │ │ │ │ │ __package__ = pkg_name, │
│ 85 │ │ │ │ │ __spec__ = mod_spec) │
│ ❱ 86 │ exec(code, run_globals) │
│ 87 │ return run_globals │
│ 88 │
│ 89 def _run_module_code(code, init_globals=None, │
│ │
│ /p/haicluster/llama/FastChat/fastchat/serve/model_worker.py:452 in <module> │
│ │
│ 449 │ │ act_order=args.gptq_act_order, │
│ 450 │ ) │
│ 451 │ │
│ ❱ 452 │ worker = ModelWorker( │
│ 453 │ │ args.controller_address, │
│ 454 │ │ args.worker_address, │
│ 455 │ │ worker_id, │
│ │
│ /p/haicluster/llama/FastChat/fastchat/serve/model_worker.py:89 in __init__ │
│ │
│ 86 │ │ self.device = device │
│ 87 │ │ │
│ 88 │ │ logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...") │
│ ❱ 89 │ │ self.model, self.tokenizer = load_model( │
│ 90 │ │ │ model_path, │
│ 91 │ │ │ device, │
│ 92 │ │ │ num_gpus, │
│ │
│ /p/haicluster/llama/FastChat/fastchat/model/model_adapter.py:137 in load_model │
│ │
│ 134 │ │ │ │ │ "device_map" │
│ 135 │ │ │ │ ] = "sequential" # This is important for not the same VRAM sizes │
│ 136 │ │ │ │ available_gpu_memory = get_gpu_memory(num_gpus) │
│ ❱ 137 │ │ │ │ kwargs["max_memory"] = { │
│ 138 │ │ │ │ │ i: str(int(available_gpu_memory[i] * 0.85)) + "GiB" │
│ 139 │ │ │ │ │ for i in range(num_gpus) │
│ 140 │ │ │ │ } │
│ │
│ /p/haicluster/llama/FastChat/fastchat/model/model_adapter.py:138 in <dictcomp> │
│ │
│ 135 │ │ │ │ ] = "sequential" # This is important for not the same VRAM sizes │
│ 136 │ │ │ │ available_gpu_memory = get_gpu_memory(num_gpus) │
│ 137 │ │ │ │ kwargs["max_memory"] = { │
│ ❱ 138 │ │ │ │ │ i: str(int(available_gpu_memory[i] * 0.85)) + "GiB" │
│ 139 │ │ │ │ │ for i in range(num_gpus) │
│ 140 │ │ │ │ } │
│ 141 │ │ │ else: │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
IndexError: list index out of range