llama-stack
llama-stack copied to clipboard
I used the official Docker image and downloaded the weight file from Meta. The md5sum test proved that the file was fine, but it still failed to run, which left me confused
I used the official Docker image and downloaded the weight file from Meta. The md5sum test proved that the file was fine, but it still failed to run, which left me confused,I confirm that CUDA can be used from within Docker
root@720:~/.llama/checkpoints# nvidia-smi
Sat Oct 12 03:22:06 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 Tesla P40 Off | 00000000:05:00.0 Off | Off |
| N/A 34C P0 49W / 250W | 0MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 Tesla P40 Off | 00000000:42:00.0 Off | Off |
| N/A 38C P0 45W / 250W | 0MiB / 24576MiB | 1% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
`root@720:~/.llama/checkpoints/Llama3.1-8B-Instruct# ls -alh
total 15G
drwxr-xr-x 2 root root 4.0K Oct 12 02:31 .
drwxr-xr-x 4 root root 4.0K Oct 12 02:42 ..
-rw-r--r-- 1 root root 15G Oct 12 02:31 consolidated.00.pth
-rw-r--r-- 1 root root 199 Oct 12 02:31 params.json
-rw-r--r-- 1 root root 8.7M Oct 12 02:31 tokenizer.json
-rw-r--r-- 1 root root 489K Oct 12 02:31 tokenizer.model
root@720:~/.llama/checkpoints# docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llamastack-local-gpu
Resolved 8 providers in topological order
Api.models: routing_table
Api.inference: router
Api.shields: routing_table
Api.safety: router
Api.memory_banks: routing_table
Api.memory: router
Api.agents: meta-reference
Api.telemetry: meta-reference
Traceback (most recent call last):
File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/server/server.py", line 351, in <module>
fire.Fire(main)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/server/server.py", line 288, in main
impls, specs = asyncio.run(resolve_impls_with_routing(config))
File "/usr/local/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/usr/local/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/resolver.py", line 104, in resolve_impls_with_routing
impl = await instantiate_provider(spec, deps, configs[api])
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/resolver.py", line 174, in instantiate_provider
impl = await instantiate_provider(
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/resolver.py", line 192, in instantiate_provider
impl = await fn(*args)
File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/impls/meta_reference/inference/__init__.py", line 18, in get_provider_impl
await impl.initialize()
File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/impls/meta_reference/inference/inference.py", line 38, in initialize
self.generator = LlamaModelParallelGenerator(self.config)
File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/impls/meta_reference/inference/model_parallel.py", line 72, in __init__
self.formatter = ChatFormat(Tokenizer(tokenizer_path))
File "/usr/local/lib/python3.10/site-packages/llama_models/llama3/api/tokenizer.py", line 77, in __init__
mergeable_ranks = load_tiktoken_bpe(model_path)
File "/usr/local/lib/python3.10/site-packages/tiktoken/load.py", line 145, in load_tiktoken_bpe
return {
File "/usr/local/lib/python3.10/site-packages/tiktoken/load.py", line 147, in <dictcomp>
for token, rank in (line.split() for line in contents.splitlines() if line)
ValueError: not enough values to unpack (expected 2, got 1)
`