Context
{'model_id': '/model', 'model_sha': None, 'model_dtype': 'torch.float16', 'model_device_type': 'cuda', 'model_pipeline_tag': None, ...}
{'model_id': 'meta-llama/Llama-3.1-8B-Instruct', 'model_sha': '0e9e39f249a16976918f6564b8830bc894c89659', ...}
Error w/ Dell's TGI
Traceback (most recent call last):
File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/server/server.py", line 343, in <module>
fire.Fire(main)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs) [5/625]
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/server/server.py", line 279, in main
impls = asyncio.run(resolve_impls_with_routing(config))
File "/usr/local/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/usr/local/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/resolver.py", line 181, in resolve_impls_with_routing
impl = await instantiate_provider(
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/resolver.py", line 268, in instantiate_provider
impl = await fn(*args)
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/routers/__init__.py", line 31, in get_routing_table_impl
await impl.initialize()
File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/routers/routing_tables.py", line 58, in initialize
models = await p.list_models()
File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/adapters/inference/tgi/tgi.py", line 55, in list_models
identifier = self.huggingface_repo_to_llama_model_id[repo]
KeyError: '/model'
Fix
- Add dedicated Dell TGI config & Dell TGI Adapter
Test
- Start TGI
docker run --network host -it \
--shm-size 1g \
-p 5009:5009 \
--gpus 1 \
-e NUM_SHARD=1 \
-e MAX_BATCH_PREFILL_TOKENS=32768 \
-e MAX_INPUT_TOKENS=8000 \
-e MAX_TOTAL_TOKENS=8192 \
registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct --port 5009
- Start chromadb
chroma run --path ./chroma_vdb --host 127.0.0.1 --port 6000
- Start Llama Stack
llama stack run ./tests/examples/dell-tgi-run.yaml
- Client
python -m llama_stack.apis.inference.client $LOCALHOST 5000 --model Llama3.1-8B-Instruct