lmdeploy
lmdeploy copied to clipboard
[Bug] use openai server, request get asyncio.exceptions.TimeoutError
Checklist
- [X] 1. I have searched related issues but cannot get the expected help.
- [X] 2. The bug has not been fixed in the latest version.
- [x] 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
Describe the bug
when i use openai server to start gemma-2-2b model .
chat_template_config = ChatTemplateConfig('gemma')
backend_config = PytorchEngineConfig(tp=1,
session_len=3072,
max_batch_size=48,
cache_max_entry_count=0.8,
enable_prefix_caching=True,
max_prefill_token_num=2048)
serve(model_path=model_path,
model_name='gemma2_2b_it_prod',
backend='pytorch',
log_level='DEBUG',
backend_config=backend_config,
chat_template_config=chat_template_config,
server_port=8000)
http curl /v1/chat/completions. i get error log
2024-08-22 16:22:29,397 - lmdeploy - [32mDEBUG[0m - Engine loop is not alive.
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 171, in __no_threadsafe_get
return await asyncio.wait_for(self.resp_que.get(), timeout)
File "/opt/conda/lib/python3.8/asyncio/tasks.py", line 501, in wait_for
raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/uvicorn/protocols/http/h11_impl.py", line 406, in run_asgi
result = await app( # type: ignore[func-returns-value]
File "/opt/conda/lib/python3.8/site-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
return await self.app(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/starlette/applications.py", line 123, in __call__
await self.middleware_stack(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/opt/conda/lib/python3.8/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 754, in __call__
await self.middleware_stack(scope, receive, send)
INFO: 10.104.137.121:42538 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 774, in app
await route.handle(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 295, in handle
await self.app(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 77, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/opt/conda/lib/python3.8/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 74, in app
response = await f(request)
File "/opt/conda/lib/python3.8/site-packages/fastapi/routing.py", line 278, in app
raw_response = await run_endpoint_function(
File "/opt/conda/lib/python3.8/site-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/root/app/lmdeploy_main.py", line 365, in chat_completions_v1
async for res in result_generator:
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/serve/async_engine.py", line 620, in generate
async for outputs in generator.async_stream_infer(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine_instance.py", line 177, in async_stream_infer
resp = await self.req_sender.async_recv(req_id)
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 314, in async_recv
resp: Response = await self._async_resp_get()
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 187, in _async_resp_get
return await __no_threadsafe_get()
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 175, in __no_threadsafe_get
exit(1)
File "/opt/conda/lib/python3.8/_sitebuiltins.py", line 26, in __call__
raise SystemExit(code)
SystemExit: 1
lmdeploy.pytorch.engine.request.RequestSender._async_resp_get
async def _async_resp_get(self):
"""get resp.
Different behavior in threadsafe mode.
"""
timeout = 1
async def __no_threadsafe_get():
while True:
try:
return await asyncio.wait_for(self.resp_que.get(), timeout)
except asyncio.TimeoutError:
if not self.manager.is_loop_alive():
logger.debug('Engine loop is not alive.')
exit(1)
continue
except Exception as e:
logger.exception(
f'sender[{self.sender_id}] get response failed: {e}')
raise e
if self.is_thread_safe():
ret = self._resp_get()
await asyncio.sleep(0)
return ret
else:
return await __no_threadsafe_get()
why self.manager.is_loop_alive() is not alive?
Reproduction
start command like lmdeploy serve api_server xxxxxx
Environment
lmdeploy 0.5.3
Error traceback
No response
@lvhan028 @grimoire
2024-08-23 09:29:43,409 - lmdeploy - [37mINFO[0m - prompt='<start_of_turn>user\nhello<end_of_turn>\n<start_of_turn>model\n', gen_config=EngineGenerationConfig(n=1, max_new_tokens=256, top_p=0.1, top_k=40, temperature=0.0, repetition_penalty=1.0, ignore_eos=False, random_seed=10490577554887956612, stop_words=[107], bad_words=None, min_new_tokens=None, skip_special_tokens=True, logprobs=None), prompt_token_id=[2, 106, 1645, 108, 17534, 107, 108, 106, 2516, 108], adapter_name=None.
2024-08-23 09:29:43,409 - lmdeploy - [37mINFO[0m - session_id=1, history_tokens=0, input_tokens=10, max_new_tokens=256, seq_start=True, seq_end=True, step=0, prep=True
2024-08-23 09:29:43,410 - lmdeploy - [33mWARNING[0m - `temperature` is 0, set to 1e-6
2024-08-23 09:29:43,410 - lmdeploy - [32mDEBUG[0m - creating engine loop task.
2024-08-23 09:29:43,422 - lmdeploy - [32mDEBUG[0m - <SchedulePrefilling> take time: 0.40 ms
2024-08-23 09:29:43,423 - lmdeploy - [32mDEBUG[0m - <CreateModelInputs> take time: 0.78 ms
2024-08-23 09:29:43,426 - lmdeploy - [32mDEBUG[0m - <ForwardTask>: batch_size=1 num_tokens=10
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-08-23 09:29:47,801 - lmdeploy - [31mERROR[0m - Engine loop failed with error: map::at
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 17, in _raise_exception_on_finish
task.result()
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 857, in async_loop
await self._async_loop()
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 847, in _async_loop
await __step(True)
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 833, in __step
raise e
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 825, in __step
raise out
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 774, in _async_loop_background
await self._async_step_background(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 683, in _async_step_background
output = await self._async_model_forward(inputs,
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/utils.py", line 255, in __tmp
return (await func(*args, **kwargs))
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 590, in _async_model_forward
return await __forward(inputs)
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 568, in __forward
return await self.model_agent.async_forward(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/model_agent.py", line 756, in async_forward
output = self._forward_impl(inputs,
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/model_agent.py", line 723, in _forward_impl
output = model_forward(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/model_agent.py", line 497, in model_forward
output = patched_model.patched_forward(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/patch.py", line 210, in __call__
output = self._model(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 994, in forward
outputs = self.model(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 236, in forward
return self._continuous_batching_forward(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 203, in _continuous_batching_forward
layer_outputs = decoder_layer(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 586, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 168, in forward
return self._contiguous_batching_forward_impl(
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 132, in _contiguous_batching_forward_impl
paged_attention_fwd(
File "<string>", line 3, in paged_attention_fwd
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/kernels/dispatcher.py", line 95, in load_and_call
return self.dispatched_func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/kernels/cuda/pagedattention.py", line 778, in paged_attention_fwd
_fwd_kernel[grid](q,
File "<string>", line 16, in __fwd_kernel_launcher
File "/opt/conda/lib/python3.8/site-packages/triton/runtime/jit.py", line 167, in <lambda>
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/triton/runtime/jit.py", line 416, in run
self.cache[device][key] = compile(
File "/opt/conda/lib/python3.8/site-packages/triton/compiler/compiler.py", line 193, in compile
next_module = compile_ir(module, metadata)
File "/opt/conda/lib/python3.8/site-packages/triton/compiler/backends/cuda.py", line 199, in <lambda>
stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, self.capability)
File "/opt/conda/lib/python3.8/site-packages/triton/compiler/backends/cuda.py", line 173, in make_llir
ret = translate_triton_gpu_to_llvmir(src, capability, tma_infos, runtime.TARGET.NVVM)
IndexError: map::at
File "/opt/conda/lib/python3.8/site-packages/triton/compiler/backends/cuda.py", line 173, in make_llir ret = translate_triton_gpu_to_llvmir(src, capability, tma_infos, runtime.TARGET.NVVM)
The triton kernel compilation failed on your device. What is your triton version?