lmdeploy [Bug] use openai server, request get asyncio.exceptions.TimeoutError

Checklist

[X] 1. I have searched related issues but cannot get the expected help.
[X] 2. The bug has not been fixed in the latest version.
[x] 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.

Describe the bug

when i use openai server to start gemma-2-2b model .

chat_template_config = ChatTemplateConfig('gemma')
    backend_config = PytorchEngineConfig(tp=1,
                                         session_len=3072,
                                         max_batch_size=48,
                                         cache_max_entry_count=0.8,
                                         enable_prefix_caching=True,
                                         max_prefill_token_num=2048)
    serve(model_path=model_path,
          model_name='gemma2_2b_it_prod',
          backend='pytorch',
          log_level='DEBUG',
          backend_config=backend_config,
          chat_template_config=chat_template_config,
          server_port=8000)

http curl /v1/chat/completions. i get error log

2024-08-22 16:22:29,397 - lmdeploy - [32mDEBUG[0m - Engine loop is not alive.
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 171, in __no_threadsafe_get
    return await asyncio.wait_for(self.resp_que.get(), timeout)
  File "/opt/conda/lib/python3.8/asyncio/tasks.py", line 501, in wait_for
    raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/uvicorn/protocols/http/h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
  File "/opt/conda/lib/python3.8/site-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
    return await self.app(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/applications.py", line 123, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/middleware/errors.py", line 164, in __call__
    await self.app(scope, receive, _send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
    await app(scope, receive, sender)
  File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 754, in __call__
    await self.middleware_stack(scope, receive, send)
INFO:     10.104.137.121:42538 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
  File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 774, in app
    await route.handle(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 295, in handle
    await self.app(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 77, in app
    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
  File "/opt/conda/lib/python3.8/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
    await app(scope, receive, sender)
  File "/opt/conda/lib/python3.8/site-packages/starlette/routing.py", line 74, in app
    response = await f(request)
  File "/opt/conda/lib/python3.8/site-packages/fastapi/routing.py", line 278, in app
    raw_response = await run_endpoint_function(
  File "/opt/conda/lib/python3.8/site-packages/fastapi/routing.py", line 191, in run_endpoint_function
    return await dependant.call(**values)
  File "/root/app/lmdeploy_main.py", line 365, in chat_completions_v1
    async for res in result_generator:
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/serve/async_engine.py", line 620, in generate
    async for outputs in generator.async_stream_infer(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine_instance.py", line 177, in async_stream_infer
    resp = await self.req_sender.async_recv(req_id)
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 314, in async_recv
    resp: Response = await self._async_resp_get()
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 187, in _async_resp_get
    return await __no_threadsafe_get()
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 175, in __no_threadsafe_get
    exit(1)
  File "/opt/conda/lib/python3.8/_sitebuiltins.py", line 26, in __call__
    raise SystemExit(code)
SystemExit: 1

lmdeploy.pytorch.engine.request.RequestSender._async_resp_get

 async def _async_resp_get(self):
        """get resp.

        Different behavior in threadsafe mode.
        """
        timeout = 1

        async def __no_threadsafe_get():
            while True:
                try:
                    return await asyncio.wait_for(self.resp_que.get(), timeout)
                except asyncio.TimeoutError:
                    if not self.manager.is_loop_alive():
                        logger.debug('Engine loop is not alive.')
                        exit(1)
                    continue
                except Exception as e:
                    logger.exception(
                        f'sender[{self.sender_id}] get response failed: {e}')
                    raise e

        if self.is_thread_safe():
            ret = self._resp_get()
            await asyncio.sleep(0)
            return ret
        else:
            return await __no_threadsafe_get()

why self.manager.is_loop_alive() is not alive?

Reproduction

start command like lmdeploy serve api_server xxxxxx

Environment

lmdeploy 0.5.3

Error traceback

No response

Aug 22 '24 16:08 wlwqq

@lvhan028 @grimoire

Aug 23 '24 06:08 wlwqq

2024-08-23 09:29:43,409 - lmdeploy - [37mINFO[0m - prompt='<start_of_turn>user\nhello<end_of_turn>\n<start_of_turn>model\n', gen_config=EngineGenerationConfig(n=1, max_new_tokens=256, top_p=0.1, top_k=40, temperature=0.0, repetition_penalty=1.0, ignore_eos=False, random_seed=10490577554887956612, stop_words=[107], bad_words=None, min_new_tokens=None, skip_special_tokens=True, logprobs=None), prompt_token_id=[2, 106, 1645, 108, 17534, 107, 108, 106, 2516, 108], adapter_name=None.
2024-08-23 09:29:43,409 - lmdeploy - [37mINFO[0m - session_id=1, history_tokens=0, input_tokens=10, max_new_tokens=256, seq_start=True, seq_end=True, step=0, prep=True
2024-08-23 09:29:43,410 - lmdeploy - [33mWARNING[0m - `temperature` is 0, set to 1e-6
2024-08-23 09:29:43,410 - lmdeploy - [32mDEBUG[0m - creating engine loop task.
2024-08-23 09:29:43,422 - lmdeploy - [32mDEBUG[0m - <SchedulePrefilling> take time: 0.40 ms
2024-08-23 09:29:43,423 - lmdeploy - [32mDEBUG[0m - <CreateModelInputs> take time: 0.78 ms
2024-08-23 09:29:43,426 - lmdeploy - [32mDEBUG[0m - <ForwardTask>: batch_size=1 num_tokens=10
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-08-23 09:29:47,801 - lmdeploy - [31mERROR[0m - Engine loop failed with error: map::at
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/request.py", line 17, in _raise_exception_on_finish
    task.result()
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 857, in async_loop
    await self._async_loop()
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 847, in _async_loop
    await __step(True)
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 833, in __step
    raise e
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 825, in __step
    raise out
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 774, in _async_loop_background
    await self._async_step_background(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 683, in _async_step_background
    output = await self._async_model_forward(inputs,
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/utils.py", line 255, in __tmp
    return (await func(*args, **kwargs))
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 590, in _async_model_forward
    return await __forward(inputs)
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/engine.py", line 568, in __forward
    return await self.model_agent.async_forward(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/model_agent.py", line 756, in async_forward
    output = self._forward_impl(inputs,
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/model_agent.py", line 723, in _forward_impl
    output = model_forward(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/engine/model_agent.py", line 497, in model_forward
    output = patched_model.patched_forward(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/patch.py", line 210, in __call__
    output = self._model(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 994, in forward
    outputs = self.model(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 236, in forward
    return self._continuous_batching_forward(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 203, in _continuous_batching_forward
    layer_outputs = decoder_layer(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 586, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 168, in forward
    return self._contiguous_batching_forward_impl(
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/models/gemma.py", line 132, in _contiguous_batching_forward_impl
    paged_attention_fwd(
  File "<string>", line 3, in paged_attention_fwd
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/kernels/dispatcher.py", line 95, in load_and_call
    return self.dispatched_func(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/lmdeploy/pytorch/kernels/cuda/pagedattention.py", line 778, in paged_attention_fwd
    _fwd_kernel[grid](q,
  File "<string>", line 16, in __fwd_kernel_launcher
  File "/opt/conda/lib/python3.8/site-packages/triton/runtime/jit.py", line 167, in <lambda>
    return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/triton/runtime/jit.py", line 416, in run
    self.cache[device][key] = compile(
  File "/opt/conda/lib/python3.8/site-packages/triton/compiler/compiler.py", line 193, in compile
    next_module = compile_ir(module, metadata)
  File "/opt/conda/lib/python3.8/site-packages/triton/compiler/backends/cuda.py", line 199, in <lambda>
    stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, self.capability)
  File "/opt/conda/lib/python3.8/site-packages/triton/compiler/backends/cuda.py", line 173, in make_llir
    ret = translate_triton_gpu_to_llvmir(src, capability, tma_infos, runtime.TARGET.NVVM)
IndexError: map::at

Aug 23 '24 09:08 wlwqq

File "/opt/conda/lib/python3.8/site-packages/triton/compiler/backends/cuda.py", line 173, in make_llir ret = translate_triton_gpu_to_llvmir(src, capability, tma_infos, runtime.TARGET.NVVM)

The triton kernel compilation failed on your device. What is your triton version?

Aug 26 '24 09:08 grimoire