vllm-ra icon indicating copy to clipboard operation
vllm-ra copied to clipboard

KeyError when managing concurrent requests.

Open daniyal214 opened this issue 1 year ago • 1 comments

@rayleizhu Facing this Key Error when trying to send concurrent requests.

class ConcurrencyTest:
    def __init__(self, model, quant=None, enforce_eager=False, tensor_parallel_size=2,
                 enable_relay_attention=True, sys_prompt=None, sys_schema=None,
                 sys_prompt_file=None, sys_schema_file=None):
        self.llm = LLM(model=model, quantization=quant, enforce_eager=enforce_eager,
                       tensor_parallel_size=tensor_parallel_size,
                       enable_relay_attention=enable_relay_attention,
                       sys_prompt=sys_prompt,
                       sys_schema=sys_schema,
                       sys_prompt_file=sys_prompt_file,
                       sys_schema_file=sys_schema_file)

    def generate_and_store_output(self, prompt, sampling_params, index):
        start_time = time.time()
        output = self.llm.generate(prompt, sampling_params)
        print(f"OUTPUT>>>>>>>>: {output}")
 

    def generate_concurrently(self, prompts, sampling_params):
        threads = []
        start_time = time.time()
        for i, prompt in enumerate(prompts):
            print(f"Thread created********: {i+1}")
            thread = Thread(target=self.generate_and_store_output, args=(prompt, sampling_params, i))
            threads.append(thread)
            thread.start()
            print("Thread Started")

        for thread in threads:
            thread.join()

concurrency_test.generate_concurrently(prompts, sampling_params)

I saw similar issue in vllm library as well at Key Error when handle multiple requests simultaneously. Not sure if it's fixed in the latest release or not. Could you please provide some insight into whether there's a quick solution to address this?

Error Faced:

Exception in thread Thread-2 (generate_and_store_output):
Traceback (most recent call last):                                                                                    | 0/5 [00:00<?, ?it/s]
  File "/root/anaconda3/lib/python3.11/threading.py", line 1038, in _bootstrap_inner                          | 1/4 [00:04<00:12,  4.23s/it]
    self.run()
  File "/root/anaconda3/lib/python3.11/threading.py", line 975, in run                                                | 0/6 [00:00<?, ?it/s]
    self._target(*self._args, **self._kwargs)                                                                         | 0/7 [00:00<?, ?it/s]
  File "/home/jovyan/testing_concurrency.py", line 21, in generate_and_store_output                                   | 0/8 [00:00<?, ?it/s]
    output = self.llm.generate(prompt, sampling_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/vllm-ra/vllm/entrypoints/llm.py", line 175, in generate
    return self._run_engine(use_tqdm)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/vllm-ra/vllm/entrypoints/llm.py", line 201, in _run_engine
    step_outputs = self.llm_engine.step()
                   ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/vllm-ra/vllm/engine/llm_engine.py", line 650, in step
    return self._process_model_outputs(output, scheduler_outputs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/vllm-ra/vllm/engine/llm_engine.py", line 612, in _process_model_outputs
    self._process_sequence_group_outputs(seq_group, outputs)
  File "/home/jovyan/vllm-ra/vllm/engine/llm_engine.py", line 451, in _process_sequence_group_outputs
    parent_child_dict[sample.parent_seq_id].append(sample)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
KeyError: 0

daniyal214 avatar Mar 19 '24 10:03 daniyal214

I got it to work using the AsyncLLMEngine class.

from vllm import AsyncLLMEngine, AsyncEngineArgs
engine_args = AsyncEngineArgs(
            model=model, quantization=quant, enforce_eager=enforce_eager,
            tensor_parallel_size=tensor_parallel_size,
            enable_relay_attention=enable_relay_attention,
            sys_prompt=sys_prompt,
            sys_schema=sys_schema,
            sys_prompt_file=sys_prompt_file,
            sys_schema_file=sys_schema_file
        )

self.engine = AsyncLLMEngine.from_engine_args(engine_args)
self.engine.generate(prompt, sampling_params, request_id)

daniyal214 avatar Mar 19 '24 12:03 daniyal214