vllm-ra
vllm-ra copied to clipboard
KeyError when managing concurrent requests.
@rayleizhu Facing this Key Error when trying to send concurrent requests.
class ConcurrencyTest:
def __init__(self, model, quant=None, enforce_eager=False, tensor_parallel_size=2,
enable_relay_attention=True, sys_prompt=None, sys_schema=None,
sys_prompt_file=None, sys_schema_file=None):
self.llm = LLM(model=model, quantization=quant, enforce_eager=enforce_eager,
tensor_parallel_size=tensor_parallel_size,
enable_relay_attention=enable_relay_attention,
sys_prompt=sys_prompt,
sys_schema=sys_schema,
sys_prompt_file=sys_prompt_file,
sys_schema_file=sys_schema_file)
def generate_and_store_output(self, prompt, sampling_params, index):
start_time = time.time()
output = self.llm.generate(prompt, sampling_params)
print(f"OUTPUT>>>>>>>>: {output}")
def generate_concurrently(self, prompts, sampling_params):
threads = []
start_time = time.time()
for i, prompt in enumerate(prompts):
print(f"Thread created********: {i+1}")
thread = Thread(target=self.generate_and_store_output, args=(prompt, sampling_params, i))
threads.append(thread)
thread.start()
print("Thread Started")
for thread in threads:
thread.join()
concurrency_test.generate_concurrently(prompts, sampling_params)
I saw similar issue in vllm library as well at Key Error when handle multiple requests simultaneously. Not sure if it's fixed in the latest release or not. Could you please provide some insight into whether there's a quick solution to address this?
Error Faced:
Exception in thread Thread-2 (generate_and_store_output):
Traceback (most recent call last): | 0/5 [00:00<?, ?it/s]
File "/root/anaconda3/lib/python3.11/threading.py", line 1038, in _bootstrap_inner | 1/4 [00:04<00:12, 4.23s/it]
self.run()
File "/root/anaconda3/lib/python3.11/threading.py", line 975, in run | 0/6 [00:00<?, ?it/s]
self._target(*self._args, **self._kwargs) | 0/7 [00:00<?, ?it/s]
File "/home/jovyan/testing_concurrency.py", line 21, in generate_and_store_output | 0/8 [00:00<?, ?it/s]
output = self.llm.generate(prompt, sampling_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/jovyan/vllm-ra/vllm/entrypoints/llm.py", line 175, in generate
return self._run_engine(use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/jovyan/vllm-ra/vllm/entrypoints/llm.py", line 201, in _run_engine
step_outputs = self.llm_engine.step()
^^^^^^^^^^^^^^^^^^^^^^
File "/home/jovyan/vllm-ra/vllm/engine/llm_engine.py", line 650, in step
return self._process_model_outputs(output, scheduler_outputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/jovyan/vllm-ra/vllm/engine/llm_engine.py", line 612, in _process_model_outputs
self._process_sequence_group_outputs(seq_group, outputs)
File "/home/jovyan/vllm-ra/vllm/engine/llm_engine.py", line 451, in _process_sequence_group_outputs
parent_child_dict[sample.parent_seq_id].append(sample)
~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
KeyError: 0
I got it to work using the AsyncLLMEngine class.
from vllm import AsyncLLMEngine, AsyncEngineArgs
engine_args = AsyncEngineArgs(
model=model, quantization=quant, enforce_eager=enforce_eager,
tensor_parallel_size=tensor_parallel_size,
enable_relay_attention=enable_relay_attention,
sys_prompt=sys_prompt,
sys_schema=sys_schema,
sys_prompt_file=sys_prompt_file,
sys_schema_file=sys_schema_file
)
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
self.engine.generate(prompt, sampling_params, request_id)