vllm
vllm copied to clipboard
Benchmarking script does not limit the maximum concurrency
The current benchmarking script if specified with INF
arrivals, will not limit the maximum concurrency level as shown here.
If we can change it to below, we can limit the maximum concurrency to have a fine controlled load level.
semaphore = asyncio.Semaphore(max_concurrency) # Semaphore to limit concurrency
async def make_request(request, sem):
async with sem: # Ensure only max_concurrency tasks run in parallel
prompt, prompt_len, output_len = request
request_func_input = RequestFuncInput(
model=model_id,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
output_len=output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
# Call the request function directly here and return its result
return await request_func(request_func_input=request_func_input, pbar=pbar)
tasks = []
for request in input_requests: # Direct iteration may replace async iteration based on design
# Enqueue task without immediately awaiting it
tasks.append(make_request(request, semaphore))
# Manage inter-arrival time
if request_rate != float("inf"):
await asyncio.sleep(1.0 / request_rate)
outputs = await asyncio.gather(*tasks) # Wait for all tasks to complete