dspy
dspy copied to clipboard
Multiple api bases
Hi,
I'd like to see functionality added to make it possible to provide multiple api bases that are later round-robined when consecutive requests are made. This would allow me to host the same LLM on two servers and cut the time it takes to receive responses in half. I have already implemented a custom version of the HFClientVLLM
class that does this but I'd like to see this implemented as part of the library's core functionality. The custom class is provided below:
import dspy
from dsp.modules.hf_client import HFModel, send_hfvllm_request_v00
from config import openai_api_key, openai_api_bases, model_name
# Monkey patch the HFClientVLLM class to round robin api bases instead of using a single one.
class HFClientVLLM(HFModel):
def __init__(self, model, urls, **kwargs):
super().__init__(model=model, is_client=True)
self.urls = [url.replace('/v1', '') for url in urls]
self.headers = {"Content-Type": "application/json"}
def _generate(self, prompt, **kwargs):
kwargs = {**self.kwargs, **kwargs}
payload = {
"model": kwargs["model"],
"prompt": prompt,
"max_tokens": kwargs["max_tokens"],
"temperature": kwargs["temperature"],
}
# Round robin the urls.
url = self.urls.pop(0)
self.urls.append(url)
response = send_hfvllm_request_v00(
f"{url}/v1/completions",
json=payload,
headers=self.headers,
)
try:
json_response = response.json()
completions = json_response["choices"]
response = {
"prompt": prompt,
"choices": [{"text": c["text"]} for c in completions],
}
return response
except Exception:
print("Failed to parse JSON response:", response.text)
raise Exception("Received invalid JSON response from server")