How to Evaluate a Model that Calls an API?
Hi authors, thank you for your amazing work!
I noticed that in base_eval_template.py, it seems like the evaluation is designed for locally deployed models. If I want to evaluate API-based models like GPT-4o, how should I modify the code?
Looking forward to your guidance!
I have the same problem, do you already know how to do with API-based models?
I have the same problem, do you already know how to do with API-based models?
Sadly not, I have tried another benchmark. :)
I modified agentenv/agentenv/controller/task.py and agentenv/agentenv/controller/utils.py, successfully implemented evaluation based on the QWEN API.
task.py
def _generate_experience_one(
self,
model,
tokenizer,
client: BaseEnvClient,
idx: int,
generation_config: Optional[GenerationConfig] = None,
max_rounds: Optional[int] = None,
api:bool=False
) -> ExperienceOutput:
client.reset(idx)
print(idx)
reward = 0.0
done = False
state = client.observe()
conversation = list(client.conversation_start)
conversation.append(
ConversationMessage({"from": "human", "loss": None, "value": state})
)
if api:
#conversation_tokenized = self._tokenize_conversation(conversation, tokenizer)
rounds = 0
input_length=0
model_name=model['name']
model_url=model['url']
model_key=model['key']
client_api = OpenAI(
# 如果没有配置环境变量,请用百炼API Key替换:api_key="sk-xxx"
api_key=model_key,
base_url=model_url
)
while not done:
#input_length=0
conversation_api=convert_langchain_to_openai(conversation)
#print(conversation_api)
completion = client_api.chat.completions.create(
model=model_name, # 您可以按需更换为其它深度思考模型
messages=conversation_api,
# enable_thinking 参数开启思考过程,QwQ 与 DeepSeek-R1 模型总会进行思考,不支持该参数
extra_body={"enable_thinking": True},
stream=True,
stream_options={
"include_usage": True
},
)
# for chunk in completion:
# #print(chunk)
# if chunk.usage is not None:
# input_length=chunk.usage.total_tokens
#break
# if input_length exceeds 4096, break
reasoning_content = "" # 完整思考过程
answer_content = "" # 完整回复
is_answering = False # 是否进入回复阶段
for chunk in completion:
if not chunk.choices:
input_length=chunk.usage.total_tokens
print(input_length)
continue
delta = chunk.choices[0].delta
# 只收集思考内容
if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None:
# if not is_answering:
# print(delta.reasoning_content, end="", flush=True)
reasoning_content += delta.reasoning_content
# 收到content,开始进行回复
if hasattr(delta, "content") and delta.content:
if not is_answering:
# print("\n" + "=" * 20 + "完整回复" + "=" * 20 + "\n")
is_answering = True
# print(delta.content, end="", flush=True)
answer_content += delta.content
if input_length > 4096:
break
generated_text=answer_content
print(generated_text)
conversation.append(
ConversationMessage(
{"from": "gpt", "loss": True, "value": generated_text}
)
)
step_output = client.step(generated_text)
state, reward, done = (
step_output.state,
step_output.reward,
step_output.done,
)
env_message = ConversationMessage(
{"from": "human", "loss": None, "value": state}
)
# env_message_tokenized = self._tokenize_conversation_one(
# env_message, tokenizer
# )
conversation.append(env_message)
# conversation_tokenized["text"] += env_message_tokenized["text"]
# conversation_tokenized["input_ids"] += env_message_tokenized["input_ids"]
# conversation_tokenized["action_mask"] += env_message_tokenized[
# "action_mask"
# ]
rounds += 1
if max_rounds is not None and rounds >= max_rounds:
break
return ExperienceOutput(
conversation=conversation,
reward=reward,
text='a',
seq_ids=[1],
attention_mask=[1],
action_mask=[1],
)
in your eval.py
model={'name':r'qwq-32b','url':r"BASE_URL",'key':r'YOUR_KEY'}
感谢感谢,有空试一下
---Original--- From: "Zhen @.> Date: Mon, May 5, 2025 14:27 PM To: @.>; Cc: "Andre @.@.>; Subject: Re: [WooooDyy/AgentGym] How to Evaluate a Model that Calls an API?(Issue #44)
CostaliyA left a comment (WooooDyy/AgentGym#44)
I modified agentenv/agentenv/controller/task.py and agentenv/agentenv/controller/utils.py, successfully implemented evaluation based on the QWEN API. task.py def _generate_experience_one( self, model, tokenizer, client: BaseEnvClient, idx: int, generation_config: Optional[GenerationConfig] = None, max_rounds: Optional[int] = None, api:bool=False ) -> ExperienceOutput: client.reset(idx) reward = 0.0 done = False state = client.observe() conversation = list(client.conversation_start) conversation.append( ConversationMessage({"from": "human", "loss": None, "value": state}) ) if api: #conversation_tokenized = self._tokenize_conversation(conversation, tokenizer) conversation_api=convert_langchain_to_openai(conversation) rounds = 0 model_name=model['name'] model_url=model['url'] model_key=model['key'] client_api = OpenAI( # 如果没有配置环境变量,请用百炼API Key替换:api_key="sk-xxx" api_key=model_key, base_url=model_url ) while not done: completion = client_api.chat.completions.create( model=model_name, # 您可以按需更换为其它深度思考模型 messages=conversation_api, # enable_thinking 参数开启思考过程,QwQ 与 DeepSeek-R1 模型总会进行思考,不支持该参数 extra_body={"enable_thinking": True}, stream=True, stream_options={ "include_usage": True }, ) reasoning_content = "" # 完整思考过程 answer_content = "" # 完整回复 is_answering = False # 是否进入回复阶段 for chunk in completion: if not chunk.choices: # print("\nUsage:") # print(chunk.usage) continue delta = chunk.choices[0].delta # 只收集思考内容 if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None: # if not is_answering: # print(delta.reasoning_content, end="", flush=True) reasoning_content += delta.reasoning_content # 收到content,开始进行回复 if hasattr(delta, "content") and delta.content: if not is_answering: # print("\n" + "=" * 20 + "完整回复" + "=" * 20 + "\n") is_answering = True # print(delta.content, end="", flush=True) answer_content += delta.content if chunk.usage is not None: input_length=chunk.usage.total_tokens if input_length > 4096: break generated_text=answer_content print(generated_text) conversation.append( ConversationMessage( {"from": "gpt", "loss": True, "value": generated_text} ) ) step_output = client.step(generated_text) state, reward, done = ( step_output.state, step_output.reward, step_output.done, ) env_message = ConversationMessage( {"from": "human", "loss": None, "value": state} ) # env_message_tokenized = self._tokenize_conversation_one( # env_message, tokenizer # ) conversation.append(env_message) # conversation_tokenized["text"] += env_message_tokenized["text"] # conversation_tokenized["input_ids"] += env_message_tokenized["input_ids"] # conversation_tokenized["action_mask"] += env_message_tokenized[ # "action_mask" # ] rounds += 1 if max_rounds is not None and rounds >= max_rounds: print(idx) break return ExperienceOutput( conversation=conversation, reward=reward, text='a', seq_ids=[1], attention_mask=[1], action_mask=[1], ) in your eval.py model={'name':r'qwq-32b','url':r"BASE_URL",'key':r'YOUR_KEY'}
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>
同一目录下的ustils.py还需要加个api的入参来区分,类似
class EvaluationOutput:
experiences: Sequence[ExperienceOutput]
score: float
success: float
class BaseAgentEnvController:
def __init__(self, agent: Agent, tasks: Sequence[BaseTask],api:bool=False) -> None:
self.agent = agent
self.tasks = tasks
self.api=api
def generate_experience(
self,
idxs: Sequence[int] | Sequence[Sequence[int]] | None = None,
generation_config: Optional[GenerationConfig] = None,
max_rounds: Optional[int] = None,
api:bool=False
) -> list[ExperienceOutput]:
experience = []
if isinstance(idxs[0], int):
experience += self.tasks[0].generate_experience(
self.agent.model,
self.agent.tokenizer,
idxs,
generation_config,
max_rounds,
api=self.api
)
elif isinstance(idxs[0], Sequence):
for idx, task in enumerate(self.tasks):
experience += task.generate_experience(
self.agent.model,
self.agent.tokenizer,
idxs[idx],
generation_config,
max_rounds,
api=api
)
else:
raise ValueError("Incorrect Format for idxs")
return experience
class Evaluator(BaseAgentEnvController):
def eval(
self,
generation_config: Optional[GenerationConfig] = None,
max_rounds: Optional[int] = None,
idxs: Sequence[int] | Sequence[Sequence[int]] | None = None,
api:bool=False
) -> EvaluationOutput:
exps = self.generate_experience(
idxs=idxs if idxs is not None else [list(range(len(task.clients[0]))) for task in self.tasks],
generation_config=generation_config,
max_rounds=max_rounds,
api=self.api
)
rewards = np.array([exp.reward for exp in exps])
return EvaluationOutput(
experiences=exps, score=rewards.mean(), success=(rewards == 1).mean()
)
同一目录下的ustils.py还需要加个api的入参来区分,类似
class EvaluationOutput: experiences: Sequence[ExperienceOutput] score: float success: float
class BaseAgentEnvController: def init(self, agent: Agent, tasks: Sequence[BaseTask],api:bool=False) -> None: self.agent = agent self.tasks = tasks self.api=api
def generate_experience( self, idxs: Sequence[int] | Sequence[Sequence[int]] | None = None, generation_config: Optional[GenerationConfig] = None, max_rounds: Optional[int] = None, api:bool=False ) -> list[ExperienceOutput]: experience = [] if isinstance(idxs[0], int): experience += self.tasks[0].generate_experience( self.agent.model, self.agent.tokenizer, idxs, generation_config, max_rounds, api=self.api ) elif isinstance(idxs[0], Sequence): for idx, task in enumerate(self.tasks): experience += task.generate_experience( self.agent.model, self.agent.tokenizer, idxs[idx], generation_config, max_rounds, api=api ) else: raise ValueError("Incorrect Format for idxs") return experienceclass Evaluator(BaseAgentEnvController): def eval( self, generation_config: Optional[GenerationConfig] = None, max_rounds: Optional[int] = None, idxs: Sequence[int] | Sequence[Sequence[int]] | None = None, api:bool=False ) -> EvaluationOutput: exps = self.generate_experience( idxs=idxs if idxs is not None else [list(range(len(task.clients[0]))) for task in self.tasks], generation_config=generation_config, max_rounds=max_rounds, api=self.api ) rewards = np.array([exp.reward for exp in exps]) return EvaluationOutput( experiences=exps, score=rewards.mean(), success=(rewards == 1).mean() )
import 部分能麻烦贴一下吗,convert_langchain_to_openai函数不知道从哪来的
只是实现两种模版转换的一个函数,你可以用https://api.python.langchain.com/en/latest/adapters/langchain_community.adapters.openai.convert_openai_messages.html 这个函数,也可以参考我写的
def convert_langchain_to_openai(
conversation: list[ConversationMessage],
):
output = []
for message in conversation:
if message["from"] == "human":
output.append({'role': 'user',
'content': message['value']})
else:
output.append({'role': 'assistant',
'content': message['value']})
return output
很好的功能,建议提PR
I modified agentenv/agentenv/controller/task.py and agentenv/agentenv/controller/utils.py, successfully implemented evaluation based on the QWEN API.
task.py def _generate_experience_one( self, model, tokenizer, client: BaseEnvClient, idx: int, generation_config: Optional[GenerationConfig] = None, max_rounds: Optional[int] = None, api:bool=False ) -> ExperienceOutput: client.reset(idx) print(idx) reward = 0.0 done = False state = client.observe() conversation = list(client.conversation_start) conversation.append( ConversationMessage({"from": "human", "loss": None, "value": state}) ) if api: #conversation_tokenized = self._tokenize_conversation(conversation, tokenizer)
rounds = 0 input_length=0 model_name=model['name'] model_url=model['url'] model_key=model['key'] client_api = OpenAI( # 如果没有配置环境变量,请用百炼API Key替换:api_key="sk-xxx" api_key=model_key, base_url=model_url ) while not done: #input_length=0 conversation_api=convert_langchain_to_openai(conversation) #print(conversation_api) completion = client_api.chat.completions.create( model=model_name, # 您可以按需更换为其它深度思考模型 messages=conversation_api, # enable_thinking 参数开启思考过程,QwQ 与 DeepSeek-R1 模型总会进行思考,不支持该参数 extra_body={"enable_thinking": True}, stream=True, stream_options={ "include_usage": True }, ) # for chunk in completion: # #print(chunk) # if chunk.usage is not None: # input_length=chunk.usage.total_tokens #break # if input_length exceeds 4096, break reasoning_content = "" # 完整思考过程 answer_content = "" # 完整回复 is_answering = False # 是否进入回复阶段 for chunk in completion: if not chunk.choices: input_length=chunk.usage.total_tokens print(input_length) continue delta = chunk.choices[0].delta # 只收集思考内容 if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None: # if not is_answering: # print(delta.reasoning_content, end="", flush=True) reasoning_content += delta.reasoning_content # 收到content,开始进行回复 if hasattr(delta, "content") and delta.content: if not is_answering: # print("\n" + "=" * 20 + "完整回复" + "=" * 20 + "\n") is_answering = True # print(delta.content, end="", flush=True) answer_content += delta.content if input_length > 4096: break generated_text=answer_content print(generated_text) conversation.append( ConversationMessage( {"from": "gpt", "loss": True, "value": generated_text} ) ) step_output = client.step(generated_text) state, reward, done = ( step_output.state, step_output.reward, step_output.done, ) env_message = ConversationMessage( {"from": "human", "loss": None, "value": state} ) # env_message_tokenized = self._tokenize_conversation_one( # env_message, tokenizer # ) conversation.append(env_message) # conversation_tokenized["text"] += env_message_tokenized["text"] # conversation_tokenized["input_ids"] += env_message_tokenized["input_ids"] # conversation_tokenized["action_mask"] += env_message_tokenized[ # "action_mask" # ] rounds += 1 if max_rounds is not None and rounds >= max_rounds: break return ExperienceOutput( conversation=conversation, reward=reward, text='a', seq_ids=[1], attention_mask=[1], action_mask=[1], )in your eval.py model={'name':r'qwq-32b','url':r"BASE_URL",'key':r'YOUR_KEY'}
Hi @CostaliyA , thank you so much for your excellent code—it’s really impressive!
I was wondering if you would be able to share your eval.py code as well? It would be very helpful.
Thanks again for your great contribute!
I modified agentenv/agentenv/controller/task.py and agentenv/agentenv/controller/utils.py, successfully implemented evaluation based on the QWEN API. task.py def _generate_experience_one( self, model, tokenizer, client: BaseEnvClient, idx: int, generation_config: Optional[GenerationConfig] = None, max_rounds: Optional[int] = None, api:bool=False ) -> ExperienceOutput: client.reset(idx) print(idx) reward = 0.0 done = False state = client.observe() conversation = list(client.conversation_start) conversation.append( ConversationMessage({"from": "human", "loss": None, "value": state}) ) if api: #conversation_tokenized = self._tokenize_conversation(conversation, tokenizer)
rounds = 0 input_length=0 model_name=model['name'] model_url=model['url'] model_key=model['key'] client_api = OpenAI( # 如果没有配置环境变量,请用百炼API Key替换:api_key="sk-xxx" api_key=model_key, base_url=model_url ) while not done: #input_length=0 conversation_api=convert_langchain_to_openai(conversation) #print(conversation_api) completion = client_api.chat.completions.create( model=model_name, # 您可以按需更换为其它深度思考模型 messages=conversation_api, # enable_thinking 参数开启思考过程,QwQ 与 DeepSeek-R1 模型总会进行思考,不支持该参数 extra_body={"enable_thinking": True}, stream=True, stream_options={ "include_usage": True }, ) # for chunk in completion: # #print(chunk) # if chunk.usage is not None: # input_length=chunk.usage.total_tokens #break # if input_length exceeds 4096, break reasoning_content = "" # 完整思考过程 answer_content = "" # 完整回复 is_answering = False # 是否进入回复阶段 for chunk in completion: if not chunk.choices: input_length=chunk.usage.total_tokens print(input_length) continue delta = chunk.choices[0].delta # 只收集思考内容 if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None: # if not is_answering: # print(delta.reasoning_content, end="", flush=True) reasoning_content += delta.reasoning_content # 收到content,开始进行回复 if hasattr(delta, "content") and delta.content: if not is_answering: # print("\n" + "=" * 20 + "完整回复" + "=" * 20 + "\n") is_answering = True # print(delta.content, end="", flush=True) answer_content += delta.content if input_length > 4096: break generated_text=answer_content print(generated_text) conversation.append( ConversationMessage( {"from": "gpt", "loss": True, "value": generated_text} ) ) step_output = client.step(generated_text) state, reward, done = ( step_output.state, step_output.reward, step_output.done, ) env_message = ConversationMessage( {"from": "human", "loss": None, "value": state} ) # env_message_tokenized = self._tokenize_conversation_one( # env_message, tokenizer # ) conversation.append(env_message) # conversation_tokenized["text"] += env_message_tokenized["text"] # conversation_tokenized["input_ids"] += env_message_tokenized["input_ids"] # conversation_tokenized["action_mask"] += env_message_tokenized[ # "action_mask" # ] rounds += 1 if max_rounds is not None and rounds >= max_rounds: break return ExperienceOutput( conversation=conversation, reward=reward, text='a', seq_ids=[1], attention_mask=[1], action_mask=[1], )in your eval.py model={'name':r'qwq-32b','url':r"BASE_URL",'key':r'YOUR_KEY'}
Hi @CostaliyA , thank you so much for your excellent code—it’s really impressive!
I was wondering if you would be able to share your eval.py code as well? It would be very helpful.
Thanks again for your great contribute!
Hello, I didn't back up the previous code :(. I remember that I just changed the model to model={'name':r'qwq-32b','url':r"BASE_URL",'key':r'YOUR_KEY'} and it worked.