autogen [Bug]: "Missing parameter 'tool_call_id'" with MultimodalConversableAgent after a tool call

Describe the bug

I am having issues with continuing a conversation with a Multimodal agent after that a function call has been used.

The goal that I have is to save a report that a first Multimodal agent generates based off of an image. I then ask an Assistant agent to use a callable function to save the report in a json file. I then would like to make a "fact checker" Multimodal agent check if the information on the image is coherent with the information in the report. Unfortunately, I am getting the following error message: BadRequestError: Error code: 400 - {'error': {'message': "Missing parameter 'tool_call_id': messages with role 'tool' must have a 'tool_call_id'.", 'type': 'invalid_request_error', 'param': 'messages.[4].tool_call_id', 'code': None}} This problem only arrises if the next agent is a Multimodal agent.

Steps to reproduce

Here is the code used to replicate the problem: The version of pyautogen is 0.2.26

import sys
path_project = #PATH TO PROJECT FOLDER
sys.path.append(path_project)
import json
import autogen
from autogen import AssistantAgent, UserProxyAgent
from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent
from annotated_types import Annotated

openaikey = #YOUR API KEY

seed = 46
timeout = 120
no_temp = 0.0
temp = 0.5
high_temp = 1.2

path_for_json = path_project + #NAME OF JSON

def is_terminal_message(content):
    have_content = content.get("content", None) is not None
    if have_content and 'TERMINATE' in content["content"]:
        return True
    return False

v4_config_list = [{
    "model": "gpt-4-turbo",
    "api_key": openaikey
}]

gpt4_config_list = [{
    "model": "gpt-4-turbo-2024-04-09",
    "api_key": openaikey
}]

user_proxy = UserProxyAgent(
    name="User",
    system_message="User. Once the question has been verified by the fact_checker, you must say TERMINATE to end the conversation.",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    llm_config={
    "cache_seed": seed,
    "temperature": temp,
    "config_list": gpt4_config_list,
    "timeout": timeout,
},
    is_termination_msg=is_terminal_message,
    code_execution_config=False,
)
assistant = MultimodalConversableAgent(
    name="Assistant",
    system_message="Describor. You describe the image accurately and generate a report on the image.",
    human_input_mode="NEVER",
    llm_config={
    "cache_seed": seed,
    "temperature": high_temp,
    "config_list": v4_config_list,
    "timeout": timeout + 500,
    "max_tokens": 1200
},
)

coder = AssistantAgent(
    name="Coder",
    system_message="Coder. Save the report thanks to the given function. YOU MUST USE THE FUNCTION PROVIDED. Once the question has been saved, say APPROVED.",
    llm_config={
    "cache_seed": seed,
    "temperature": temp,
    "config_list": gpt4_config_list,
    "timeout": timeout,
},
    is_termination_msg=is_terminal_message,
    human_input_mode="NEVER",
    code_execution_config=False,
)

fact_checker = MultimodalConversableAgent(
    name="Fact_Checker",
    system_message="Fact_Checker. You must check the report for accuracy and correct any errors you find.",
    human_input_mode="NEVER",
    llm_config={
    "cache_seed": seed,
    "temperature": high_temp,
    "config_list": v4_config_list,
    "timeout": timeout + 500,
    "max_tokens": 1200
},
)

def register_report(
    title: Annotated[str, "The title of the report"], 
    report: Annotated[str, "The report generated"],
    tldr: Annotated[str, "A quick summary of the report"],
    ) -> str:
    quest = {
        "title": title,
        "report": report,
        "tldr": tldr
    }
    with open(f"{path_for_json}", 'w') as f:
        json.dump(quest, f)
    return f"""
Title: {title}

Report: {report}

TLDR: {tldr}

file name: {path_for_json}
"""

coder.register_for_llm(name="register_report", description="Save the report with the title, report and a tldr.")(register_report)
user_proxy.register_for_execution(name="register_report")(register_report)

def custom_speaker_selection_func(last_speaker: autogen.Agent, groupchat: autogen.GroupChat):
    messages = groupchat.messages
    if len(messages) <= 1:
        return assistant

    elif last_speaker is user_proxy:
        if messages[-2]['name'] == coder.name:
            return coder
        else:
            return assistant
        
    elif last_speaker is assistant:
        return coder
    
    elif last_speaker is coder:
        if "APPROVED" in messages[-1]['content']:
            return fact_checker
        else:
            return user_proxy
        
    elif last_speaker is fact_checker:
        return user_proxy
        
    else:
        return "random"

groupchat = autogen.GroupChat(
        agents=[user_proxy, assistant, coder, fact_checker], messages=[], max_round=30, speaker_selection_method=custom_speaker_selection_func,
)


manager_llm_config = {
    "config_list": gpt4_config_list,
    "seed": seed,
    "timeout": timeout,
}

groupchat_manager = autogen.GroupChatManager(
    groupchat=groupchat,
    name="GroupChatManager",
    llm_config=manager_llm_config,
)

user_proxy.initiate_chat(
            groupchat_manager,
            clear_history=True,
            message=f"""Make a report based in the image provided: <img {#YOUR IMAGE}"""
        )

The error should be the one stated above.

Model Used

gpt-4-turbo-2024-04-09 gpt-4-turbo

Expected Behavior

I would like for the fact_checker agent to be able to check the validity of the report.

Screenshots and logs

No response

Additional Information

pyautogen: 0.2.26 python: 3.11.7

Apr 30 '24 14:04 SebastianMaierEPFL

I have the same problem with Multimodel too. The function can do its job but the chat is always ended by this error. I managed to deal with this error by ignoring it

May 06 '24 14:05 simonngai

Has this problem been solved？I have this problem too.

May 26 '24 07:05 CLEILI

Hi, I recent revisited this problem and just realized you have to give vision_capability to group manager to make it work.

You can follow this example here.

Also, I created a simple demo with tool here.

I hope it helps!

Jun 13 '24 15:06 simonngai

I have this issue too. From what I can figure out it is an issue with how the multimodalconversableangent structures the tool response message before it gets sent to OpenAI. The issue doesn’t seem to be related to the groupchat, as I get the exact same error in a two-way agent conversation as well as inside a groupchat.

When I swap the agent that makes the function call from a multimodalconversableangent to a conversableagent the error goes away. It is the caller agent causing the error after the function is executed and it gets the output, not the executor agent.

Here is the difference between how the two agents structure the message they append to the message list.

Conversableagent: { ‘tool_call_id’: xxxxxx, ‘role’: ‘tool’, ‘content’: output from the function }

Multimodalconversableangent: { ‘content’: [ { ‘type’: ‘text’, ‘text’: output from the function } ], ‘tool_responses’:[ { ‘tool_call_id’: xxxxxx, ‘role’: ‘tool’, ‘content’: output from the function } ], ‘name: name of the agent that executed the function, ‘role’: ‘tool’ }

The multimodalconversableagent adds some extra info that should not be in the message. It should just be using the info inside the ‘tool_responses’ item, not the other values.

There also seem to be some other differences between how the two agents structure regular messages that don’t involve tool calls, but this doesn’t cause an error. E.g. the ‘content’ item in the conversableagent has the text directly in it, while in the multimodalconversableagent ‘content’ is a dictionary of ‘type’ and ‘text’.

Python: 3.12.2 Autogen: 0.2.31 Model: GPT-4o and GPT-4-turbo, both give the same result

Jun 28 '24 04:06 jk10001

Hi, I recent revisited this problem and just realized you have to give vision_capability to group manager to make it work.

You can follow this example here.

Also, I created a simple demo with tool here.

I hope it helps!

The order of calling agents is very important. It determines whether the bug can be reproduced. If an agent calls the function call first and then the multimodal agents speak, the bug will definitely appear, but not vice versa. Overall, this is an unavoidable bug in practical applications.

Jul 31 '24 09:07 jssuoyii

I was facing this issue, and I found quite a workaround. You can subclass MultimodalConversableAgent, overriding the method generate_oai_reply to ensure that all messages with role tool do also have a tool_call_id. In my case, previous chats w/ tool calling were creating such faulty messages, lacking a tool_call_id even if role was set to tool. This happened when several tool calls were suggested in a row. The chat looked like this:

{
  'content': [{'text': '', 'type': 'text'}],
  'name': '<TOOL_CALLER_AGENT>',
  'role': 'assistant',
  'tool_calls': [
          {
                  'function': {
                            'arguments': '<FUNC_1_ARGS>',
                            'name': '<FUNC_1_NAME>'
                   },
                  'id': '<CALL_ID_FUNC_1>',
                  'type': 'function'
          },
          {
                    'function': {
                               'arguments': '<FUNC_1_ARGS>',
                               'name': '<FUNC_1_NAME>'
                     },
                    'id': '<CALL_ID_FUNC_2>',
                    'type': 'function'
            }
   ]
},
{
      'content': '<SUM_OF_ALL_FUNCTION_RESPONSES>',
      'role': 'tool',
      'tool_responses': [
              {
                    'content': <FUNC_1_RESPONSE>,
                    'role': 'tool',
                    'tool_call_id': '<CALL_ID_FUNC_1>',
               },
               { 
                    'content': <FUNC_2_RESPONSE>,
                    'role': 'tool',
                    'tool_call_id': '<CALL_ID_FUNC_2>',
               }
        ]
}

Instead, you can add all messages in the tool_responses field of the tool response directly into the chat, removing the "general" response with the sum of all function responses, ensuring that each tool message actually has a tool_call_id.

Here's the workaround.

class CustomMultimodalConversableAgent(MultimodalConversableAgent):
    def __init__(
            self,
            name: str,
            system_message: Optional[Union[str, List]] = "",
            is_termination_msg: str = None,
            *args,
            **kwargs,
    ):
        super().__init__(
            name,
            system_message,
            is_termination_msg=is_termination_msg,
            *args,
            **kwargs,
        )
        self.replace_reply_func(
            MultimodalConversableAgent.generate_oai_reply,
            CustomMultimodalConversableAgent.generate_oai_reply,
        )

    def generate_oai_reply(
            self,
            messages: Optional[List[Dict]] = None,
            sender: Optional[Agent] = None,
            config: Optional[OpenAIWrapper] = None,
    ) -> Tuple[bool, Union[str, Dict, None]]:
        """Generate a reply using autogen.oai."""
        client = self.client if config is None else config
        if client is None:
            return False, None
        if messages is None:
            messages = self._oai_messages[sender]

        # Custom modifications ##################################################
        new_messages = []
        for message in messages:
            if message["role"] != "tool":
                new_messages.append(message)
            else:
                for sub_message in message["tool_responses"]:
                    new_messages.append(sub_message)
        messages = new_messages
        #####################################################################

        messages_with_b64_img = message_formatter_pil_to_b64(self._oai_system_message + messages)
        response = client.create(context=messages[-1].pop("context", None), messages=messages_with_b64_img)
        extracted_response = client.extract_text_or_completion_object(response)[0]
        if not isinstance(extracted_response, str):
            extracted_response = extracted_response.model_dump()
        return True, extracted_response

Hope this helps !

Aug 04 '24 09:08 jeanprbt