[gpt-oss] support mcp tool streaming with gptoss
Purpose
This change enables streaming support for MCP tools when using GPT OSS. It extends the harmony utilities and response serving infrastructure to handle tool streaming, allowing tool calls and their results to be incrementally streamed back to clients rather than returned as a single batch.
Test Plan
curl -X POST "http://localhost:8000/v1/responses" -H "Content-Type: application/json" -H "Authorization: Bearer dummy-api-key" -d '{
"model": "default",
"input": "Multiply 123*456 using the mcp.code_interpreter tool.",
"tools": [{
"type": "mcp",
"server_label": "code_interpreter",
"headers": {"test": "test"},
"server_url": "IGNORED"
}],
"stream": true,
"enable_response_messages": true
}'
Test Result
event: response.created
data: {"response":{"id":"resp_634aa3735d374e609c59128a4ca4c9ff","created_at":1762333234,"incomplete_details":null,"instructions":null,"metadata":null,"model":"default","object":"response","output":[],"parallel_tool_calls":true,"temperature":1.0,"tool_choice":"auto","tools":[{"server_label":"code_interpreter","type":"mcp","allowed_tools":null,"authorization":null,"connector_id":null,"headers":{"test":"test"},"require_approval":null,"server_description":null,"server_url":"IGNORED"}],"top_p":1.0,"background":false,"max_output_tokens":130895,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":"auto","status":"in_progress","text":null,"top_logprobs":null,"truncation":"disabled","usage":null,"user":null,"input_messages":null,"output_messages":null},"sequence_number":0,"type":"response.created"}
event: response.in_progress
data: {"response":{"id":"resp_634aa3735d374e609c59128a4ca4c9ff","created_at":1762333234,"incomplete_details":null,"instructions":null,"metadata":null,"model":"default","object":"response","output":[],"parallel_tool_calls":true,"temperature":1.0,"tool_choice":"auto","tools":[{"server_label":"code_interpreter","type":"mcp","allowed_tools":null,"authorization":null,"connector_id":null,"headers":{"test":"test"},"require_approval":null,"server_description":null,"server_url":"IGNORED"}],"top_p":1.0,"background":false,"max_output_tokens":130895,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":"auto","status":"in_progress","text":null,"top_logprobs":null,"truncation":"disabled","usage":null,"user":null,"input_messages":null,"output_messages":null},"sequence_number":1,"type":"response.in_progress"}
event: response.output_item.added
data: {"item":{"id":"msg_91e0b5be583e4ac38cfe7d55f025def7","summary":[],"type":"reasoning","content":null,"encrypted_content":null,"status":"in_progress"},"output_index":0,"sequence_number":2,"type":"response.output_item.added"}
event: response.reasoning_part.added
data: {"content_index":0,"item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"part":{"text":"","type":"reasoning_text"},"sequence_number":3,"type":"response.reasoning_part.added"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":"We","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":4,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":" need","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":5,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":" to","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":6,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":" compute","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":7,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":" ","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":8,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":"123","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":9,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":"*","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":10,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":"456","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":11,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":".","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":12,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":" Use","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":13,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":" python","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":14,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.delta
data: {"content_index":0,"delta":".","item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":15,"type":"response.reasoning_text.delta"}
event: response.reasoning_text.done
data: {"content_index":-1,"item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"sequence_number":16,"text":"We need to compute 123*456. Use python.","type":"response.reasoning_text.done"}
event: response.reasoning_part.done
data: {"content_index":-1,"item_id":"msg_91e0b5be583e4ac38cfe7d55f025def7","output_index":0,"part":{"text":"We need to compute 123*456. Use python.","type":"reasoning_text"},"sequence_number":17,"type":"response.reasoning_part.done"}
event: response.output_item.done
data: {"item":{"id":"msg_91e0b5be583e4ac38cfe7d55f025def7","summary":[],"type":"reasoning","content":[{"text":"We need to compute 123*456. Use python.","type":"reasoning_text"}],"encrypted_content":null,"status":"completed"},"output_index":0,"sequence_number":18,"type":"response.output_item.done"}
event: response.output_item.added
data: {"item":{"id":"mcp_4e15766739ed49a1860e8d7b377348d7","arguments":"","name":"python","server_label":"code_interpreter","type":"mcp_call","approval_request_id":null,"error":null,"output":null,"status":"in_progress","call_id":"mcp_f38d222820be4db7ba44e4b7e63b0c0f"},"output_index":1,"sequence_number":19,"type":"response.output_item.added"}
event: response.mcp_call.in_progress
data: {"item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":20,"type":"response.mcp_call.in_progress"}
event: response.mcp_call_arguments.delta
data: {"delta":"123","item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":21,"type":"response.mcp_call_arguments.delta"}
event: response.mcp_call_arguments.delta
data: {"delta":"*","item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":22,"type":"response.mcp_call_arguments.delta"}
event: response.mcp_call_arguments.delta
data: {"delta":"456","item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":23,"type":"response.mcp_call_arguments.delta"}
event: response.mcp_call_arguments.delta
data: {"delta":"\n","item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":24,"type":"response.mcp_call_arguments.delta"}
event: response.mcp_call_arguments.done
data: {"arguments":"123*456\n","item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":25,"type":"response.mcp_call_arguments.done","name":"python"}
event: response.mcp_call.completed
data: {"item_id":"mcp_4e15766739ed49a1860e8d7b377348d7","output_index":1,"sequence_number":26,"type":"response.mcp_call.completed"}
event: response.output_item.done
data: {"item":{"id":"mcp_4e15766739ed49a1860e8d7b377348d7","arguments":"123*456\n","name":"python","server_label":"code_interpreter","type":"mcp_call","approval_request_id":null,"error":null,"output":null,"status":"completed","call_id":"mcp_13e054b550474cd5aa66c71aefcebf00"},"output_index":1,"sequence_number":27,"type":"response.output_item.done"}
event: response.output_item.added
data: {"item":{"id":"msg_5e2d50b2c1704e9eb848d78929716445","content":[],"role":"assistant","status":"in_progress","type":"message"},"output_index":2,"sequence_number":28,"type":"response.output_item.added"}
event: response.content_part.added
data: {"content_index":0,"item_id":"msg_5e2d50b2c1704e9eb848d78929716445","output_index":2,"part":{"annotations":[],"text":"","type":"output_text","logprobs":[]},"sequence_number":29,"type":"response.content_part.added"}
event: response.output_text.delta
data: {"content_index":0,"delta":"The","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":30,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" product","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":31,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" of","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":32,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" \\(","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":33,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"123","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":34,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" \\","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":35,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"times","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":36,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" ","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":37,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"456","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":38,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"\\","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":39,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":")","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":40,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" is","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":41,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":" **","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":42,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"56","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":43,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":",","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":44,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"088","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":45,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":"**","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":46,"type":"response.output_text.delta"}
event: response.output_text.delta
data: {"content_index":0,"delta":".","item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":47,"type":"response.output_text.delta"}
event: response.output_text.done
data: {"content_index":-1,"item_id":"msg_5e2d50b2c1704e9eb848d78929716445","logprobs":[],"output_index":2,"sequence_number":48,"text":"The product of \\(123 \\times 456\\) is **56,088**.","type":"response.output_text.done"}
event: response.content_part.done
data: {"content_index":-1,"item_id":"msg_5e2d50b2c1704e9eb848d78929716445","output_index":2,"part":{"annotations":[],"text":"The product of \\(123 \\times 456\\) is **56,088**.","type":"output_text","logprobs":null},"sequence_number":49,"type":"response.content_part.done"}
event: response.output_item.done
data: {"item":{"id":"msg_5e2d50b2c1704e9eb848d78929716445","content":[{"annotations":[],"text":"The product of \\(123 \\times 456\\) is **56,088**.","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"},"output_index":2,"sequence_number":50,"type":"response.output_item.done"}
event: response.completed
data: {"response":{"id":"resp_634aa3735d374e609c59128a4ca4c9ff","created_at":1762333234,"incomplete_details":null,"instructions":null,"metadata":null,"model":"default","object":"response","output":[{"id":"rs_3b68bdfae7ae40f7a60aea582112be00","summary":[],"type":"reasoning","content":[{"text":"We need to compute 123*456. Use python.","type":"reasoning_text"}],"encrypted_content":null,"status":null},{"id":"rs_bf4d1d8780a94590b632b1934b761f14","summary":[],"type":"reasoning","content":[{"text":"123*456\n","type":"reasoning_text"}],"encrypted_content":null,"status":null},{"id":"rs_fadc43a8733046ac90d018cf77bd2985","summary":[],"type":"reasoning","content":[{"text":"123*456\n","type":"reasoning_text"}],"encrypted_content":null,"status":null},{"id":"msg_44a462509fc0476fad741b6f9cdc4dc5","content":[{"annotations":[],"text":"The product of \\(123 \\times 456\\) is **56,088**.","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":1.0,"tool_choice":"auto","tools":[{"server_label":"code_interpreter","type":"mcp","allowed_tools":null,"authorization":null,"connector_id":null,"headers":{"test":"test"},"require_approval":null,"server_description":null,"server_url":"IGNORED"}],"top_p":1.0,"background":false,"max_output_tokens":130863,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":"auto","status":"completed","text":null,"top_logprobs":null,"truncation":"disabled","usage":{"input_tokens":386,"input_tokens_details":{"cached_tokens":368,"input_tokens_per_turn":[177,209],"cached_tokens_per_turn":[176,192]},"output_tokens":52,"output_tokens_details":{"reasoning_tokens":18,"tool_output_tokens":2,"output_tokens_per_turn":[30,22],"tool_output_tokens_per_turn":[0,2]},"total_tokens":438},"user":null,"input_messages":[{"role":"system","name":null,"content":[{"model_identity":"You are ChatGPT, a large language model trained by OpenAI.","reasoning_effort":"Medium","conversation_start_date":"2025-11-05","knowledge_cutoff":"2024-06","channel_config":{"valid_channels":["analysis","final"],"channel_required":true},"tools":{"python":{"name":"python","description":"Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files). When you send a message containing python code to python, it will be executed in a stateful docker container, and the stdout of that process will be returned to you.","tools":[]}},"type":"system_content"}]},{"role":"user","name":null,"content":[{"type":"text","text":"Multiply 123*456 using the mcp.code_interpreter tool."}]}],"output_messages":[{"role":"assistant","name":null,"content":[{"type":"text","text":"We need to compute 123*456. Use python."}],"channel":"analysis"},{"role":"assistant","name":null,"content":[{"type":"text","text":"123*456\n"}],"channel":"analysis","recipient":"python","content_type":"code"},{"role":"tool","name":"python","content":[{"type":"text","text":"56088\n"}],"channel":"analysis","recipient":"assistant"},{"role":"assistant","name":null,"content":[{"type":"text","text":"123*456\n"}],"channel":"analysis","recipient":"python","content_type":"code"},{"role":"assistant","name":null,"content":[{"type":"text","text":"The product of \\(123 \\times 456\\) is **56,088**."}],"channel":"final"}]},"sequence_number":51,"type":"response.completed"}
Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating
supported_models.mdandexamplesfor a new model. - [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the Google Doc.
Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating
supported_models.mdandexamplesfor a new model. - [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the Google Doc.
💡 Codex Review
https://github.com/vllm-project/vllm/blob/476d94582f0191b1fbaee9d1b1f6170d7796a590/vllm/entrypoints/openai/serving_responses.py#L385-L388
Guard against missing tools when computing MCP tool names
When Harmony is enabled the code now unconditionally calls _extract_mcp_tool_names(request.tools) before creating the context. request.tools is optional and is None for normal requests that do not use tools, so this call immediately raises TypeError: 'NoneType' object is not iterable and the request fails before any tokens are generated. Before this change, requests without tools worked. The call should handle None (e.g. pass an empty list or skip the extraction) so that non‑tool conversations continue to function.
ℹ️ About Codex in GitHub
Codex has been enabled to automatically review pull requests in this repo. Reviews are triggered when you
- Open a pull request for review
- Mark a draft as ready
- Comment "@codex review".
If Codex has suggestions, it will comment; otherwise it will react with 👍.
When you sign up for Codex through ChatGPT, Codex can also answer questions or update the PR, like "@codex address that feedback".
CC @aarnphm @chaunceyjiang @yeqcharlotte
Are the call_id fields for the McpCall objects needed? I don't see those in the spec anywhere - is this just copy/paste from function calls, or is call_id used for McpCall?
@bbrowning thanks for catching that - yeah I saw function calls were using it so I carried it over, but realize we're not pairing the mcp calls to any output so the call_id isn't needed
@alecsolder thanks for the detailed comments - totally agree with the direction and it makes the overall design much cleaner. I went ahead and took a stab at implementing it with the namespace approach and simplified MCP handling.
thanks for the review @alecsolder !
I was able to remove the extra checks and simplify the logic. Also updated unit tests and caught an issue with multi-turn mcp requests not being stored in previous msgs.
I went ahead and also incorporated @bbrowning comment about allowed_tools in https://github.com/vllm-project/vllm/issues/28261#issuecomment-3504841201
This pull request has merge conflicts that must be resolved before it can be merged. Please rebase the PR, @daniel-salib.
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
thanks for adding this. i feel this is getting into a state where it's confusing for both users and developers to use. could you add an example in vllm/ folder how to spin up relevant server and update recipes accordingly https://github.com/vllm-project/recipes/blob/main/OpenAI/GPT-OSS.md
will create new PRs breaking the current PR into smaller chunks