UI-TARS
UI-TARS copied to clipboard
OOD Error During Example Code Test with UI-TARS-7B-DPO on A100×4 Local Deployment
Deployed UI-TARS-7B-DPO on a local server equipped with four A100 GPUs (4 × 80 GB). Tested using the example code, but encountered an out-of-distribution (OOD) error.
Question: Why does a 7B model require such a large amount of GPU memory?
Mode deployment code:
import os
import time
import uuid
import torch
from io import BytesIO
from PIL import Image
import requests
from flask import Flask, request, jsonify
from transformers import AutoProcessor, AutoModelForVision2Seq
# -----------------------------
# Model & Processor (global, lazy)
# -----------------------------
MODEL_ID = os.getenv("MODEL_ID", "ByteDance-Seed/UI-TARS-7B-DPO")
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
print(f"[BOOT] Loading {MODEL_ID} (dtype={DTYPE}) …")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
MODEL_ID,
torch_dtype=DTYPE,
device_map="auto",
trust_remote_code=True,
)
model.eval()
print("[BOOT] Model loaded.")
# -----------------------------
# Flask App
# -----------------------------
app = Flask(__name__)
# Simple API key gate (optional)
EXPECTED_KEY = os.getenv("API_KEY", "sk-local")
def _auth_ok(req) -> bool:
# Accept "Authorization: Bearer <key>" or "X-API-Key: <key>"
auth = req.headers.get("Authorization", "")
if auth.startswith("Bearer "):
return auth.split(" ", 1)[1].strip() == EXPECTED_KEY
xk = req.headers.get("X-API-Key")
return (xk is not None) and (xk == EXPECTED_KEY)
# -----------------------------
# Helpers
# -----------------------------
def fetch_image(url: str) -> Image.Image:
"""Download an image and return a PIL image."""
r = requests.get(url, timeout=20)
r.raise_for_status()
return Image.open(BytesIO(r.content)).convert("RGB")
def openai_to_hf_messages(openai_messages):
"""
Convert OpenAI 'messages' into the format UI-TARS' processor expects:
[
{"role": "user", "content": [{"type":"image","image": PIL}, {"type":"text","text":"..."}]},
{"role": "assistant", "content": [{"type":"text","text":"..."}]},
...
]
"""
hf_msgs = []
for m in openai_messages:
role = m.get("role", "user")
content = m.get("content")
parts = []
# OpenAI content may be a string OR a list of content blocks
if isinstance(content, str):
parts.append({"type": "text", "text": content})
elif isinstance(content, list):
for c in content:
ctype = c.get("type")
if ctype in ("text", "input_text"):
parts.append({"type": "text", "text": c.get("text", "")})
elif ctype in ("image_url", "image"):
# Support both {type:"image_url", image_url:{url:"..."}} and {type:"image", url:"..."}
url = None
if "image_url" in c and isinstance(c["image_url"], dict):
url = c["image_url"].get("url")
if url is None:
url = c.get("url")
if not url:
continue
try:
img = fetch_image(url)
parts.append({"type": "image", "image": img})
except Exception as e:
# If image fails, append a tiny text note instead of breaking the request
parts.append({"type": "text", "text": f"[image fetch failed: {e}]"})
else:
# Ignore unknown blocks but keep going
pass
else:
# Fallback: treat as plain text
parts.append({"type": "text", "text": str(content)})
hf_msgs.append({"role": role, "content": parts})
return hf_msgs
def generate_with_model(hf_messages, max_tokens=256, temperature=0.0, top_p=1.0):
"""
Build inputs via processor.apply_chat_template and call model.generate.
"""
# Build processor inputs on the model device
inputs = processor.apply_chat_template(
hf_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
gen_kwargs = {
"max_new_tokens": int(max_tokens),
"do_sample": (temperature is not None and float(temperature) > 0.0),
"temperature": float(max(0.0, temperature)),
"top_p": float(top_p),
"use_cache": True,
}
with torch.inference_mode():
output_ids = model.generate(**inputs, **gen_kwargs)
# Decode only the new tokens
prompt_len = inputs["input_ids"].shape[-1]
decoded = processor.decode(output_ids[0][prompt_len:], skip_special_tokens=True)
return decoded
# -----------------------------
# OpenAI-compatible endpoint
# -----------------------------
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions():
if not _auth_ok(request):
return jsonify({"error": {"message": "Unauthorized"}}), 401
payload = request.get_json(force=True, silent=True) or {}
messages = payload.get("messages", [])
max_tokens = payload.get("max_tokens", 65536)
temperature = payload.get("temperature", 0.0)
top_p = payload.get("top_p", 1.0)
stream = payload.get("stream", False)
model_name = payload.get("model", MODEL_ID)
# Convert OpenAI chat format to HF/UI-TARS format
hf_messages = openai_to_hf_messages(messages)
# We don’t implement streaming in this minimal server
if stream:
return jsonify({"error": {"message": "stream=True not supported by this server"}}), 400
started = time.time()
try:
text = generate_with_model(
hf_messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
except Exception as e:
return jsonify({"error": {"message": f"Generation failed: {e}"}}), 500
elapsed = time.time() - started
choice_id = str(uuid.uuid4())[:8]
# OpenAI-style response
resp = {
"id": f"chatcmpl-{str(uuid.uuid4())}",
"object": "chat.completion",
"created": int(time.time()),
"model": model_name,
"usage": { # token counts are optional (and non-trivial to compute exactly without re-tokenizing)
"prompt_tokens": None,
"completion_tokens": None,
"total_tokens": None,
},
"choices": [
{
"index": 0,
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": text,
},
"id": choice_id,
"logprobs": None,
}
],
"latency_ms": int(elapsed * 1000),
}
return jsonify(resp), 200
# Health check
@app.route("/health", methods=["GET"])
def health():
return jsonify({"status": "ok", "model": MODEL_ID}), 200
if __name__ == "__main__":
# Bind to 0.0.0.0 for container/VPS, default port 8000
port = int(os.getenv("PORT", "8000"))
app.run(host="0.0.0.0", port=port, threaded=True)
# nohup python -u flask_UI_TARS_7B.py > server.log 2>&1 &
Test code:
# pip install openai
import io
import re
import json
import base64
from PIL import Image
from io import BytesIO
from openai import OpenAI
def add_box_token(input_string):
# Step 1: Split the string into individual actions
if "Action: " in input_string and "start_box=" in input_string:
suffix = input_string.split("Action: ")[0] + "Action: "
actions = input_string.split("Action: ")[1:]
processed_actions = []
for action in actions:
action = action.strip()
# Step 2: Extract coordinates (start_box or end_box) using regex
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
updated_action = action # Start with the original action
for coord_type, x, y in coordinates:
# Convert x and y to integers
updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
processed_actions.append(updated_action)
# Step 5: Reconstruct the final string
final_string = suffix + "\n\n".join(processed_actions)
else:
final_string = input_string
return final_string
client = OpenAI(
base_url="http://xxxxx:8000/v1",
api_key="sk-local", # must match the server's API_KEY
)
result = {}
messages = json.load(open("./data/test_messages.json"))
for message in messages:
if message["role"] == "assistant":
message["content"] = add_box_token(message["content"])
print(message["content"])
chat_completion = client.chat.completions.create(
model="tgi",
messages=messages,
temperature=0.0,
max_tokens=400,
stream=False,
)
response = ""
for message in chat_completion:
response += message.choices[0].delta.content
print(response)
Error Info:
Thought: 我看到屏幕上弹出了一个颜色配置文件转换的对话框,要继续操作的话得先处理掉它。对话框右下角有个"Convert"按钮,我需要点击它来确认这个转换操作。
Action: click(start_box='<|box_start|>(607,647)<|box_end|>')
Thought: 看到这张复古电视机的照片,我需要把它设置成基于调色板的方式。让我先点击顶部菜单栏的"Tools"选项,这样就能找到相关的颜色设置选项了。
Action: click(start_box='<|box_start|>(220,71)<|box_end|>')
Thought: 刚才点开了Tools菜单,但发现这不是正确的路径。要设置图片为基于调色板的方式,我需要先关闭当前打开的Tools菜单,然后重新寻找正确的颜色设置选项。让我点击一下Tools按钮把它关掉。
Action: click(start_box='<|box_start|>(220,71)<|box_end|>')
Thought: 刚才在Tools菜单里没找到想要的选项,让我换个思路。我注意到顶部菜单栏有个"Colors"选项,这应该就是处理颜色设置的地方。我需要点击它来继续寻找将图片设置为基于调色板方式的选项。
Action: click(start_box='<|box_start|>(192,71)<|box_end|>')
Thought: 我在Colors菜单里仔细浏览了一遍,发现这里并没有直接设置调色板的选项。看来得换个思路了 - 先把这个菜单关掉,然后去Edit菜单看看,那里应该能找到更多关于图像处理的选项。毕竟要实现基于调色板的设置,可能需要通过其他路径来实现。
Action: click(start_box='<|box_start|>(192,71)<|box_end|>')
Thought: 我在Colors菜单里没找到想要的选项,让我换个思路。我注意到顶部菜单栏有个"Edit"选项,这应该就是处理颜色设置的地方。我需要点击它来继续寻找将图片设置为基于调色板方式的选项。
Action: click(start_box='<|box_start|>(65,71)<|box_end|>')
Thought: 我在Edit菜单里发现了Preferences选项,这正是我需要的。通过这个选项,我应该能找到调整图片颜色模式的设置。让我点击它进入设置界面,这样就能把图片转换成基于调色板的方式了。
Action: click(start_box='<|box_start|>(96,602)<|box_end|>')
---------------------------------------------------------------------------
InternalServerError Traceback (most recent call last)
Cell In[9], [line 45](vscode-notebook-cell:?execution_count=9&line=45)
42 message["content"] = add_box_token(message["content"])
43 print(message["content"])
---> [45](vscode-notebook-cell:?execution_count=9&line=45) chat_completion = client.chat.completions.create(
46 model="tgi",
47 messages=messages,
48 temperature=0.0,
49 max_tokens=400,
50 stream=False,
51 )
53 response = ""
54 for message in chat_completion:
File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_utils/_utils.py:286, in required_args.<locals>.inner.<locals>.wrapper(*args, **kwargs)
284 msg = f"Missing required argument: {quote(missing[0])}"
285 raise TypeError(msg)
--> [286](https://file+.vscode-resource.vscode-cdn.net/xxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_utils/_utils.py:286) return func(*args, **kwargs)
File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:1147, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, prompt_cache_key, reasoning_effort, response_format, safety_identifier, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, verbosity, web_search_options, extra_headers, extra_query, extra_body, timeout)
1101 @required_args(["messages", "model"], ["messages", "model", "stream"])
1102 def create(
1103 self,
(...) 1144 timeout: float | httpx.Timeout | None | NotGiven = not_given,
1145 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
1146 validate_response_format(response_format)
-> [1147](https://file+.vscode-resource.vscode-cdn.net/xxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:1147) return self._post(
1148 "/chat/completions",
1149 body=maybe_transform(
1150 {
1151 "messages": messages,
1152 "model": model,
1153 "audio": audio,
1154 "frequency_penalty": frequency_penalty,
1155 "function_call": function_call,
1156 "functions": functions,
1157 "logit_bias": logit_bias,
1158 "logprobs": logprobs,
1159 "max_completion_tokens": max_completion_tokens,
1160 "max_tokens": max_tokens,
1161 "metadata": metadata,
1162 "modalities": modalities,
1163 "n": n,
1164 "parallel_tool_calls": parallel_tool_calls,
1165 "prediction": prediction,
1166 "presence_penalty": presence_penalty,
1167 "prompt_cache_key": prompt_cache_key,
1168 "reasoning_effort": reasoning_effort,
1169 "response_format": response_format,
1170 "safety_identifier": safety_identifier,
1171 "seed": seed,
1172 "service_tier": service_tier,
1173 "stop": stop,
1174 "store": store,
1175 "stream": stream,
1176 "stream_options": stream_options,
1177 "temperature": temperature,
1178 "tool_choice": tool_choice,
1179 "tools": tools,
1180 "top_logprobs": top_logprobs,
1181 "top_p": top_p,
1182 "user": user,
1183 "verbosity": verbosity,
1184 "web_search_options": web_search_options,
1185 },
1186 completion_create_params.CompletionCreateParamsStreaming
1187 if stream
1188 else completion_create_params.CompletionCreateParamsNonStreaming,
1189 ),
1190 options=make_request_options(
1191 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
1192 ),
1193 cast_to=ChatCompletion,
1194 stream=stream or False,
1195 stream_cls=Stream[ChatCompletionChunk],
1196 )
File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:[1259](https://file+.vscode-resource.vscode-cdn.net/xxxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:1259), in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
1245 def post(
1246 self,
1247 path: str,
(...) 1254 stream_cls: type[_StreamT] | None = None,
1255 ) -> ResponseT | _StreamT:
1256 opts = FinalRequestOptions.construct(
1257 method="post", url=path, json_data=body, files=to_httpx_files(files), **options
1258 )
-> 1259 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:[1047](https://file+.vscode-resource.vscode-cdn.net/xxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:1047), in SyncAPIClient.request(self, cast_to, options, stream, stream_cls)
1044 err.response.read()
1046 log.debug("Re-raising status error")
-> 1047 raise self._make_status_error_from_response(err.response) from None
1049 break
1051 assert response is not None, "could not resolve response (should never happen)"
InternalServerError: Error code: 500 - {'error': {'message': 'Generation failed: CUDA out of memory. Tried to allocate 76.02 GiB. GPU 0 has a total capacity of 79.15 GiB of which 25.10 GiB is free. Including non-PyTorch memory, this process has 54.04 GiB memory in use. Of the allocated memory 49.64 GiB is allocated by PyTorch, and 3.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)'}}