OOD Error During Example Code Test with UI-TARS-7B-DPO on A100×4 Local Deployment

Open Zhuifeng414 opened this issue 2 months ago • 0 comments

Deployed UI-TARS-7B-DPO on a local server equipped with four A100 GPUs (4 × 80 GB). Tested using the example code, but encountered an out-of-distribution (OOD) error.

Question: Why does a 7B model require such a large amount of GPU memory?

Mode deployment code:


import os
import time
import uuid
import torch
from io import BytesIO
from PIL import Image
import requests
from flask import Flask, request, jsonify

from transformers import AutoProcessor, AutoModelForVision2Seq

# -----------------------------
# Model & Processor (global, lazy)
# -----------------------------
MODEL_ID = os.getenv("MODEL_ID", "ByteDance-Seed/UI-TARS-7B-DPO")
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

print(f"[BOOT] Loading {MODEL_ID} (dtype={DTYPE}) …")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=DTYPE,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
print("[BOOT] Model loaded.")

# -----------------------------
# Flask App
# -----------------------------
app = Flask(__name__)

# Simple API key gate (optional)
EXPECTED_KEY = os.getenv("API_KEY", "sk-local")

def _auth_ok(req) -> bool:
    # Accept "Authorization: Bearer <key>" or "X-API-Key: <key>"
    auth = req.headers.get("Authorization", "")
    if auth.startswith("Bearer "):
        return auth.split(" ", 1)[1].strip() == EXPECTED_KEY
    xk = req.headers.get("X-API-Key")
    return (xk is not None) and (xk == EXPECTED_KEY)

# -----------------------------
# Helpers
# -----------------------------
def fetch_image(url: str) -> Image.Image:
    """Download an image and return a PIL image."""
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    return Image.open(BytesIO(r.content)).convert("RGB")

def openai_to_hf_messages(openai_messages):
    """
    Convert OpenAI 'messages' into the format UI-TARS' processor expects:
    [
      {"role": "user", "content": [{"type":"image","image": PIL}, {"type":"text","text":"..."}]},
      {"role": "assistant", "content": [{"type":"text","text":"..."}]},
      ...
    ]
    """
    hf_msgs = []
    for m in openai_messages:
        role = m.get("role", "user")
        content = m.get("content")
        parts = []

        # OpenAI content may be a string OR a list of content blocks
        if isinstance(content, str):
            parts.append({"type": "text", "text": content})
        elif isinstance(content, list):
            for c in content:
                ctype = c.get("type")
                if ctype in ("text", "input_text"):
                    parts.append({"type": "text", "text": c.get("text", "")})
                elif ctype in ("image_url", "image"):
                    # Support both {type:"image_url", image_url:{url:"..."}} and {type:"image", url:"..."}
                    url = None
                    if "image_url" in c and isinstance(c["image_url"], dict):
                        url = c["image_url"].get("url")
                    if url is None:
                        url = c.get("url")
                    if not url:
                        continue
                    try:
                        img = fetch_image(url)
                        parts.append({"type": "image", "image": img})
                    except Exception as e:
                        # If image fails, append a tiny text note instead of breaking the request
                        parts.append({"type": "text", "text": f"[image fetch failed: {e}]"})
                else:
                    # Ignore unknown blocks but keep going
                    pass
        else:
            # Fallback: treat as plain text
            parts.append({"type": "text", "text": str(content)})

        hf_msgs.append({"role": role, "content": parts})
    return hf_msgs

def generate_with_model(hf_messages, max_tokens=256, temperature=0.0, top_p=1.0):
    """
    Build inputs via processor.apply_chat_template and call model.generate.
    """
    # Build processor inputs on the model device
    inputs = processor.apply_chat_template(
        hf_messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    gen_kwargs = {
        "max_new_tokens": int(max_tokens),
        "do_sample": (temperature is not None and float(temperature) > 0.0),
        "temperature": float(max(0.0, temperature)),
        "top_p": float(top_p),
        "use_cache": True,
    }

    with torch.inference_mode():
        output_ids = model.generate(**inputs, **gen_kwargs)

    # Decode only the new tokens
    prompt_len = inputs["input_ids"].shape[-1]
    decoded = processor.decode(output_ids[0][prompt_len:], skip_special_tokens=True)
    return decoded

# -----------------------------
# OpenAI-compatible endpoint
# -----------------------------
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions():
    if not _auth_ok(request):
        return jsonify({"error": {"message": "Unauthorized"}}), 401

    payload = request.get_json(force=True, silent=True) or {}

    messages = payload.get("messages", [])
    max_tokens = payload.get("max_tokens", 65536)
    temperature = payload.get("temperature", 0.0)
    top_p = payload.get("top_p", 1.0)
    stream = payload.get("stream", False)
    model_name = payload.get("model", MODEL_ID)

    # Convert OpenAI chat format to HF/UI-TARS format
    hf_messages = openai_to_hf_messages(messages)

    # We don’t implement streaming in this minimal server
    if stream:
        return jsonify({"error": {"message": "stream=True not supported by this server"}}), 400

    started = time.time()
    try:
        text = generate_with_model(
            hf_messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )
    except Exception as e:
        return jsonify({"error": {"message": f"Generation failed: {e}"}}), 500

    elapsed = time.time() - started
    choice_id = str(uuid.uuid4())[:8]

    # OpenAI-style response
    resp = {
        "id": f"chatcmpl-{str(uuid.uuid4())}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": model_name,
        "usage": {  # token counts are optional (and non-trivial to compute exactly without re-tokenizing)
            "prompt_tokens": None,
            "completion_tokens": None,
            "total_tokens": None,
        },
        "choices": [
            {
                "index": 0,
                "finish_reason": "stop",
                "message": {
                    "role": "assistant",
                    "content": text,
                },
                "id": choice_id,
                "logprobs": None,
            }
        ],
        "latency_ms": int(elapsed * 1000),
    }
    return jsonify(resp), 200

# Health check
@app.route("/health", methods=["GET"])
def health():
    return jsonify({"status": "ok", "model": MODEL_ID}), 200

if __name__ == "__main__":
    # Bind to 0.0.0.0 for container/VPS, default port 8000
    port = int(os.getenv("PORT", "8000"))
    app.run(host="0.0.0.0", port=port, threaded=True)
    
    # nohup python -u flask_UI_TARS_7B.py > server.log 2>&1 &

Test code:

# pip install openai
import io
import re
import json
import base64
from PIL import Image
from io import BytesIO
from openai import OpenAI

def add_box_token(input_string):
    # Step 1: Split the string into individual actions
    if "Action: " in input_string and "start_box=" in input_string:
        suffix = input_string.split("Action: ")[0] + "Action: "
        actions = input_string.split("Action: ")[1:]
        processed_actions = []
        for action in actions:
            action = action.strip()
            # Step 2: Extract coordinates (start_box or end_box) using regex
            coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
            
            updated_action = action  # Start with the original action
            for coord_type, x, y in coordinates:
                # Convert x and y to integers
                updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
            processed_actions.append(updated_action)
        
        # Step 5: Reconstruct the final string
        final_string = suffix + "\n\n".join(processed_actions)
    else:
        final_string = input_string
    return final_string

client = OpenAI(
    base_url="http://xxxxx:8000/v1",
    api_key="sk-local",  # must match the server's API_KEY
)

result = {}
messages = json.load(open("./data/test_messages.json"))
for message in messages:
    if message["role"] == "assistant":
        message["content"] = add_box_token(message["content"])
        print(message["content"])

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=messages,
    temperature=0.0,
    max_tokens=400,
    stream=False,
)

response = ""
for message in chat_completion:
    response += message.choices[0].delta.content
print(response)

Error Info:


Thought: 我看到屏幕上弹出了一个颜色配置文件转换的对话框，要继续操作的话得先处理掉它。对话框右下角有个"Convert"按钮，我需要点击它来确认这个转换操作。
Action: click(start_box='<|box_start|>(607,647)<|box_end|>')
Thought: 看到这张复古电视机的照片，我需要把它设置成基于调色板的方式。让我先点击顶部菜单栏的"Tools"选项，这样就能找到相关的颜色设置选项了。
Action: click(start_box='<|box_start|>(220,71)<|box_end|>')
Thought: 刚才点开了Tools菜单，但发现这不是正确的路径。要设置图片为基于调色板的方式，我需要先关闭当前打开的Tools菜单，然后重新寻找正确的颜色设置选项。让我点击一下Tools按钮把它关掉。
Action: click(start_box='<|box_start|>(220,71)<|box_end|>')
Thought: 刚才在Tools菜单里没找到想要的选项，让我换个思路。我注意到顶部菜单栏有个"Colors"选项，这应该就是处理颜色设置的地方。我需要点击它来继续寻找将图片设置为基于调色板方式的选项。
Action: click(start_box='<|box_start|>(192,71)<|box_end|>')
Thought: 我在Colors菜单里仔细浏览了一遍，发现这里并没有直接设置调色板的选项。看来得换个思路了 - 先把这个菜单关掉，然后去Edit菜单看看，那里应该能找到更多关于图像处理的选项。毕竟要实现基于调色板的设置，可能需要通过其他路径来实现。
Action: click(start_box='<|box_start|>(192,71)<|box_end|>')
Thought: 我在Colors菜单里没找到想要的选项，让我换个思路。我注意到顶部菜单栏有个"Edit"选项，这应该就是处理颜色设置的地方。我需要点击它来继续寻找将图片设置为基于调色板方式的选项。
Action: click(start_box='<|box_start|>(65,71)<|box_end|>')
Thought: 我在Edit菜单里发现了Preferences选项，这正是我需要的。通过这个选项，我应该能找到调整图片颜色模式的设置。让我点击它进入设置界面，这样就能把图片转换成基于调色板的方式了。
Action: click(start_box='<|box_start|>(96,602)<|box_end|>')
---------------------------------------------------------------------------
InternalServerError                       Traceback (most recent call last)
Cell In[9], [line 45](vscode-notebook-cell:?execution_count=9&line=45)
     42         message["content"] = add_box_token(message["content"])
     43         print(message["content"])
---> [45](vscode-notebook-cell:?execution_count=9&line=45) chat_completion = client.chat.completions.create(
     46     model="tgi",
     47     messages=messages,
     48     temperature=0.0,
     49     max_tokens=400,
     50     stream=False,
     51 )
     53 response = ""
     54 for message in chat_completion:

File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_utils/_utils.py:286, in required_args.<locals>.inner.<locals>.wrapper(*args, **kwargs)
    284             msg = f"Missing required argument: {quote(missing[0])}"
    285     raise TypeError(msg)
--> [286](https://file+.vscode-resource.vscode-cdn.net/xxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_utils/_utils.py:286) return func(*args, **kwargs)

File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:1147, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, prompt_cache_key, reasoning_effort, response_format, safety_identifier, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, verbosity, web_search_options, extra_headers, extra_query, extra_body, timeout)
   1101 @required_args(["messages", "model"], ["messages", "model", "stream"])
   1102 def create(
   1103     self,
   (...)   1144     timeout: float | httpx.Timeout | None | NotGiven = not_given,
   1145 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
   1146     validate_response_format(response_format)
-> [1147](https://file+.vscode-resource.vscode-cdn.net/xxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:1147)     return self._post(
   1148         "/chat/completions",
   1149         body=maybe_transform(
   1150             {
   1151                 "messages": messages,
   1152                 "model": model,
   1153                 "audio": audio,
   1154                 "frequency_penalty": frequency_penalty,
   1155                 "function_call": function_call,
   1156                 "functions": functions,
   1157                 "logit_bias": logit_bias,
   1158                 "logprobs": logprobs,
   1159                 "max_completion_tokens": max_completion_tokens,
   1160                 "max_tokens": max_tokens,
   1161                 "metadata": metadata,
   1162                 "modalities": modalities,
   1163                 "n": n,
   1164                 "parallel_tool_calls": parallel_tool_calls,
   1165                 "prediction": prediction,
   1166                 "presence_penalty": presence_penalty,
   1167                 "prompt_cache_key": prompt_cache_key,
   1168                 "reasoning_effort": reasoning_effort,
   1169                 "response_format": response_format,
   1170                 "safety_identifier": safety_identifier,
   1171                 "seed": seed,
   1172                 "service_tier": service_tier,
   1173                 "stop": stop,
   1174                 "store": store,
   1175                 "stream": stream,
   1176                 "stream_options": stream_options,
   1177                 "temperature": temperature,
   1178                 "tool_choice": tool_choice,
   1179                 "tools": tools,
   1180                 "top_logprobs": top_logprobs,
   1181                 "top_p": top_p,
   1182                 "user": user,
   1183                 "verbosity": verbosity,
   1184                 "web_search_options": web_search_options,
   1185             },
   1186             completion_create_params.CompletionCreateParamsStreaming
   1187             if stream
   1188             else completion_create_params.CompletionCreateParamsNonStreaming,
   1189         ),
   1190         options=make_request_options(
   1191             extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
   1192         ),
   1193         cast_to=ChatCompletion,
   1194         stream=stream or False,
   1195         stream_cls=Stream[ChatCompletionChunk],
   1196     )

File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:[1259](https://file+.vscode-resource.vscode-cdn.net/xxxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:1259), in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
   1245 def post(
   1246     self,
   1247     path: str,
   (...)   1254     stream_cls: type[_StreamT] | None = None,
   1255 ) -> ResponseT | _StreamT:
   1256     opts = FinalRequestOptions.construct(
   1257         method="post", url=path, json_data=body, files=to_httpx_files(files), **options
   1258     )
-> 1259     return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))

File ~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:[1047](https://file+.vscode-resource.vscode-cdn.net/xxx/notebook/~/anaconda3/envs/BrowserAgent/lib/python3.11/site-packages/openai/_base_client.py:1047), in SyncAPIClient.request(self, cast_to, options, stream, stream_cls)
   1044             err.response.read()
   1046         log.debug("Re-raising status error")
-> 1047         raise self._make_status_error_from_response(err.response) from None
   1049     break
   1051 assert response is not None, "could not resolve response (should never happen)"

InternalServerError: Error code: 500 - {'error': {'message': 'Generation failed: CUDA out of memory. Tried to allocate 76.02 GiB. GPU 0 has a total capacity of 79.15 GiB of which 25.10 GiB is free. Including non-PyTorch memory, this process has 54.04 GiB memory in use. Of the allocated memory 49.64 GiB is allocated by PyTorch, and 3.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)'}}

Oct 28 '25 01:10 Zhuifeng414