UI-TARS-desktop Can't deploy locally on a Mac with M2 Apple Silicon chip

Can't deploy locally on a Mac with M2 Apple Silicon chip.

RuntimeError: Failed to infer device type

Jan 27 '25 10:01 JerWalawala

somebody can give an install guide on the mac with Apple Silicon chip

Jan 27 '25 10:01 JerWalawala

Download https://github.com/bytedance/UI-TARS-desktop/releases/download/v0.0.3/UI.TARS-0.0.3-arm64.dmg version

Jan 27 '25 23:01 ycjcl868

they are talking about the model

Jan 28 '25 20:01 WindowsRefundDay

vLLM only works with Linux. You'll need to setup an endpoint with Huggingface. This code worked for me.

import torch
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, ValidationError
from typing import List, Dict, Union, Optional
import base64
from PIL import Image
import io
import logging
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import time

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

app = FastAPI()

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Check if MPS is available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and processor
model_path = "/Users/admin/Desktop/AI/GUI Agents/UI-TARS/ui-tars-2b-sft"
processor = AutoProcessor.from_pretrained(model_path)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map=device
)

# Pydantic models for request validation
class ImageUrl(BaseModel):
    url: str

class ContentItem(BaseModel):
    type: str
    text: Optional[str] = None
    image_url: Optional[ImageUrl] = None

class Message(BaseModel):
    role: str
    content: Union[str, List[ContentItem]]  # Can be either string or list of content items

class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[Message]
    max_tokens: Optional[int] = 1000
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.7
    frequency_penalty: Optional[float] = 0.0
    presence_penalty: Optional[float] = 0.0
    stream: Optional[bool] = False
    stop: Optional[Union[str, List[str]]] = None
    seed: Optional[int] = None

class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: str

class ChatCompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[Choice]

def decode_base64_image(base64_string: str) -> Image.Image:
    try:
        # Remove the data URL prefix if present
        if "base64," in base64_string:
            base64_string = base64_string.split("base64,")[1]
        image_data = base64.b64decode(base64_string)
        image = Image.open(io.BytesIO(image_data))
        logging.debug(f"Decoded image size: {image.size}")
        return image
    except Exception as e:
        logging.error(f"Error decoding image: {str(e)}")
        raise

@app.middleware("http")
async def log_requests(request: Request, call_next):
    body = await request.body()
    logging.info(f"Raw request body: {body.decode()}")
    response = await call_next(request)
    return response

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    try:
        logging.info("Received chat completion request")
        logging.info(f"Full request data: {request.dict()}")
        
        # Process the last message
        last_message = request.messages[-1]
        messages = []

        # Convert the message to Qwen2-VL format
        content_list = []
        
        if isinstance(last_message.content, str):
            content_list.append({
                "type": "text",
                "text": last_message.content
            })
            logging.debug(f"Added text: {last_message.content[:100]}...")
        else:
            for content_item in last_message.content:
                if content_item.type == "text":
                    content_list.append({
                        "type": "text",
                        "text": content_item.text
                    })
                    logging.debug(f"Added text: {content_item.text[:100]}...")
                elif content_item.type == "image_url":
                    image_url = content_item.image_url.url
                    if image_url.startswith("data:image"):
                        logging.debug("Decoding base64 image")
                        image = decode_base64_image(image_url)
                        content_list.append({
                            "type": "image",
                            "image": image
                        })

        messages.append({
            "role": "user",
            "content": content_list
        })

        # Prepare text input using chat template
        logging.info("Preparing model inputs")
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Process image inputs
        image_inputs = []
        for message in messages:
            for content in message['content']:
                if content['type'] == 'image':
                    image_inputs.append(content['image'])

        # Prepare model inputs
        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt"
        )
        
        # Log input tensor shapes and memory usage
        for key, value in inputs.items():
            if isinstance(value, torch.Tensor):
                logging.debug(f"Input tensor {key}: shape={value.shape}, dtype={value.dtype}")
                logging.debug(f"Memory used by {key}: {value.element_size() * value.nelement() / 1024 / 1024:.2f} MB")

        # Move inputs to device
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        
        logging.info("Generating response")
        # Generate response
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=request.max_tokens or 512,
            do_sample=True,
            temperature=request.temperature or 0.7,
            pad_token_id=processor.tokenizer.pad_token_id
        )

        # Trim and decode the response
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
        ]
        response_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]  # Get first response since we only have one
        
        # Create the response object
        response = ChatCompletionResponse(
            id=str(time.time()),
            object="chat.completion",
            created=int(time.time()),
            model=request.model,
            choices=[
                Choice(
                    index=0,
                    message=Message(
                        role="assistant",
                        content=[ContentItem(type="text", text=response_text)]
                    ),
                    finish_reason="stop"
                )
            ]
        )
        
        logging.info("Returning response")
        return response

    except ValidationError as e:
        logging.error(f"Validation error: {e.json()}")
        raise HTTPException(status_code=422, detail=str(e))
    except Exception as e:
        logging.error(f"Error processing request: {str(e)}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8001)

Feb 10 '25 21:02 arisylafeta

somebody can give an install guide on the mac with Apple Silicon chip

Clone vLLM project
Build vLLM (see vLLM docs)
Run python -m vllm.entrypoints.openai.api_server --served-model-name=ui-tars --model=<YOUR_MODEL> --device=cpu
When the model download is completed, the vLLM server will run.

Feb 12 '25 03:02 Hwwwww-dev

Huggingface instances are absurdly expensive, and VLLM on macOS is unbearably slow as it doesn't utilize MPS.

Is there not a way to use HF transformers to load the model? I tried on the example provided by HF but it didn't work:

    337     images = inputs.images
    338 else:
--> 339     text = inputs["text"]
    340     inputs_text = inputs["text"]
    341     images = inputs["images"]

KeyError: 'text'

Mar 17 '25 05:03 gregory-fanous