aibrix icon indicating copy to clipboard operation
aibrix copied to clipboard

Support DeepSeek/DeepSeek-OCR

Open Jeffwan opened this issue 2 months ago • 5 comments

🚀 Feature Description and Motivation

https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR.html#running-deepseek-ocr

  1. this file doesn't have api interface, user need to leverage LLM class
  2. it doesn't have batch api support.

Use Case

Support ocr images

Proposed Solution

No response

Jeffwan avatar Oct 25 '25 05:10 Jeffwan

use logits_processors in request extra_body

def infer(img_base64):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        # "image_url": {"url": f"data:image/png;base64,{img_base64}"},
                        "image_url": {"url": "https://jeroen.github.io/images/testocr.png"},
                    },
                                        {
                        "type": "image_url",
                        # "image_url": {"url": f"data:image/png;base64,{img_base64}"},
                        "image_url": {"url": "https://jeroen.github.io/images/testocr.png"},
                    },
                    {
                        "type": "text",
                        "text": "Extract the text from the above document as if you were reading it naturally.",
                    },
                ],
            }
        ],
         extra_body={
            "logits_processors": [
                "vllm.model_executor.models.deepseek_ocr.NGramPerReqLogitsProcessor",
            ]
        },
        temperature=0.0,
        max_tokens=4096
    )
    return response.choices[0].message.content
Image

Jeffwan avatar Oct 25 '25 06:10 Jeffwan

specify logits_processors in engine startup command

Image

Jeffwan avatar Oct 25 '25 06:10 Jeffwan

work version 1:

 vllm serve deepseek-ai/DeepSeek-OCR
from openai import OpenAI
import base64
import os

os.environ["OPENAI_API_KEY"] = ""
client = OpenAI(base_url="http://localhost:8000/v1")


model = "deepseek-ai/DeepSeek-OCR"

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def infer(img_base64):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        # "image_url": {"url": f"data:image/png;base64,{img_base64}"},
                        "image_url": {"url": "https://jeroen.github.io/images/testocr.png"},
                    },
                    {
                        "type": "text",
                        "text": "Extract the text from the above document as if you were reading it naturally.",
                    },
                ],
            }
        ],
        temperature=0.0
    )
    return response.choices[0].message.content




# img_base64 = encode_image("/workspace/test1.png")
# print(infer(img_base64))

print(infer(""))

Jeffwan avatar Oct 25 '25 06:10 Jeffwan

work version 2

vllm source doc

Jeffwan avatar Oct 25 '25 06:10 Jeffwan

work version 3

wrapper + standard openai api

# server.py
import base64
import io
import os
from typing import List, Optional, Union, Dict, Any

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
import requests
from vllm import LLM, SamplingParams
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor

# ---------------------------
# Model bootstrap
# ---------------------------
MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR")

llm = LLM(
    model=MODEL_NAME,
    enable_prefix_caching=False,
    mm_processor_cache_gb=0,
    logits_processors=[NGramPerReqLogitsProcessor],  # class, not instance
)

# ---------------------------
# OpenAI-compatible schemas (simplified)
# ---------------------------
class ImageURL(BaseModel):
    url: str

class ContentItem(BaseModel):
    type: str  # "text" or "image_url"
    text: Optional[str] = None
    image_url: Optional[ImageURL] = None

class Message(BaseModel):
    role: str  # "user"/"system"/"assistant"
    content: Union[str, List[ContentItem]]

class ChatRequest(BaseModel):
    model: Optional[str] = None
    messages: List[Message]
    temperature: Optional[float] = 0.0
    max_tokens: Optional[int] = 8192
    stream: Optional[bool] = False
    extra_args: Optional[Dict[str, Any]] = None

class ChoiceMessage(BaseModel):
    role: str
    content: str

class Choice(BaseModel):
    index: int
    message: ChoiceMessage
    finish_reason: Optional[str] = "stop"

class ChatResponse(BaseModel):
    id: str
    object: str
    model: str
    choices: List[Choice]

app = FastAPI(title="OpenAI adapter for vLLM DeepSeek-OCR")

# ---------------------------
# Helpers
# ---------------------------

def _load_image_from_item(item: ContentItem) -> Optional[Image.Image]:
    if item.type != "image_url":
        return None
    url = item.image_url.url if item.image_url else ""
    if not url:
        return None

    # 1) data URL (base64)
    if url.startswith("data:"):
        try:
            b64 = url.split(",", 1)[1]
            raw = base64.b64decode(b64)
            return Image.open(io.BytesIO(raw)).convert("RGB")
        except Exception as e:
            raise HTTPException(status_code=400, detail=f"Invalid data URL: {e}")

    # 2) http(s) URL
    if url.startswith("http://") or url.startswith("https://"):
        try:
            r = requests.get(url, timeout=10)   # timeout
            r.raise_for_status()
            ctype = r.headers.get("Content-Type", "")
            if "image" not in ctype.lower():
                raise HTTPException(status_code=400, detail=f"URL did not return an image (Content-Type={ctype})")
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        except HTTPException:
            raise
        except Exception as e:
            raise HTTPException(status_code=400, detail=f"Failed to fetch image: {e}")

    # 3) local path / file://
    if url.startswith("file://"):
        url = url[len("file://"):]
    if os.path.exists(url):
        try:
            return Image.open(url).convert("RGB")
        except Exception as e:
            raise HTTPException(status_code=400, detail=f"Failed to open local image: {e}")

    raise HTTPException(status_code=400, detail="Unsupported image_url; use data:, http(s), or file path.")



def _extract_prompt_and_images(messages: List[Message]) -> (str, List[Image.Image]):
    """
    convert OpenAI chat messages to OCR prompt and image list.
    rules
    - Use the text from the most recent user message as the prompt (if none, use “Free OCR.”).
    - Use all image_urls in that user message as the image set.
    """
    prompt_text = "Free OCR."
    images: List[Image.Image] = []

    # find last user
    last_user = None
    for m in messages[::-1]:
        if m.role == "user":
            last_user = m
            break
    if last_user is None:
        return prompt_text, images

    if isinstance(last_user.content, str):
        if last_user.content.strip():
            prompt_text = last_user.content.strip()
    else:
        text_parts = []
        for it in last_user.content:
            if it.type == "text" and it.text:
                text_parts.append(it.text)
            elif it.type == "image_url":
                img = _load_image_from_item(it)
                if img is not None:
                    images.append(img)
        if text_parts:
            prompt_text = "\n".join([t.strip() for t in text_parts if t.strip()])

    return prompt_text, images

def _build_vllm_batches(prompt: str, images: List[Image.Image]) -> List[Dict[str, Any]]:
    if not images:
        # If no image is provided, still allow a text-only prompt—but DeepSeek-OCR generally expects an image.
        return [{"prompt": prompt}]
    return [{"prompt": f"<image>\n{prompt}", "multi_modal_data": {"image": im}} for im in images]

# ---------------------------
# Endpoint
# ---------------------------
@app.post("/v1/chat/completions", response_model=ChatResponse)
def chat_completions(req: ChatRequest):
    # parse messages → prompt + images
    prompt, images = _extract_prompt_and_images(req.messages)
    model_inputs = _build_vllm_batches(prompt, images)

    # Sampling parameters (pass through as needed)
    extra_args = req.extra_args or {}
    # Provide sensible defaults to match your example
    extra_args.setdefault("ngram_size", 30)
    extra_args.setdefault("window_size", 90)
    # whitelist_token_ids examples: <td>, </td>
    extra_args.setdefault("whitelist_token_ids", {128821, 128822})

    sampling_params = SamplingParams(
        temperature=req.temperature or 0.0,
        max_tokens=req.max_tokens or 8192,
        extra_args=extra_args,
        skip_special_tokens=False,
    )

    # This example uses synchronous generation (batch size = number of images)
    try:
        outputs = llm.generate(model_inputs, sampling_params)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"vLLM generate error: {e}")

    # Combine the results from all images into a single assistant message (you could also return multiple choices)
    texts = []
    for out in outputs:
        if not out.outputs:
            texts.append("")
        else:
            texts.append(out.outputs[0].text)

    merged_text = "\n\n".join(texts) if len(texts) > 1 else (texts[0] if texts else "")

    resp = ChatResponse(
        id="chatcmpl-deepseek-ocr-adapter",
        object="chat.completion",
        model=req.model or MODEL_NAME,
        choices=[
            Choice(
                index=0,
                message=ChoiceMessage(role="assistant", content=merged_text),
                finish_reason="stop"
            )
        ],
    )
    return resp

# Run uvicorn server:app --host 0.0.0.0 --port 8000

Jeffwan avatar Oct 25 '25 06:10 Jeffwan