aibrix
aibrix copied to clipboard
Support DeepSeek/DeepSeek-OCR
🚀 Feature Description and Motivation
https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR.html#running-deepseek-ocr
- this file doesn't have api interface, user need to leverage LLM class
- it doesn't have batch api support.
Use Case
Support ocr images
Proposed Solution
No response
use logits_processors in request extra_body
def infer(img_base64):
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
# "image_url": {"url": f"data:image/png;base64,{img_base64}"},
"image_url": {"url": "https://jeroen.github.io/images/testocr.png"},
},
{
"type": "image_url",
# "image_url": {"url": f"data:image/png;base64,{img_base64}"},
"image_url": {"url": "https://jeroen.github.io/images/testocr.png"},
},
{
"type": "text",
"text": "Extract the text from the above document as if you were reading it naturally.",
},
],
}
],
extra_body={
"logits_processors": [
"vllm.model_executor.models.deepseek_ocr.NGramPerReqLogitsProcessor",
]
},
temperature=0.0,
max_tokens=4096
)
return response.choices[0].message.content
specify logits_processors in engine startup command
work version 1:
vllm serve deepseek-ai/DeepSeek-OCR
from openai import OpenAI
import base64
import os
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI(base_url="http://localhost:8000/v1")
model = "deepseek-ai/DeepSeek-OCR"
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def infer(img_base64):
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
# "image_url": {"url": f"data:image/png;base64,{img_base64}"},
"image_url": {"url": "https://jeroen.github.io/images/testocr.png"},
},
{
"type": "text",
"text": "Extract the text from the above document as if you were reading it naturally.",
},
],
}
],
temperature=0.0
)
return response.choices[0].message.content
# img_base64 = encode_image("/workspace/test1.png")
# print(infer(img_base64))
print(infer(""))
work version 2
vllm source doc
work version 3
wrapper + standard openai api
# server.py
import base64
import io
import os
from typing import List, Optional, Union, Dict, Any
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
import requests
from vllm import LLM, SamplingParams
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
# ---------------------------
# Model bootstrap
# ---------------------------
MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR")
llm = LLM(
model=MODEL_NAME,
enable_prefix_caching=False,
mm_processor_cache_gb=0,
logits_processors=[NGramPerReqLogitsProcessor], # class, not instance
)
# ---------------------------
# OpenAI-compatible schemas (simplified)
# ---------------------------
class ImageURL(BaseModel):
url: str
class ContentItem(BaseModel):
type: str # "text" or "image_url"
text: Optional[str] = None
image_url: Optional[ImageURL] = None
class Message(BaseModel):
role: str # "user"/"system"/"assistant"
content: Union[str, List[ContentItem]]
class ChatRequest(BaseModel):
model: Optional[str] = None
messages: List[Message]
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 8192
stream: Optional[bool] = False
extra_args: Optional[Dict[str, Any]] = None
class ChoiceMessage(BaseModel):
role: str
content: str
class Choice(BaseModel):
index: int
message: ChoiceMessage
finish_reason: Optional[str] = "stop"
class ChatResponse(BaseModel):
id: str
object: str
model: str
choices: List[Choice]
app = FastAPI(title="OpenAI adapter for vLLM DeepSeek-OCR")
# ---------------------------
# Helpers
# ---------------------------
def _load_image_from_item(item: ContentItem) -> Optional[Image.Image]:
if item.type != "image_url":
return None
url = item.image_url.url if item.image_url else ""
if not url:
return None
# 1) data URL (base64)
if url.startswith("data:"):
try:
b64 = url.split(",", 1)[1]
raw = base64.b64decode(b64)
return Image.open(io.BytesIO(raw)).convert("RGB")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid data URL: {e}")
# 2) http(s) URL
if url.startswith("http://") or url.startswith("https://"):
try:
r = requests.get(url, timeout=10) # timeout
r.raise_for_status()
ctype = r.headers.get("Content-Type", "")
if "image" not in ctype.lower():
raise HTTPException(status_code=400, detail=f"URL did not return an image (Content-Type={ctype})")
return Image.open(io.BytesIO(r.content)).convert("RGB")
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to fetch image: {e}")
# 3) local path / file://
if url.startswith("file://"):
url = url[len("file://"):]
if os.path.exists(url):
try:
return Image.open(url).convert("RGB")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to open local image: {e}")
raise HTTPException(status_code=400, detail="Unsupported image_url; use data:, http(s), or file path.")
def _extract_prompt_and_images(messages: List[Message]) -> (str, List[Image.Image]):
"""
convert OpenAI chat messages to OCR prompt and image list.
rules
- Use the text from the most recent user message as the prompt (if none, use “Free OCR.”).
- Use all image_urls in that user message as the image set.
"""
prompt_text = "Free OCR."
images: List[Image.Image] = []
# find last user
last_user = None
for m in messages[::-1]:
if m.role == "user":
last_user = m
break
if last_user is None:
return prompt_text, images
if isinstance(last_user.content, str):
if last_user.content.strip():
prompt_text = last_user.content.strip()
else:
text_parts = []
for it in last_user.content:
if it.type == "text" and it.text:
text_parts.append(it.text)
elif it.type == "image_url":
img = _load_image_from_item(it)
if img is not None:
images.append(img)
if text_parts:
prompt_text = "\n".join([t.strip() for t in text_parts if t.strip()])
return prompt_text, images
def _build_vllm_batches(prompt: str, images: List[Image.Image]) -> List[Dict[str, Any]]:
if not images:
# If no image is provided, still allow a text-only prompt—but DeepSeek-OCR generally expects an image.
return [{"prompt": prompt}]
return [{"prompt": f"<image>\n{prompt}", "multi_modal_data": {"image": im}} for im in images]
# ---------------------------
# Endpoint
# ---------------------------
@app.post("/v1/chat/completions", response_model=ChatResponse)
def chat_completions(req: ChatRequest):
# parse messages → prompt + images
prompt, images = _extract_prompt_and_images(req.messages)
model_inputs = _build_vllm_batches(prompt, images)
# Sampling parameters (pass through as needed)
extra_args = req.extra_args or {}
# Provide sensible defaults to match your example
extra_args.setdefault("ngram_size", 30)
extra_args.setdefault("window_size", 90)
# whitelist_token_ids examples: <td>, </td>
extra_args.setdefault("whitelist_token_ids", {128821, 128822})
sampling_params = SamplingParams(
temperature=req.temperature or 0.0,
max_tokens=req.max_tokens or 8192,
extra_args=extra_args,
skip_special_tokens=False,
)
# This example uses synchronous generation (batch size = number of images)
try:
outputs = llm.generate(model_inputs, sampling_params)
except Exception as e:
raise HTTPException(status_code=500, detail=f"vLLM generate error: {e}")
# Combine the results from all images into a single assistant message (you could also return multiple choices)
texts = []
for out in outputs:
if not out.outputs:
texts.append("")
else:
texts.append(out.outputs[0].text)
merged_text = "\n\n".join(texts) if len(texts) > 1 else (texts[0] if texts else "")
resp = ChatResponse(
id="chatcmpl-deepseek-ocr-adapter",
object="chat.completion",
model=req.model or MODEL_NAME,
choices=[
Choice(
index=0,
message=ChoiceMessage(role="assistant", content=merged_text),
finish_reason="stop"
)
],
)
return resp
# Run uvicorn server:app --host 0.0.0.0 --port 8000