super slow? What did I do wrong?
I'm using it with docling. I see gpu getting used but still very slow. This is a m3 max with 128GB RAM.
nexa list ┌──────────────────────────────────┬─────────┬───────────┐ │ NAME │ SIZE │ QUANTS │ ├──────────────────────────────────┼─────────┼───────────┤ │ NexaAI/Qwen3-VL-4B-Instruct-GGUF │ 5.8 GiB │ Q4_0,Q4_K │ └──────────────────────────────────┴─────────┴───────────┘ (base) bayes-max:docling kundeng$ nexa serve Localhosting on http://127.0.0.1:18181/docs/ui [GIN] 2025/10/24 - 08:13:38 | 400 | 12.514042ms | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:13:38 | 400 | 14.351042ms | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:14:59 | 200 | 24.874519041s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:15:31 | 200 | 31.831121875s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:16:10 | 200 | 38.34948275s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:16:43 | 200 | 33.343695459s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:17:13 | 200 | 30.358976916s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:18:00 | 200 | 46.042449s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:18:39 | 200 | 38.871925334s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:19:45 | 200 | 1m6s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:21:27 | 200 | 1m41s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:23:57 | 200 | 2m30s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:25:30 | 200 | 1m32s | 127.0.0.1 | POST "/v1/chat/completions" [GIN] 2025/10/24 - 08:26:30 | 200 | 1m0s | 127.0.0.1 | POST "/v1/chat/completions"
The docling code is here:
#!/usr/bin/env python3
"""
Docling + External VLM (Nexa / Qwen3-VL / LM Studio / vLLM)
------------------------------------------------------------
CLI wrapper to let Docling use any OpenAI-compatible multimodal endpoint.
Examples:
uv run python docling_vlm_cli.py \
"https://arxiv.org/pdf/2510.15532" \
--endpoint "http://127.0.0.1:18181/v1/chat/completions" \
--model-id "Qwen3-VL-4B-Instruct-GGUF" \
--format md
"""
import argparse
import json
import os
import sys
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
# --------------------------------------------------------------------------- #
# Argument parsing
# --------------------------------------------------------------------------- #
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description="Run Docling with an external OpenAI-compatible VLM "
"(e.g. Nexa / LM Studio / vLLM / Ollama)."
)
p.add_argument(
"source",
help="Input PDF path or direct PDF URL (e.g. https://arxiv.org/pdf/xxxx.pdf)"
)
p.add_argument(
"--endpoint", "--url",
dest="endpoint",
default=os.getenv("VLM_ENDPOINT", "http://localhost:1234/v1/chat/completions"),
help="OpenAI-compatible /v1/chat/completions endpoint "
"(env: VLM_ENDPOINT, default: http://localhost:1234/v1/chat/completions)",
)
p.add_argument(
"--model-id",
default=os.getenv("VLM_MODEL_ID", "qwen3-vl-8b-instruct"),
help="Model ID exposed by your endpoint (env: VLM_MODEL_ID).",
)
p.add_argument(
"--api-key",
default=os.getenv("VLM_API_KEY", ""),
help="API key sent as Authorization: Bearer <key> (env: VLM_API_KEY).",
)
p.add_argument(
"--headers",
default=os.getenv("VLM_HEADERS_JSON", ""),
help="Extra HTTP headers as JSON (env: VLM_HEADERS_JSON). "
"Example: '{\"X-Org\": \"team-ml\"}'",
)
p.add_argument("--output-dir", default="outputs", help="Where to write outputs.")
p.add_argument("--timeout", type=int, default=180, help="Timeout (s) for responses.")
p.add_argument(
"--format",
choices=["md", "html", "doctags"],
default="md",
help="Output format: md=Markdown, html=HTML, doctags=DocTags JSON.",
)
p.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature.")
p.add_argument("--scale", type=float, default=2.0, help="Layout fidelity scaling.")
return p
# --------------------------------------------------------------------------- #
# Helpers
# --------------------------------------------------------------------------- #
def parse_headers(api_key: str, headers_json: str) -> dict:
"""Return a safe headers dict (never None)."""
headers = {}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
if headers_json:
try:
extra = json.loads(headers_json)
if not isinstance(extra, dict):
raise ValueError("headers JSON must be an object")
headers.update(extra)
except Exception as e:
raise ValueError(f"Invalid --headers JSON: {e}") from e
return headers or {} # always a dict
# --------------------------------------------------------------------------- #
# Main pipeline
# --------------------------------------------------------------------------- #
def run_docling(args):
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
# map CLI -> Docling enum
fmt_map = {"md": "MARKDOWN", "html": "HTML", "doctags": "DOCTAGS"}
fmt = fmt_map.get(args.format.lower(), "MARKDOWN")
headers = parse_headers(args.api_key, args.headers)
prompt = (
"You are a document conversion engine. Convert PDF pages (images + text) "
"into clean, faithful output.\n"
"For ALL mathematics: use LaTeX ONLY, no Unicode symbols.\n"
"Wrap inline math in $...$ and block math in $$...$$.\n"
"Preserve headings, tables, and layout structure. Do not hallucinate content."
)
vlm_opts = ApiVlmOptions(
url=args.endpoint,
params={"model": args.model_id},
headers=headers, # always a dict (fixed)
prompt=prompt,
response_format=getattr(ResponseFormat, fmt),
temperature=args.temperature,
timeout=args.timeout,
scale=args.scale,
)
pipeline_opts = VlmPipelineOptions(
enable_remote_services=True,
vlm_options=vlm_opts,
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_opts,
)
}
)
print(f"🚀 Converting {args.source}")
print(f" Endpoint: {args.endpoint}")
print(f" Model: {args.model_id}")
if headers:
print(f" Headers: {list(headers.keys())}")
result = converter.convert(source=args.source).document
out_base = Path(args.output_dir) / Path(Path(args.source).stem)
if args.format == "md":
out_path = out_base.with_suffix(".md")
out_path.write_text(result.export_to_markdown(), encoding="utf-8")
elif args.format == "html":
out_path = out_base.with_suffix(".html")
out_path.write_text(result.export_to_html(), encoding="utf-8")
else: # doctags
out_path = out_base.with_suffix(".json")
out_path.write_text(result.export_to_doctags(), encoding="utf-8")
print(f"✅ Output written to: {out_path}")
# --------------------------------------------------------------------------- #
# Entrypoint
# --------------------------------------------------------------------------- #
def main():
parser = build_parser()
args = parser.parse_args()
try:
run_docling(args)
except KeyboardInterrupt:
print("\n🛑 Interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
I have similiar issue, it becomes slower after some requests and VRAM usage increases until it eats all VRAM and starting to use shared memory. I'm on 4070tis, windows cli. It would be good if there were some monitoring/debug options to check VRAM usage and options to limit it.
me too 5070ti slow
same, first request is like 100t/s, after that drops to 4t/s. NexaAI/Qwen3-0.6B-GGUF linux/nexa cli/4090. only reload of the model helps (/clear doesn't), after the same pattern repeats itself. the behavior happens through nexa infer. via nexa run it's always slow