vLLM backend and multimodal inputs

Open CAlexander0614 opened this issue 10 months ago • 0 comments

import requests
import json
from PIL import Image
import base64
import io
import numpy as np



url = "http://172.17.102.43:8000/v2/models/vlm_model/generate"
stream = True

image_pth = 'A.jpg'

headers = {
    "Content-Type": "application/json"
}

def resize_and_base64_encode(image_path, width, height, output_format='png'):
    # Open the image
    img = Image.open(image_path)
    
    img_resized = img.resize((width, height), Image.Resampling.LANCZOS)
    img_buffer = io.BytesIO()
    img_resized.save(img_buffer, format=output_format)
    img_bytes = img_buffer.getvalue()
    
    base64_string = base64.b64encode(img_bytes).decode('utf-8')
    
    return base64_string

base64_image = resize_and_base64_encode(image_pth,224,224)

prompt= "what is a usdot number"

data = {
    "text_input": "Describe the content of this image.",
    "image": base64_image,
    "parameters": {
        "stream": False,
        "max_tokens": 256,
        "temperature": 0.7
    }
}

response = requests.post(url, headers=headers, data=json.dumps(data))

# Check if the request was successful
if response.status_code == 200:
    print("Response from API:", response.json())
else:
    print(f"Error: {response.status_code}")
    print(response.text)

Does the vLLM backend support multi-modal inputs? Above is the code used to send the request. However I only get the folllowing 400 response:

Error: 400 {"error":"Error generating stream: The number of image tokens (0) must be the same as the number of images (1)"}

Can't seem to find much documentation on this or examples.

Thank you.

Jun 16 '25 10:06 CAlexander0614