server
server copied to clipboard
vLLM backend and multimodal inputs
import requests
import json
from PIL import Image
import base64
import io
import numpy as np
url = "http://172.17.102.43:8000/v2/models/vlm_model/generate"
stream = True
image_pth = 'A.jpg'
headers = {
"Content-Type": "application/json"
}
def resize_and_base64_encode(image_path, width, height, output_format='png'):
# Open the image
img = Image.open(image_path)
img_resized = img.resize((width, height), Image.Resampling.LANCZOS)
img_buffer = io.BytesIO()
img_resized.save(img_buffer, format=output_format)
img_bytes = img_buffer.getvalue()
base64_string = base64.b64encode(img_bytes).decode('utf-8')
return base64_string
base64_image = resize_and_base64_encode(image_pth,224,224)
prompt= "what is a usdot number"
data = {
"text_input": "Describe the content of this image.",
"image": base64_image,
"parameters": {
"stream": False,
"max_tokens": 256,
"temperature": 0.7
}
}
response = requests.post(url, headers=headers, data=json.dumps(data))
# Check if the request was successful
if response.status_code == 200:
print("Response from API:", response.json())
else:
print(f"Error: {response.status_code}")
print(response.text)
Does the vLLM backend support multi-modal inputs? Above is the code used to send the request. However I only get the folllowing 400 response:
Error: 400 {"error":"Error generating stream: The number of image tokens (0) must be the same as the number of images (1)"}
Can't seem to find much documentation on this or examples.
Thank you.