[Feature-Request] Support for GPT Vision
I have already tried a solution suggested by @truebit, for this, earlier, without any luck - I have documented my attempt a little here:
Originally posted by @antoan in https://github.com/stanfordnlp/dspy/issues/459#issuecomment-1987333865
Hi @antoan , I believe you would have to pass the path through an image_url argument. Feel free to follow the openAI docs on vision for reference in passing the proper configuration.
I have the same desire to try this out with images, looking at the GPT class, one current restriction I see is that the __call__ method only takes a single string parameter of 'prompt', which gets passes as a single 'message' (in OpenAI terms) to the LM: [{"role": "user", "content": prompt}]
A way forward would be to allow GPT to accept a list of 'prompts' which are type annotated somehow, and dynamically construct the messages based on those types, ie 'inline base64 image' or a image_url.
Not sure if these is existing machinery with DSPy which can help with this (i've really only looked at this for 1hr or so... including reading the docs!)
I have the same desire to try this out with images, looking at the GPT class, one current restriction I see is that the
__call__method only takes a single string parameter of 'prompt', which gets passes as a single 'message' (in OpenAI terms) to the LM:[{"role": "user", "content": prompt}]A way forward would be to allow GPT to accept a list of 'prompts' which are type annotated somehow, and dynamically construct the messages based on those types, ie 'inline base64 image' or a
image_url.Not sure if these is existing machinery with DSPy which can help with this (i've really only looked at this for 1hr or so... including reading the docs!)
Maybe start here https://github.com/stanfordnlp/dspy/blob/main/dsp/modules/lm.py
https://github.com/stanfordnlp/dspy/pull/675
I have made a PR
here is a potential implementation: https://github.com/stanfordnlp/dspy/blob/56a0949ad285e0a3dd5649de58a6f5fb6f734a60/dsp/modules/gpt4vision.py#L106C1-L147C20
I am also in need of this feature, started with this at https://github.com/stanfordnlp/dspy/pull/1099
Tested this with both Gemini and GPT-4o. If anybody is interested, welcome to try it out!
I have also been looking for a solution to this problem, and saw that someone had written a GPT4Vision class (https://github.com/stanfordnlp/dspy/blob/56a0949ad285e0a3dd5649de58a6f5fb6f734a60/dsp/modules/gpt4vision.py#L106C1-L147C20). But that one is too complicated. This is a simple solution I wrote based on the documentation, and it has actually been tested and is feasible:
import dspy
import requests
import base64
import re
from dsp import LM
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
class GPTVision(LM):
def __init__(self, model, api_key):
super().__init__("gpt-4o")
self.model = model
self.api_key = api_key
self.provider = "openai"
self.history = []
self.base_url = "https://api.openai.com/v1/chat/completions"
def basic_request(self, prompt, **kwargs):
# 目前是从prompt中把图片文件的位置取出来,然后再从prompt把这一部分删除
pattern = r'^Image Path: .*'
matches = re.findall(pattern, prompt, re.MULTILINE)
image_path = matches[1].replace("Image Path: ", "")
for match in matches:
prompt = prompt.replace(f"\n{match}\n", "")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
base64_image = encode_image(image_path)
data = {
**kwargs,
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 300
}
response = requests.post(self.base_url, headers=headers, json=data)
response = response.json()
self.history.append({
"prompt": prompt,
"response": response,
"kwargs": kwargs
})
return response
def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
responses = self.request(prompt, **kwargs)
completions = [choice["message"]["content"] for choice in responses["choices"]]
return completions
class VqaCoT(dspy.Signature):
"""Answer the questions based on the pictures."""
image_path = dspy.InputField(desc="Base64 format of the image")
question = dspy.InputField()
answer = dspy.OutputField(desc="Answer based on image and question")
if __name__ == "__main__":
gpt4o = GPTVision(model='gpt-4o', api_key="your api key of openai")
qa = dspy.ChainOfThought(VqaCoT)
with dspy.context(lm=gpt4o):
print(qa(question="What is the occupation of the man in the picture?",
image_path="/Users/dream/myProjects/ITS-llm/demo/curry.jpeg"))
# Prediction(
# rationale='Question: What is the occupation of the man in the picture?\n\nReasoning: Let\'s think step by step in order to determine his occupation. We observe that he is wearing a basketball jersey with the team name "Golden State Warriors" and holding a basketball. These indicators suggest that he is likely employed in a profession related to basketball.',
# answer='The man is a basketball player.'
# )
I have also been looking for a solution to this problem, and saw that someone had written a GPT4Vision class (https://github.com/stanfordnlp/dspy/blob/56a0949ad285e0a3dd5649de58a6f5fb6f734a60/dsp/modules/gpt4vision.py#L106C1-L147C20). But that one is too complicated. This is a simple solution I wrote based on the documentation, and it has actually been tested and is feasible:
import dspy import requests import base64 import re from dsp import LM def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') class GPTVision(LM): def __init__(self, model, api_key): super().__init__("gpt-4o") self.model = model self.api_key = api_key self.provider = "openai" self.history = [] self.base_url = "https://api.openai.com/v1/chat/completions" def basic_request(self, prompt, **kwargs): # 目前是从prompt中把图片文件的位置取出来,然后再从prompt把这一部分删除 pattern = r'^Image Path: .*' matches = re.findall(pattern, prompt, re.MULTILINE) image_path = matches[1].replace("Image Path: ", "") for match in matches: prompt = prompt.replace(f"\n{match}\n", "") headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}" } base64_image = encode_image(image_path) data = { **kwargs, "model": "gpt-4o", "messages": [ { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] } ], "max_tokens": 300 } response = requests.post(self.base_url, headers=headers, json=data) response = response.json() self.history.append({ "prompt": prompt, "response": response, "kwargs": kwargs }) return response def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs): responses = self.request(prompt, **kwargs) completions = [choice["message"]["content"] for choice in responses["choices"]] return completions class VqaCoT(dspy.Signature): """Answer the questions based on the pictures.""" image_path = dspy.InputField(desc="Base64 format of the image") question = dspy.InputField() answer = dspy.OutputField(desc="Answer based on image and question") if __name__ == "__main__": gpt4o = GPTVision(model='gpt-4o', api_key="your api key of openai") qa = dspy.ChainOfThought(VqaCoT) with dspy.context(lm=gpt4o): print(qa(question="What is the occupation of the man in the picture?", image_path="/Users/dream/myProjects/ITS-llm/demo/curry.jpeg")) # Prediction( # rationale='Question: What is the occupation of the man in the picture?\n\nReasoning: Let\'s think step by step in order to determine his occupation. We observe that he is wearing a basketball jersey with the team name "Golden State Warriors" and holding a basketball. These indicators suggest that he is likely employed in a profession related to basketball.', # answer='The man is a basketball player.' # )
Thank you
Here is another simple GptVision class using the Azure OpenAI client in case it helps.
import base64
import json
import re
from dsp import LM
from dsp.modules.azure_openai import AzureOpenAI, chat_request
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
class GptVision(LM):
def __init__(self, model, api_version, api_base, azure_ad_token_provider, max_tokens):
super().__init__(model)
client = AzureOpenAI(
model=model,
api_version=api_version,
api_base=api_base,
azure_ad_token_provider=azure_ad_token_provider,
max_tokens=max_tokens,
)
self.client = client
self.history = []
def basic_request(self, prompt, **kwargs):
raw_kwargs = kwargs
pattern = r'^Image Path: .*'
matches = re.findall(pattern, prompt, re.MULTILINE)
image_path = matches[1].replace("Image Path: ", "")
for match in matches:
prompt = prompt.replace(f"\n{match}\n", "")
base64_image = encode_image(image_path)
kwargs = {**self.kwargs, **kwargs}
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
kwargs["messages"] = messages
kwargs = {"stringify_request": json.dumps(kwargs)}
response = chat_request(self.client.client, **kwargs)
history = {
"prompt": prompt,
"response": response,
"kwargs": kwargs,
"raw_kwargs": raw_kwargs,
}
self.history.append(history)
return response
def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
responses = self.request(prompt, **kwargs)
completions = [choice["message"]["content"] for choice in responses["choices"]]
return completions