llama_index
llama_index copied to clipboard
[Question]: Run multimodal LLM locally
Question Validation
- [X] I have searched both the documentation and discord for an answer.
Question
I am Referring to this example: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e to run a multimodal LLM for a video processing use case. I want to know how to run the Multimodal LLM for generation locally? I have highlighted the code block which is now using OpenAI api which needs to be replaced so that we can run a multimodal LLM locally.
I am using the embedding model from Huggingface.
Could anyone please help.
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import ImageNode
from pathlib import Path
from moviepy.editor import *
import speech_recognition as sr
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from PIL import Image
import matplotlib.pyplot as plt
from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.core import StorageContext from llama_index.vector_stores.lancedb import LanceDBVectorStore from llama_index.core import SimpleDirectoryReader import lancedb import numpy as np
video_url = "https://www.youtube.com/watch?v=d_qvLDhkg00"
output_video_path = "./video_data/" output_folder = "./mixed_data/" output_audio_path = "./mixed_data/output_audio.wav" filepath = output_video_path + "input_vid.mp4" Path(output_folder).mkdir(parents=True, exist_ok=True)
def plot_images(image_paths): images_shown = 0 plt.figure(figsize=(16, 9)) for img_path in image_paths: if os.path.isfile(img_path): image = Image.open(img_path)
plt.subplot(2, 3, images_shown + 1)
plt.imshow(image)
plt.xticks([])
plt.yticks([])
images_shown += 1
if images_shown >= 7:
break
def download_video(url, output_path): """ Download a video from a given url and save it to the output path.
Parameters:
url (str): The url of the video to download.
output_path (str): The path to save the video to.
Returns:
dict: A dictionary containing the metadata of the video.
"""
from pytube import YouTube
yt = YouTube(url)
metadata = {"Author": yt.author, "Title": yt.title, "Views": yt.views}
yt.streams.get_highest_resolution().download(
output_path=output_path, filename="input_vid.mp4"
)
return metadata
metadata_vid = download_video(video_url, output_video_path)
def video_to_images(video_path, output_folder): """ Convert a video to a sequence of images and save them to the output folder.
Parameters:
video_path (str): The path to the video file.
output_folder (str): The path to the folder to save the images to.
"""
clip = VideoFileClip(video_path)
clip.write_images_sequence(
os.path.join(output_folder, "frame%04d.png"), fps=0.2 #configure this for controlling frame rate.
)
def video_to_audio(video_path, output_audio_path): """ Convert a video to audio and save it to the output path.
Parameters:
video_path (str): The path to the video file.
output_audio_path (str): The path to save the audio to.
"""
clip = VideoFileClip(video_path)
audio = clip.audio
audio.write_audiofile(output_audio_path)
def audio_to_text(audio_path): """ Convert an audio file to text.
Parameters:
audio_path (str): The path to the audio file.
Returns:
test (str): The text recognized from the audio.
"""
recognizer = sr.Recognizer()
audio = sr.AudioFile(audio_path)
with audio as source:
# Record the audio data
audio_data = recognizer.record(source)
try:
# Recognize the speech
text = recognizer.recognize_whisper(audio_data)
except sr.UnknownValueError:
print("Speech recognition could not understand the audio.")
except sr.RequestError as e:
print(f"Could not request results from service; {e}")
return text
metadata_vid = download_video(video_url, output_video_path)
video_to_images(filepath, output_folder) video_to_audio(filepath, output_audio_path) text_data = audio_to_text(output_audio_path)
with open(output_folder + "output_text.txt", "w") as file: file.write(text_data) print("Text data saved to file") file.close()
os.remove(output_audio_path) print("Audio file removed")
#building the multimodal index and Vector Store
uri= "tmp/sample-lancedb" text_store = LanceDBVectorStore(uri=uri, table_name="text_collection") image_store = LanceDBVectorStore(uri=uri, table_name="image_collection") storage_context = StorageContext.from_defaults( vector_store=text_store, image_store=image_store )
Create the MultiModal index
documents = SimpleDirectoryReader(output_folder).load_data()
Settings.embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/clip-ViT-B-32-multilingual-v1"
)
#the MultiModalVectorStoreIndex used the embedding model mentioned above
index = MultiModalVectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
retriever_engine = index.as_retriever( similarity_top_k=5, image_similarity_top_k=5 )
import json
metadata_str = json.dumps(metadata_vid)
qa_tmpl_str = (
"Given the provided information, including relevant images and retrieved context from the video,
accurately and precisely answer the query without any additional prior knowledge.\n"
"Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
"---------------------\n"
"Context: {context_str}\n"
"Metadata for video: {metadata_str} \n"
"---------------------\n"
"Query: {query_str}\n"
"Answer: "
)
def retrieve(retriever_engine, query_str): retrieval_results = retriever_engine.retrieve(query_str)
retrieved_image = []
retrieved_text = []
for res_node in retrieval_results:
if isinstance(res_node.node, ImageNode):
retrieved_image.append(res_node.node.metadata["file_path"])
else:
display_source_node(res_node, source_length=200)
retrieved_text.append(res_node.text)
return retrieved_image, retrieved_text
query_str = "Using examples from video, explain all things covered in the video regarding the police activity"
img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)
image_documents = SimpleDirectoryReader( input_dir=output_folder, input_files=img ).load_data() context_str = "".join(txt) plot_images(img)
######################################## **#how do I replace this module with a local multimodal llm? What class should I import?
What are my options if I want to import a multimodal model from Huggingface? How do I use it instead of the code below?
'''** from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text) '''
Referring to this example: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e
from pathlib import Path
from moviepy.editor import *
import speech_recognition as sr
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from PIL import Image
import matplotlib.pyplot as plt
from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.core import StorageContext from llama_index.vector_stores.lancedb import LanceDBVectorStore from llama_index.core import SimpleDirectoryReader import lancedb import numpy as np
video_url = "https://www.youtube.com/watch?v=d_qvLDhkg00"
output_video_path = "./video_data/" output_folder = "./mixed_data/" output_audio_path = "./mixed_data/output_audio.wav" filepath = output_video_path + "input_vid.mp4" Path(output_folder).mkdir(parents=True, exist_ok=True)
def plot_images(image_paths): images_shown = 0 plt.figure(figsize=(16, 9)) for img_path in image_paths: if os.path.isfile(img_path): image = Image.open(img_path)
plt.subplot(2, 3, images_shown + 1)
plt.imshow(image)
plt.xticks([])
plt.yticks([])
images_shown += 1
if images_shown >= 7:
break
def download_video(url, output_path): """ Download a video from a given url and save it to the output path.
Parameters:
url (str): The url of the video to download.
output_path (str): The path to save the video to.
Returns:
dict: A dictionary containing the metadata of the video.
"""
from pytube import YouTube
yt = YouTube(url)
metadata = {"Author": yt.author, "Title": yt.title, "Views": yt.views}
yt.streams.get_highest_resolution().download(
output_path=output_path, filename="input_vid.mp4"
)
return metadata
metadata_vid = download_video(video_url, output_video_path)
def video_to_images(video_path, output_folder): """ Convert a video to a sequence of images and save them to the output folder.
Parameters:
video_path (str): The path to the video file.
output_folder (str): The path to the folder to save the images to.
"""
clip = VideoFileClip(video_path)
clip.write_images_sequence(
os.path.join(output_folder, "frame%04d.png"), fps=0.2 #configure this for controlling frame rate.
)
def video_to_audio(video_path, output_audio_path): """ Convert a video to audio and save it to the output path.
Parameters:
video_path (str): The path to the video file.
output_audio_path (str): The path to save the audio to.
"""
clip = VideoFileClip(video_path)
audio = clip.audio
audio.write_audiofile(output_audio_path)
def audio_to_text(audio_path): """ Convert an audio file to text.
Parameters:
audio_path (str): The path to the audio file.
Returns:
test (str): The text recognized from the audio.
"""
recognizer = sr.Recognizer()
audio = sr.AudioFile(audio_path)
with audio as source:
# Record the audio data
audio_data = recognizer.record(source)
try:
# Recognize the speech
text = recognizer.recognize_whisper(audio_data)
except sr.UnknownValueError:
print("Speech recognition could not understand the audio.")
except sr.RequestError as e:
print(f"Could not request results from service; {e}")
return text
metadata_vid = download_video(video_url, output_video_path)
video_to_images(filepath, output_folder) video_to_audio(filepath, output_audio_path) text_data = audio_to_text(output_audio_path)
with open(output_folder + "output_text.txt", "w") as file: file.write(text_data) print("Text data saved to file") file.close()
os.remove(output_audio_path) print("Audio file removed")
#building the multimodal index and Vector Store
uri= "tmp/sample-lancedb" text_store = LanceDBVectorStore(uri=uri, table_name="text_collection") image_store = LanceDBVectorStore(uri=uri, table_name="image_collection") storage_context = StorageContext.from_defaults( vector_store=text_store, image_store=image_store )
Create the MultiModal index
documents = SimpleDirectoryReader(output_folder).load_data()
Settings.embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/clip-ViT-B-32-multilingual-v1"
)
#the MultiModalVectorStoreIndex used the embedding model mentioned above
index = MultiModalVectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
retriever_engine = index.as_retriever( similarity_top_k=5, image_similarity_top_k=5 )
import json
metadata_str = json.dumps(metadata_vid)
qa_tmpl_str = (
"Given the provided information, including relevant images and retrieved context from the video,
accurately and precisely answer the query without any additional prior knowledge.\n"
"Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
"---------------------\n"
"Context: {context_str}\n"
"Metadata for video: {metadata_str} \n"
"---------------------\n"
"Query: {query_str}\n"
"Answer: "
)
from llama_index.core.response.notebook_utils import display_source_node from llama_index.core.schema import ImageNode
def retrieve(retriever_engine, query_str): retrieval_results = retriever_engine.retrieve(query_str)
retrieved_image = []
retrieved_text = []
for res_node in retrieval_results:
if isinstance(res_node.node, ImageNode):
retrieved_image.append(res_node.node.metadata["file_path"])
else:
display_source_node(res_node, source_length=200)
retrieved_text.append(res_node.text)
return retrieved_image, retrieved_text
query_str = "Using examples from video, explain all things covered in the video regarding the police activity"
img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)
image_documents = SimpleDirectoryReader( input_dir=output_folder, input_files=img ).load_data() context_str = "".join(txt) plot_images(img)
######################################## #how do I replace this module with a local multimodal llm? What class should I import?
What are my options if I want to import a multimodal llm from Huggingface? How do I use it instead of the code below?
''' from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text) '''
To run a multimodal LLM locally and replace the OpenAI API usage in your video processing project, you can use the HuggingFaceLLM class from the llama_index.legacy.llms.custom module with a model from Huggingface. Here's an example of how to set it up:
from llama_index.legacy.llms.custom import HuggingFaceLLM
# Initialize the HuggingFaceLLM with your chosen model
llm = HuggingFaceLLM(
model_name="your-model-name-here", # Specify your Huggingface model name
tokenizer_name="your-tokenizer-name-here", # Specify your tokenizer name, if different
context_window=2048, # Adjust based on your model's capabilities
max_new_tokens=256, # Adjust based on your needs
)
# Generate a response
response = llm.complete("Your prompt here")
print(response.text)
Make sure to replace "your-model-name-here" and "your-tokenizer-name-here" with the actual names of the model and tokenizer you wish to use. The context_window and max_new_tokens parameters should be adjusted according to the capabilities and requirements of your chosen model.
Regarding the availability of multimodal models on Huggingface for video processing, it's important to note that while there are models that support multimodal interactions, their direct application to video processing might be limited. The document mentions models like GPT4V, CLIP, and others, but also indicates that video processing capabilities are not directly supported at the moment. This suggests that while these models can be used for multimodal tasks, additional steps or custom integrations might be necessary for processing video content specifically.
Details
Sources
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-legacy/llama_index/legacy/llms/huggingface.py
- docs/docs/module_guides/models/multi_modal.md
- llama-index-integrations/llms/llama-index-llms-huggingface/pyproject.toml
- llama-index-integrations/embeddings/llama-index-embeddings-huggingface/pyproject.toml
- llama-index-integrations/vector_stores/llama-index-vector-stores-lancedb/llama_index/vector_stores/lancedb/base.py
- docs/docs/examples/llm/huggingface.ipynb
- llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py
- llama-index-legacy/llama_index/legacy/embeddings/huggingface.py
- llama-index-legacy/llama_index/legacy/query_engine/multi_modal.py
- llama-index-legacy/llama_index/legacy/multi_modal_llms/openai.py
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
@dosubot The example I am referring to: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e, uses retrieved images from the image vector store along with the prompt as input to the final multimodal LLM shown below:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text)
Can you please give the code that can do the same with the HuggingFaceLLM, that is, use both the prompt and the images retrieved from the vector store as input to the multimodal llm?
To integrate a local multimodal LLM using Hugging Face models for processing both text and images in your video processing project, follow these steps:
-
Set Up the HuggingFaceLLM: First, ensure you have the necessary packages installed. If not, you might need to install them using pip. Then, import the
HuggingFaceLLMclass fromllama_index.llms.huggingface. Initialize theHuggingFaceLLMwith the model and tokenizer of your choice, along with any specific configurations like context window size and maximum new tokens. Here's an example setup:from llama_index.llms.huggingface import HuggingFaceLLM llm = HuggingFaceLLM( model_name="your-model-name-here", tokenizer_name="your-tokenizer-name-here", context_window=2048, max_new_tokens=256, ) -
Prepare Multimodal Inputs: Since your project involves video processing, you'll need to handle both text and images. Convert images to a format compatible with your LLM, which might involve encoding them into embeddings or converting them to text descriptions using a separate model.
-
Generate Responses: Use the
completemethod of yourHuggingFaceLLMinstance to generate responses. If you've converted images to text, concatenate this text with your other textual inputs before generating a response. -
Integration and Testing: Integrate this setup into your video processing pipeline, ensuring that the LLM processes both text and image inputs correctly. Test thoroughly to optimize model choice, input preprocessing, and configuration parameters.
This approach allows you to run a multimodal LLM locally, leveraging Hugging Face's models for enhanced control over your data and potentially reducing latency and costs associated with external API calls.
Details
Sources
- [Tracking]: Better support for Local Models
- [Question]: Using Local LLM for Image to Image instead of openai
- [Question]: How to use multi modal large model in local env base llama_index?
- local OpenAILike llm, when extracting database schema with ObjectIndex it always resorts to using OpenAI API
- llama-index-legacy/llama_index/legacy/program/multi_modal_llm_program.py
- llama-index-legacy/llama_index/legacy/llms/huggingface.py
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-integrations/llms/llama-index-llms-huggingface/pyproject.toml
- llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/image_vision_llm/base.py
- llama-index-integrations/llms/llama-index-llms-huggingface/tests/test_llms_huggingface.py
- docs/docs/examples/multi_modal/multi_modal_video_RAG.ipynb
- docs/docs/module_guides/models/multi_modal.md
- llama-index-legacy/llama_index/legacy/query_engine/multi_modal.py
- llama-index-core/llama_index/core/indices/multi_modal/retriever.py
- llama-index-legacy/llama_index/legacy/indices/multi_modal/retriever.py
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
@dosubot could you give the code snippet using HuggingFaceLLM which does the same thing as the snippet below:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text)
Refer to https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e for preparing the multimodal inputs.
To integrate a local multimodal LLM using Hugging Face models for processing both text and images in your video processing project, you'll need to follow a specific approach since direct support for multimodal inputs (text and images) in a single model might vary. Here's a concise guide to achieve this with a hypothetical Hugging Face model that supports multimodal inputs:
-
Install Required Libraries: Ensure you have the necessary libraries installed. If not, install them using pip:
pip install transformers llama-index-llms-huggingface -
Select and Load a Multimodal Model: Choose a Hugging Face model that supports multimodal inputs. Initialize the model and tokenizer with the
HuggingFaceLLMclass. Adjust themodel_nameto the one you've selected:from llama_index.llms.huggingface import HuggingFaceLLM model_name = "your_multimodal_model_name_here" # Replace with the actual model name llm = HuggingFaceLLM( model_name=model_name, tokenizer_name=model_name, context_window=3900, max_new_tokens=256, device_map="auto", ) -
Prepare Inputs: Convert images to a format compatible with your LLM. This might involve encoding them into embeddings or converting them to text descriptions using a separate model.
-
Generate Responses: Use the
completemethod of yourHuggingFaceLLMinstance to generate responses. If you've converted images to text, concatenate this text with your other textual inputs before generating a response.
This approach allows you to run a multimodal LLM locally, leveraging Hugging Face's models for enhanced control over your data and potentially reducing latency and costs associated with external API calls. Remember, the exact preprocessing steps for images and the method to include them in your inputs will depend on the specific requirements of the model you choose.
Details
Sources
- [Question]: How to use multi modal large model in local env base llama_index?
- llama-index-legacy/llama_index/legacy/llms/huggingface.py
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-legacy/llama_index/legacy/readers/file/image_reader.py
- llama-index-legacy/llama_index/legacy/query_engine/multi_modal.py
- llama-index-legacy/llama_index/legacy/llms/vllm.py
- llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py
- llama-index-legacy/llama_index/legacy/llms/nvidia_tensorrt.py
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
@dosu, what is the code for the complete method from HuggingFaceLLM, if my input to the llm is a pair of text and list of images.
To replace the OpenAI API usage in your code snippet with a local Hugging Face model using HuggingFaceLLM from the llama_index.llms.huggingface module, follow these steps:
-
Install Required Libraries:
pip install llama-index-core huggingface-hub torch text-generation transformers -
Initialize the HuggingFaceLLM:
from llama_index.llms.huggingface import HuggingFaceLLM from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import torch # Initialize the HuggingFaceLLM with your chosen model model_name = "your_multimodal_model_name_here" # Replace with the actual model name llm = HuggingFaceLLM( model_name=model_name, tokenizer_name=model_name, context_window=3900, max_new_tokens=256, device_map="auto", ) -
Process Images and Generate Captions:
# Initialize the BlipProcessor and BlipForConditionalGeneration for image processing processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Function to process images and generate captions def process_images(image_paths): captions = [] for image_path in image_paths: image = Image.open(image_path).convert("RGB") inputs = processor(image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) captions.append(caption) return captions -
Prepare the Prompt and Generate a Response:
# Prepare the prompt and image documents qa_tmpl_str = ( "Given the provided information, including relevant images and retrieved context from the video, " "accurately and precisely answer the query without any additional prior knowledge.\n" "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n" "---------------------\n" "Context: {context_str}\n" "Metadata for video: {metadata_str} \n" "---------------------\n" "Query: {query_str}\n" "Answer: " ) context_str = "Your context string here" query_str = "Your query string here" metadata_str = "Your metadata string here" image_documents = ["path/to/image1.png", "path/to/image2.png"] # List of image paths # Process images to generate captions image_captions = process_images(image_documents) # Combine text and image captions into the prompt prompt = qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ) + "\nImages: " + ", ".join(image_captions) # Generate a response response = llm.complete(prompt=prompt) print(response.text)
Replace "your_multimodal_model_name_here" with the actual model name you intend to use from Hugging Face. This setup allows you to run a multimodal LLM locally, handling both text and image inputs.
Details
Sources
- [Tracking]: Better support for Local Models
- [Bug]: not able to import HuggingFaceLLM
- local OpenAILike llm, when extracting database schema with ObjectIndex it always resorts to using OpenAI API
- [Bug]: cannot select model other than openai models in openai llm integration
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/image_vision_llm/base.py
- llama-index-legacy/llama_index/legacy/readers/file/image_vision_llm_reader.py
- llama-index-integrations/llms/llama-index-llms-huggingface/pyproject.toml
- docs/docs/module_guides/models/multi_modal.md
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
@dosu I do not want to convert the image into captions, rather send a list of images and a text prompt as input to the llm using HuggingFaceLLM just like how its done here:
openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, )
To achieve the same functionality using HuggingFaceLLM from the llama_index.llms.huggingface module, you can follow a similar approach to the OpenAIMultiModal implementation. Below is a step-by-step guide and code example to help you set up and use HuggingFaceLLM for your multimodal LLM use case:
-
Install Required Libraries:
pip install llama-index-core huggingface-hub torch text-generation transformers -
Define the
HuggingFaceLLMClass:from typing import Sequence, Any from llama_index.legacy.schema import ImageDocument from llama_index.legacy.multi_modal_llms import MultiModalLLM, MultiModalLLMMetadata from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList import torch from threading import Thread class HuggingFaceLLM(MultiModalLLM): @property def metadata(self) -> MultiModalLLMMetadata: return MultiModalLLMMetadata( model_name="huggingface-multimodal-model", is_chat_model=False, context_window=1024, num_output=256, num_input_files=10, is_function_calling_model=False ) def __init__(self, model_name: str, tokenizer_name: str, context_window: int = 1024, max_new_tokens: int = 256, device_map: str = "auto", **kwargs): self._model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, **kwargs) self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) self.context_window = context_window self.max_new_tokens = max_new_tokens self.device_map = device_map class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs: Any) -> bool: return input_ids[0][-1] in self.stopping_ids self._stopping_criteria = StoppingCriteriaList([StopOnTokens()]) def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse: # Implement the logic to process the prompt and image_documents using the HuggingFace model # This is a placeholder implementation full_prompt = prompt inputs = self._tokenizer(full_prompt, return_tensors="pt") inputs = inputs.to(self._model.device) tokens = self._model.generate( **inputs, max_new_tokens=self.max_new_tokens, stopping_criteria=self._stopping_criteria, **kwargs ) completion_tokens = tokens[0][inputs["input_ids"].size(1):] completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True) return CompletionResponse(text=completion) def stream_complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponseGen: from transformers import TextIteratorStreamer full_prompt = prompt inputs = self._tokenizer(full_prompt, return_tensors="pt") inputs = inputs.to(self._model.device) streamer = TextIteratorStreamer(self._tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=self.max_new_tokens, stopping_criteria=self._stopping_criteria, **kwargs ) thread = Thread(target=self._model.generate, kwargs=generation_kwargs) thread.start() def gen() -> CompletionResponseGen: text = "" for x in streamer: text += x yield CompletionResponse(text=text, delta=x) return gen() def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: prompt = self.messages_to_prompt(messages) completion_response = self.complete(prompt, formatted=True, **kwargs) return completion_response_to_chat_response(completion_response) def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: prompt = self.messages_to_prompt(messages) completion_response = self.stream_complete(prompt, formatted=True, **kwargs) return stream_completion_response_to_chat_response(completion_response) -
Example Usage:
# Example usage huggingface_llm = HuggingFaceLLM(model_name="HuggingFaceH4/zephyr-7b-beta", tokenizer_name="HuggingFaceH4/zephyr-7b-beta") prompt = "Describe the content of the images" image_documents = [ImageDocument(image_path="path/to/image1.jpg"), ImageDocument(image_path="path/to/image2.jpg")] response = huggingface_llm.complete(prompt, image_documents) print(response.text)
This code should help you achieve the same functionality as the OpenAIMultiModal example, but using a local Hugging Face model instead. You can extend this implementation to include other methods and functionalities as needed.
Details
Sources
- llama-index-core/llama_index/core/multi_modal_llms/base.py
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-legacy/llama_index/legacy/program/multi_modal_llm_program.py
- llama-index-legacy/llama_index/legacy/multi_modal_llms/base.py
- llama-index-legacy/llama_index/legacy/schema.py
- llama-index-core/tests/program/test_multi_modal_llm_program.py
- docs/docs/examples/multi_modal/llava_demo.ipynb
- docs/docs/examples/multi_modal/replicate_multi_modal.ipynb
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
@dosubot could you please provide the code for the placeholder part mentioned in your answer above for the function complete. The goal is to use both images and text prompt as input to the llm, to provide the same functionality as openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, )
def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse: # Implement the logic to process the prompt and image_documents using the HuggingFace model # This is a placeholder implementation full_prompt = prompt inputs = self._tokenizer(full_prompt, return_tensors="pt") inputs = inputs.to(self._model.device)
tokens = self._model.generate(
**inputs,
max_new_tokens=self.max_new_tokens,
stopping_criteria=self._stopping_criteria,
**kwargs
)
completion_tokens = tokens[0][inputs["input_ids"].size(1):]
completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)
return CompletionResponse(text=completion)
To implement the complete method in the HuggingFaceLLM class to handle both text and image inputs, you can follow the approach used in the OpenAIMultiModal class. This involves preprocessing the images, combining them with the text prompt, and then generating the response using the Hugging Face model.
Here is the complete code for the complete method:
from typing import Sequence, Any
from llama_index.core.multi_modal_llms.base import CompletionResponse, ImageDocument
from transformers import BlipProcessor, BlipForConditionalGeneration
class HuggingFaceLLM:
def __init__(self, model_name: str, tokenizer_name: str, max_new_tokens: int, **kwargs):
self._model = BlipForConditionalGeneration.from_pretrained(model_name)
self._tokenizer = BlipProcessor.from_pretrained(tokenizer_name)
self.max_new_tokens = max_new_tokens
self._model.to(kwargs.get("device", "cpu"))
def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse:
# Validate inputs
if not isinstance(prompt, str):
raise ValueError("Prompt must be a string.")
if not isinstance(image_documents, list) or not all(isinstance(doc, ImageDocument) for doc in image_documents):
raise ValueError("image_documents must be a list of ImageDocument objects.")
# Process images
images = [self._process_image(doc.image) for doc in image_documents]
# Prepare the model inputs
inputs = self._tokenizer(images, prompt, return_tensors="pt").to(self._model.device)
# Generate the response
tokens = self._model.generate(
**inputs,
max_new_tokens=self.max_new_tokens,
**kwargs
)
completion_tokens = tokens[0][inputs["input_ids"].size(1):]
completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)
return CompletionResponse(text=completion)
def _process_image(self, image_base64: str):
from PIL import Image
import base64
from io import BytesIO
# Decode the base64 image
image_data = base64.b64decode(image_base64)
image = Image.open(BytesIO(image_data))
if image.mode != "RGB":
image = image.convert("RGB")
return image
# Example usage
image_documents = [ImageDocument(image="base64_image_string_here", text="Image description")]
huggingface_llm = HuggingFaceLLM(model_name="blip-image-captioning-base", tokenizer_name="blip-image-captioning-base", max_new_tokens=150)
response = huggingface_llm.complete(prompt="Describe the image", image_documents=image_documents)
print(response.text)
Explanation:
-
Initialization:
- The
HuggingFaceLLMclass is initialized with the model and tokenizer names, and the maximum number of new tokens to generate. - The model and tokenizer are loaded using the
BlipForConditionalGenerationandBlipProcessorclasses from Hugging Face.
- The
-
Complete Method:
- The
completemethod validates the inputs to ensure the prompt is a string and the image documents are a list ofImageDocumentobjects. - The images are processed by decoding the base64 strings and converting them to RGB format if necessary.
- The model inputs are prepared by combining the images and the text prompt using the tokenizer.
- The model generates the response tokens, which are then decoded to get the final completion text.
- The
-
Image Processing:
- The
_process_imagemethod decodes the base64 image string and converts it to an RGB image using the PIL library.
- The
This implementation should provide the same functionality as the OpenAIMultiModal class, allowing you to process both text and image inputs using a local Hugging Face model.
Details
Sources
- llama-index-legacy/llama_index/legacy/multi_modal_llms/openai.py
- llama-index-core/llama_index/core/multi_modal_llms/base.py
- llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py
- llama-index-core/llama_index/core/program/multi_modal_llm_program.py
- llama-index-legacy/llama_index/legacy/readers/file/image_vision_llm_reader.py
- docs/docs/module_guides/models/multi_modal.md
- docs/docs/use_cases/multimodal.md
- llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/image_vision_llm/base.py
- llama-index-legacy/llama_index/legacy/llms/huggingface.py
- llama-index-legacy/llama_index/legacy/program/multi_modal_llm_program.py
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-core/tests/program/test_multi_modal_llm_program.py
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
@dosubot Please rewrite the code, if my LLM is https://huggingface.co/llava-hf/llava-1.5-7b-hf. Please make the necessary changes.
To replace the OpenAI API usage with the llava-hf/llava-1.5-7b-hf model from Hugging Face for processing both text and image inputs, you can use the following code:
Updated Code:
from typing import Sequence, Any
from llama_index.core.multi_modal_llms.base import CompletionResponse, ImageDocument
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import base64
from io import BytesIO
class HuggingFaceLLM:
def __init__(self, model_name: str, tokenizer_name: str, max_new_tokens: int, **kwargs):
self._model = AutoModelForCausalLM.from_pretrained(model_name)
self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.max_new_tokens = max_new_tokens
self._model.to(kwargs.get("device", "cpu"))
# Initialize the BLIP processor and model for image captioning
self._blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
self._blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16)
self._blip_model.to(kwargs.get("device", "cpu"))
def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse:
# Validate inputs
if not isinstance(prompt, str):
raise ValueError("Prompt must be a string.")
if not isinstance(image_documents, list) or not all(isinstance(doc, ImageDocument) for doc in image_documents):
raise ValueError("image_documents must be a list of ImageDocument objects.")
# Process images and generate captions
image_captions = [self._generate_image_caption(doc.image) for doc in image_documents]
# Combine image captions with the prompt
combined_prompt = prompt + " " + " ".join(image_captions)
# Prepare the model inputs
inputs = self._tokenizer(combined_prompt, return_tensors="pt").to(self._model.device)
# Generate the response
tokens = self._model.generate(
**inputs,
max_new_tokens=self.max_new_tokens,
**kwargs
)
completion_tokens = tokens[0][inputs["input_ids"].size(1):]
completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)
return CompletionResponse(text=completion, raw={"model_output": tokens})
def _generate_image_caption(self, image_base64: str) -> str:
# Decode the base64 image
image_data = base64.b64decode(image_base64)
image = Image.open(BytesIO(image_data))
if image.mode != "RGB":
image = image.convert("RGB")
# Process the image and generate a caption
inputs = self._blip_processor(image, return_tensors="pt").to(self._blip_model.device)
out = self._blip_model.generate(**inputs)
caption = self._blip_processor.decode(out[0], skip_special_tokens=True)
return caption
# Example usage
image_documents = [ImageDocument(image="base64_image_string_here", text="Image description")]
huggingface_llm = HuggingFaceLLM(model_name="llava-hf/llava-1.5-7b-hf", tokenizer_name="llava-hf/llava-1.5-7b-hf", max_new_tokens=150)
prompt = "Describe the content of the images"
response = huggingface_llm.complete(prompt=prompt, image_documents=image_documents)
print(response.text)
Explanation:
-
Initialization:
- The
HuggingFaceLLMclass initializes theAutoModelForCausalLMandAutoTokenizerfor thellava-hf/llava-1.5-7b-hfmodel. - Additionally, it initializes the
BlipProcessorandBlipForConditionalGenerationfor image captioning.
- The
-
Image Processing:
- The
_generate_image_captionmethod decodes the base64 image, processes it using theBlipProcessor, and generates a caption using theBlipForConditionalGenerationmodel.
- The
-
Combining Prompts:
- The
completemethod combines the generated image captions with the text prompt to create a combined prompt.
- The
-
Generating Response:
- The combined prompt is tokenized and passed to the
llava-hf/llava-1.5-7b-hfmodel to generate a response.
- The combined prompt is tokenized and passed to the
This setup ensures that the HuggingFaceLLM class can handle both text and image inputs using the llava-hf/llava-1.5-7b-hf model from Hugging Face.
Details
Sources
- local OpenAILike llm, when extracting database schema with ObjectIndex it always resorts to using OpenAI API
- [Question]: How to use an OpenAI compatible API?
- how can I use my model to predict answer?
- llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
- llama-index-integrations/llms/llama-index-llms-huggingface/pyproject.toml
- llama-index-core/llama_index/core/schema.py
- llama-index-legacy/llama_index/legacy/schema.py
- llama-index-legacy/llama_index/legacy/llms/huggingface.py
- docs/docs/examples/multi_modal/llava_demo.ipynb
- llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/image_vision_llm/base.py
- llama-index-legacy/llama_index/legacy/readers/file/image_vision_llm_reader.py
- llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/image_caption/base.py
About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.
Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.
To continue the conversation, mention @dosu.
Hi check here, I built a local multimodal RAG, It use locall llava.