Question Validation

[X] I have searched both the documentation and discord for an answer.

Question

I am Referring to this example: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e to run a multimodal LLM for a video processing use case. I want to know how to run the Multimodal LLM for generation locally? I have highlighted the code block which is now using OpenAI api which needs to be replaced so that we can run a multimodal LLM locally.

I am using the embedding model from Huggingface.

Could anyone please help.

from llama_index.core.response.notebook_utils import display_source_node from llama_index.core.schema import ImageNode from pathlib import Path from moviepy.editor import * import speech_recognition as sr
from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import Settings from PIL import Image import matplotlib.pyplot as plt

from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.core import StorageContext from llama_index.vector_stores.lancedb import LanceDBVectorStore from llama_index.core import SimpleDirectoryReader import lancedb import numpy as np

video_url = "https://www.youtube.com/watch?v=d_qvLDhkg00"

output_video_path = "./video_data/" output_folder = "./mixed_data/" output_audio_path = "./mixed_data/output_audio.wav" filepath = output_video_path + "input_vid.mp4" Path(output_folder).mkdir(parents=True, exist_ok=True)

def plot_images(image_paths): images_shown = 0 plt.figure(figsize=(16, 9)) for img_path in image_paths: if os.path.isfile(img_path): image = Image.open(img_path)

        plt.subplot(2, 3, images_shown + 1)
        plt.imshow(image)
        plt.xticks([])
        plt.yticks([])

        images_shown += 1
        if images_shown >= 7:
            break

def download_video(url, output_path): """ Download a video from a given url and save it to the output path.

Parameters:
url (str): The url of the video to download.
output_path (str): The path to save the video to.

Returns:
dict: A dictionary containing the metadata of the video.
"""
from pytube import YouTube

yt = YouTube(url)
metadata = {"Author": yt.author, "Title": yt.title, "Views": yt.views}
yt.streams.get_highest_resolution().download(
    output_path=output_path, filename="input_vid.mp4"
)
return metadata

metadata_vid = download_video(video_url, output_video_path)

def video_to_images(video_path, output_folder): """ Convert a video to a sequence of images and save them to the output folder.

Parameters:
video_path (str): The path to the video file.
output_folder (str): The path to the folder to save the images to.

"""
clip = VideoFileClip(video_path)
clip.write_images_sequence(
    os.path.join(output_folder, "frame%04d.png"), fps=0.2 #configure this for controlling frame rate.
)

def video_to_audio(video_path, output_audio_path): """ Convert a video to audio and save it to the output path.

Parameters:
video_path (str): The path to the video file.
output_audio_path (str): The path to save the audio to.

"""
clip = VideoFileClip(video_path)
audio = clip.audio
audio.write_audiofile(output_audio_path)

def audio_to_text(audio_path): """ Convert an audio file to text.

Parameters:
audio_path (str): The path to the audio file.

Returns:
test (str): The text recognized from the audio.

"""
recognizer = sr.Recognizer()
audio = sr.AudioFile(audio_path)

with audio as source:
    # Record the audio data
    audio_data = recognizer.record(source)

    try:
        # Recognize the speech
        text = recognizer.recognize_whisper(audio_data)
    except sr.UnknownValueError:
        print("Speech recognition could not understand the audio.")
    except sr.RequestError as e:
        print(f"Could not request results from service; {e}")

return text

metadata_vid = download_video(video_url, output_video_path)

video_to_images(filepath, output_folder) video_to_audio(filepath, output_audio_path) text_data = audio_to_text(output_audio_path)

with open(output_folder + "output_text.txt", "w") as file: file.write(text_data) print("Text data saved to file") file.close()

os.remove(output_audio_path) print("Audio file removed")

#building the multimodal index and Vector Store

uri= "tmp/sample-lancedb" text_store = LanceDBVectorStore(uri=uri, table_name="text_collection") image_store = LanceDBVectorStore(uri=uri, table_name="image_collection") storage_context = StorageContext.from_defaults( vector_store=text_store, image_store=image_store )

Create the MultiModal index

documents = SimpleDirectoryReader(output_folder).load_data()

Settings.embed_model = HuggingFaceEmbedding( model_name="sentence-transformers/clip-ViT-B-32-multilingual-v1" ) #the MultiModalVectorStoreIndex used the embedding model mentioned above index = MultiModalVectorStoreIndex.from_documents( documents, storage_context=storage_context,
)

retriever_engine = index.as_retriever( similarity_top_k=5, image_similarity_top_k=5 )

import json

metadata_str = json.dumps(metadata_vid)

qa_tmpl_str = ( "Given the provided information, including relevant images and retrieved context from the video,
accurately and precisely answer the query without any additional prior knowledge.\n" "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n" "---------------------\n" "Context: {context_str}\n" "Metadata for video: {metadata_str} \n" "---------------------\n" "Query: {query_str}\n" "Answer: " )

def retrieve(retriever_engine, query_str): retrieval_results = retriever_engine.retrieve(query_str)

retrieved_image = []
retrieved_text = []
for res_node in retrieval_results:
    if isinstance(res_node.node, ImageNode):
        retrieved_image.append(res_node.node.metadata["file_path"])
    else:
        display_source_node(res_node, source_length=200)
        retrieved_text.append(res_node.text)

return retrieved_image, retrieved_text

query_str = "Using examples from video, explain all things covered in the video regarding the police activity"

img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)

image_documents = SimpleDirectoryReader( input_dir=output_folder, input_files=img ).load_data() context_str = "".join(txt) plot_images(img)

######################################## **#how do I replace this module with a local multimodal llm? What class should I import?

What are my options if I want to import a multimodal model from Huggingface? How do I use it instead of the code below?

'''** from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text) '''

Referring to this example: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e

from pathlib import Path from moviepy.editor import * import speech_recognition as sr
from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import Settings from PIL import Image import matplotlib.pyplot as plt

from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.core import StorageContext from llama_index.vector_stores.lancedb import LanceDBVectorStore from llama_index.core import SimpleDirectoryReader import lancedb import numpy as np

video_url = "https://www.youtube.com/watch?v=d_qvLDhkg00"

output_video_path = "./video_data/" output_folder = "./mixed_data/" output_audio_path = "./mixed_data/output_audio.wav" filepath = output_video_path + "input_vid.mp4" Path(output_folder).mkdir(parents=True, exist_ok=True)

def plot_images(image_paths): images_shown = 0 plt.figure(figsize=(16, 9)) for img_path in image_paths: if os.path.isfile(img_path): image = Image.open(img_path)

        plt.subplot(2, 3, images_shown + 1)
        plt.imshow(image)
        plt.xticks([])
        plt.yticks([])

        images_shown += 1
        if images_shown >= 7:
            break

def download_video(url, output_path): """ Download a video from a given url and save it to the output path.

Parameters:
url (str): The url of the video to download.
output_path (str): The path to save the video to.

Returns:
dict: A dictionary containing the metadata of the video.
"""
from pytube import YouTube

yt = YouTube(url)
metadata = {"Author": yt.author, "Title": yt.title, "Views": yt.views}
yt.streams.get_highest_resolution().download(
    output_path=output_path, filename="input_vid.mp4"
)
return metadata

metadata_vid = download_video(video_url, output_video_path)

def video_to_images(video_path, output_folder): """ Convert a video to a sequence of images and save them to the output folder.

Parameters:
video_path (str): The path to the video file.
output_folder (str): The path to the folder to save the images to.

"""
clip = VideoFileClip(video_path)
clip.write_images_sequence(
    os.path.join(output_folder, "frame%04d.png"), fps=0.2 #configure this for controlling frame rate.
)

def video_to_audio(video_path, output_audio_path): """ Convert a video to audio and save it to the output path.

Parameters:
video_path (str): The path to the video file.
output_audio_path (str): The path to save the audio to.

"""
clip = VideoFileClip(video_path)
audio = clip.audio
audio.write_audiofile(output_audio_path)

def audio_to_text(audio_path): """ Convert an audio file to text.

Parameters:
audio_path (str): The path to the audio file.

Returns:
test (str): The text recognized from the audio.

"""
recognizer = sr.Recognizer()
audio = sr.AudioFile(audio_path)

with audio as source:
    # Record the audio data
    audio_data = recognizer.record(source)

    try:
        # Recognize the speech
        text = recognizer.recognize_whisper(audio_data)
    except sr.UnknownValueError:
        print("Speech recognition could not understand the audio.")
    except sr.RequestError as e:
        print(f"Could not request results from service; {e}")

return text

metadata_vid = download_video(video_url, output_video_path)

video_to_images(filepath, output_folder) video_to_audio(filepath, output_audio_path) text_data = audio_to_text(output_audio_path)

with open(output_folder + "output_text.txt", "w") as file: file.write(text_data) print("Text data saved to file") file.close()

os.remove(output_audio_path) print("Audio file removed")

#building the multimodal index and Vector Store

uri= "tmp/sample-lancedb" text_store = LanceDBVectorStore(uri=uri, table_name="text_collection") image_store = LanceDBVectorStore(uri=uri, table_name="image_collection") storage_context = StorageContext.from_defaults( vector_store=text_store, image_store=image_store )

Create the MultiModal index

documents = SimpleDirectoryReader(output_folder).load_data()

Settings.embed_model = HuggingFaceEmbedding( model_name="sentence-transformers/clip-ViT-B-32-multilingual-v1" ) #the MultiModalVectorStoreIndex used the embedding model mentioned above index = MultiModalVectorStoreIndex.from_documents( documents, storage_context=storage_context,
)

retriever_engine = index.as_retriever( similarity_top_k=5, image_similarity_top_k=5 )

import json

metadata_str = json.dumps(metadata_vid)

qa_tmpl_str = ( "Given the provided information, including relevant images and retrieved context from the video,
accurately and precisely answer the query without any additional prior knowledge.\n" "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n" "---------------------\n" "Context: {context_str}\n" "Metadata for video: {metadata_str} \n" "---------------------\n" "Query: {query_str}\n" "Answer: " )

from llama_index.core.response.notebook_utils import display_source_node from llama_index.core.schema import ImageNode

def retrieve(retriever_engine, query_str): retrieval_results = retriever_engine.retrieve(query_str)

retrieved_image = []
retrieved_text = []
for res_node in retrieval_results:
    if isinstance(res_node.node, ImageNode):
        retrieved_image.append(res_node.node.metadata["file_path"])
    else:
        display_source_node(res_node, source_length=200)
        retrieved_text.append(res_node.text)

return retrieved_image, retrieved_text

query_str = "Using examples from video, explain all things covered in the video regarding the police activity"

img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)

image_documents = SimpleDirectoryReader( input_dir=output_folder, input_files=img ).load_data() context_str = "".join(txt) plot_images(img)

######################################## #how do I replace this module with a local multimodal llm? What class should I import?

What are my options if I want to import a multimodal llm from Huggingface? How do I use it instead of the code below?

''' from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text) '''

May 20 '24 17:05 tsantra

To run a multimodal LLM locally and replace the OpenAI API usage in your video processing project, you can use the HuggingFaceLLM class from the llama_index.legacy.llms.custom module with a model from Huggingface. Here's an example of how to set it up:

from llama_index.legacy.llms.custom import HuggingFaceLLM

# Initialize the HuggingFaceLLM with your chosen model
llm = HuggingFaceLLM(
    model_name="your-model-name-here",  # Specify your Huggingface model name
    tokenizer_name="your-tokenizer-name-here",  # Specify your tokenizer name, if different
    context_window=2048,  # Adjust based on your model's capabilities
    max_new_tokens=256,  # Adjust based on your needs
)

# Generate a response
response = llm.complete("Your prompt here")
print(response.text)

Make sure to replace "your-model-name-here" and "your-tokenizer-name-here" with the actual names of the model and tokenizer you wish to use. The context_window and max_new_tokens parameters should be adjusted according to the capabilities and requirements of your chosen model.

Regarding the availability of multimodal models on Huggingface for video processing, it's important to note that while there are models that support multimodal interactions, their direct application to video processing might be limited. The document mentions models like GPT4V, CLIP, and others, but also indicates that video processing capabilities are not directly supported at the moment. This suggests that while these models can be used for multimodal tasks, additional steps or custom integrations might be necessary for processing video content specifically.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 20 '24 17:05 dosubot[bot]

@dosubot The example I am referring to: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e, uses retrieved images from the image vector store along with the prompt as input to the final multimodal LLM shown below:

from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text)

Can you please give the code that can do the same with the HuggingFaceLLM, that is, use both the prompt and the images retrieved from the vector store as input to the multimodal llm?

May 20 '24 18:05 tsantra

To integrate a local multimodal LLM using Hugging Face models for processing both text and images in your video processing project, follow these steps:

Set Up the HuggingFaceLLM: First, ensure you have the necessary packages installed. If not, you might need to install them using pip. Then, import the HuggingFaceLLM class from llama_index.llms.huggingface. Initialize the HuggingFaceLLM with the model and tokenizer of your choice, along with any specific configurations like context window size and maximum new tokens. Here's an example setup:
```
from llama_index.llms.huggingface import HuggingFaceLLM

llm = HuggingFaceLLM(
    model_name="your-model-name-here",
    tokenizer_name="your-tokenizer-name-here",
    context_window=2048,
    max_new_tokens=256,
)
```
Prepare Multimodal Inputs: Since your project involves video processing, you'll need to handle both text and images. Convert images to a format compatible with your LLM, which might involve encoding them into embeddings or converting them to text descriptions using a separate model.
Generate Responses: Use the complete method of your HuggingFaceLLM instance to generate responses. If you've converted images to text, concatenate this text with your other textual inputs before generating a response.
Integration and Testing: Integrate this setup into your video processing pipeline, ensuring that the LLM processes both text and image inputs correctly. Test thoroughly to optimize model choice, input preprocessing, and configuration parameters.

This approach allows you to run a multimodal LLM locally, leveraging Hugging Face's models for enhanced control over your data and potentially reducing latency and costs associated with external API calls.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 20 '24 19:05 dosubot[bot]

@dosubot could you give the code snippet using HuggingFaceLLM which does the same thing as the snippet below:

from llama_index.multi_modal_llms.openai import OpenAIMultiModal openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, ) pprint(response_1.text)

Refer to https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e for preparing the multimodal inputs.

May 20 '24 19:05 tsantra

To integrate a local multimodal LLM using Hugging Face models for processing both text and images in your video processing project, you'll need to follow a specific approach since direct support for multimodal inputs (text and images) in a single model might vary. Here's a concise guide to achieve this with a hypothetical Hugging Face model that supports multimodal inputs:

Install Required Libraries: Ensure you have the necessary libraries installed. If not, install them using pip:
```
pip install transformers llama-index-llms-huggingface
```

Select and Load a Multimodal Model: Choose a Hugging Face model that supports multimodal inputs. Initialize the model and tokenizer with the HuggingFaceLLM class. Adjust the model_name to the one you've selected:

from llama_index.llms.huggingface import HuggingFaceLLM

model_name = "your_multimodal_model_name_here"  # Replace with the actual model name
llm = HuggingFaceLLM(
    model_name=model_name,
    tokenizer_name=model_name,
    context_window=3900,
    max_new_tokens=256,
    device_map="auto",
)

Prepare Inputs: Convert images to a format compatible with your LLM. This might involve encoding them into embeddings or converting them to text descriptions using a separate model.
Generate Responses: Use the complete method of your HuggingFaceLLM instance to generate responses. If you've converted images to text, concatenate this text with your other textual inputs before generating a response.

This approach allows you to run a multimodal LLM locally, leveraging Hugging Face's models for enhanced control over your data and potentially reducing latency and costs associated with external API calls. Remember, the exact preprocessing steps for images and the method to include them in your inputs will depend on the specific requirements of the model you choose.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 20 '24 19:05 dosubot[bot]

@dosu, what is the code for the complete method from HuggingFaceLLM, if my input to the llm is a pair of text and list of images.

May 20 '24 23:05 tsantra

To replace the OpenAI API usage in your code snippet with a local Hugging Face model using HuggingFaceLLM from the llama_index.llms.huggingface module, follow these steps:

Install Required Libraries:

pip install llama-index-core huggingface-hub torch text-generation transformers

Initialize the HuggingFaceLLM:

from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Initialize the HuggingFaceLLM with your chosen model
model_name = "your_multimodal_model_name_here"  # Replace with the actual model name
llm = HuggingFaceLLM(
    model_name=model_name,
    tokenizer_name=model_name,
    context_window=3900,
    max_new_tokens=256,
    device_map="auto",
)

Process Images and Generate Captions:

# Initialize the BlipProcessor and BlipForConditionalGeneration for image processing
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Function to process images and generate captions
def process_images(image_paths):
    captions = []
    for image_path in image_paths:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        captions.append(caption)
    return captions

Prepare the Prompt and Generate a Response:

# Prepare the prompt and image documents
qa_tmpl_str = (
    "Given the provided information, including relevant images and retrieved context from the video, "
    "accurately and precisely answer the query without any additional prior knowledge.\n"
    "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
    "---------------------\n"
    "Context: {context_str}\n"
    "Metadata for video: {metadata_str} \n"
    "---------------------\n"
    "Query: {query_str}\n"
    "Answer: "
)

context_str = "Your context string here"
query_str = "Your query string here"
metadata_str = "Your metadata string here"
image_documents = ["path/to/image1.png", "path/to/image2.png"]  # List of image paths

# Process images to generate captions
image_captions = process_images(image_documents)

# Combine text and image captions into the prompt
prompt = qa_tmpl_str.format(
    context_str=context_str, query_str=query_str, metadata_str=metadata_str
) + "\nImages: " + ", ".join(image_captions)

# Generate a response
response = llm.complete(prompt=prompt)
print(response.text)

Replace "your_multimodal_model_name_here" with the actual model name you intend to use from Hugging Face. This setup allows you to run a multimodal LLM locally, handling both text and image inputs.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 20 '24 23:05 dosubot[bot]

@dosu I do not want to convert the image into captions, rather send a list of images and a text prompt as input to the llm using HuggingFaceLLM just like how its done here:

openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, )

May 20 '24 23:05 tsantra

To achieve the same functionality using HuggingFaceLLM from the llama_index.llms.huggingface module, you can follow a similar approach to the OpenAIMultiModal implementation. Below is a step-by-step guide and code example to help you set up and use HuggingFaceLLM for your multimodal LLM use case:

Install Required Libraries:

pip install llama-index-core huggingface-hub torch text-generation transformers

Define the HuggingFaceLLM Class:

from typing import Sequence, Any
from llama_index.legacy.schema import ImageDocument
from llama_index.legacy.multi_modal_llms import MultiModalLLM, MultiModalLLMMetadata
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
import torch
from threading import Thread

class HuggingFaceLLM(MultiModalLLM):
    @property
    def metadata(self) -> MultiModalLLMMetadata:
        return MultiModalLLMMetadata(
            model_name="huggingface-multimodal-model",
            is_chat_model=False,
            context_window=1024,
            num_output=256,
            num_input_files=10,
            is_function_calling_model=False
        )

    def __init__(self, model_name: str, tokenizer_name: str, context_window: int = 1024, max_new_tokens: int = 256, device_map: str = "auto", **kwargs):
        self._model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, **kwargs)
        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.context_window = context_window
        self.max_new_tokens = max_new_tokens
        self.device_map = device_map

        class StopOnTokens(StoppingCriteria):
            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs: Any) -> bool:
                return input_ids[0][-1] in self.stopping_ids

        self._stopping_criteria = StoppingCriteriaList([StopOnTokens()])

    def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse:
        # Implement the logic to process the prompt and image_documents using the HuggingFace model
        # This is a placeholder implementation
        full_prompt = prompt
        inputs = self._tokenizer(full_prompt, return_tensors="pt")
        inputs = inputs.to(self._model.device)

        tokens = self._model.generate(
            **inputs,
            max_new_tokens=self.max_new_tokens,
            stopping_criteria=self._stopping_criteria,
            **kwargs
        )
        completion_tokens = tokens[0][inputs["input_ids"].size(1):]
        completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)

        return CompletionResponse(text=completion)

    def stream_complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponseGen:
        from transformers import TextIteratorStreamer

        full_prompt = prompt
        inputs = self._tokenizer(full_prompt, return_tensors="pt")
        inputs = inputs.to(self._model.device)

        streamer = TextIteratorStreamer(self._tokenizer, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(
            inputs,
            streamer=streamer,
            max_new_tokens=self.max_new_tokens,
            stopping_criteria=self._stopping_criteria,
            **kwargs
        )

        thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
        thread.start()

        def gen() -> CompletionResponseGen:
            text = ""
            for x in streamer:
                text += x
                yield CompletionResponse(text=text, delta=x)

        return gen()

    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
        prompt = self.messages_to_prompt(messages)
        completion_response = self.complete(prompt, formatted=True, **kwargs)
        return completion_response_to_chat_response(completion_response)

    def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
        prompt = self.messages_to_prompt(messages)
        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
        return stream_completion_response_to_chat_response(completion_response)

Example Usage:

# Example usage
huggingface_llm = HuggingFaceLLM(model_name="HuggingFaceH4/zephyr-7b-beta", tokenizer_name="HuggingFaceH4/zephyr-7b-beta")
prompt = "Describe the content of the images"
image_documents = [ImageDocument(image_path="path/to/image1.jpg"), ImageDocument(image_path="path/to/image2.jpg")]

response = huggingface_llm.complete(prompt, image_documents)
print(response.text)

This code should help you achieve the same functionality as the OpenAIMultiModal example, but using a local Hugging Face model instead. You can extend this implementation to include other methods and functionalities as needed.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 20 '24 23:05 dosubot[bot]

@dosubot could you please provide the code for the placeholder part mentioned in your answer above for the function complete. The goal is to use both images and text prompt as input to the llm, to provide the same functionality as openai_mm_llm = OpenAIMultiModal( model="gpt-4-turbo", api_key= "sk-proj-4jX30IwiomEZDezL9pJJT3BlbkFJefAyiBjUhXUlYYW1kFdL", max_new_tokens=1500 ) response_1 = openai_mm_llm.complete( prompt=qa_tmpl_str.format( context_str=context_str, query_str=query_str, metadata_str=metadata_str ), image_documents=image_documents, )

def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse: # Implement the logic to process the prompt and image_documents using the HuggingFace model # This is a placeholder implementation full_prompt = prompt inputs = self._tokenizer(full_prompt, return_tensors="pt") inputs = inputs.to(self._model.device)

    tokens = self._model.generate(
        **inputs,
        max_new_tokens=self.max_new_tokens,
        stopping_criteria=self._stopping_criteria,
        **kwargs
    )
    completion_tokens = tokens[0][inputs["input_ids"].size(1):]
    completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)

    return CompletionResponse(text=completion)

May 21 '24 01:05 tsantra

To implement the complete method in the HuggingFaceLLM class to handle both text and image inputs, you can follow the approach used in the OpenAIMultiModal class. This involves preprocessing the images, combining them with the text prompt, and then generating the response using the Hugging Face model.

Here is the complete code for the complete method:

from typing import Sequence, Any
from llama_index.core.multi_modal_llms.base import CompletionResponse, ImageDocument
from transformers import BlipProcessor, BlipForConditionalGeneration

class HuggingFaceLLM:
    def __init__(self, model_name: str, tokenizer_name: str, max_new_tokens: int, **kwargs):
        self._model = BlipForConditionalGeneration.from_pretrained(model_name)
        self._tokenizer = BlipProcessor.from_pretrained(tokenizer_name)
        self.max_new_tokens = max_new_tokens
        self._model.to(kwargs.get("device", "cpu"))

    def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse:
        # Validate inputs
        if not isinstance(prompt, str):
            raise ValueError("Prompt must be a string.")
        if not isinstance(image_documents, list) or not all(isinstance(doc, ImageDocument) for doc in image_documents):
            raise ValueError("image_documents must be a list of ImageDocument objects.")

        # Process images
        images = [self._process_image(doc.image) for doc in image_documents]

        # Prepare the model inputs
        inputs = self._tokenizer(images, prompt, return_tensors="pt").to(self._model.device)

        # Generate the response
        tokens = self._model.generate(
            **inputs,
            max_new_tokens=self.max_new_tokens,
            **kwargs
        )
        completion_tokens = tokens[0][inputs["input_ids"].size(1):]
        completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)

        return CompletionResponse(text=completion)

    def _process_image(self, image_base64: str):
        from PIL import Image
        import base64
        from io import BytesIO

        # Decode the base64 image
        image_data = base64.b64decode(image_base64)
        image = Image.open(BytesIO(image_data))
        if image.mode != "RGB":
            image = image.convert("RGB")
        return image

# Example usage
image_documents = [ImageDocument(image="base64_image_string_here", text="Image description")]
huggingface_llm = HuggingFaceLLM(model_name="blip-image-captioning-base", tokenizer_name="blip-image-captioning-base", max_new_tokens=150)
response = huggingface_llm.complete(prompt="Describe the image", image_documents=image_documents)
print(response.text)

Explanation:

Initialization:
- The HuggingFaceLLM class is initialized with the model and tokenizer names, and the maximum number of new tokens to generate.
- The model and tokenizer are loaded using the BlipForConditionalGeneration and BlipProcessor classes from Hugging Face.
Complete Method:
- The complete method validates the inputs to ensure the prompt is a string and the image documents are a list of ImageDocument objects.
- The images are processed by decoding the base64 strings and converting them to RGB format if necessary.
- The model inputs are prepared by combining the images and the text prompt using the tokenizer.
- The model generates the response tokens, which are then decoded to get the final completion text.
Image Processing:
- The _process_image method decodes the base64 image string and converts it to an RGB image using the PIL library.

This implementation should provide the same functionality as the OpenAIMultiModal class, allowing you to process both text and image inputs using a local Hugging Face model.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 21 '24 02:05 dosubot[bot]

@dosubot Please rewrite the code, if my LLM is https://huggingface.co/llava-hf/llava-1.5-7b-hf. Please make the necessary changes.

May 21 '24 20:05 tsantra

To replace the OpenAI API usage with the llava-hf/llava-1.5-7b-hf model from Hugging Face for processing both text and image inputs, you can use the following code:

Updated Code:

from typing import Sequence, Any
from llama_index.core.multi_modal_llms.base import CompletionResponse, ImageDocument
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import base64
from io import BytesIO

class HuggingFaceLLM:
    def __init__(self, model_name: str, tokenizer_name: str, max_new_tokens: int, **kwargs):
        self._model = AutoModelForCausalLM.from_pretrained(model_name)
        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_new_tokens = max_new_tokens
        self._model.to(kwargs.get("device", "cpu"))

        # Initialize the BLIP processor and model for image captioning
        self._blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self._blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16)
        self._blip_model.to(kwargs.get("device", "cpu"))

    def complete(self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any) -> CompletionResponse:
        # Validate inputs
        if not isinstance(prompt, str):
            raise ValueError("Prompt must be a string.")
        if not isinstance(image_documents, list) or not all(isinstance(doc, ImageDocument) for doc in image_documents):
            raise ValueError("image_documents must be a list of ImageDocument objects.")

        # Process images and generate captions
        image_captions = [self._generate_image_caption(doc.image) for doc in image_documents]

        # Combine image captions with the prompt
        combined_prompt = prompt + " " + " ".join(image_captions)

        # Prepare the model inputs
        inputs = self._tokenizer(combined_prompt, return_tensors="pt").to(self._model.device)

        # Generate the response
        tokens = self._model.generate(
            **inputs,
            max_new_tokens=self.max_new_tokens,
            **kwargs
        )
        completion_tokens = tokens[0][inputs["input_ids"].size(1):]
        completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)

        return CompletionResponse(text=completion, raw={"model_output": tokens})

    def _generate_image_caption(self, image_base64: str) -> str:
        # Decode the base64 image
        image_data = base64.b64decode(image_base64)
        image = Image.open(BytesIO(image_data))
        if image.mode != "RGB":
            image = image.convert("RGB")

        # Process the image and generate a caption
        inputs = self._blip_processor(image, return_tensors="pt").to(self._blip_model.device)
        out = self._blip_model.generate(**inputs)
        caption = self._blip_processor.decode(out[0], skip_special_tokens=True)
        return caption

# Example usage
image_documents = [ImageDocument(image="base64_image_string_here", text="Image description")]
huggingface_llm = HuggingFaceLLM(model_name="llava-hf/llava-1.5-7b-hf", tokenizer_name="llava-hf/llava-1.5-7b-hf", max_new_tokens=150)
prompt = "Describe the content of the images"
response = huggingface_llm.complete(prompt=prompt, image_documents=image_documents)
print(response.text)

Explanation:

Initialization:
- The HuggingFaceLLM class initializes the AutoModelForCausalLM and AutoTokenizer for the llava-hf/llava-1.5-7b-hf model.
- Additionally, it initializes the BlipProcessor and BlipForConditionalGeneration for image captioning.
Image Processing:
- The _generate_image_caption method decodes the base64 image, processes it using the BlipProcessor, and generates a caption using the BlipForConditionalGeneration model.
Combining Prompts:
- The complete method combines the generated image captions with the text prompt to create a combined prompt.
Generating Response:
- The combined prompt is tokenized and passed to the llava-hf/llava-1.5-7b-hf model to generate a response.

This setup ensures that the HuggingFaceLLM class can handle both text and image inputs using the llava-hf/llava-1.5-7b-hf model from Hugging Face.

Details

Sources

About Dosu This response is meant to be useful and save you time. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

May 21 '24 20:05 dosubot[bot]

Hi check here, I built a local multimodal RAG, It use locall llava.

Jun 07 '24 02:06 LJ-Hao

llama_index
llama_index copied to clipboard

[Question]: Run multimodal LLM locally

Question Validation

Question

Create the MultiModal index

What are my options if I want to import a multimodal model from Huggingface? How do I use it instead of the code below?

Referring to this example: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e

Create the MultiModal index

What are my options if I want to import a multimodal llm from Huggingface? How do I use it instead of the code below?

Details

Details

Details

Details

Details

Explanation:

Details

Updated Code:

Explanation:

Details

llama_index llama_index copied to clipboard

[Question]: Run multimodal LLM locally

Question Validation

Question

Create the MultiModal index

What are my options if I want to import a multimodal model from Huggingface? How do I use it instead of the code below?

Referring to this example: https://www.llamaindex.ai/blog/multimodal-rag-for-advanced-video-processing-with-llamaindex-lancedb-33be4804822e

Create the MultiModal index

What are my options if I want to import a multimodal llm from Huggingface? How do I use it instead of the code below?

Details

Details

Details

Details

Details

Explanation:

Details

Updated Code:

Explanation:

Details

llama_index
llama_index copied to clipboard