LLaVA icon indicating copy to clipboard operation
LLaVA copied to clipboard

[Error] following `model_kwargs` are not used by the model: ['image_sizes'

Open vishalkmr opened this issue 1 year ago • 1 comments

Describe the issue

Issue: i am using the bellow code, a modified version of cli.py code, to query a local image.

import sys
sys.path.append(os.getcwd()+ "/LLaVA/")
import argparse
import torch

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

from PIL import Image

import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer


def load_image(image_file):
    if image_file.startswith('http://') or image_file.startswith('https://'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    return image

model_path="liuhaotian/LLaVA-Lightning-MPT-7B-preview"
model_base=None
load_8bit=False
load_4bit=False
temperature = 0.2
max_new_tokens = 512
image_file = "/localhome/local-vishkumar/gen-ai-app/streams/llava_logo.png"

disable_torch_init()

model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, load_8bit, load_4bit)

if "llama-2" in model_name.lower():
    conv_mode = "llava_llama_2"
elif "mistral" in model_name.lower():
    conv_mode = "mistral_instruct"
elif "v1.6-34b" in model_name.lower():
    conv_mode = "chatml_direct"
elif "v1" in model_name.lower():
    conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
    conv_mode = "mpt"
else:
    conv_mode = "llava_v0"

if conv_mode is not None and conv_mode != conv_mode:
    print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, conv_mode, conv_mode))
else:
    conv_mode = conv_mode

conv = conv_templates[conv_mode].copy()
if "mpt" in model_name.lower():
    roles = ('user', 'assistant')
else:
    roles = conv.roles

image = load_image(image_file)
image_size = image.size

image_tensor = process_images([image], image_processor, model.config)
if type(image_tensor) is list:
    image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
else:
    image_tensor = image_tensor.to(model.device, dtype=torch.float16)


inp = "Describe the image"
print(f"{roles[0]}: {inp}")
print(f"{roles[1]}: ", end="")

if image is not None:
    # first message
    if model.config.mm_use_im_start_end:
        inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    else:
        inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    image = None
else:
    conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=image_tensor,
        image_sizes=[image_size],
        do_sample=True if temperature > 0 else False,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        streamer=streamer,
        use_cache=True)

outputs = tokenizer.decode(output_ids[0]).strip()
conv.messages[-1][-1] = outputs

print("\n", {"prompt": prompt, "outputs": outputs}, "\n")

Error Message:

user: Describe the image
assistant: Traceback (most recent call last):
  File "/localhome/local-vishkumar/gen-ai-app/src/test.py", line 100, in <module>
    output_ids = model.generate(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/localhome/local-vishkumar/.local/lib/python3.10/site-packages/transformers/generation/utils.py", line 1307, in generate
    self._validate_model_kwargs(model_kwargs.copy())
  File "/localhome/local-vishkumar/.local/lib/python3.10/site-packages/transformers/generation/utils.py", line 1122, in _validate_model_kwargs
    raise ValueError(
ValueError: The following `model_kwargs` are not used by the model: ['image_sizes'] (note: typos in the generate arguments will also show up in this list)

Can anyone please tell me why i am getting this issue??

vishalkmr avatar Feb 14 '24 12:02 vishalkmr

changing the model to model_path="liuhaotian/llava-v1.5-13b" resolved the issue and I'm getting the image description now. Seems like there is bug in LLaVA-Lightning-MPT-7B-preview model path.

vishalkmr avatar Feb 15 '24 07:02 vishalkmr

getting this also to llava-hf/llava-v1.6-mistral-7b-hf

idan-tankel avatar Mar 22 '24 12:03 idan-tankel

getting this error in MPT-7b model too

chanangad avatar Apr 18 '24 15:04 chanangad