blog icon indicating copy to clipboard operation
blog copied to clipboard

Inference with bitsandbytes for Llama 3.2 vision model as per the blog https://huggingface.co/blog/llama32 is taking more time than the default model load.

Open aabbhishekksr opened this issue 1 year ago • 1 comments

aabbhishekksr avatar Dec 04 '24 13:12 aabbhishekksr

import torch from transformers import MllamaForConditionalGeneration, AutoProcessor from transformers import BitsAndBytesConfig import time from PIL import Image import requests

model_id= 'my model path'

bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 )

model = MllamaForConditionalGeneration.from_pretrained(

model_id,

# torch_dtype=torch.bfloat16,

# device_map="cuda",

quantization_config=bnb_config,

)

model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="cuda", ) processor = AutoProcessor.from_pretrained(model_id)

for i in range(5): start = time.time() url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "Can you please describe this image in just one sentence?"}
    ]}
]

input_text = processor.apply_chat_template(
    messages, add_generation_prompt=True,
)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to(model.device)
output = model.generate(**inputs, max_new_tokens=70)

print("Took time: ",time.time()-start)

print(processor.decode(output[0][inputs["input_ids"].shape[-1]:]))

aabbhishekksr avatar Dec 04 '24 13:12 aabbhishekksr