blog
blog copied to clipboard
Inference with bitsandbytes for Llama 3.2 vision model as per the blog https://huggingface.co/blog/llama32 is taking more time than the default model load.
import torch from transformers import MllamaForConditionalGeneration, AutoProcessor from transformers import BitsAndBytesConfig import time from PIL import Image import requests
model_id= 'my model path'
bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 )
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
# torch_dtype=torch.bfloat16,
# device_map="cuda",
quantization_config=bnb_config,
)
model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="cuda", ) processor = AutoProcessor.from_pretrained(model_id)
for i in range(5): start = time.time() url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" image = Image.open(requests.get(url, stream=True).raw)
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "Can you please describe this image in just one sentence?"}
]}
]
input_text = processor.apply_chat_template(
messages, add_generation_prompt=True,
)
inputs = processor(
image,
input_text,
add_special_tokens=False,
return_tensors="pt",
).to(model.device)
output = model.generate(**inputs, max_new_tokens=70)
print("Took time: ",time.time()-start)
print(processor.decode(output[0][inputs["input_ids"].shape[-1]:]))