After using autoawq to quantify the model, an error occurs when inferring the model
quantization:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AwqConfig, AutoConfig
from awq import AutoAWQForCausalLM
# 使用中文版
model_id = './Llama-3.1-Nemotron-70B-Instruct-HF'
# 或者,使用原版
# model_id = 'meta-llama/Llama-2-7b-chat-hf'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
model = AutoAWQForCausalLM.from_pretrained(model_id, device_map='auto'})
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model.quantize(tokenizer, quant_config=quant_config)
quantization_config = AwqConfig(
bits=quant_config["w_bit"],
group_size=quant_config["q_group_size"],
zero_point=quant_config["zero_point"],
version=quant_config["version"].lower(),
).to_dict()
model.model.config.quantization_config = quantization_config
import os
output = "./Llama-3.1-Nemotron-70B-Instruct-HF-awq"
if not os.path.exists(output):
os.mkdir(output)
model.save_quantized(output)
tokenizer.save_pretrained(output)
inference:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "./Llama-3.1-Nemotron-70B-Instruct-HF-awq"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [{"role": "user", "content": "hello"}]
tokenized_message = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
response_token_ids = model.generate(tokenized_message['input_ids'].cuda(),attention_mask=tokenized_message['attention_mask'].cuda(),max_new_tokens=4096, pad_toke n_id = tokenizer.eos_token_id)
generated_tokens =response_token_ids[:, len(tokenized_message['input_ids'][0]):]
generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print(generated_text)
but I got error:
loc("/home/code/pytorch/AutoAWQ/awq/modules/triton/gemm.py":229:35): error: invalid element type in packLLEElements. Expected 'f16' but got 'f32'
loc("/home/code/pytorch/AutoAWQ/awq/modules/triton/gemm.py":229:35): error: invalid element type in packLLEElements. Expected 'f16' but got 'f32'
loc("/home/code/pytorch/AutoAWQ/awq/modules/triton/gemm.py":229:35): error: invalid element type in packLLEElements. Expected 'f16' but got 'f32'
loc("/home/code/pytorch/AutoAWQ/awq/modules/triton/gemm.py":229:35): error: invalid element type in packLLEElements. Expected 'f16' but got 'f32'
.
.
.
loc("/home/xuan/code/pytorch/AutoAWQ/awq/modules/triton/gemm.py":229:35): error: 'llvm.insertvalue' op Type mismatch: cannot insert 'f32' into '!llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>'
env info:
torch 2.4.1
triton 3.0.0
autoawq 0.2.6+cu121 /home/code/pytorch/AutoAWQ
transformers 4.46.0.dev0 /home/code/pytorch/hunningface/transformers
CUDA Version: 12.1
Hi, have you solved this issue?
Hi, have you solved this issue?
I have the same problem. Have you solved it?
same error
same error
Seems like a user problem, as you need to respect the values form the original model's config.json, to properly configure the AWQ quantization pipeline.
This error is suggesting that the inference request wants fp16, when the model is fp32. but you quantized it to 4bit AWQ. Why would you try this way?
Your line here with torch_dtype=torch.float16 is specifically asking for fp16.
Here is an example of proper inference, according to AWQ documentation, which you should have already bee able to reference:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer
model_path = "./Llama-3.1-Nemotron-70B-Instruct-HF-awq"
system_message = "You are Llama-3.1-Nemotron-70B-Instruct, incarnated as a powerful AI. You were created by [xuanzhangyang](https://github.com/xuanzhangyang)."
# Load model
model = AutoAWQForCausalLM.from_quantized(model_path,
fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(model_path,
trust_remote_code=True)
streamer = TextStreamer(tokenizer,
skip_prompt=True,
skip_special_tokens=True)
# Convert prompt to tokens
prompt_template = """\
<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant"""
prompt = "You're standing on the surface of the Earth. "\
"You walk one mile south, one mile west and one mile north. "\
"You end up exactly where you started. Where are you?"
tokens = tokenizer(prompt_template.format(system_message=system_message,prompt=prompt),
return_tensors='pt').input_ids.cuda()
# Generate output
generation_output = model.generate(tokens,
streamer=streamer,
max_new_tokens=512)
I hope this can hep. Best of luck to you.