gpt-fast
gpt-fast copied to clipboard
Bug convert HF model
Bug Report
Description:
I encountered a bug when attempting to convert a model from Hugging Face (HF) using the provided code implementation. The issue appears to be related to counting parameters in the PyTorch model.
Code Implementation:
import re
import torch
from transformers import LlamaForCausalLM, AutoTokenizer
from models.model_configs import transformer_configs
from models.llama import ModelArgs
#Load model
model_path = "nickypro/tinyllama-15M-fp32"
model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype="auto", use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tiny_llama_15M_config = ModelArgs.from_name('tinyllama-15M')
#ModelArgs(block_size=256, vocab_size=32000, n_layer=6, n_head=6, dim=288, intermediate_size=768, n_local_heads=6, #head_dim=48, rope_base=10000, norm_eps=1e-05)
mymodel = LLama(tiny_llama_15M_config)
#Convert HF
weight_map = {
"model.embed_tokens.weight": "tok_embeddings.weight",
"model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
"model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
"model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
"model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
"model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
"model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
"model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
"model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
"model.norm.weight": "norm.weight",
"lm_head.weight": "output.weight",
}
def permute(w, n_head, dim, head_dim):
dim = dim
return (
w.view(n_head, 2, head_dim // 2, dim)
.transpose(1, 2)
.reshape(head_dim * n_head, dim)
)
hf_path = "tests/tiny_15M_fp32.pt"
torch.save(model.state_dict(), hf_path)
checkpoint = torch.load(hf_path)
final_result = {}
for key, value in checkpoint.items():
if "layers" in key:
abstract_key = re.sub(r'(\d+)', '{}', key)
layer_num = re.search(r'\d+', key).group(0)
new_key = weight_map[abstract_key]
if new_key is None:
continue
new_key = new_key.format(layer_num)
else:
new_key = weight_map[key]
final_result[new_key] = value
for key in tuple(final_result.keys()):
if "wq" in key:
q = final_result[key]
k = final_result[key.replace("wq", "wk")]
v = final_result[key.replace("wq", "wv")]
q = permute(q, tiny_llama_15M_cofig.n_head, tiny_llama_15M_cofig.dim, tiny_llama_15M_cofig.head_dim)
k = permute(k, tiny_llama_15M_cofig.n_local_heads, tiny_llama_15M_cofig.dim, tiny_llama_15M_cofig.head_dim)
final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
del final_result[key]
del final_result[key.replace("wq", "wk")]
del final_result[key.replace("wq", "wv")]
torch.save(final_result, "tests/model_converted.pth")
mymodel.load_state_dict(torch.load("tests/model_converted.pth"))
#Input
inputs = tokenizer("I am a student at", return_tensors="pt", return_attention_mask=False)
inp = inputs['input_ids']
T = inp.shape[1]
input_pos = torch.arange(0, T)
# Run HF
hf_outputs = model(**inputs)
#Run implement
mymodel.setup_caches(1, tiny_llama_15M_cofig.block_size)
output = mymodel(inp, input_pos)
# Output
hf_outputs.logits
tensor([[[ -6.7908, 0.8281, -6.7904, ..., -6.7907, -6.7907, -6.7905],
[ -8.2606, -0.2434, -8.2608, ..., -8.2607, -8.2609, -8.2608],
[-10.8138, -3.1881, -10.8137, ..., -10.8138, -10.8139, -10.8138],
[-11.4940, -0.7831, -11.4936, ..., -11.4939, -11.4938, -11.4937],
[-11.8310, -2.4853, -11.8308, ..., -11.8310, -11.8310, -11.8310],
[ -6.9855, 0.3798, -6.9853, ..., -6.9855, -6.9853, -6.9854]]],
grad_fn=<UnsafeViewBackward0>)
output
tensor([[[ -6.7908, 0.8281, -6.7904, ..., -6.7907, -6.7907, -6.7905],
[ -8.2573, -0.2481, -8.2575, ..., -8.2574, -8.2576, -8.2575],
[-10.8115, -3.1968, -10.8114, ..., -10.8116, -10.8116, -10.8115],
[-11.4960, -0.7812, -11.4957, ..., -11.4959, -11.4959, -11.4957],
[-11.8348, -2.4808, -11.8346, ..., -11.8348, -11.8348, -11.8348],
[ -6.9774, 0.3842, -6.9772, ..., -6.9774, -6.9772, -6.9773]]],
grad_fn=<UnsafeViewBackward0>)
diff = torch.sum(abs(hf_outputs.logits - output), -1)
tensor([[ 0.0000, 99.5280, 80.2527, 58.3247, 99.8532, 236.9683]],
grad_fn=<SumBackward1>)
I think the bug lies in the Key-Value (KV) cache, as the output for the first token remains unchanged
@Chillee @kit1980
@Chillee @kit1980
Have you solved this problem? I also found the tensor are different not only the first token, but all the logits of the tokens are different. @Chillee Can you help to take a look at this problem?
@vinhtran2611
I have set AutoModelForCausalLM.from_pretrained(torch_dtype=torch.bfloat16)
and _load_model(precision=torch.bfloat16)
But I get hf_outputs.logits.dtype == torch .float32
and output.dtype == torch.bfloat16
. Maybe it's the precision problem.