llama
llama copied to clipboard
Parameters perturb and full finetuning
Hi @subramen, I'm trying to perturb some dimensions of parameter values in some layers (like 0-5 layers's all parameter) and freeze it, then do a full parameter fintuning.
I got 2 puzzling problems, the 1st is after loading the pre-trained model, I print all the model.named_parameters
and each param size
, and it all shows to be torch. size [0], while after the training starts, I print it again, and found as below, only input_layernorm.weight
and post_attention_layernorm.weight
will update in each layer when fine-tuning, but parameters like self_attn q, k, v,o
not. is this behavior normal?
(p.s. I tried both on Llama and llama 2, got the same results.)
Also, the code I used are shown in the bottom. Thanks in advance.
model.named_parameters and param.size
:
20315 module.model.layers.9.self_attn.q_proj.weight torch.Size([0])
20316 module.model.layers.9.self_attn.k_proj.weight torch.Size([0])
20317 module.model.layers.9.self_attn.v_proj.weight torch.Size([0])
20318 module.model.layers.9.self_attn.o_proj.weight torch.Size([0])
20319 module.model.layers.9.mlp.gate_proj.weight torch.Size([0])
20320 module.model.layers.9.mlp.up_proj.weight torch.Size([0])
20321 module.model.layers.9.mlp.down_proj.weight torch.Size([0])
20322 module.model.layers.9.input_layernorm.weight torch.Size([5120])
20323 module.model.layers.9.post_attention_layernorm.weight torch.Size([5120])
20324 module.model.layers.10.self_attn.q_proj.weight torch.Size([0])
20325 module.model.layers.10.self_attn.k_proj.weight torch.Size([0])
20326 module.model.layers.10.self_attn.v_proj.weight torch.Size([0])
20327 module.model.layers.10.self_attn.o_proj.weight torch.Size([0])
20328 module.model.layers.10.mlp.gate_proj.weight torch.Size([0])
20329 module.model.layers.10.mlp.up_proj.weight torch.Size([0])
20330 module.model.layers.10.mlp.down_proj.weight torch.Size([0])
20331 module.model.layers.10.input_layernorm.weight torch.Size([5120])
20332 module.model.layers.10.post_attention_layernorm.weight torch.Size([5120])
`my code for fientuning':
accelerator = Accelerator(log_with="wandb")
hps = {"learning_rate": args.learning_rate}
accelerator.init_trackers(args.wandb_name)
set_random_seed(args.seed)
tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path,
fast_tokenizer=True)
tokenizer.pad_token_id = ( 0)
tokenizer.padding_side = "left"
# Load model
model = LlamaForCausalLM.from_pretrained(args.model_name_or_path)
# to change some parameters, and freeze it.
layer_name = "model.layers.11.input_layernorm.weight"
# Loop through all named parameters to freeze the target parameter
for name, param in model.named_parameters():
if name == "model.layers.11.input_layernorm.weight":
param.requires_grad = False
break # Stop the loop once the target parameter is found and modified
# to ensure the parameter is frozen
parameter_frozen = False
for name, param in model.named_parameters():
if name == "model.layers.11.input_layernorm.weight":
parameter_frozen = not param.requires_grad # This should be True if the parameter is frozen
# Assert to check if the parameter is indeed frozen
assert parameter_frozen, f"Parameter {layer_name} is not frozen"
with accelerator.main_process_first():
train_dataset,eval_dataset = create_dataset(
args.local_rank, # invalid
args.data_output_path,
args.seed,
args.model_name_or_path,
args.max_seq_len,
)
accelerator.wait_for_everyone()
train_dataloader = DataLoader(train_dataset,
collate_fn=DataCollatorForSeq2Seq(tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True),
batch_size=args.per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset,
collate_fn=DataCollatorForSeq2Seq(tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True),
batch_size=args.per_device_eval_batch_size)
print(f'length of en_train: {len(train_dataloader)}\n{len(eval_dataloader)}')
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
# Adam Optimizer
optimizer_cls = (
torch.optim.AdamW
if accelerator.state.deepspeed_plugin is None
or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
else DummyOptim
)
optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
)
model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
model, train_dataloader, eval_dataloader, optimizer, lr_scheduler)
# Train!
print_rank_0("***** Running training *****", accelerator.process_index)
for epoch in range(args.num_train_epochs):
#for epoch in range(start_epoch, args.num_train_epochs):
current_step = []
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch, use_cache=False)
train_loss = outputs.loss
accelerator.backward(train_loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
accelerator.log({"train_loss": train_loss})
accelerator.log({"lr": lr_scheduler.get_lr()[0]})
if step % 400 == 0:
print_rank_0(f"Epoch is {epoch}, Step is {step}, train_loss is {train_loss.item()}", accelerator.process_index)
for name, param in model.named_parameters():
print(name, param.size())
print_trainable_parameters(model)
ppl, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
accelerator.log({"eval_loss": eval_loss})
if accelerator.is_main_process:
print_rank_0(f"eval_loss: {eval_loss}, ppl: {ppl}", accelerator.process_index)
if args.output_dir is not None:
epoch_output_dir = os.path.join(args.output_dir, f"epoch_{epoch}_eval_loss_{eval_loss:.4f}")
os.makedirs(epoch_output_dir, exist_ok=True)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(
epoch_output_dir,
#args.output_dir_huggingface,
is_main_process=accelerator.is_main_process,
save_function=accelerator.save,
state_dict=accelerator.get_state_dict(model),
)
if accelerator.is_main_process:
tokenizer.save_pretrained(epoch_output_dir)
#tokenizer.save_pretrained(args.output_dir_huggingface)
accelerator.end_training()
if __name__ == "__main__":
main()