llama icon indicating copy to clipboard operation
llama copied to clipboard

Parameters perturb and full finetuning

Open waterluck opened this issue 11 months ago • 0 comments

Hi @subramen, I'm trying to perturb some dimensions of parameter values in some layers (like 0-5 layers's all parameter) and freeze it, then do a full parameter fintuning.

I got 2 puzzling problems, the 1st is after loading the pre-trained model, I print all the model.named_parameters and each param size, and it all shows to be torch. size [0], while after the training starts, I print it again, and found as below, only input_layernorm.weight and post_attention_layernorm.weight will update in each layer when fine-tuning, but parameters like self_attn q, k, v,o not. is this behavior normal? (p.s. I tried both on Llama and llama 2, got the same results.)

Also, the code I used are shown in the bottom. Thanks in advance.

model.named_parameters and param.size:

20315 module.model.layers.9.self_attn.q_proj.weight torch.Size([0])
20316 module.model.layers.9.self_attn.k_proj.weight torch.Size([0])
20317 module.model.layers.9.self_attn.v_proj.weight torch.Size([0])
20318 module.model.layers.9.self_attn.o_proj.weight torch.Size([0])
20319 module.model.layers.9.mlp.gate_proj.weight torch.Size([0])
20320 module.model.layers.9.mlp.up_proj.weight torch.Size([0])
20321 module.model.layers.9.mlp.down_proj.weight torch.Size([0])
20322 module.model.layers.9.input_layernorm.weight torch.Size([5120])
20323 module.model.layers.9.post_attention_layernorm.weight torch.Size([5120])
20324 module.model.layers.10.self_attn.q_proj.weight torch.Size([0])
20325 module.model.layers.10.self_attn.k_proj.weight torch.Size([0])
20326 module.model.layers.10.self_attn.v_proj.weight torch.Size([0])
20327 module.model.layers.10.self_attn.o_proj.weight torch.Size([0])
20328 module.model.layers.10.mlp.gate_proj.weight torch.Size([0])
20329 module.model.layers.10.mlp.up_proj.weight torch.Size([0])
20330 module.model.layers.10.mlp.down_proj.weight torch.Size([0])
20331 module.model.layers.10.input_layernorm.weight torch.Size([5120])
20332 module.model.layers.10.post_attention_layernorm.weight torch.Size([5120])

`my code for fientuning':

    accelerator = Accelerator(log_with="wandb")

    hps = {"learning_rate": args.learning_rate}
    accelerator.init_trackers(args.wandb_name)

    set_random_seed(args.seed)

    tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path,
                                               fast_tokenizer=True)
    tokenizer.pad_token_id = ( 0)
    tokenizer.padding_side = "left"
    
   # Load model
    model = LlamaForCausalLM.from_pretrained(args.model_name_or_path)

    # to change some parameters, and freeze it. 
    layer_name = "model.layers.11.input_layernorm.weight"

    # Loop through all named parameters to freeze the target parameter
    for name, param in model.named_parameters():
        if name == "model.layers.11.input_layernorm.weight":
            param.requires_grad = False
            break  # Stop the loop once the target parameter is found and modified

    # to ensure the parameter is frozen
    parameter_frozen = False
    for name, param in model.named_parameters():
        if name == "model.layers.11.input_layernorm.weight":
            parameter_frozen = not param.requires_grad  # This should be True if the parameter is frozen

    # Assert to check if the parameter is indeed frozen
    assert parameter_frozen, f"Parameter {layer_name} is not frozen"
        with accelerator.main_process_first():
        train_dataset,eval_dataset  = create_dataset(
            args.local_rank, # invalid
            args.data_output_path,
            args.seed,
            args.model_name_or_path,
            args.max_seq_len,
        )   

    accelerator.wait_for_everyone()

    train_dataloader = DataLoader(train_dataset,
                                   collate_fn=DataCollatorForSeq2Seq(tokenizer,
                                   pad_to_multiple_of=8,
                                   return_tensors="pt",
                                   padding=True),
                                  batch_size=args.per_device_train_batch_size)

    eval_dataloader = DataLoader(eval_dataset,
                                collate_fn=DataCollatorForSeq2Seq(tokenizer,
                                pad_to_multiple_of=8,
                                return_tensors="pt",
                                padding=True),
                                batch_size=args.per_device_eval_batch_size)

    print(f'length of en_train: {len(train_dataloader)}\n{len(eval_dataloader)}')

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    # Adam Optimizer
    optimizer_cls = (
        torch.optim.AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )

    optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
    )

    model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        model, train_dataloader, eval_dataloader, optimizer, lr_scheduler)


    # Train!
    print_rank_0("***** Running training *****", accelerator.process_index)
    
    for epoch in range(args.num_train_epochs):
    #for epoch in range(start_epoch, args.num_train_epochs):
        current_step = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch, use_cache=False)
            train_loss = outputs.loss
            accelerator.backward(train_loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            accelerator.log({"train_loss": train_loss})
            accelerator.log({"lr": lr_scheduler.get_lr()[0]})
            if step % 400 == 0:
                print_rank_0(f"Epoch is {epoch}, Step is {step}, train_loss is {train_loss.item()}", accelerator.process_index)
            for name, param in model.named_parameters():
                print(name, param.size())
            print_trainable_parameters(model)
        
        ppl, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
        accelerator.log({"eval_loss": eval_loss})

        if accelerator.is_main_process:
            print_rank_0(f"eval_loss: {eval_loss}, ppl: {ppl}", accelerator.process_index)
     

        if args.output_dir is not None:

            epoch_output_dir = os.path.join(args.output_dir, f"epoch_{epoch}_eval_loss_{eval_loss:.4f}")
            os.makedirs(epoch_output_dir, exist_ok=True)

            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)

            unwrapped_model.save_pretrained(
                epoch_output_dir,
                #args.output_dir_huggingface,
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save,
                state_dict=accelerator.get_state_dict(model),
            )
            if accelerator.is_main_process:
                tokenizer.save_pretrained(epoch_output_dir)
                #tokenizer.save_pretrained(args.output_dir_huggingface)
            

    accelerator.end_training()

if __name__ == "__main__":
    main()

waterluck avatar Mar 05 '24 02:03 waterluck