peft icon indicating copy to clipboard operation
peft copied to clipboard

Using trl SFTTrainer creates empty adapter.safetensors file while saving when training with LoRA and Deepspeed Zero3

Open maximus-21 opened this issue 3 weeks ago • 4 comments

System Info

When training with trl SFTTrainer with peft and deepspeed zero3 configuration it results in adapter_model.safetensors file of just 40 bytes i.e empty. However when training with deepspeed zero2, it saves adapter_model.safetensors correctly with 20MB size. Also when lora is false in config and it's complete finetuning, deepspeed saves sharded weights correctly as it should in zero3 setting. Have already checked while training, number of trainable parameters are not zero and training (loss and validation generation) is as expected with deepspeed zero3. The issue is with saving the adapter weights.

Who can help?

@BenjaminBossan

Reproduction

"""
LoRA SFT Trainer with periodic validation generation logging
"""
import os
import torch
import yaml
import json
import random
import ast
import argparse
from datasets import load_dataset
from utils import *
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainerCallback
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel
from huggingface_hub import login
from deepspeed.runtime.zero.config import ZeroStageEnum
from deepspeed.runtime.fp16.loss_scaler import LossScaler

os.environ["WANDB_DISABLED"] = "true"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"

local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)


# DeepSpeed safe load
torch.serialization.add_safe_globals([ZeroStageEnum, LossScaler])

# ----------------------------
# Parse arguments
# ----------------------------
parser = argparse.ArgumentParser(description="LoRA SFT Trainer with YAML config")
parser.add_argument("--config", type=str, required=True, help="Path to train_config.yaml")
args = parser.parse_args()

# ----------------------------
# Load YAML config
# ----------------------------
with open(args.config, "r") as f:
    cfg = yaml.safe_load(f)

model_cfg = cfg["model"]
data_cfg = cfg["datasets"]
run_cfg = cfg["run"]

# ----------------------------
# Logging directory
# ----------------------------
logging_dir = os.path.join(run_cfg["output_dir"], run_cfg["exp_name"])

if local_rank == 0:
    os.makedirs(logging_dir, exist_ok=True)
    print(f"[INFO] Logging directory created at: {logging_dir}")

CACHE_DIR = run_cfg.get("cache_dir", None)

# ----------------------------
# Model + tokenizer
# ----------------------------
base_model_path = model_cfg["llm_path"]
resume_ckpt_path = model_cfg.get("ckpt", None) if run_cfg.get("resume_from_checkpoint", False) else None

print(f"[INFO] Loading base model: {base_model_path}")
model_config = AutoConfig.from_pretrained(base_model_path, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    cache_dir=CACHE_DIR,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16 if run_cfg.get("amp", True) else torch.float32,
    config=model_config
)
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)

# ----------------------------
# Training dataset
# ----------------------------

dataset = load_dataset(
    "json",
    data_files=data_cfg["train_ann_path"],
    split="train",
    cache_dir=CACHE_DIR,
).shuffle(seed=run_cfg["seed"])

print("-" * 100)
print("Length of training dataset: ", len(dataset))
print("-" * 100)

# ----------------------------
# Validation dataset
# ----------------------------
val_dataset = None
if "valid_ann_path" in data_cfg:
    val_dataset = load_dataset(
        "json",
        data_files=data_cfg["valid_ann_path"],
        split="train",
        cache_dir=CACHE_DIR
    ).shuffle(seed=run_cfg["seed"])

    print("Length of validation dataset: ", len(val_dataset))
    
# ----------------------------
# LoRA / PEFT setup
# ----------------------------
peft_config = None
if model_cfg.get("lora", False):
    peft_config = LoraConfig(
        r=model_cfg["lora_rank"],
        lora_alpha=model_cfg["lora_alpha"],
        target_modules=model_cfg["target_modules"],
        lora_dropout=model_cfg["lora_dropout"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    if resume_ckpt_path:
        try:
            print(f"[INFO] Resuming LoRA from checkpoint: {resume_ckpt_path}")
            model = PeftModel.from_pretrained(
                model,
                resume_ckpt_path,
                is_trainable=True,
                torch_dtype=torch.bfloat16
            )
            if not any(p.requires_grad for p in model.parameters()):
                raise ValueError("No trainable parameters found in PEFT model!")
        except Exception as e:
            print(f"[WARN] Failed to load PEFT checkpoint: {e}. Training LoRA from scratch using peft_config.")
else:
    if resume_ckpt_path:
        print(f"[INFO] Will resume full-model training from checkpoint: {resume_ckpt_path}")


# ----------------------------
# Training Arguments
# ----------------------------
optim_cfg = run_cfg["optims"]
training_arguments = SFTConfig(
    output_dir=logging_dir,
    bf16=run_cfg.get("amp", True),
    deepspeed=run_cfg.get("ds_config", "") if run_cfg.get("use_distributed", True) else None,
    optim=optim_cfg["optim"],
    per_device_train_batch_size=run_cfg["batch_size_train"],
    gradient_accumulation_steps=run_cfg["accum_grad_iters"],
    per_device_eval_batch_size=run_cfg["batch_size_eval"],
    log_level="debug",
    save_strategy="steps",
    save_steps=run_cfg["save_ckpt_freq_steps"],
    logging_steps=run_cfg["log_freq"],
    learning_rate=optim_cfg["init_lr"],
    weight_decay=optim_cfg["weight_decay"],
    num_train_epochs=optim_cfg["max_epoch"],
    lr_scheduler_type=optim_cfg["lr_scheduler"],
    warmup_steps=optim_cfg["warmup_steps"],
    #dataset_text_field="text",
    seed=run_cfg["seed"],
    max_length=model_cfg["max_seq_len"],
    dataset_kwargs={
        "add_special_tokens": False, # We template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)


# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
    model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer
)

# ----------------------------
# Train
# ----------------------------
trainer.train()

deepspeed config:

{
  "zero_optimization": {
      "stage": 3,
      "offload_optimizer": {
      "device": "none"
      },
      "offload_param": {
      "device": "none"
      },
      "overlap_comm": true,
      "contiguous_gradients": true,
      "reduce_bucket_size": "auto",
      "stage3_prefetch_bucket_size": "auto",
      "stage3_param_persistence_threshold": "auto"
  },
  "bf16": {
      "enabled": true
  },
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "steps_per_print": 10,
  "wall_clock_breakdown": false,
  "gradient_clipping": 1.0
}

outputs:

***** Running training *****
  Num examples = 152,655
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9,542
  Number of trainable parameters = 10,186,752
 

System Info

Name: trl
Version: 0.24.0

Name: peft
Version: 0.17.1

Name: transformers
Version: 4.57.1

Expected behavior

Instead of empty file .safetensors file (40 bytes) should save the LoRA weights correctly as seen with deepspeed zero 2 configuration. The .safetensors file should be around 20MB.

maximus-21 avatar Nov 04 '25 08:11 maximus-21

Thanks for this report @maximus-21. For me to try to reproduce, could you please share your config yaml too?

BenjaminBossan avatar Nov 04 '25 14:11 BenjaminBossan

Hii @BenjaminBossan here's the config file:

model:
  # paths
  llm_path: "google/gemma-3-4b-it"

  # LoRA
  lora: True
  lora_rank: 8
  lora_alpha: 16
  lora_dropout: 0.05
  target_modules: ["q_proj", "v_proj", "up_proj", "down_proj"]

  max_seq_len: 4096
  end_sym: "<end_of_turn>\n"

datasets:
  train_ann_path: ""
  valid_ann_path: ""
run:
  # log & settings
  exp_name: ''
  output_dir: "" 
  cache_dir: ""
  # resume_from_checkpoint: True
  save_ckpt_freq_steps: 1000
  seed: 42
  log_freq: 5
  accum_grad_iters: 1
  batch_size_train: 4
  batch_size_eval: 4

  device: "cuda"
  use_distributed: True
  ds_config: "configs/ds_zero3.json"
  amp: True

  # optimizer & scheduler
  optims:
    lr_scheduler: "cosine"
    optim: "paged_adamw_32bit"
    max_epoch: 2
    warmup_steps: 1000
    init_lr: 2.0e-5
    weight_decay: 0.0001

The training and valuation datasets are private.

maximus-21 avatar Nov 06 '25 06:11 maximus-21

Thanks for the additional information. I had to make some modifications, as still some info was missing, but I could get this to run:

import os
import torch
import yaml
import argparse
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel
from deepspeed.runtime.zero.config import ZeroStageEnum
from deepspeed.runtime.fp16.loss_scaler import LossScaler


os.environ["WANDB_DISABLED"] = "true"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"

local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)

# DeepSpeed safe load
torch.serialization.add_safe_globals([ZeroStageEnum, LossScaler])

# ----------------------------
# Parse arguments
# ----------------------------
parser = argparse.ArgumentParser(description="LoRA SFT Trainer with YAML config")
parser.add_argument("--config", type=str, required=False, help="Path to train_config.yaml")
args = parser.parse_args()

# ----------------------------
# Load YAML config
# ----------------------------

yaml_str = """model:
  # paths
  llm_path: "google/gemma-3-4b-it"

  # LoRA
  lora: True
  lora_rank: 8
  lora_alpha: 16
  lora_dropout: 0.05
  target_modules: ["q_proj", "v_proj", "up_proj", "down_proj"]

  max_seq_len: 4096
  end_sym: "<end_of_turn>\n"

datasets:
  train_ann_path: ""
  valid_ann_path: ""
run:
  # log & settings
  exp_name: '2893'
  output_dir: "/tmp/peft"
  # resume_from_checkpoint: True
  save_ckpt_freq_steps: 1000
  seed: 42
  log_freq: 5
  accum_grad_iters: 1
  batch_size_train: 4
  batch_size_eval: 4

  device: "cuda"
  use_distributed: True
  ds_config: "configs/ds_zero3.json"
  amp: True

  # optimizer & scheduler
  optims:
    lr_scheduler: "cosine"
    optim: "paged_adamw_32bit"
    max_epoch: 2
    warmup_steps: 1000
    init_lr: 2.0e-5
    weight_decay: 0.0001"""
cfg = yaml.safe_load(yaml_str)

model_cfg = cfg["model"]
data_cfg = cfg["datasets"]
run_cfg = cfg["run"]

# ----------------------------
# Logging directory
# ----------------------------
logging_dir = os.path.join(run_cfg["output_dir"], run_cfg["exp_name"])

if local_rank == 0:
    os.makedirs(logging_dir, exist_ok=True)
    print(f"[INFO] Logging directory created at: {logging_dir}")

# ----------------------------
# Model + tokenizer
# ----------------------------
base_model_path = model_cfg["llm_path"]
resume_ckpt_path = model_cfg.get("ckpt", None) if run_cfg.get("resume_from_checkpoint", False) else None

print(f"[INFO] Loading base model: {base_model_path}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    attn_implementation="flash_attention_2",
    dtype=torch.bfloat16 if run_cfg.get("amp", True) else torch.float32,
)
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)

# ----------------------------
# Training dataset
# ----------------------------

dataset = load_dataset("ybelkada/english_quotes_copy")
dataset = dataset.map(lambda samples: tokenizer(samples["quote"]), batched=True)

# ----------------------------
# LoRA / PEFT setup
# ----------------------------
peft_config = None
if model_cfg.get("lora", False):
    peft_config = LoraConfig(
        r=model_cfg["lora_rank"],
        lora_alpha=model_cfg["lora_alpha"],
        target_modules=model_cfg["target_modules"],
        lora_dropout=model_cfg["lora_dropout"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    if resume_ckpt_path:
        try:
            print(f"[INFO] Resuming LoRA from checkpoint: {resume_ckpt_path}")
            model = PeftModel.from_pretrained(
                model,
                resume_ckpt_path,
                is_trainable=True,
                torch_dtype=torch.bfloat16
            )
            if not any(p.requires_grad for p in model.parameters()):
                raise ValueError("No trainable parameters found in PEFT model!")
        except Exception as e:
            print(f"[WARN] Failed to load PEFT checkpoint: {e}. Training LoRA from scratch using peft_config.")
else:
    if resume_ckpt_path:
        print(f"[INFO] Will resume full-model training from checkpoint: {resume_ckpt_path}")
# ----------------------------
# Training Arguments
# ----------------------------
optim_cfg = run_cfg["optims"]
training_arguments = SFTConfig(
    output_dir=logging_dir,
    bf16=run_cfg.get("amp", True),
    #deepspeed=ds_config,
    optim=optim_cfg["optim"],
    per_device_train_batch_size=run_cfg["batch_size_train"],
    gradient_accumulation_steps=run_cfg["accum_grad_iters"],
    per_device_eval_batch_size=run_cfg["batch_size_eval"],
    log_level="debug",
    save_strategy="steps",
    save_steps=run_cfg["save_ckpt_freq_steps"],
    logging_steps=run_cfg["log_freq"],
    learning_rate=optim_cfg["init_lr"],
    weight_decay=optim_cfg["weight_decay"],
    num_train_epochs=optim_cfg["max_epoch"],
    lr_scheduler_type=optim_cfg["lr_scheduler"],
    warmup_steps=optim_cfg["warmup_steps"],
    seed=run_cfg["seed"],
    max_length=model_cfg["max_seq_len"],
    max_steps=10,
)

# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
    model,
    args=training_arguments,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    processing_class=tokenizer,
)
# ----------------------------
# Train
# ----------------------------
trainer.train()

My DS config is:

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  deepspeed_multinode_launcher: standard
  gradient_accumulation_steps: 1
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

I called with:

accelerate launch --config_file <path-to-config> train.py

When I went to the logging directory, /tmp/peft/2893/checkpoint-10, everything looked good:

$ du -h adapter_model.safetensors 
20M	adapter_model.safetensors

I could load the safetensors file and it looked correct.

My versions tested for this:

  • PEFT 0.17.1 and current main (both work)
  • transformers 4.57.1
  • deepspeed 0.17.1
  • torch 2.8.0

Maybe as a next step, you could try if you can reproduce my findings. If yes, we can check what difference there could be.

BenjaminBossan avatar Nov 06 '25 11:11 BenjaminBossan

@BenjaminBossan thanks for this, i'll try reproducing this. Will try to find if there is any difference.

maximus-21 avatar Nov 07 '25 09:11 maximus-21