peft
peft copied to clipboard
Using trl SFTTrainer creates empty adapter.safetensors file while saving when training with LoRA and Deepspeed Zero3
System Info
When training with trl SFTTrainer with peft and deepspeed zero3 configuration it results in adapter_model.safetensors file of just 40 bytes i.e empty. However when training with deepspeed zero2, it saves adapter_model.safetensors correctly with 20MB size. Also when lora is false in config and it's complete finetuning, deepspeed saves sharded weights correctly as it should in zero3 setting. Have already checked while training, number of trainable parameters are not zero and training (loss and validation generation) is as expected with deepspeed zero3. The issue is with saving the adapter weights.
Who can help?
@BenjaminBossan
Reproduction
"""
LoRA SFT Trainer with periodic validation generation logging
"""
import os
import torch
import yaml
import json
import random
import ast
import argparse
from datasets import load_dataset
from utils import *
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainerCallback
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel
from huggingface_hub import login
from deepspeed.runtime.zero.config import ZeroStageEnum
from deepspeed.runtime.fp16.loss_scaler import LossScaler
os.environ["WANDB_DISABLED"] = "true"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"
local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)
# DeepSpeed safe load
torch.serialization.add_safe_globals([ZeroStageEnum, LossScaler])
# ----------------------------
# Parse arguments
# ----------------------------
parser = argparse.ArgumentParser(description="LoRA SFT Trainer with YAML config")
parser.add_argument("--config", type=str, required=True, help="Path to train_config.yaml")
args = parser.parse_args()
# ----------------------------
# Load YAML config
# ----------------------------
with open(args.config, "r") as f:
cfg = yaml.safe_load(f)
model_cfg = cfg["model"]
data_cfg = cfg["datasets"]
run_cfg = cfg["run"]
# ----------------------------
# Logging directory
# ----------------------------
logging_dir = os.path.join(run_cfg["output_dir"], run_cfg["exp_name"])
if local_rank == 0:
os.makedirs(logging_dir, exist_ok=True)
print(f"[INFO] Logging directory created at: {logging_dir}")
CACHE_DIR = run_cfg.get("cache_dir", None)
# ----------------------------
# Model + tokenizer
# ----------------------------
base_model_path = model_cfg["llm_path"]
resume_ckpt_path = model_cfg.get("ckpt", None) if run_cfg.get("resume_from_checkpoint", False) else None
print(f"[INFO] Loading base model: {base_model_path}")
model_config = AutoConfig.from_pretrained(base_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
cache_dir=CACHE_DIR,
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16 if run_cfg.get("amp", True) else torch.float32,
config=model_config
)
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)
# ----------------------------
# Training dataset
# ----------------------------
dataset = load_dataset(
"json",
data_files=data_cfg["train_ann_path"],
split="train",
cache_dir=CACHE_DIR,
).shuffle(seed=run_cfg["seed"])
print("-" * 100)
print("Length of training dataset: ", len(dataset))
print("-" * 100)
# ----------------------------
# Validation dataset
# ----------------------------
val_dataset = None
if "valid_ann_path" in data_cfg:
val_dataset = load_dataset(
"json",
data_files=data_cfg["valid_ann_path"],
split="train",
cache_dir=CACHE_DIR
).shuffle(seed=run_cfg["seed"])
print("Length of validation dataset: ", len(val_dataset))
# ----------------------------
# LoRA / PEFT setup
# ----------------------------
peft_config = None
if model_cfg.get("lora", False):
peft_config = LoraConfig(
r=model_cfg["lora_rank"],
lora_alpha=model_cfg["lora_alpha"],
target_modules=model_cfg["target_modules"],
lora_dropout=model_cfg["lora_dropout"],
bias="none",
task_type="CAUSAL_LM",
)
if resume_ckpt_path:
try:
print(f"[INFO] Resuming LoRA from checkpoint: {resume_ckpt_path}")
model = PeftModel.from_pretrained(
model,
resume_ckpt_path,
is_trainable=True,
torch_dtype=torch.bfloat16
)
if not any(p.requires_grad for p in model.parameters()):
raise ValueError("No trainable parameters found in PEFT model!")
except Exception as e:
print(f"[WARN] Failed to load PEFT checkpoint: {e}. Training LoRA from scratch using peft_config.")
else:
if resume_ckpt_path:
print(f"[INFO] Will resume full-model training from checkpoint: {resume_ckpt_path}")
# ----------------------------
# Training Arguments
# ----------------------------
optim_cfg = run_cfg["optims"]
training_arguments = SFTConfig(
output_dir=logging_dir,
bf16=run_cfg.get("amp", True),
deepspeed=run_cfg.get("ds_config", "") if run_cfg.get("use_distributed", True) else None,
optim=optim_cfg["optim"],
per_device_train_batch_size=run_cfg["batch_size_train"],
gradient_accumulation_steps=run_cfg["accum_grad_iters"],
per_device_eval_batch_size=run_cfg["batch_size_eval"],
log_level="debug",
save_strategy="steps",
save_steps=run_cfg["save_ckpt_freq_steps"],
logging_steps=run_cfg["log_freq"],
learning_rate=optim_cfg["init_lr"],
weight_decay=optim_cfg["weight_decay"],
num_train_epochs=optim_cfg["max_epoch"],
lr_scheduler_type=optim_cfg["lr_scheduler"],
warmup_steps=optim_cfg["warmup_steps"],
#dataset_text_field="text",
seed=run_cfg["seed"],
max_length=model_cfg["max_seq_len"],
dataset_kwargs={
"add_special_tokens": False, # We template with special tokens
"append_concat_token": True, # Add EOS token as separator token between examples
}
)
# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
model,
args=training_arguments,
train_dataset=dataset,
peft_config=peft_config,
processing_class=tokenizer
)
# ----------------------------
# Train
# ----------------------------
trainer.train()
deepspeed config:
{
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "none"
},
"offload_param": {
"device": "none"
},
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto"
},
"bf16": {
"enabled": true
},
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"steps_per_print": 10,
"wall_clock_breakdown": false,
"gradient_clipping": 1.0
}
outputs:
***** Running training *****
Num examples = 152,655
Num Epochs = 2
Instantaneous batch size per device = 4
Total train batch size (w. parallel, distributed & accumulation) = 32
Gradient Accumulation steps = 1
Total optimization steps = 9,542
Number of trainable parameters = 10,186,752
System Info
Name: trl
Version: 0.24.0
Name: peft
Version: 0.17.1
Name: transformers
Version: 4.57.1
Expected behavior
Instead of empty file .safetensors file (40 bytes) should save the LoRA weights correctly as seen with deepspeed zero 2 configuration. The .safetensors file should be around 20MB.
Thanks for this report @maximus-21. For me to try to reproduce, could you please share your config yaml too?
Hii @BenjaminBossan here's the config file:
model:
# paths
llm_path: "google/gemma-3-4b-it"
# LoRA
lora: True
lora_rank: 8
lora_alpha: 16
lora_dropout: 0.05
target_modules: ["q_proj", "v_proj", "up_proj", "down_proj"]
max_seq_len: 4096
end_sym: "<end_of_turn>\n"
datasets:
train_ann_path: ""
valid_ann_path: ""
run:
# log & settings
exp_name: ''
output_dir: ""
cache_dir: ""
# resume_from_checkpoint: True
save_ckpt_freq_steps: 1000
seed: 42
log_freq: 5
accum_grad_iters: 1
batch_size_train: 4
batch_size_eval: 4
device: "cuda"
use_distributed: True
ds_config: "configs/ds_zero3.json"
amp: True
# optimizer & scheduler
optims:
lr_scheduler: "cosine"
optim: "paged_adamw_32bit"
max_epoch: 2
warmup_steps: 1000
init_lr: 2.0e-5
weight_decay: 0.0001
The training and valuation datasets are private.
Thanks for the additional information. I had to make some modifications, as still some info was missing, but I could get this to run:
import os
import torch
import yaml
import argparse
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel
from deepspeed.runtime.zero.config import ZeroStageEnum
from deepspeed.runtime.fp16.loss_scaler import LossScaler
os.environ["WANDB_DISABLED"] = "true"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"
local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)
# DeepSpeed safe load
torch.serialization.add_safe_globals([ZeroStageEnum, LossScaler])
# ----------------------------
# Parse arguments
# ----------------------------
parser = argparse.ArgumentParser(description="LoRA SFT Trainer with YAML config")
parser.add_argument("--config", type=str, required=False, help="Path to train_config.yaml")
args = parser.parse_args()
# ----------------------------
# Load YAML config
# ----------------------------
yaml_str = """model:
# paths
llm_path: "google/gemma-3-4b-it"
# LoRA
lora: True
lora_rank: 8
lora_alpha: 16
lora_dropout: 0.05
target_modules: ["q_proj", "v_proj", "up_proj", "down_proj"]
max_seq_len: 4096
end_sym: "<end_of_turn>\n"
datasets:
train_ann_path: ""
valid_ann_path: ""
run:
# log & settings
exp_name: '2893'
output_dir: "/tmp/peft"
# resume_from_checkpoint: True
save_ckpt_freq_steps: 1000
seed: 42
log_freq: 5
accum_grad_iters: 1
batch_size_train: 4
batch_size_eval: 4
device: "cuda"
use_distributed: True
ds_config: "configs/ds_zero3.json"
amp: True
# optimizer & scheduler
optims:
lr_scheduler: "cosine"
optim: "paged_adamw_32bit"
max_epoch: 2
warmup_steps: 1000
init_lr: 2.0e-5
weight_decay: 0.0001"""
cfg = yaml.safe_load(yaml_str)
model_cfg = cfg["model"]
data_cfg = cfg["datasets"]
run_cfg = cfg["run"]
# ----------------------------
# Logging directory
# ----------------------------
logging_dir = os.path.join(run_cfg["output_dir"], run_cfg["exp_name"])
if local_rank == 0:
os.makedirs(logging_dir, exist_ok=True)
print(f"[INFO] Logging directory created at: {logging_dir}")
# ----------------------------
# Model + tokenizer
# ----------------------------
base_model_path = model_cfg["llm_path"]
resume_ckpt_path = model_cfg.get("ckpt", None) if run_cfg.get("resume_from_checkpoint", False) else None
print(f"[INFO] Loading base model: {base_model_path}")
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
attn_implementation="flash_attention_2",
dtype=torch.bfloat16 if run_cfg.get("amp", True) else torch.float32,
)
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)
# ----------------------------
# Training dataset
# ----------------------------
dataset = load_dataset("ybelkada/english_quotes_copy")
dataset = dataset.map(lambda samples: tokenizer(samples["quote"]), batched=True)
# ----------------------------
# LoRA / PEFT setup
# ----------------------------
peft_config = None
if model_cfg.get("lora", False):
peft_config = LoraConfig(
r=model_cfg["lora_rank"],
lora_alpha=model_cfg["lora_alpha"],
target_modules=model_cfg["target_modules"],
lora_dropout=model_cfg["lora_dropout"],
bias="none",
task_type="CAUSAL_LM",
)
if resume_ckpt_path:
try:
print(f"[INFO] Resuming LoRA from checkpoint: {resume_ckpt_path}")
model = PeftModel.from_pretrained(
model,
resume_ckpt_path,
is_trainable=True,
torch_dtype=torch.bfloat16
)
if not any(p.requires_grad for p in model.parameters()):
raise ValueError("No trainable parameters found in PEFT model!")
except Exception as e:
print(f"[WARN] Failed to load PEFT checkpoint: {e}. Training LoRA from scratch using peft_config.")
else:
if resume_ckpt_path:
print(f"[INFO] Will resume full-model training from checkpoint: {resume_ckpt_path}")
# ----------------------------
# Training Arguments
# ----------------------------
optim_cfg = run_cfg["optims"]
training_arguments = SFTConfig(
output_dir=logging_dir,
bf16=run_cfg.get("amp", True),
#deepspeed=ds_config,
optim=optim_cfg["optim"],
per_device_train_batch_size=run_cfg["batch_size_train"],
gradient_accumulation_steps=run_cfg["accum_grad_iters"],
per_device_eval_batch_size=run_cfg["batch_size_eval"],
log_level="debug",
save_strategy="steps",
save_steps=run_cfg["save_ckpt_freq_steps"],
logging_steps=run_cfg["log_freq"],
learning_rate=optim_cfg["init_lr"],
weight_decay=optim_cfg["weight_decay"],
num_train_epochs=optim_cfg["max_epoch"],
lr_scheduler_type=optim_cfg["lr_scheduler"],
warmup_steps=optim_cfg["warmup_steps"],
seed=run_cfg["seed"],
max_length=model_cfg["max_seq_len"],
max_steps=10,
)
# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
model,
args=training_arguments,
train_dataset=dataset["train"],
peft_config=peft_config,
processing_class=tokenizer,
)
# ----------------------------
# Train
# ----------------------------
trainer.train()
My DS config is:
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
I called with:
accelerate launch --config_file <path-to-config> train.py
When I went to the logging directory, /tmp/peft/2893/checkpoint-10, everything looked good:
$ du -h adapter_model.safetensors
20M adapter_model.safetensors
I could load the safetensors file and it looked correct.
My versions tested for this:
- PEFT 0.17.1 and current main (both work)
- transformers 4.57.1
- deepspeed 0.17.1
- torch 2.8.0
Maybe as a next step, you could try if you can reproduce my findings. If yes, we can check what difference there could be.
@BenjaminBossan thanks for this, i'll try reproducing this. Will try to find if there is any difference.