DeepSpeed
DeepSpeed copied to clipboard
[BUG] ZeRO3 - Getting assert len(self.ckpt_list) > 0 while running validation code during fine tuning
@stas00, @tjruwase - Tagging you here since I have seen you working on ZeRO3 extensively. Apologies if I shouldn't do this. Describe the bug I am fine tuning a LoRA model on top of BioBART-V2-Base using Deepspeed and Hugginface PEFT library on T4 instance. I am not using Hugginface Trainer class as I wanted to learn how to integrate Deepspeed in with any code. To benchmark how different ZeRO configurations work, I ran the code using following configurations -
Baseline -
{
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0,
"warmup_type": "linear",
"total_num_steps": 6.497000e+03,
"warmup_max_lr": 0.001,
"warmup_num_steps": 650
}
},
"optimizer": {
"type": "Adam",
"params": {
"betas": [0.9, 0.999],
"eps": 1e-06,
"weight_decay": 0.01,
"bias_correction": true
}
},
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 16,
"gradient_clipping": 1.0
}
ZeRO 2 -
{
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0,
"warmup_type": "linear",
"total_num_steps": 6.497000e+03,
"warmup_max_lr": 0.001,
"warmup_num_steps": 650
}
},
"optimizer": {
"type": "Adam",
"params": {
"betas": [0.9, 0.999],
"eps": 1e-06,
"weight_decay": 0.01,
"bias_correction": true
}
},
"fp16": {
"enabled": true,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true
},
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 16,
"gradient_clipping": 1.0
}
and ZeRO 3 -
{
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0,
"warmup_type": "linear",
"total_num_steps": 6.497000e+03,
"warmup_max_lr": 0.001,
"warmup_num_steps": 650
}
},
"optimizer": {
"type": "Adam",
"params": {
"betas": [0.9, 0.999],
"eps": 1e-06,
"weight_decay": 0.01,
"bias_correction": true
}
},
"fp16": {
"enabled": true,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1.000000e+09,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1.000000e+09,
"stage3_max_reuse_distance": 1.000000e+09,
"stage3_gather_16bit_weights_on_model_save": true
},
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 16,
"gradient_clipping": 1.0
}
Validation learning curves are matching perfectly for Baseline and ZeRO2 but I am getting AssertionError() when I try to use ZeRO3.
To Reproduce Steps to reproduce the behavior:
- Simple inference script to reproduce
def create_model_optimizer(ds_config_json,peft_config):
ds_config = ds_config_json
seed_everything(42)
model = \
AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-v2-base")
model = get_peft_model(model,peft_config)
if model.config.decoder_start_token_id is None:
raise Exception("Ensure that config.decoder_start_token_id is set")
ds_config["optimizer"]["params"]["eps"] = 1e-6
ds_config["optimizer"]["params"]["weight_decay"] = 0.01
ds_config["optimizer"]["params"]["bias_correction"] = True
return model, ds_config
def validate_summarization(ds_config,peft_config,valid_ds,checkpoint_folder):
world_size = int(os.getenv('WORLD_SIZE', '4'))
model, ds_config = create_model_optimizer(ds_config, peft_config)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, \
model=model, \
label_pad_token_id=label_pad_token_id)
model_engine_train, _, valid_dl, _ = deepspeed.initialize(model=model,
model_parameters=model.parameters(),
training_data=valid_ds,
collate_fn=data_collator,
config=ds_config)
checkpoint_dict = None
if checkpoint_folder.is_dir() and checkpoint_folder.exists():
load_path, client_state = model_engine_train.load_checkpoint(load_dir=checkpoint_folder)
load_path = Path(load_path).parent
load_path = str(load_path)
epoch = client_state['epoch']
checkpoint_dict = \
{
"dtype": torch.float,
"checkpoint": load_path
}
model_engine = deepspeed.init_inference(model=model, \
mp_size=world_size, \
config=checkpoint_dict, \
replace_with_kernel_inject=False)
else:
raise Exception(f"{checkpoint_folder} does not exist")
model_engine.eval()
if model_engine.training is True:
raise Exception("Model should not be trainable")
total_loss = 0
for valid_step, valid_batch in enumerate(valid_dl):
input_ids = valid_batch["input_ids"].to(device)
attention_mask = valid_batch["attention_mask"].to(device)
labels = valid_batch["labels"].to(device)
decoder_input_ids = valid_batch["decoder_input_ids"].to(device)
with torch.no_grad():
output = model_engine(input_ids=input_ids, \
attention_mask=attention_mask, \
decoder_input_ids=decoder_input_ids, \
labels=labels, \
use_cache=False, \
return_dict=True)
loss = output.loss
total_loss += loss.item()
avg_eval_loss = total_loss / len(valid_dl)
return avg_eval_loss
def train_summarization(ds_config,peft_config,train_ds,epoch,checkpoint_folder=None):
seed_everything(code_config.TASKA_SUMMARY_SEED)
model, ds_config = create_model_optimizer(ds_config, peft_config)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, \
model=model, \
label_pad_token_id=label_pad_token_id)
model_engine, _, train_dl, _ = deepspeed.initialize(model=model,
model_parameters=model.parameters(),
training_data=train_ds,
collate_fn=data_collator,
config=ds_config)
if checkpoint_folder.is_dir() and checkpoint_folder.exists():
_, client_state = model_engine.load_checkpoint(load_dir=checkpoint_folder)
old_epoch = client_state['epoch']
else:
checkpoint_folder.mkdir(parents=True, exist_ok=False)
model_engine.train()
if model_engine.training is False:
raise Exception("Model is not trainable")
total_train_loss = 0
for train_step,train_batch in enumerate(train_dl):
if train_batch["input_ids"].shape[0] > ds_config["train_micro_batch_size_per_gpu"] :
raise Exception("batch size is not equal to train_micro_batch_size_per_gpu")
input_ids = train_batch["input_ids"].to(device)
attention_mask = train_batch["attention_mask"].to(device)
labels = train_batch["labels"].to(device)
decoder_input_ids = train_batch["decoder_input_ids"].to(device)
output = model_engine(input_ids=input_ids, \
attention_mask=attention_mask, \
decoder_input_ids=decoder_input_ids, \
labels=labels, \
output_hidden_states=True, \
use_cache=False, \
return_dict=True)
loss = output.loss
model_engine.backward(loss)
model_engine.step()
total_train_loss += loss.item()
train_step_new = train_step + epoch * len(train_dl)
model_engine.save_checkpoint(save_dir=checkpoint_folder,
client_state={'epoch': epoch})
avg_train_loss = total_train_loss / len(train_dl)
return avg_train_loss
def training_loop(model_name, \
ds_config, \
peft_config, \
train_ds, \
valid_ds, \
checkpoint_folder=None):
best_loss = np.inf
best_model = None
best_epoch = 0
for epoch in tqdm(range(code_config.TASKA_SUMMARY_EPOCHS)):
avg_train_loss = \
train_summarization(ds_config, \
peft_config, \
train_ds, \
epoch, \
checkpoint_folder)
new_loss = \
validate_summarization(ds_config, \
peft_config, \
valid_ds, \
checkpoint_folder)
if new_loss < best_loss:
best_loss = new_loss
shutil.rmtree(checkpoint_folder)
return best_loss
- I am using latest version of Huggingface, Huggingface PEFT, and Deepspeed libraries
- Executing training_loop function runs the entire code. However, I am afraid this snipped can't be run because it is missing code-config and dataframes.
- Stacktrace -
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] optimizer_legacy_fusion ...... False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] optimizer_name ............... adam
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] optimizer_params ............. {'betas': [0.9, 0.999], 'eps': 1e-06, 'weight_decay': 0.01, 'bias_correction': True}
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] pld_enabled .................. False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] pld_params ................... False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] prescale_gradients ........... False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] scheduler_name ............... WarmupDecayLR
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] scheduler_params ............. {'warmup_min_lr': 0, 'warmup_type': 'linear', 'total_num_steps': 6497, 'warmup_max_lr': 0.001, 'warmup_num_steps': 650}
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] sparse_attention ............. None
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] sparse_gradients_enabled ..... False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] steps_per_print .............. 10
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] train_batch_size ............. 16
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] train_micro_batch_size_per_gpu 1
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] use_node_local_storage ....... False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] wall_clock_breakdown ......... False
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] world_size ................... 1
[2023-04-25 15:02:10,034] [INFO] [config.py:1022:print] zero_allow_untested_optimizer False
[2023-04-25 15:02:10,035] [INFO] [config.py:1022:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False
[2023-04-25 15:02:10,035] [INFO] [config.py:1022:print] zero_enabled ................. True
[2023-04-25 15:02:10,035] [INFO] [config.py:1022:print] zero_force_ds_cpu_optimizer .. True
[2023-04-25 15:02:10,035] [INFO] [config.py:1022:print] zero_optimization_stage ...... 3
[2023-04-25 15:02:10,035] [INFO] [config.py:1007:print_user_config] json = {
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0,
"warmup_type": "linear",
"total_num_steps": 6.497000e+03,
"warmup_max_lr": 0.001,
"warmup_num_steps": 650
}
},
"optimizer": {
"type": "Adam",
"params": {
"betas": [0.9, 0.999],
"eps": 1e-06,
"weight_decay": 0.01,
"bias_correction": true
}
},
"fp16": {
"enabled": true,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1.000000e+09,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1.000000e+09,
"stage3_max_reuse_distance": 1.000000e+09,
"stage3_gather_16bit_weights_on_model_save": true
},
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 16,
"gradient_clipping": 1.0
}
Using /root/.cache/torch_extensions/py38_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.00040435791015625 seconds
[2023-04-25 15:02:10,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loading checkpoint from /workspace/3-fold-stratified-cv-biobart-v2-base-peft-deepspeed-zero3-0/global_step43/zero_pp_rank_0_mp_rank_00_model_states.pt...
[2023-04-25 15:02:10,048] [INFO] [torch_checkpoint_engine.py:25:load] [Torch] Loaded checkpoint from /workspace/3-fold-stratified-cv-biobart-v2-base-peft-deepspeed-zero3-0/global_step43/zero_pp_rank_0_mp_rank_00_model_states.pt.
[2023-04-25 15:02:10,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loading checkpoint from /workspace/3-fold-stratified-cv-biobart-v2-base-peft-deepspeed-zero3-0/global_step43/zero_pp_rank_0_mp_rank_00_model_states.pt...
[2023-04-25 15:02:10,055] [INFO] [torch_checkpoint_engine.py:25:load] [Torch] Loaded checkpoint from /workspace/3-fold-stratified-cv-biobart-v2-base-peft-deepspeed-zero3-0/global_step43/zero_pp_rank_0_mp_rank_00_model_states.pt.
[2023-04-25 15:02:10,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loading checkpoint from /workspace/3-fold-stratified-cv-biobart-v2-base-peft-deepspeed-zero3-0/global_step43/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2023-04-25 15:02:10,065] [INFO] [torch_checkpoint_engine.py:25:load] [Torch] Loaded checkpoint from /workspace/3-fold-stratified-cv-biobart-v2-base-peft-deepspeed-zero3-0/global_step43/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2023-04-25 15:02:10,065] [INFO] [engine.py:3043:_get_all_zero_checkpoint_state_dicts] successfully read 1 ZeRO state_dicts for rank 0
[2023-04-25 15:02:10,085] [INFO] [engine.py:2983:_load_zero_checkpoint] loading 1 zero partition checkpoints for rank 0
[2023-04-25 15:02:10,086] [INFO] [logging.py:93:log_dist] [Rank 0] DeepSpeed info: version=0.8.3, git-hash=unknown, git-branch=unknown
[2023-04-25 15:02:10,086] [WARNING] [config_utils.py:75:_process_deprecated_field] Config parameter mp_size is deprecated use tensor_parallel.tp_size instead
[2023-04-25 15:02:10,086] [INFO] [logging.py:93:log_dist] [Rank 0] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
0%| | 0/150 [04:52<?, ?it/s]
Traceback (most recent call last):
File "Task A - Summarization - Sweep with Deepspeed wo wandb.py", line 580, in <module>
main()
File "Task A - Summarization - Sweep with Deepspeed wo wandb.py", line 570, in main
training_loop(model_name, \
File "Task A - Summarization - Sweep with Deepspeed wo wandb.py", line 499, in training_loop
validate_summarization(ds_config, \
File "Task A - Summarization - Sweep with Deepspeed wo wandb.py", line 339, in validate_summarization
model_engine = deepspeed.init_inference(model=model, \
File "/usr/local/lib/python3.8/dist-packages/deepspeed/__init__.py", line 311, in init_inference
engine = InferenceEngine(model, config=ds_inference_config)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/inference/engine.py", line 96, in __init__
self._load_checkpoint(config.checkpoint)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/inference/engine.py", line 405, in _load_checkpoint
sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, self.checkpoint_engine)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader
return MegatronSDLoader(ckpt_list, version, checkpoint_engine)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__
super().__init__(ckpt_list, version, checkpoint_engine)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__
self.check_ckpt_list()
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list
assert len(self.ckpt_list) > 0
AssertionError
[2023-04-25 15:02:13,180] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 143
Expected behavior The code should run smoothly since it ran for ZeRO2
ds_report output
Please run ds_report
to give us details about your setup.
Screenshots If applicable, add screenshots to help explain your problem.
System info (please complete the following information):
- OS: [e.g. Ubuntu 18.04]
- GPU count and types [e.g. two machines with x8 A100s each]
- (if applicable) what DeepSpeed-MII version are you using
- (if applicable) Hugging Face Transformers/Accelerate/etc. versions
- Python version
- Any other relevant info about your setup
Docker context Are you using a specific docker image that you can share?
Additional context Add any other context about the problem here.
Did you ever figure out a solution for this?
@suri-kunal hi, How did you fix the bug ? Can you tell me ?
I haven't. Just didn't try it further.
On Tue, 13 Jun, 2023, 8:15 pm sile, @.***> wrote:
@suri-kunal https://github.com/suri-kunal hi, How did you fix the bug ? Can you tell me ?
— Reply to this email directly, view it on GitHub https://github.com/microsoft/DeepSpeed/issues/3377#issuecomment-1589458412, or unsubscribe https://github.com/notifications/unsubscribe-auth/A4A72NKEFUIDN4MOSRCRKLDXLB4JVANCNFSM6AAAAAAXLI3U4I . You are receiving this because you were mentioned.Message ID: @.***>
No Petros. I didn't try it further.
On Sun, 11 Jun, 2023, 11:08 am Petros Karypis, @.***> wrote:
Did you ever figure out a solution for this?
— Reply to this email directly, view it on GitHub https://github.com/microsoft/DeepSpeed/issues/3377#issuecomment-1586026449, or unsubscribe https://github.com/notifications/unsubscribe-auth/A4A72NNE6WLDD52IDKH7KQLXKVKURANCNFSM6AAAAAAXLI3U4I . You are receiving this because you authored the thread.Message ID: @.***>
same problem
I think I'm hitting the same bug, although I'm using zero1 and trying to load from a checkpoint but hitting this assert
I'm facing the same situation as @gordicaleksa
im finding some of my processes are able to load the model states while others fail to do so
edit: in my case i realized i am going from one node to two nodes, and the ranks that were previously not there do not have model states to pull from