DeepSpeed
DeepSpeed copied to clipboard
[BUG] batch_size check failed with zero 2 (deepspeed v0.9.0)
Describe the bug
AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 16 != 2 * 1 * 1
This error only occurs when using deepspeed v0.9.0 and zero stage 2. My code trains normally with (deepspeed v0.9.0 + zero stage 3), or (deepspeed v0.8.3 +zero stage 2).
ds_report output
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /data/alpaca-lora/finetune_accelerate.py:520 in <module> │
│ │
│ 517 │
│ 518 │
│ 519 if __name__ == "__main__": │
│ ❱ 520 │ main() │
│ 521 │
│ │
│ /data/alpaca-lora/finetune_accelerate.py:347 in main │
│ │
│ 344 │ │ eval_dataloader, │
│ 345 │ │ optimizer, │
│ 346 │ │ lr_scheduler, │
│ ❱ 347 │ ) = accelerator.prepare( │
│ 348 │ │ model, train_dataloader, eval_dataloader, optimizer, lr_scheduler │
│ 349 │ ) │
│ 350 │ accelerator.print(model) │
│ │
│ /home/chenmingrui/.local/lib/python3.10/site-packages/accelerate/accelerator.py:1118 in prepare │
│ │
│ 1115 │ │ │ old_named_params = self._get_named_parameters(*args) │
│ 1116 │ │ │
│ 1117 │ │ if self.distributed_type == DistributedType.DEEPSPEED: │
│ ❱ 1118 │ │ │ result = self._prepare_deepspeed(*args) │
│ 1119 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │
│ 1120 │ │ │ result = self._prepare_megatron_lm(*args) │
│ 1121 │ │ else: │
│ │
│ /home/chenmingrui/.local/lib/python3.10/site-packages/accelerate/accelerator.py:1415 in │
│ _prepare_deepspeed │
│ │
│ 1412 │ │ │ │ │ │ if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VA │
│ 1413 │ │ │ │ │ │ │ kwargs["lr_scheduler"] = scheduler │
│ 1414 │ │ │ │
│ ❱ 1415 │ │ │ engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs) │
│ 1416 │ │ │ if optimizer is not None: │
│ 1417 │ │ │ │ optimizer = DeepSpeedOptimizerWrapper(optimizer) │
│ 1418 │ │ │ if scheduler is not None: │
│ │
│ /home/chenmingrui/.local/lib/python3.10/site-packages/deepspeed/__init__.py:142 in initialize
│ │
│ 139 │ assert config != None, "DeepSpeed requires --deepspeed_config to specify configurati │
│ 140 │ │
│ 141 │ if not isinstance(model, PipelineModule): │
│ ❱ 142 │ │ config_class = DeepSpeedConfig(config, mpu) │
│ 143 │ │ if config_class.hybrid_engine.enabled: │
│ 144 │ │ │ engine = DeepSpeedHybridEngine(args=args, │
│ 145 │ │ │ │ │ │ │ │ │ │ model=model, │
│ │
│ /home/chenmingrui/.local/lib/python3.10/site-packages/deepspeed/runtime/config.py:764 in │
│ __init__ │
│ │
│ 761 │ │ │
│ 762 │ │ # Pass a copy so that user json is unmodified, e.g. for logging │
│ 763 │ │ self._initialize_params(copy.copy(self._param_dict)) │
│ ❱ 764 │ │ self._configure_train_batch_size() │
│ 765 │ │ self._do_sanity_check() │
│ 766 │ │
│ 767 │ def _initialize_params(self, param_dict): │
│ │
│ /home/chenmingrui/.local/lib/python3.10/site-packages/deepspeed/runtime/config.py:935 in │
│ _configure_train_batch_size │
│ │
│ 932 │ │
│ 933 │ def _configure_train_batch_size(self): │
│ 934 │ │ self._set_batch_related_parameters() │
│ ❱ 935 │ │ self._batch_assertion() │
│ 936 │ │
│ 937 │ def _do_sanity_check(self): │
│ 938 │ │ self._do_error_check() │
│ │
│ /home/chenmingrui/.local/lib/python3.10/site-packages/deepspeed/runtime/config.py:883 in │
│ _batch_assertion │
│ │
│ 880 │ │ │
│ 881 │ │ assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be great │
│ 882 │ │ │
│ ❱ 883 │ │ assert train_batch == micro_batch * grad_acc * self.world_size, ( │
│ 884 │ │ │ f"Check batch related parameters. train_batch_size is not equal " │
│ 885 │ │ │ "to micro_batch_per_gpu * gradient_acc_step * world_size " │
│ 886 │ │ │ f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}") │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 16 != 2 * 1 * 1
System info (please complete the following information):
- OS: Ubuntu 22.04
- GPU count and types [e.g. one machines with x8 titanxp]
- Python version: 3.10
- deepspeed: 0.9.0
- accelerate: 0.18.0
Launcher context accelerate launch --config_file $ACCELERATE_CONFIG_NAME --deepspeed_config_file $DEEPSPEED_CONFIG_NAME --num_processes 8 --zero3_init_flag true alpaca-lora/finetune_accelerate.py
Additional context deepspeed config:
{
"fp16": {
"enabled": true,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": false
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": 205520896,
"stage3_prefetch_bucket_size": 184968807,
"stage3_param_persistence_threshold": 143360,
"sub_group_size": 1e9,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": 1,
"wall_clock_breakdown": false
}
accelerate config:
compute_environment: LOCAL_MACHINE
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
same problem with zero 2 and deepspeed=0.9.0,same config with accelerate(1 machine and 8 GPUs). And I have a doubt that (..deepspeed/runtime/config.py) self.world_size equals 1,not num_machines * num_processes = 8?
Following thread, same issue with zero stage 2 + deepspeed=0.9.0. Zero stage 3 does not have this issue. (fall back to deepspeed==0.8.2 fixed the issue).
same problem with zero 2 and deepspeed=0.9.0
I have investigated the bug, and here are my findings:
- accelerate calls
deepspeed.initialize
-
deepspeed.initalize
creates aDeepSpeedConfig
class and then passes it to theDeepSpeedEngine
-
DeepSpeedEngine
initializes DeepSpeed communications throughdeepspeed.dist.init_distributed
if not isinstance(model, PipelineModule):
config_class = DeepSpeedConfig(config, mpu)
if config_class.hybrid_engine.enabled:
engine = DeepSpeedHybridEngine(args=args,
model=model,
optimizer=optimizer,
model_parameters=model_parameters,
training_data=training_data,
lr_scheduler=lr_scheduler,
mpu=mpu,
dist_init_required=dist_init_required,
collate_fn=collate_fn,
config=config,
config_class=config_class)
else:
engine = DeepSpeedEngine(args=args,
model=model,
optimizer=optimizer,
model_parameters=model_parameters,
training_data=training_data,
lr_scheduler=lr_scheduler,
mpu=mpu,
dist_init_required=dist_init_required,
collate_fn=collate_fn,
config=config,
config_class=config_class)
The error is caused by DeepSpeedConfig
expecting init_distributed
to have already been called, but this only happens in the next step. This always raises an exception here:
try:
self.global_rank = dist.get_rank()
if mpu is None:
self.world_size = dist.get_world_size()
else:
self.world_size = mpu.get_data_parallel_world_size()
except:
self.global_rank = 0
self.world_size = 1
which means that the world_size
is always set to 1, no matter what.
@Yard1 thank you for investigating this. This is related to recent changes we made for the release of DeepSpeed-Chat. I'll find a solution and get that merged soon. In the meantime, please use deepspeed<0.9.0
.
@chenmingjiong (or @Yard1 ) do you have a script I can use to replicate this error? I believe I have a fix, but I want to test if it addresses the error that each of you were seeing. Thanks!
@mrwyattii We have caught this bug as part of our CI suite. If you can provide me with a link to a PR/branch containing your changes that I can install, I can run it on my end.
Alternatively, you can install Ray and run https://github.com/ray-project/ray/blob/master/python/ray/train/tests/test_accelerate_trainer_gpu.py
Thank you @Yard1 I will test on my end. PR is #3324 if you would like to try as well
@Yard1 could you provide some more direction on how to run these unit tests? I tried following the docs (https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#testing) and did:
pip install -c python/requirements.txt -r python/requirements_test.txt
python -m pytest -v -s python/ray/train/tests/test_accelerate_trainer_gpu.py
But I'm seeing the following import error:
ImportError while loading conftest '/home/michaelwyatt/repos/DeepSpeed/ray/python/ray/train/tests/conftest.py'.
python/ray/train/tests/conftest.py:2: in <module>
from ray.tests.conftest import pytest_runtest_makereport # noqa
E ModuleNotFoundError: No module named 'ray.tests'
Make sure to install Ray itself as well
@chenmingjiong this issue should now be resolved after #3324 was merged. Please update to the latest release of DeepSpeed. Closing the issue, but please reopen if the issue persists.
deepspeed 0.9.2 stage3 + offload multi-gpu, This error also occurs AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 256 != 2 * 10 * 12
"""deepspeed --hostfile hostfile --num_nodes 2 main.py --train_batch_size 2 --ppo_train_batch_size 2 --gradient_accumulation_steps 16 --actor_zero_stage 3 --critic_zero_stage 3 -- """
Same for me
same here
The same error even with DeepSpeed 0.10.0 using a Ray cluster, I submitted a job to 2 workers with a micro-batch per worker=2 and grad_acc_step=2. The number on the left of the equality (8) seems to be the number of workers * bath per GPU * ACC steps. On the right of equality, the number '1' at the rightmost position remains the same no matter what.
AssertionError: Check batch-related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 8 != 2 * 2 * 1
I am experiencing the same issue with zero3 on a single node with 4x 3090s.
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
with zero 2 config below I do not have this problem
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
same here
same here
Also getting wrong self.world_size when using zero3 while zero2 is alright in the same environment (8xA100)
i have deepspeed==0.11.1, transformers==4.34.1, accelerate==0.24.0 and ray==2.7.1
same here @mrwyattii
Same
AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 10 != 1 * 1 * 5 latest version
Seeing the same issue as well
Also getting wrong self.world_size when using zero3 while zero2 is alright in the same environment (8xA100)
i have deepspeed==0.11.1, transformers==4.34.1, accelerate==0.24.0 and ray==2.7.1
hi, did u solve this problem? i still encounter this issue while using the latest version of deepspeed, zero3.
same question with deepspeed==0.14.0, accelerate==0.27.2