OutOfMemoryError: CUDA out of memory. -- train dolly v2
command: python training/trainer.py --input-model /home/xx/pythia-2.8b --local-output-dir /home/xx/dolly/train --deepspeed config/ds_z3_bf16_config.json --warmup-steps 0 env: ubuntu 20.04 + a100*8(80G) + pyhton3.10.10
Traceback (most recent call last):
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/xx/tasks/dolly/dolly/training/trainer.py:339 in <module> │
│ │
│ 336 │ │ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, d │
│ 337 │ ) │
│ 338 │ try: │
│ ❱ 339 │ │ main() │
│ 340 │ except Exception: │
│ 341 │ │ logger.exception("main failed") │
│ 342 │ │ raise │
│ │
│ /home/user/.local/lib/python3.10/site-packages/click/core.py:1130 in __call__ │
│ │
│ 1127 │ │
│ 1128 │ def __call__(self, *args: t.Any, **kwargs: t.Any) -> t.Any: │
│ 1129 │ │ """Alias for :meth:`main`.""" │
│ ❱ 1130 │ │ return self.main(*args, **kwargs) │
│ 1131 │
│ 1132 │
│ 1133 class Command(BaseCommand): │
│ │
│ /home/user/.local/lib/python3.10/site-packages/click/core.py:1055 in main │
│ │
│ 1052 │ │ try: │
│ 1053 │ │ │ try: │
│ 1054 │ │ │ │ with self.make_context(prog_name, args, **extra) as ctx: │
│ ❱ 1055 │ │ │ │ │ rv = self.invoke(ctx) │
│ 1056 │ │ │ │ │ if not standalone_mode: │
│ 1057 │ │ │ │ │ │ return rv │
│ 1058 │ │ │ │ │ # it's not safe to `ctx.exit(rv)` here! │
│ │
│ /home/user/.local/lib/python3.10/site-packages/click/core.py:1404 in invoke │
│ │
│ 1401 │ │ │ echo(style(message, fg="red"), err=True) │
│ 1402 │ │ │
│ 1403 │ │ if self.callback is not None: │
│ ❱ 1404 │ │ │ return ctx.invoke(self.callback, **ctx.params) │
│ 1405 │ │
│ 1406 │ def shell_complete(self, ctx: Context, incomplete: str) -> t.List["CompletionItem"]: │
│ 1407 │ │ """Return a list of completions for the incomplete value. Looks │
│ │
│ /home/user/.local/lib/python3.10/site-packages/click/core.py:760 in invoke │
│ │
│ 757 │ │ │
│ 758 │ │ with augment_usage_errors(__self): │
│ 759 │ │ │ with ctx: │
│ ❱ 760 │ │ │ │ return __callback(*args, **kwargs) │
│ 761 │ │
│ 762 │ def forward( │
│ 763 │ │ __self, __cmd: "Command", *args: t.Any, **kwargs: t.Any # noqa: B902 │
│ │
│ /home/xx/tasks/dolly/dolly/training/trainer.py:331 in main │
│ │
│ 328 ) │
│ 329 @click.option("--bf16", type=bool, default=True, help="Whether to use bf16 (preferred on │
│ 330 def main(**kwargs): │
│ ❱ 331 │ train(**kwargs) │
│ 332 │
│ 333 │
│ 334 if __name__ == "__main__": │
│ │
│ /home/xx/tasks/dolly/dolly/training/trainer.py:287 in train │
│ │
│ 284 │ ) │
│ 285 │ │
│ 286 │ logger.info("Training") │
│ ❱ 287 │ trainer.train() │
│ 288 │ │
│ 289 │ logger.info(f"Saving Model to {local_output_dir}") │
│ 290 │ trainer.save_model(output_dir=local_output_dir) │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/transformers/trainer.py:1527 in │
│ train │
│ │
│ 1524 │ │ inner_training_loop = find_executable_batch_size( │
│ 1525 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1526 │ │ ) │
│ ❱ 1527 │ │ return inner_training_loop( │
│ 1528 │ │ │ args=args, │
│ 1529 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1530 │ │ │ trial=trial, │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/transformers/trainer.py:1775 in │
│ _inner_training_loop │
│ │
│ 1772 │ │ │ │ │ with model.no_sync(): │
│ 1773 │ │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1774 │ │ │ │ else: │
│ ❱ 1775 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1776 │ │ │ │ │
│ 1777 │ │ │ │ if ( │
│ 1778 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/transformers/trainer.py:2539 in │
│ training_step │
│ │
│ 2536 │ │ │ │ scaled_loss.backward() │
│ 2537 │ │ elif self.deepspeed: │
│ 2538 │ │ │ # loss gets scaled under gradient_accumulation_steps in deepspeed │
│ ❱ 2539 │ │ │ loss = self.deepspeed.backward(loss) │
│ 2540 │ │ else: │
│ 2541 │ │ │ loss.backward() │
│ 2542 │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/deepspeed/utils/nvtx.py:11 in │
│ wrapped_fn │
│ │
│ 8 │ │ │
│ 9 │ │ def wrapped_fn(*args, **kwargs): │
│ 10 │ │ │ with torch.cuda.nvtx.range(func.__qualname__): │
│ ❱ 11 │ │ │ │ return func(*args, **kwargs) │
│ 12 │ │ │
│ 13 │ │ return wrapped_fn │
│ 14 │ else: │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/deepspeed/runtime/engine.py:1969 in │
│ backward │
│ │
│ 1966 │ │ if self.zero_optimization(): │
│ 1967 │ │ │ self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumula │
│ 1968 │ │ │ ) │
│ ❱ 1969 │ │ │ self.optimizer.backward(loss, retain_graph=retain_graph) │
│ 1970 │ │ elif self.amp_enabled(): │
│ 1971 │ │ │ # AMP requires delaying unscale when inside gradient accumulation boundaries │
│ 1972 │ │ │ # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-i │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/deepspeed/utils/nvtx.py:11 in │
│ wrapped_fn │
│ │
│ 8 │ │ │
│ 9 │ │ def wrapped_fn(*args, **kwargs): │
│ 10 │ │ │ with torch.cuda.nvtx.range(func.__qualname__): │
│ ❱ 11 │ │ │ │ return func(*args, **kwargs) │
│ 12 │ │ │
│ 13 │ │ return wrapped_fn │
│ 14 │ else: │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:20 │
│ 83 in backward │
│ │
│ 2080 │ │ │ scaled_loss = self.external_loss_scale * loss │
│ 2081 │ │ │ scaled_loss.backward() │
│ 2082 │ │ else: │
│ ❱ 2083 │ │ │ self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) │
│ 2084 │ │ │
│ 2085 │ │ self._get_param_coordinator(training=True).reset_step() │
│ 2086 │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler. │
│ py:51 in backward │
│ │
│ 48 │ │
│ 49 │ def backward(self, loss, retain_graph=False): │
│ 50 │ │ scaled_loss = loss * self.loss_scale │
│ ❱ 51 │ │ scaled_loss.backward(retain_graph=retain_graph) │
│ 52 │
│ 53 │
│ 54 class LossScaler(LossScalerBase): │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/torch/_tensor.py:488 in backward │
│ │
│ 485 │ │ │ │ create_graph=create_graph, │
│ 486 │ │ │ │ inputs=inputs, │
│ 487 │ │ │ ) │
│ ❱ 488 │ │ torch.autograd.backward( │
│ 489 │ │ │ self, gradient, retain_graph, create_graph, inputs=inputs │
│ 490 │ │ ) │
│ 491 │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/torch/autograd/__init__.py:197 in │
│ backward │
│ │
│ 194 │ # The reason we repeat same the comment below is that │
│ 195 │ # some Python versions print out the first line of a multi-line function │
│ 196 │ # calls in the traceback and some print out the last line │
│ ❱ 197 │ Variable._execution_engine.run_backward( # Calls into the C++ engine to run the bac │
│ 198 │ │ tensors, grad_tensors_, retain_graph, create_graph, inputs, │
│ 199 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to ru │
│ 200 │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/torch/autograd/function.py:267 in │
│ apply │
│ │
│ 264 │ │ │ │ │ │ │ "Function is not allowed. You should only implement one " │
│ 265 │ │ │ │ │ │ │ "of them.") │
│ 266 │ │ user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn │
│ ❱ 267 │ │ return user_fn(self, *args) │
│ 268 │ │
│ 269 │ def apply_jvp(self, *args): │
│ 270 │ │ # _forward_cls is defined by derived class │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/torch/utils/checkpoint.py:157 in │
│ backward │
│ │
│ 154 │ │ │ raise RuntimeError( │
│ 155 │ │ │ │ "none of output has requires_grad=True," │
│ 156 │ │ │ │ " this checkpoint() is not necessary") │
│ ❱ 157 │ │ torch.autograd.backward(outputs_with_grad, args_with_grad) │
│ 158 │ │ grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None │
│ 159 │ │ │ │ │ for inp in detached_inputs) │
│ 160 │
│ │
│ /home/xx/conda/envs/dolly/lib/python3.10/site-packages/torch/autograd/__init__.py:197 in │
│ backward │
│ │
│ 194 │ # The reason we repeat same the comment below is that │
│ 195 │ # some Python versions print out the first line of a multi-line function │
│ 196 │ # calls in the traceback and some print out the last line │
│ ❱ 197 │ Variable._execution_engine.run_backward( # Calls into the C++ engine to run the bac │
│ 198 │ │ tensors, grad_tensors_, retain_graph, create_graph, inputs, │
│ 199 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to ru │
│ 200 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
OutOfMemoryError: CUDA out of memory. Tried to allocate 716.00 MiB (GPU 0; 79.15 GiB total capacity; 44.98 GiB already allocated;
320.44 MiB free; 46.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to
avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Your problem may be that you are missing the deepspeed command. Check the notebook for an example of how it is called.
Your problem may be that you are missing the deepspeed command. Check the notebook for an example of how it is called.
I am using the default config/ds_z3_bf16_config.json file in the repository, I am a novice in this, can you give me a configuration example, thank you
You are running with python, not deespeed. See train_dolly.py and just follow it exactly
python training/trainer.py --input-model /home/xx/pythia-2.8b --local-output-dir /home/xx/dolly/train --deepspeed config/ds_z3_bf16_config.json --warmup-steps 0
change python to deepspeed is useful to me . thx (´▽`) @srowen @matthayes
deepspeed training/trainer.py --input-model /home/xx/pythia-2.8b --local-output-dir /home/xx/dolly/train --deepspeed config/ds_z3_bf16_config.json --warmup-steps 0