DeepSpeed
DeepSpeed copied to clipboard
RuntimeError: cannot pin 'CUDABFloat16Type' only dense CPU tensors can be pinned
Describe the bug
self.param_groups_fp16_flat_cpu_memory.append(get_accelerator().pin_memory(
File "/usr/local/python/lib/python3.8/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 291, in pin_memory
self.param_groups_fp16_flat_cpu_memory.append(get_accelerator().pin_memory(
File "/usr/local/python/lib/python3.8/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 291, in pin_memory
return tensor.pin_memory()
RuntimeError: cannot pin 'CUDABFloat16Type' only dense CPU tensors can be pinned
return tensor.pin_memory()
when i use deepspeed zero3 to train model ,i meet this problem ,can someone help me?
@cooper12121, please share repro steps, such as scripts, full stack trace, and ds_config.
@cooper12121, please share repro steps, such as scripts, full stack trace, and ds_config.
I use transformers' Trainer to train my model, Here is the full error report:
11.219.19.47: File "pretrain.py", line 591, in train
11.219.19.47: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: return inner_training_loop(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47:
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: return inner_training_loop(trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.47: return inner_training_loop(
11.219.19.47:
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.133: return inner_training_loop(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133:
11.219.19.47: return inner_training_loop(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.47: return inner_training_loop(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: trainer.train(resume_from_checkpoint = resume_from_checkpoint_dir)return inner_training_loop(
11.219.19.133:
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1859, in train
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: return inner_training_loop(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: result = self._prepare_deepspeed(*args)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: return inner_training_loop(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: return inner_training_loop(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: return inner_training_loop(
11.219.19.133: return inner_training_loop(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.133: return inner_training_loop(return inner_training_loop(return inner_training_loop(
11.219.19.133:
11.219.19.133:
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.47: result = self._prepare_deepspeed(*args)engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47:
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.133: return inner_training_loop(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 2015, in _inner_training_loop
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: result = self._prepare_deepspeed(*args)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(result = self._prepare_deepspeed(*args)
11.219.19.47:
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: self._configure_optimizer(optimizer, model_parameters)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: result = self._prepare_deepspeed(*args)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: result = self._prepare_deepspeed(*args)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47:
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: self.optimizer = self._configure_zero_optimizer(basic_optimizer)result = self._prepare_deepspeed(*args)
11.219.19.47:
11.219.19.47: engine = DeepSpeedEngine(args=args,engine = DeepSpeedEngine(args=args, File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.47:
11.219.19.47:
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.133: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1220, in prepare
11.219.19.47: self._configure_optimizer(optimizer, model_parameters)self._configure_optimizer(optimizer, model_parameters)
11.219.19.47:
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: result = self._prepare_deepspeed(*args)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.47: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133: result = self._prepare_deepspeed(*args)engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133:
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: self._configure_optimizer(optimizer, model_parameters)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: optimizer = DeepSpeedZeroOptimizer_Stage3(
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 317, in __init__
11.219.19.133: result = self._prepare_deepspeed(*args)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/accelerate/accelerator.py", line 1605, in _prepare_deepspeed
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.47: self._configure_optimizer(optimizer, model_parameters) self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: self._configure_optimizer(optimizer, model_parameters)
11.219.19.47:
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 708, in _create_fp16_partitions_with_defragmentation
11.219.19.47: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: self._configure_optimizer(optimizer, model_parameters)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47: engine = DeepSpeedEngine(args=args,
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: self._create_param_groups_fp16_flat_cpu_memory()
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_param_groups_fp16_flat_cpu_memory
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.133: engine = DeepSpeedEngine(args=args, File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.133:
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: self._configure_optimizer(optimizer, model_parameters)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.47: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
11.219.19.47: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/__init__.py", line 176, in initialize
11.219.19.47: optimizer = DeepSpeedZeroOptimizer_Stage3(self.param_groups_fp16_flat_cpu_memory.append(get_accelerator().pin_memory(optimizer = DeepSpeedZeroOptimizer_Stage3(
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.47:
11.219.19.47:
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 317, in __init__
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 291, in pin_memory
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 317, in __init__
11.219.19.133: engine = DeepSpeedEngine(args=args,
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 307, in __init__
11.219.19.133: optimizer = DeepSpeedZeroOptimizer_Stage3(
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 317, in __init__
11.219.19.47: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
11.219.19.47: return tensor.pin_memory()
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1579, in _configure_zero_optimizer
11.219.19.133: self._configure_optimizer(optimizer, model_parameters)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1256, in _configure_optimizer
11.219.19.133: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
11.219.19.133: File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 708, in _create_fp16_partitions_with_defragmentation
11.219.19.47: optimizer = DeepSpeedZeroOptimizer_Stage3(
11.219.19.47: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups) RuntimeError File "/usr/local/python/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 317, in __init__
11.219.19.47: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
11.219.19.47: : cannot pin 'CUDABFloat16Type' only dense CPU tensors can be pinned
This error occurred when I added the following code to .py file:
torch.set_default_tensor_type(torch.cuda.HalfTensor)
my deepspeed zero3 configs are as follows:
{
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 20,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
my packages version are as follows:
torch:2.0.1
transformers: 4.40.0
deepspeed: 0.14.0
my OS and machine informations are as follows:
OS: CST 2020 x86_64 x86_64 x86_64 GNU/Linux
GPU: A100-SXM4-40GB