DeepLearningExamples icon indicating copy to clipboard operation
DeepLearningExamples copied to clipboard

[DeepSpeedExamples/training/HelloDeepSpeed] Fail to run if onnxruntime-training is installed

Open wschin opened this issue 1 year ago • 0 comments

repro: execute bash run.sh from DeepSpeedExamples/training/HelloDeepSpeed.

error

root@9824d79a444b:/home/DeepSpeedExamples/training/HelloDeepSpeed# sh run_ds.sh [2024-04-15 21:59:04,124] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) Traceback (most recent call last): File "/usr/local/bin/deepspeed", line 3, in from deepspeed.launcher.runner import main File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 25, in from . import ops File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/init.py", line 6, in from . import adam File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/init.py", line 6, in from .cpu_adam import DeepSpeedCPUAdam File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/cpu_adam.py", line 8, in from deepspeed.utils import logger File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/init.py", line 10, in from .groups import * File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/groups.py", line 28, in from deepspeed import comm as dist File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/init.py", line 7, in from .comm import * File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 31, in from deepspeed.comm.ccl import CCLBackend File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/ccl.py", line 12, in from .torch import TorchBackend File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 80, in class TorchBackend(Backend): File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 105, in TorchBackend def get_all_gather_function(self): File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/compiler.py", line 21, in disable return torch.compiler.disable(func) File "/home/pytorch/torch/compiler/init.py", line 96, in disable import torch._dynamo File "/home/pytorch/torch/_dynamo/init.py", line 2, in from . import convert_frame, eval_frame, resume_execution File "/home/pytorch/torch/_dynamo/convert_frame.py", line 41, in from . import config, exc, trace_rules File "/home/pytorch/torch/_dynamo/trace_rules.py", line 51, in from .variables import ( File "/home/pytorch/torch/_dynamo/variables/init.py", line 38, in from .higher_order_ops import ( File "/home/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 13, in import torch.onnx.operators File "/home/pytorch/torch/onnx/init.py", line 61, in from ._internal.onnxruntime import ( File "/home/pytorch/torch/onnx/_internal/onnxruntime.py", line 37, in import onnxruntime # type: ignore[import] File "/usr/local/lib/python3.10/dist-packages/onnxruntime/init.py", line 54, in from onnxruntime.capi import onnxruntime_validation File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 145, in has_ortmodule, package_name, version, cuda_version = validate_build_package_info() File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 140, in validate_build_package_info raise import_ortmodule_exception File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 70, in validate_build_package_info from onnxruntime.training.ortmodule import ORTModule # noqa: F401 File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/init.py", line 26, in from .ortmodule import ORTModule # noqa: F401 File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/init.py", line 132, in from .ortmodule import ORTModule # noqa: E402, F401 File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/ortmodule.py", line 8, in from ._torch_module_factory import TorchModuleFactory File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_torch_module_factory.py", line 8, in from ._torch_module_ort import TorchModuleORT File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_torch_module_ort.py", line 13, in from ._graph_execution_manager_factory import GraphExecutionManagerFactory File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_graph_execution_manager_factory.py", line 10, in from ._inference_manager import InferenceManager File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_inference_manager.py", line 17, in from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_graph_execution_manager.py", line 23, in from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3 File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/utils/hooks/init.py", line 19, in from ._zero_offload_subscriber import ZeROOffloadSubscriber, configure_ort_compatible_zero_stage3 File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/utils/hooks/_zero_offload_subscriber.py", line 141, in from deepspeed.runtime.zero.parameter_offload import * # noqa: F403 File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/init.py", line 6, in from .partition_parameters import ZeroParamType File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 22, in from .linear import zero3_linear_wrap File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/linear.py", line 25, in from deepspeed.runtime.utils import noop_decorator File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/utils.py", line 12, in from deepspeed.moe.utils import is_moe_param File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/utils.py", line 12, in from .layer import MoE File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/layer.py", line 14, in from .sharded_moe import MOELayer, TopKGate File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/sharded_moe.py", line 96, in class _AllToAll(torch.autograd.Function): File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/sharded_moe.py", line 99, in _AllToAll def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor: # type: ignore AttributeError: partially initialized module 'deepspeed.comm' has no attribute 'ProcessGroup' (most likely due to a circular import)

after uninstalling onnxruntime-training, the example can run but generates another error:

[rank7]: Traceback (most recent call last): [rank7]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank7]: train_pipe(args) [rank7]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank7]: engine, _, _, _ = deepspeed.initialize( [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank7]: engine = PipelineEngine(args=args, [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank7]: super().init(*super_args, **super_kwargs) [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank7]: self._configure_distributed_model(model) [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank7]: self._broadcast_model() [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank7]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank7]: return func(*args, **kwargs) [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank7]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank7]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank7]: return fn(*args, **kwargs) [rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank7]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank7]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank7]: return func(*args, **kwargs) [rank7]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank7]: work = group.broadcast([tensor], opts) [rank7]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank7]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank7]: Last error: [rank7]: Error while creating shared memory segment /dev/shm/nccl-JFI57y (size 5767520) [rank5]: Traceback (most recent call last): [rank5]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank5]: train_pipe(args) [rank5]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank5]: engine, _, _, _ = deepspeed.initialize( [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank5]: engine = PipelineEngine(args=args, [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank5]: super().init(*super_args, **super_kwargs) [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank5]: self._configure_distributed_model(model) [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank5]: self._broadcast_model() [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank5]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank5]: return func(*args, **kwargs) [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank5]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank5]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank5]: return fn(*args, **kwargs) [rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank5]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank5]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank5]: return func(*args, **kwargs) [rank5]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank5]: work = group.broadcast([tensor], opts) [rank5]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank5]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank5]: Last error: [rank5]: Error while creating shared memory segment /dev/shm/nccl-NloqKs (size 5767520) [rank6]: Traceback (most recent call last): [rank6]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank6]: train_pipe(args) [rank6]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank6]: engine, _, _, _ = deepspeed.initialize( [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank6]: engine = PipelineEngine(args=args, [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank6]: super().init(*super_args, **super_kwargs) [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank6]: self._configure_distributed_model(model) [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank6]: self._broadcast_model() [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank6]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank6]: return func(*args, **kwargs) [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank6]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank6]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank6]: return fn(*args, **kwargs) [rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank6]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank6]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank6]: return func(*args, **kwargs) [rank6]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank6]: work = group.broadcast([tensor], opts) [rank6]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank6]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank6]: Last error: [rank6]: Error while creating shared memory segment /dev/shm/nccl-V76VEU (size 5767520) [rank4]: Traceback (most recent call last): [rank4]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank4]: train_pipe(args) [rank4]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank4]: engine, _, _, _ = deepspeed.initialize( [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank4]: engine = PipelineEngine(args=args, [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank4]: super().init(*super_args, **super_kwargs) [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank4]: self._configure_distributed_model(model) [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank4]: self._broadcast_model() [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank4]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank4]: return func(*args, **kwargs) [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank4]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank4]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank4]: return fn(*args, **kwargs) [rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank4]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank4]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank4]: return func(*args, **kwargs) [rank4]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank4]: work = group.broadcast([tensor], opts) [rank4]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank4]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank4]: Last error: [rank4]: Error while creating shared memory segment /dev/shm/nccl-RJem58 (size 5767520) [rank0]: Traceback (most recent call last): [rank0]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank0]: train_pipe(args) [rank0]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank0]: engine, _, _, _ = deepspeed.initialize( [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank0]: engine = PipelineEngine(args=args, [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank0]: super().init(*super_args, **super_kwargs) [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank0]: self._configure_distributed_model(model) [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank0]: self._broadcast_model() [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank0]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank0]: return func(*args, **kwargs) [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank0]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank0]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank0]: return fn(*args, **kwargs) [rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank0]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank0]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank0]: return func(*args, **kwargs) [rank0]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank0]: work = group.broadcast([tensor], opts) [rank0]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank0]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank0]: Last error: [rank0]: Error while creating shared memory segment /dev/shm/nccl-8AJ7Mw (size 5767520) [rank1]: Traceback (most recent call last): [rank1]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank1]: train_pipe(args) [rank1]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank1]: engine, _, _, _ = deepspeed.initialize( [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank1]: engine = PipelineEngine(args=args, [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank1]: super().init(*super_args, **super_kwargs) [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank1]: self._configure_distributed_model(model) [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank1]: self._broadcast_model() [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank1]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank1]: return func(*args, **kwargs) [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank1]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank1]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank1]: return fn(*args, **kwargs) [rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank1]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank1]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank1]: return func(*args, **kwargs) [rank1]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank1]: work = group.broadcast([tensor], opts) [rank1]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank1]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank1]: Last error: [rank1]: Error while creating shared memory segment /dev/shm/nccl-xaFDUZ (size 5767520) [rank2]: Traceback (most recent call last): [rank2]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank2]: train_pipe(args) [rank2]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank2]: engine, _, _, _ = deepspeed.initialize( [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank2]: engine = PipelineEngine(args=args, [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank2]: super().init(*super_args, **super_kwargs) [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank2]: self._configure_distributed_model(model) [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank2]: self._broadcast_model() [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank2]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank2]: return func(*args, **kwargs) [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank2]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank2]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank2]: return fn(*args, **kwargs) [rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank2]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank2]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank2]: return func(*args, **kwargs) [rank2]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank2]: work = group.broadcast([tensor], opts) [rank2]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank2]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank2]: Last error: [rank2]: Error while creating shared memory segment /dev/shm/nccl-c8G2vd (size 5767520) [rank3]: Traceback (most recent call last): [rank3]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in [rank3]: train_pipe(args) [rank3]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe [rank3]: engine, _, _, _ = deepspeed.initialize( [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize [rank3]: engine = PipelineEngine(args=args, [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init [rank3]: super().init(*super_args, **super_kwargs) [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init [rank3]: self._configure_distributed_model(model) [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model [rank3]: self._broadcast_model() [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model [rank3]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group) [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper [rank3]: return func(*args, **kwargs) [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast [rank3]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank3]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn [rank3]: return fn(*args, **kwargs) [rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast [rank3]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) [rank3]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper [rank3]: return func(*args, **kwargs) [rank3]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast [rank3]: work = group.broadcast([tensor], opts) [rank3]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5 [rank3]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. [rank3]: Last error: [rank3]: Error while creating shared memory segment /dev/shm/nccl-ByCsYh (size 5767520)

wschin avatar Apr 15 '24 22:04 wschin