repro: execute bash run.sh from DeepSpeedExamples/training/HelloDeepSpeed.
error
root@9824d79a444b:/home/DeepSpeedExamples/training/HelloDeepSpeed# sh run_ds.sh
[2024-04-15 21:59:04,124] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Traceback (most recent call last):
File "/usr/local/bin/deepspeed", line 3, in
from deepspeed.launcher.runner import main
File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 25, in
from . import ops
File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/init.py", line 6, in
from . import adam
File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/init.py", line 6, in
from .cpu_adam import DeepSpeedCPUAdam
File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/cpu_adam.py", line 8, in
from deepspeed.utils import logger
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/init.py", line 10, in
from .groups import *
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/groups.py", line 28, in
from deepspeed import comm as dist
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/init.py", line 7, in
from .comm import *
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 31, in
from deepspeed.comm.ccl import CCLBackend
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/ccl.py", line 12, in
from .torch import TorchBackend
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 80, in
class TorchBackend(Backend):
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 105, in TorchBackend
def get_all_gather_function(self):
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/compiler.py", line 21, in disable
return torch.compiler.disable(func)
File "/home/pytorch/torch/compiler/init.py", line 96, in disable
import torch._dynamo
File "/home/pytorch/torch/_dynamo/init.py", line 2, in
from . import convert_frame, eval_frame, resume_execution
File "/home/pytorch/torch/_dynamo/convert_frame.py", line 41, in
from . import config, exc, trace_rules
File "/home/pytorch/torch/_dynamo/trace_rules.py", line 51, in
from .variables import (
File "/home/pytorch/torch/_dynamo/variables/init.py", line 38, in
from .higher_order_ops import (
File "/home/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 13, in
import torch.onnx.operators
File "/home/pytorch/torch/onnx/init.py", line 61, in
from ._internal.onnxruntime import (
File "/home/pytorch/torch/onnx/_internal/onnxruntime.py", line 37, in
import onnxruntime # type: ignore[import]
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/init.py", line 54, in
from onnxruntime.capi import onnxruntime_validation
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 145, in
has_ortmodule, package_name, version, cuda_version = validate_build_package_info()
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 140, in validate_build_package_info
raise import_ortmodule_exception
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 70, in validate_build_package_info
from onnxruntime.training.ortmodule import ORTModule # noqa: F401
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/init.py", line 26, in
from .ortmodule import ORTModule # noqa: F401
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/init.py", line 132, in
from .ortmodule import ORTModule # noqa: E402, F401
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/ortmodule.py", line 8, in
from ._torch_module_factory import TorchModuleFactory
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_torch_module_factory.py", line 8, in
from ._torch_module_ort import TorchModuleORT
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_torch_module_ort.py", line 13, in
from ._graph_execution_manager_factory import GraphExecutionManagerFactory
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_graph_execution_manager_factory.py", line 10, in
from ._inference_manager import InferenceManager
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_inference_manager.py", line 17, in
from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_graph_execution_manager.py", line 23, in
from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/utils/hooks/init.py", line 19, in
from ._zero_offload_subscriber import ZeROOffloadSubscriber, configure_ort_compatible_zero_stage3
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/utils/hooks/_zero_offload_subscriber.py", line 141, in
from deepspeed.runtime.zero.parameter_offload import * # noqa: F403
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/init.py", line 6, in
from .partition_parameters import ZeroParamType
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 22, in
from .linear import zero3_linear_wrap
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/linear.py", line 25, in
from deepspeed.runtime.utils import noop_decorator
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/utils.py", line 12, in
from deepspeed.moe.utils import is_moe_param
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/utils.py", line 12, in
from .layer import MoE
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/layer.py", line 14, in
from .sharded_moe import MOELayer, TopKGate
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/sharded_moe.py", line 96, in
class _AllToAll(torch.autograd.Function):
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/sharded_moe.py", line 99, in _AllToAll
def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor: # type: ignore
AttributeError: partially initialized module 'deepspeed.comm' has no attribute 'ProcessGroup' (most likely due to a circular import)
after uninstalling onnxruntime-training, the example can run but generates another error:
[rank7]: Traceback (most recent call last):
[rank7]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank7]: train_pipe(args)
[rank7]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank7]: engine, _, _, _ = deepspeed.initialize(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank7]: engine = PipelineEngine(args=args,
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank7]: super().init(*super_args, **super_kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank7]: self._configure_distributed_model(model)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank7]: self._broadcast_model()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank7]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank7]: return func(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank7]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank7]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank7]: return fn(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank7]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank7]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank7]: return func(*args, **kwargs)
[rank7]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank7]: work = group.broadcast([tensor], opts)
[rank7]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank7]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank7]: Last error:
[rank7]: Error while creating shared memory segment /dev/shm/nccl-JFI57y (size 5767520)
[rank5]: Traceback (most recent call last):
[rank5]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank5]: train_pipe(args)
[rank5]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank5]: engine, _, _, _ = deepspeed.initialize(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank5]: engine = PipelineEngine(args=args,
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank5]: super().init(*super_args, **super_kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank5]: self._configure_distributed_model(model)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank5]: self._broadcast_model()
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank5]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank5]: return func(*args, **kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank5]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank5]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank5]: return fn(*args, **kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank5]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank5]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank5]: return func(*args, **kwargs)
[rank5]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank5]: work = group.broadcast([tensor], opts)
[rank5]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank5]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank5]: Last error:
[rank5]: Error while creating shared memory segment /dev/shm/nccl-NloqKs (size 5767520)
[rank6]: Traceback (most recent call last):
[rank6]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank6]: train_pipe(args)
[rank6]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank6]: engine, _, _, _ = deepspeed.initialize(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank6]: engine = PipelineEngine(args=args,
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank6]: super().init(*super_args, **super_kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank6]: self._configure_distributed_model(model)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank6]: self._broadcast_model()
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank6]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank6]: return func(*args, **kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank6]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank6]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank6]: return fn(*args, **kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank6]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank6]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank6]: return func(*args, **kwargs)
[rank6]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank6]: work = group.broadcast([tensor], opts)
[rank6]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank6]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank6]: Last error:
[rank6]: Error while creating shared memory segment /dev/shm/nccl-V76VEU (size 5767520)
[rank4]: Traceback (most recent call last):
[rank4]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank4]: train_pipe(args)
[rank4]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank4]: engine, _, _, _ = deepspeed.initialize(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank4]: engine = PipelineEngine(args=args,
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank4]: super().init(*super_args, **super_kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank4]: self._configure_distributed_model(model)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank4]: self._broadcast_model()
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank4]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank4]: return func(*args, **kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank4]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank4]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank4]: return fn(*args, **kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank4]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank4]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank4]: return func(*args, **kwargs)
[rank4]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank4]: work = group.broadcast([tensor], opts)
[rank4]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank4]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank4]: Last error:
[rank4]: Error while creating shared memory segment /dev/shm/nccl-RJem58 (size 5767520)
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank0]: train_pipe(args)
[rank0]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank0]: engine, _, _, _ = deepspeed.initialize(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank0]: engine = PipelineEngine(args=args,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank0]: super().init(*super_args, **super_kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank0]: self._configure_distributed_model(model)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank0]: self._broadcast_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank0]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank0]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank0]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank0]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank0]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank0]: work = group.broadcast([tensor], opts)
[rank0]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank0]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank0]: Last error:
[rank0]: Error while creating shared memory segment /dev/shm/nccl-8AJ7Mw (size 5767520)
[rank1]: Traceback (most recent call last):
[rank1]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank1]: train_pipe(args)
[rank1]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank1]: engine, _, _, _ = deepspeed.initialize(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank1]: engine = PipelineEngine(args=args,
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank1]: super().init(*super_args, **super_kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank1]: self._configure_distributed_model(model)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank1]: self._broadcast_model()
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank1]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank1]: return func(*args, **kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank1]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank1]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank1]: return fn(*args, **kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank1]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank1]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank1]: return func(*args, **kwargs)
[rank1]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank1]: work = group.broadcast([tensor], opts)
[rank1]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank1]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank1]: Last error:
[rank1]: Error while creating shared memory segment /dev/shm/nccl-xaFDUZ (size 5767520)
[rank2]: Traceback (most recent call last):
[rank2]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank2]: train_pipe(args)
[rank2]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank2]: engine, _, _, _ = deepspeed.initialize(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank2]: engine = PipelineEngine(args=args,
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank2]: super().init(*super_args, **super_kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank2]: self._configure_distributed_model(model)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank2]: self._broadcast_model()
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank2]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank2]: return func(*args, **kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank2]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank2]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank2]: return fn(*args, **kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank2]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank2]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank2]: return func(*args, **kwargs)
[rank2]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank2]: work = group.broadcast([tensor], opts)
[rank2]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank2]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank2]: Last error:
[rank2]: Error while creating shared memory segment /dev/shm/nccl-c8G2vd (size 5767520)
[rank3]: Traceback (most recent call last):
[rank3]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank3]: train_pipe(args)
[rank3]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank3]: engine, _, _, _ = deepspeed.initialize(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank3]: engine = PipelineEngine(args=args,
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank3]: super().init(*super_args, **super_kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank3]: self._configure_distributed_model(model)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank3]: self._broadcast_model()
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank3]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank3]: return func(*args, **kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank3]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank3]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank3]: return fn(*args, **kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank3]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank3]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank3]: return func(*args, **kwargs)
[rank3]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank3]: work = group.broadcast([tensor], opts)
[rank3]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank3]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank3]: Last error:
[rank3]: Error while creating shared memory segment /dev/shm/nccl-ByCsYh (size 5767520)