[Bug] test_internode.py timeout on 4 * 8 H800
Environment:
script:
export NCCL_DEBUG=INFO
export TORCH_CPP_LOG_LEVEL=INFO
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=bond0
export GLOO_SOCKET_IFNAME=bond0
export NCCL_IB_HCA='^=mlx5_bond_0'
export NCCL_IB_TC=128
export NCCL_IB_TIMEOUT=22
export NCCL_IB_RETRY_CNT=15
export DISABLE_SM90_FEATURES=1
export TORCH_CUDA_ARCH_LIST="9.0"
export DISABLE_AGGRESSIVE_PTX_INSTRS=0
export NCCL_IB_QPS_PER_CONNECTION=128
# node 0:
MASTER_ADDR=my_master_node MASTER_PORT=my_master_port WORLD_SIZE=4 RANK=0 python test_internode.py
# node 1:
MASTER_ADDR=my_master_node MASTER_PORT=my_master_port WORLD_SIZE=4 RANK=1 python test_internode.py
# node 2:
MASTER_ADDR=my_master_node MASTER_PORT=my_master_port WORLD_SIZE=4 RANK=2 python test_internode.py
# node 3:
MASTER_ADDR=my_master_node MASTER_PORT=my_master_port WORLD_SIZE=4 RANK=3 python test_internode.py
logs on node 0:
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081462:3082590 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081462:3082590 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081462:3082590 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081462:3082590 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081462:3082590 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081458:3082602 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/6/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081460:3082601 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/2/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081464:3082593 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/7/GDRDMA
DeepEP-test:3081461:3082596 [4] NCCL INFO Connected NVLS tree
DeepEP-test:3081461:3082596 [4] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081461:3082596 [4] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081459:3082607 [2] NCCL INFO Connected NVLS tree
DeepEP-test:3081459:3082607 [2] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081459:3082607 [2] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081457:3082608 [0] NCCL INFO Connected NVLS tree
DeepEP-test:3081457:3082608 [0] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081457:3082608 [0] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081464:3082593 [7] NCCL INFO Connected NVLS tree
DeepEP-test:3081464:3082593 [7] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081464:3082593 [7] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081460:3082601 [3] NCCL INFO Connected NVLS tree
DeepEP-test:3081460:3082601 [3] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081460:3082601 [3] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081463:3082611 [6] NCCL INFO Connected NVLS tree
DeepEP-test:3081463:3082611 [6] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081463:3082611 [6] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081462:3082590 [5] NCCL INFO Connected NVLS tree
DeepEP-test:3081462:3082590 [5] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081462:3082590 [5] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081458:3082602 [1] NCCL INFO Connected NVLS tree
DeepEP-test:3081458:3082602 [1] NCCL INFO threadThresholds 8/8/64 | 256/8/64 | 512 | 512
DeepEP-test:3081458:3082602 [1] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer
DeepEP-test:3081458:3082602 [1] NCCL INFO ncclCommSplit comm 0x88506c0 rank 1 nranks 32 cudaDev 1 nvmlDev 1 busId 38000 parent 0x863f950 color 698429859 key 1 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081462:3082590 [5] NCCL INFO ncclCommSplit comm 0x71d3420 rank 5 nranks 32 cudaDev 5 nvmlDev 5 busId bb000 parent 0x6fc26a0 color 698429859 key 5 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081460:3082601 [3] NCCL INFO ncclCommSplit comm 0x73d21c0 rank 3 nranks 32 cudaDev 3 nvmlDev 3 busId 59000 parent 0x71c1410 color 698429859 key 3 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081464:3082593 [7] NCCL INFO ncclCommSplit comm 0x8af9980 rank 7 nranks 32 cudaDev 7 nvmlDev 7 busId da000 parent 0x88e8470 color 698429859 key 7 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081461:3082596 [4] NCCL INFO ncclCommSplit comm 0x7ad94b0 rank 4 nranks 32 cudaDev 4 nvmlDev 4 busId 9b000 parent 0x78c8730 color 698429859 key 4 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081457:3082608 [0] NCCL INFO ncclCommSplit comm 0x72ab6d0 rank 0 nranks 32 cudaDev 0 nvmlDev 0 busId 18000 parent 0x709b0f0 color 698429859 key 0 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081463:3082611 [6] NCCL INFO ncclCommSplit comm 0x898dc00 rank 6 nranks 32 cudaDev 6 nvmlDev 6 busId ca000 parent 0x877ce70 color 698429859 key 6 commId 0x725286ef14dca1f4 - Init COMPLETE
DeepEP-test:3081459:3082607 [2] NCCL INFO ncclCommSplit comm 0x8327690 rank 2 nranks 32 cudaDev 2 nvmlDev 2 busId 49000 parent 0x8116900 color 698429859 key 2 commId 0x725286ef14dca1f4 - Init COMPLETE
[rank5]:[I702 21:26:24.214150102 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 5] ProcessGroupNCCL created ncclComm_ 0x71d3420 on CUDA device:
[rank5]:[I702 21:26:24.214206655 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 5] NCCL_DEBUG: INFO
[rank7]:[I702 21:26:24.214185277 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 7] ProcessGroupNCCL created ncclComm_ 0x8af9980 on CUDA device:
[rank1]:[I702 21:26:24.214184812 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 1] ProcessGroupNCCL created ncclComm_ 0x88506c0 on CUDA device:
[rank4]:[I702 21:26:24.214206895 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 4] ProcessGroupNCCL created ncclComm_ 0x7ad94b0 on CUDA device:
[rank3]:[I702 21:26:24.214220130 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 3] ProcessGroupNCCL created ncclComm_ 0x73d21c0 on CUDA device:
[rank7]:[I702 21:26:24.214239687 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 7] NCCL_DEBUG: INFO
[rank1]:[I702 21:26:24.214253689 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 1] NCCL_DEBUG: INFO
[rank4]:[I702 21:26:24.214265286 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 4] NCCL_DEBUG: INFO
[rank3]:[I702 21:26:24.214274454 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 3] NCCL_DEBUG: INFO
[rank6]:[I702 21:26:24.214295010 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 6] ProcessGroupNCCL created ncclComm_ 0x898dc00 on CUDA device:
[rank2]:[I702 21:26:24.214303675 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 2] ProcessGroupNCCL created ncclComm_ 0x8327690 on CUDA device:
[rank6]:[I702 21:26:24.214355397 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 6] NCCL_DEBUG: INFO
[rank2]:[I702 21:26:24.214364662 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 2] NCCL_DEBUG: INFO
[rank0]:[I702 21:26:24.214582605 ProcessGroupNCCL.cpp:2301] [PG ID 1 PG GUID 1 Rank 0] ProcessGroupNCCL created ncclComm_ 0x72ab6d0 on CUDA device:
[rank0]:[I702 21:26:24.214675581 ProcessGroupNCCL.cpp:2306] [PG ID 1 PG GUID 1 Rank 0] NCCL_DEBUG: INFO
xterm-256colorWARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
[config] num_tokens=4096, hidden=7168, num_topk_groups=4, num_topk=8
[layout] Kernel performance: 0.057 ms
[testing] Running with BF16, without top-k (async=False, previous=False) ... passed
[testing] Running with BF16, with top-k (async=False, previous=False) ... passed
[testing] Running with BF16, without top-k (async=False, previous=False) ... passed
[testing] Running with BF16, with top-k (async=False, previous=False) ... passed
[testing] Running with FP8, without top-k (async=False, previous=False) ... passed
[testing] Running with FP8, with top-k (async=False, previous=False) ... passed
[testing] Running with BF16, without top-k (async=True, previous=False) ... passed
[testing] Running with BF16, with top-k (async=True, previous=False) ... passed
[testing] Running with BF16, without top-k (async=True, previous=False) ... passed
[testing] Running with BF16, with top-k (async=True, previous=False) ... passed
[testing] Running with FP8, without top-k (async=True, previous=False) ... passed
[testing] Running with FP8, with top-k (async=True, previous=False) ... passed
[testing] Running with BF16, without top-k (async=False, previous=True) ... passed
[testing] Running with BF16, with top-k (async=False, previous=True) ... passed
[testing] Running with BF16, without top-k (async=False, previous=True) ... passed
[testing] Running with BF16, with top-k (async=False, previous=True) ... passed
[testing] Running with FP8, without top-k (async=False, previous=True) ... passed
[testing] Running with FP8, with top-k (async=False, previous=True) ... passed
[testing] Running with BF16, without top-k (async=True, previous=True) ... passed
[testing] Running with BF16, with top-k (async=True, previous=True) ...
DeepEP timeout check failed: rank = 7, thread = 0)
DeepEP timeout check failed: rank = 0, thread = 0)
DeepEP RDMA sender coordinator timeout, channel: 4, IB: 0, nvl 4, dst IB: 0, tail: 309, remaining: 0
DeepEP RDMA sender coordinator timeout, channel: 4, IB: 0, nvl 4, dst IB: 1, tail: 297, remaining: 0
DeepEP RDMA sender coordinator timeout, channel: 4, IB: 0, nvl 4, dst IB: 2, tail: 240, remaining: 70
DeepEP RDMA sender coordinator timeout, channel: 4, IB: 0, nvl 4, dst IB: 3, tail: 224, remaining: 75
DeepEP timeout check failed: rank = 3, thread = 0)
DeepEP timeout check failed: rank = 2, thread = 0)
DeepEP timeout check failed: rank = 6, thread = 0)
DeepEP timeout check failed: rank = 5, thread = 0)
DeepEP timeout check failed: rank = 1, thread = 0)
W0702 21:28:25.026000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081457 via signal SIGTERM
W0702 21:28:25.027000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081458 via signal SIGTERM
W0702 21:28:25.027000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081459 via signal SIGTERM
W0702 21:28:25.027000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081460 via signal SIGTERM
W0702 21:28:25.027000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081462 via signal SIGTERM
W0702 21:28:25.027000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081463 via signal SIGTERM
W0702 21:28:25.028000 3081433 site-packages/torch/multiprocessing/spawn.py:160] Terminating process 3081464 via signal SIGTERM
Traceback (most recent call last):
File "/root/hfy/DeepEP-main/tests/test_internode.py", line 254, in <module>
torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
File "/root/miniconda3/envs/test_deepep_py310/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 328, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
File "/root/miniconda3/envs/test_deepep_py310/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 284, in start_processes
while not context.join():
File "/root/miniconda3/envs/test_deepep_py310/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 203, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 4 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/test_deepep_py310/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
fn(i, *args)
File "/root/hfy/DeepEP-main/tests/test_internode.py", line 238, in test_loop
test_main(i, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
File "/root/hfy/DeepEP-main/tests/test_internode.py", line 116, in test_main
assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
File "/root/miniconda3/envs/test_deepep_py310/lib/python3.10/site-packages/torch/utils/_device.py", line 106, in __torch_function__
return func(*args, **kwargs)
RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
meet the same problem on 4*8 h800,after fp8 dispatch test, "DeepEP dispatch RDMA sender timeout" happened in bf16 dispatch test. more info is
same error with ROCE. 2 nodes 16 GPUs under the same TOR is OK, but 4 nodes 32 gpus fail.
It might be the same issue as https://github.com/deepseek-ai/DeepEP/issues/270. I will try to debug it.
meet the same problem on 4*8 h800
this repo is ok,about 45GB/s https://github.com/Infrawaves/DeepEP_ibrc_dual-ports_multiQP
Hi I have the same issue.
same error on 4 * 8 * H800, any progress with that @newfarmer thanks !
Try this PR #310.
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
DeepEP dispatch NVL receiver timeout, channel: 6, RDMA: 0, nvl: 0, src NVL: 5, head: 315, tail: 315
DeepEP dispatch NVL receiver timeout, channel: 10, RDMA: 0, nvl: 0, src NVL: 5, head: 300, tail: 300
terminate called after throwing an instance of 'EPException'
what(): Failed: CUDA error /workspace/DeepEP/csrc/deep_ep.cpp:83 'unspecified launch failure'
ment
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
WARN: IB: read failed in ib_roce_get_version_num: Invalid argument
DeepEP dispatch NVL receiver timeout, channel: 8, RDMA: 0, nvl: 3, src NVL: 5, head: 324, tail: 324
DeepEP dispatch NVL receiver timeout, channel: 6, RDMA: 0, nvl: 3, src NVL: 5, head: 330, tail: 330
terminate called after throwing an instance of 'EPException'
what(): Failed: CUDA error /workspace/DeepEP/csrc/deep_ep.cpp:83 'unspecified launch failure'
terminate called after throwing an instance of 'EPException'
what(): Failed: CUDA error /workspace/DeepEP/csrc/deep_ep.cpp:83 'unspecified launch failure'
W1022 04:10:47.910000 808 torch/multiprocessing/spawn.py:169] Terminating process 873 via signal SIGTERM
W1022 04:10:47.911000 808 torch/multiprocessing/spawn.py:169] Terminating process 874 via signal SIGTERM
W1022 04:10:47.911000 808 torch/multiprocessing/spawn.py:169] Terminating process 875 via signal SIGTERM
W1022 04:10:47.911000 808 torch/multiprocessing/spawn.py:169] Terminating process 877 via signal SIGTERM
W1022 04:10:47.911000 808 torch/multiprocessing/spawn.py:169] Terminating process 878 via signal SIGTERM
W1022 04:10:47.912000 808 torch/multiprocessing/spawn.py:169] Terminating process 879 via signal SIGTERM
W1022 04:10:47.912000 808 torch/multiprocessing/spawn.py:169] Terminating process 880 via signal SIGTERM
Traceback (most recent call last):
File "/workspace/DeepEP/tests/test_internode.py", line 244, in <module>
torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
File "/usr/local/lib/python3.12/dist-packages/torch/multiprocessing/spawn.py", line 340, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/multiprocessing/spawn.py", line 296, in start_processes
while not context.join():
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/multiprocessing/spawn.py", line 215, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 3 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
fn(i, *args)
File "/workspace/DeepEP/tests/test_internode.py", line 232, in test_loop
test_main(i, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
File "/workspace/DeepEP/tests/test_internode.py", line 109, in test_main
recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/deep_ep-1.0.0+a84a248-py3.12-linux-x86_64.egg/deep_ep/buffer.py", line 283, in dispatch
return self.internode_dispatch(x, handle, num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/deep_ep-1.0.0+a84a248-py3.12-linux-x86_64.egg/deep_ep/buffer.py", line 391, in internode_dispatch
recv_src_meta, send_rdma_head, send_nvl_head, event = self.runtime.internode_dispatch(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Failed: CUDA error /workspace/DeepEP/csrc/kernels/internode.cu:1068 'unspecified launch failure'
--- Inter-Node Test Finished on Node 0 ---
same issue here on 20 H200 nodes