DeepEP
DeepEP copied to clipboard
time out always happens at num_token =128
I have tested node2/node4/node4 normal mode deepep, and always encounter deepep timeout check failed when num_tokens=128.
Here is my test code.
def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
num_nodes = int(os.getenv('WORLD_SIZE', 1))
rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
num_sms = [4, 8, 12, 16, 20, 24]
tokens = [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 160, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
if not args.test_ll_compatibility:
for num_sm in num_sms:
num_qps_per_rank = num_sm
buffer = deep_ep.Buffer(group, int(2e9), int(1e9), low_latency_mode=args.test_ll_compatibility,
num_qps_per_rank=num_qps_per_rank, explicitly_destroy=True)
assert num_local_ranks == 8 and num_ranks > 8
torch.manual_seed(rank)
for num_tokens in tokens:
args.num_tokens = num_tokens
test_main(args, num_sm, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
if local_rank == 0:
print('', flush=True)
buffer.destroy()
dist.barrier()
dist.destroy_process_group()