DeepEP icon indicating copy to clipboard operation
DeepEP copied to clipboard

time out always happens at num_token =128

Open xutizhou opened this issue 4 months ago • 0 comments

I have tested node2/node4/node4 normal mode deepep, and always encounter deepep timeout check failed when num_tokens=128.

Image

Here is my test code.

def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
    num_nodes = int(os.getenv('WORLD_SIZE', 1))
    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)

    num_sms = [4, 8, 12, 16, 20, 24]
    tokens = [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 160, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
    if not args.test_ll_compatibility:
        for num_sm in num_sms:
            num_qps_per_rank = num_sm

            buffer = deep_ep.Buffer(group, int(2e9), int(1e9), low_latency_mode=args.test_ll_compatibility,
                                    num_qps_per_rank=num_qps_per_rank, explicitly_destroy=True)
            assert num_local_ranks == 8 and num_ranks > 8
            torch.manual_seed(rank)

            for num_tokens in tokens:
                args.num_tokens = num_tokens
                test_main(args, num_sm, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
                if local_rank == 0:
                    print('', flush=True)
            buffer.destroy()
    dist.barrier()
    dist.destroy_process_group()


xutizhou avatar Aug 28 '25 10:08 xutizhou