mpich RDMA /MPIR_CVAR_CH4_OFI_ENABLE_HMEM + 2 Rank per Tile doesn't scale (22 GB/s, when expecting 40 GB/s)

(It's assumed in the reproducer that people know PCIe is 50 GB/s, hence it's not the bootlneck when used correctly -- 50 GB/s PCie > 20 GB/s NIC * 2 )

The reproducer assume 2 node (-n 4 --ppn 2)

applenco@x4506c3s2b0n0:~/tmp> cat rdma_mpi_contension.cpp
#include <algorithm>
#include <chrono>
#include <iostream>
#include <limits>
#include <mpi.h>
#include <omp.h>
#include <random>
#include <vector>
#include <cassert>

bool almost_equal(double x, double gold, float rel_tol = 0.20, double abs_tol = 0.0) {
  return std::abs(x - gold) <= std::max(rel_tol * std::max(std::abs(x), std::abs(gold)), abs_tol);
}

// Default template: no mapping
template <typename T> struct mpi_type {
  static_assert(sizeof(T) == 0, "Unsupported type for MPI mapping");
};

template <> struct mpi_type<int> {
  static constexpr MPI_Datatype value = MPI_INT;
};

template <> struct mpi_type<double> {
  static constexpr MPI_Datatype value = MPI_DOUBLE;
};

template <typename Func>
double bench(const std::string &label, size_t N_byte, int num_iteration, Func &&f, MPI_Comm comm) {
  unsigned long min_time = std::numeric_limits<unsigned long>::max();
  for (int r = 0; r < num_iteration; r++) {
    MPI_Barrier(comm);
    const unsigned long l_start =
        std::chrono::high_resolution_clock::now().time_since_epoch().count();

    f(); // Execute the lambda

    const unsigned long l_end =
        std::chrono::high_resolution_clock::now().time_since_epoch().count();
    unsigned long start, end;
    MPI_Reduce(&l_start, &start, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, comm);
    MPI_Reduce(&l_end, &end, 1, MPI_UNSIGNED_LONG, MPI_MAX, 0, comm);

    const unsigned long time = end - start;
    min_time = std::min(time, min_time);
  }

  int mpi_size;
  MPI_Comm_size(comm, &mpi_size);

  int world_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

  double bw;
  if (world_rank == 0) {
    bw = .5 * (mpi_size)*N_byte / min_time;
    std::cout << label << " BW " << bw << " GB/s" << std::endl;
  }
  MPI_Bcast(&bw, 1, MPI_DOUBLE, 0, comm);
  return bw;
}

template <typename T>
double run(MPI_Comm comm, uint64_t N, std::string prefix, int num_iteration = 10) {

  MPI_Datatype mtype = mpi_type<T>::value;

  // Find out rank, size
  int mpi_rank;
  MPI_Comm_rank(comm, &mpi_rank);
  int mpi_size;
  MPI_Comm_size(comm, &mpi_size);

  int device_id = omp_get_default_device();
  int host_id = omp_get_initial_device();

  const uint64_t N_byte = N * sizeof(T);

  std::vector<T> A(N);
  // Fast than std::rand
  {
    std::minstd_rand rng{std::random_device{}()};
    std::uniform_int_distribution<> dist(0, 100);
    std::generate(A.begin(), A.end(), [&]() { return dist(rng); });
  }

  T *A2_gpu = (T *)omp_target_alloc_device(N_byte, device_id);
  omp_target_memcpy(A2_gpu, A.data(), N_byte, 0, 0, device_id, host_id);

  const auto mpi_gpu_gpu = bench(
      prefix + " GPU -> GPU", N_byte, num_iteration,
      [&]() {
        if (mpi_rank < mpi_size / 2) {
          MPI_Send(
              /* data         = */ A2_gpu,
              /* count        = */ N,
              /* datatype     = */ mtype,
              /* destination  = */ mpi_rank + mpi_size / 2,
              /* tag          = */ 0,
              /* communicator = */ comm);
        } else {
          MPI_Recv(
              /* data         = */ A2_gpu,
              /* count        = */ N,
              /* datatype     = */ mtype,
              /* source       = */ mpi_rank - mpi_size / 2,
              /* tag          = */ 0,
              /* communicator = */ comm,
              /* status       = */ MPI_STATUS_IGNORE);
        }
      },
      comm);
  omp_target_free(A2_gpu, device_id);
  return mpi_gpu_gpu;
}

int main() {
  MPI_Init(NULL, NULL);

  int world_rank, world_size;
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);

  assert(world_size == 4);

  auto bw_2ppn = run<double>(MPI_COMM_WORLD, 1 << 28, "2 NIC");

  // Use color = 1 for even ranks, MPI_UNDEFINED for odd
  int color = (world_rank % 2 == 0) ? 1 : MPI_UNDEFINED;
  MPI_Comm even_comm;

  // Split the communicator
  double bw_1ppn;
  MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &even_comm);

  if (color == 1) {
    bw_1ppn = run<double>(even_comm, 1 << 28, "1 NIC");
    MPI_Comm_free(&even_comm);
  }

  MPI_Finalize();
  if (world_rank == 0) {
    if (almost_equal(bw_2ppn, 2 * bw_1ppn, 0.20)) {
      std::cout << "Success: Using 2 NIC, is 2x faster than using 1 NIC!" << std::endl;
    } else {
      std::cout << "Fail: Using 2 NIC, is NOT 2x faster than using 1 NIC!" << std::endl;
      exit(1);
    }
  }
}
applenco@x4506c3s2b0n0:~/tmp> mpicxx -fiopenmp -fopenmp-targets=spir64 rdma_mpi_contension.cpp
applenco@x4506c3s2b0n0:~/tmp> NEOReadDebugKeys=1 EnableImplicitScaling=0 MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 mpirun --cpu-bind=list:1-8:9-16 --mem-bind=list:2:2 -n 4 --ppn 2 gpu_dev_compact.sh ./a.out
2 NIC GPU -> GPU BW 38.6542 GB/s
1 NIC GPU -> GPU BW 21.2663 GB/s
Success: Using 2 NIC, is 2x faster than using 1 NIC!
applenco@x4506c3s2b0n0:~/tmp> NEOReadDebugKeys=1 EnableImplicitScaling=0 MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 mpirun --cpu-bind=list:1-8:9-16 --mem-bind=list:2:2 -n 4 --ppn 2 gpu_tile_compact.sh ./a.out
2 NIC GPU -> GPU BW 20.5591 GB/s
1 NIC GPU -> GPU BW 21.254 GB/s
Fail: Using 2 NIC, is NOT 2x faster than using 1 NIC!
x4506c3s2b0n0.hostmgmt2506.cm.aurora.alcf.anl.gov: rank 0 exited with code 1

Jun 24 '25 23:06 TApplencourt

We will schedule a meeting the week of 6/30 to discuss further and decide on technical direction.

Jun 25 '25 19:06 raffenet

@TApplencourt Add timeline debug to this issue

Jul 23 '25 19:07 hzhou

@TApplencourt is this resolved with the new pipelining code in the latest module?

Sep 17 '25 21:09 raffenet

Give me aurora back and then I will test :p -- I did put that on my "Todo list when the toy is back"

Sep 22 '25 12:09 TApplencourt