kokkos-remote-spaces icon indicating copy to clipboard operation
kokkos-remote-spaces copied to clipboard

Kokkos::deep_copy hangs

Open brian-kelley opened this issue 4 months ago • 1 comments

In this example, the Kokkos::deep_copy call marked below causes the program to hang when running with multiple MPI ranks. Commenting out the deep_copy and uncommenting the for-loop to copy the equivalent entries makes the program work as expected.

Kokkos::deep_copy is not a collective operation when one of the arguments is a view in a remote space, correct?

// clang-format off
#include <fstream>
#include <algorithm>
#include <numeric>
#include <unistd.h>

#include <Kokkos_RemoteSpaces.hpp>
// clang-format on

int main(int argc, char *argv[]) {
  using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace;
  constexpr size_t M = 8;
  int mpi_thread_level_available;
  int mpi_thread_level_required = MPI_THREAD_MULTIPLE;
  MPI_Init_thread(&argc, &argv, mpi_thread_level_required,
                  &mpi_thread_level_available);
  assert(mpi_thread_level_available >= mpi_thread_level_required);
  if (!(mpi_thread_level_available >= mpi_thread_level_required)) {
    // if asserts are disabled, don't want to move forward.
    std::cout << "mpi_thread_level_available >= mpi_thread_level_required failed\n";
    exit(1);
  }

  Kokkos::initialize(argc, argv);
  {
    using namespace Kokkos;
    using PartitionedView1D =
        Kokkos::View<double **, PartitionedLayoutRight, RemoteSpace_t>;
    using Local1DView = typename PartitionedView1D::HostMirror;
    using TeamPolicy_t = Kokkos::TeamPolicy<>;

    int size, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank == 0) std::cout << "MPI_COMM_WORLD size: " << size << '\n';

    auto A = PartitionedView1D("RemoteView", size, M);
    RemoteSpace_t().fence();
    auto Alocal = Local1DView("LocalView", 1, M);
    auto lr = Experimental::get_local_range(M);
    parallel_for(
        "init", (A.extent(1)),
        KOKKOS_LAMBDA(auto i) { A(rank, i) = rank * M + i; });
    RemoteSpace_t().fence();
    for (int i = 0; i < size; i++) {
      if (rank == 0) {
        std::cout << "MPI_COMM_WORLD rank: " << i << '\n';
        auto range = std::make_pair(size_t(0), M);
        auto ar = Kokkos::subview(A, std::make_pair(i, i+1), range);
        auto al = Kokkos::subview(A, std::make_pair(rank, rank+1), range);
        Kokkos::parallel_for(
            "Team", TeamPolicy_t(1, 1),
            KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) {
              Kokkos::single(Kokkos::PerTeam(team), [&]() {
                Kokkos::Experimental::RemoteSpaces::local_deep_copy(al, ar);
              });
            });
        //for(int i = 0; i < al.extent_int(1); i++)
        //  Alocal(0, i) = al(0, i);
        Kokkos::deep_copy(Alocal, al); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< HERE
        for (size_t j = range.first; j < range.second; j++)
          std::cout << Alocal(0, j) << ' ';
        std::cout << '\n';
      }
      RemoteSpace_t().fence();
    }
  }
  Kokkos::finalize();
  MPI_Finalize();

  return 0;
}

Output with 4 ranks and the deep_copy uncommented:

MPI_COMM_WORLD size: 4
MPI_COMM_WORLD rank: 0
0 1 2 3 4 5 6 7 
MPI_COMM_WORLD rank: 1
8 9 10 11 12 13 14 15 
MPI_COMM_WORLD rank: 2
16 17 18 19 20 21 22 23 
<hang>

With the for-loop uncommented (correct behavior). Now the program terminates normally:

MPI_COMM_WORLD size: 4
MPI_COMM_WORLD rank: 0
0 1 2 3 4 5 6 7 
MPI_COMM_WORLD rank: 1
8 9 10 11 12 13 14 15 
MPI_COMM_WORLD rank: 2
16 17 18 19 20 21 22 23 
MPI_COMM_WORLD rank: 3
24 25 26 27 28 29 30 31 

brian-kelley avatar Feb 27 '24 22:02 brian-kelley

Thanks. We do a global fence in the deep_copy which makes it collective operation and hence the deadlock. This behavior is an artifact from previous versions and I will add the fix to one of the PRs.

janciesko avatar May 08 '24 20:05 janciesko