Cabana icon indicating copy to clipboard operation
Cabana copied to clipboard

Data wrong after Cabana::migrate

Open LonelyCat124 opened this issue 2 years ago • 1 comments

I've manage to replicate this issue I mentioned on slack in a standalone piece of code of 150 lines.

Setup: [email protected] (installed via spack) OpenMPI 4.0.4 (used from module system) Cabana with OpenMP, MPI (installed via spack) Kokkos@master with OpenMP (installed via spack 2 weeks ago)

2 MPI ranks, 32 threads per rank.

Reproducer code base loop pseudocode:

initialisation()

for(int step = 0; step < nsteps; step++){
  //create all the slices and functors

  //compute new ranks for particles, in this case 2/3rds of particles end up on rank 0 and the rest on rank 1

  //migrate the particles
}

I think the bug only occurs with "enough" particles (I'll try to investigate), but after the 3rd migration a bunch of the data is no longer correct after creating the slices in the next step:

ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137776
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137776
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137776
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137777
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137777
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137777
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137778
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137778
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137778
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137779
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137779
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137779
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137780
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137780
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137780
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137781
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137781
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137781

The values at some point are all 0 - this matches what I saw in my production code, which eventually crashes because the masses become 0 suddenly (despite being fixed values).

If I reduce the (global) number of particles to 1137780, the issue instead starts with particle at position 505680, and node 0 expects to have 758516 particles.

If I reduce further to 500000 then it runs to completion with correct data (at least for weight/charge/mass which I check).

Am I missing some necessary component for large datasets or is something else going wrong?

The code I'm running is:

//Include Cabana Header and Kokkos headers

#include <Cabana_Core.hpp>

#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>

#include <iostream>

// Include the standard C++ headers
#include <cmath>
#include <cstdio>
#include <fstream>
#include <vector>
#include <random>
#include <sys/stat.h>
#include <sys/time.h>
#include "hdf5.h"
#include "mpi.h"

enum FieldNames{ id = 0,
                 weight,
                 mass,
                 charge,
                 part_pos,
                 part_p,
                 rank
               };

//Setting default Memory/Host/Execution spaces
//using MemorySpace = Kokkos::CudaSpace;
using MemorySpace = Kokkos::HostSpace;
using ExecutionSpace = Kokkos::DefaultExecutionSpace; /*Kokkos::Serial;*/
using DeviceType = Kokkos::Device<Kokkos::DefaultExecutionSpace, MemorySpace/*ExecutionSpace, MemorySpace*/>;
using HostType = Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>;
using field_type = Kokkos::View<double* , MemorySpace>;
using host_mirror_type = field_type::HostMirror;
//Types used in the tuple (corresponds to the enum before)
using DataTypes = Cabana::MemberTypes<int64_t, double, double,
                                      double, double[3], double[3],
                                      int>;
const int VectorLength = 16;

template<class RankSlice>
struct particle_bcs_functor{
    RankSlice _rank;
    int _nranks;
    //TODO The slice?

    KOKKOS_INLINE_FUNCTION
    particle_bcs_functor(RankSlice rank):
        _rank(rank){
        MPI_Comm_size( MPI_COMM_WORLD, &_nranks );
    }

    //SIMD Parallelised implementation for now
    KOKKOS_INLINE_FUNCTION
    void operator()(const int ix, const int ij) const{

        int rank = ix % 3;
        if(rank == 2) rank = 0;
        if((rank != 0) && (rank != 1)){
            std::cout << "Particle going to incorrect rank: " << rank << "\n";
        }
        _rank.access(ix, ij) = rank;
    }

};

void init_aosoa(Cabana::AoSoA<DataTypes, HostType, VectorLength> &particle_aosoa){

    int myrank = 0; MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
    auto id_slice = Cabana::slice<id>(particle_aosoa);
    auto mass_slice = Cabana::slice<mass>(particle_aosoa);
    auto charge_slice = Cabana::slice<charge>(particle_aosoa);
    auto weight_slice = Cabana::slice<weight>(particle_aosoa);
    auto part_pos_slice = Cabana::slice<part_pos>(particle_aosoa);
    auto part_p_slice = Cabana::slice<part_p>(particle_aosoa);
    auto rank_slice = Cabana::slice<rank>(particle_aosoa);
    for(int i = 0; i < particle_aosoa.size(); i++){
        id_slice(i) = i;
        mass_slice(i) = 1.5;
        charge_slice(i) = -1.0;
        weight_slice(i) = 2.52341e-15;
        part_pos_slice(i, 0) = 0.67;
        part_p_slice(i, 0) = 0.;
        part_p_slice(i, 1) = 1.;
        part_p_slice(i, 2) = 2.;
        rank_slice(i) = myrank;
    }

}

int main(int argc, char* argv[] ){
    //TODO

    Kokkos::ScopeGuard scope_guard(argc, argv);
    MPI_Init( &argc, &argv );
    {
        int myrank = 0; MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
        int nranks = 1; MPI_Comm_size( MPI_COMM_WORLD, &nranks );
        int npart = 2560000;
        Cabana::AoSoA<DataTypes, DeviceType, VectorLength> particle_aosoa( "particle_list", npart/2);
        Cabana::AoSoA<DataTypes, HostType, VectorLength> host_particle_aosoa( "particles_host",npart/2);
        init_aosoa(host_particle_aosoa);
        Cabana::deep_copy(particle_aosoa, host_particle_aosoa);
        //Taken from cabana example to construct neighbour list
        int previous_rank = ( myrank == 0 ) ? nranks - 1 : myrank - 1;
        int next_rank = ( myrank == nranks - 1 ) ? 0 : myrank + 1;
        std::vector<int> neighbors = { previous_rank, myrank, next_rank };
        std::sort( neighbors.begin(), neighbors.end() );
        auto unique_end = std::unique( neighbors.begin(), neighbors.end() );
        neighbors.resize( std::distance( neighbors.begin(), unique_end ) );

        for(int step = 0; step < 256; step++){
            bool continu = true;
            auto id_s = Cabana::slice<id>(particle_aosoa);
            auto weight_s = Cabana::slice<weight>(particle_aosoa);
            auto mass_s = Cabana::slice<mass>(particle_aosoa);
            auto charge_s = Cabana::slice<charge>(particle_aosoa);
            auto part_pos_s = Cabana::slice<part_pos>(particle_aosoa);
            auto part_p_s = Cabana::slice<part_p>(particle_aosoa);
            auto rank_slice = Cabana::slice<rank>(particle_aosoa);
            for(int i = 0; i < particle_aosoa.size(); i++){
                if( weight_s(i) != 2.52341e-15){
                    std::cout << "ERROR: RECEIEVE NODE: " << myrank << " Step: " << step << " Weight " << weight_s(i) << " wrong at: " << i << "\n";
                    continu = false;
                }
                if( charge_s(i) != -1.0){
                    std::cout << "ERROR: RECEIEVE NODE: " << myrank << " Step: " << step << " Charge " << charge_s(i) << " wrong at: " << i << "\n";
                    continu = false;
                }
                if( mass_s(i) != 1.5){
                    std::cout << "ERROR: RECEIVED NODE: " << myrank <<  " Step: " << step << " Mass " << mass_s(i) << " wrong at: " << i << "\n";
                    continu = false;
                }
            }
            if(!continu){ abort(); }
            particle_bcs_functor<decltype(rank_slice)> pbf(rank_slice);
            Cabana::SimdPolicy<VectorLength, ExecutionSpace> simd_policy( 0,
                                                            particle_aosoa.size());
            Cabana::simd_parallel_for(simd_policy, pbf, "update_ranks");
            Kokkos::fence();
            Cabana::Distributor<DeviceType> distributor( MPI_COMM_WORLD, rank_slice, neighbors);
            Cabana::migrate( distributor, particle_aosoa );
        }
    }
    MPI_Finalize();
}

LonelyCat124 avatar Apr 22 '22 09:04 LonelyCat124

Any progress on this?

sslattery avatar Jun 03 '22 00:06 sslattery