Cabana
Cabana copied to clipboard
Data wrong after Cabana::migrate
I've manage to replicate this issue I mentioned on slack in a standalone piece of code of 150 lines.
Setup: [email protected] (installed via spack) OpenMPI 4.0.4 (used from module system) Cabana with OpenMP, MPI (installed via spack) Kokkos@master with OpenMP (installed via spack 2 weeks ago)
2 MPI ranks, 32 threads per rank.
Reproducer code base loop pseudocode:
initialisation()
for(int step = 0; step < nsteps; step++){
//create all the slices and functors
//compute new ranks for particles, in this case 2/3rds of particles end up on rank 0 and the rest on rank 1
//migrate the particles
}
I think the bug only occurs with "enough" particles (I'll try to investigate), but after the 3rd migration a bunch of the data is no longer correct after creating the slices in the next step:
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137776
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137776
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137776
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137777
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137777
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137777
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137778
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137778
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137778
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137779
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137779
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137779
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137780
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137780
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137780
ERROR: RECEIEVE NODE: 0 Step: 3 Weight 0 wrong at: 1137781
ERROR: RECEIEVE NODE: 0 Step: 3 Charge 0 wrong at: 1137781
ERROR: RECEIVED NODE: 0 Step: 3 Mass 0 wrong at: 1137781
The values at some point are all 0 - this matches what I saw in my production code, which eventually crashes because the masses become 0 suddenly (despite being fixed values).
If I reduce the (global) number of particles to 1137780
, the issue instead starts with particle at position 505680
, and node 0 expects to have 758516
particles.
If I reduce further to 500000
then it runs to completion with correct data (at least for weight/charge/mass which I check).
Am I missing some necessary component for large datasets or is something else going wrong?
The code I'm running is:
//Include Cabana Header and Kokkos headers
#include <Cabana_Core.hpp>
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#include <iostream>
// Include the standard C++ headers
#include <cmath>
#include <cstdio>
#include <fstream>
#include <vector>
#include <random>
#include <sys/stat.h>
#include <sys/time.h>
#include "hdf5.h"
#include "mpi.h"
enum FieldNames{ id = 0,
weight,
mass,
charge,
part_pos,
part_p,
rank
};
//Setting default Memory/Host/Execution spaces
//using MemorySpace = Kokkos::CudaSpace;
using MemorySpace = Kokkos::HostSpace;
using ExecutionSpace = Kokkos::DefaultExecutionSpace; /*Kokkos::Serial;*/
using DeviceType = Kokkos::Device<Kokkos::DefaultExecutionSpace, MemorySpace/*ExecutionSpace, MemorySpace*/>;
using HostType = Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>;
using field_type = Kokkos::View<double* , MemorySpace>;
using host_mirror_type = field_type::HostMirror;
//Types used in the tuple (corresponds to the enum before)
using DataTypes = Cabana::MemberTypes<int64_t, double, double,
double, double[3], double[3],
int>;
const int VectorLength = 16;
template<class RankSlice>
struct particle_bcs_functor{
RankSlice _rank;
int _nranks;
//TODO The slice?
KOKKOS_INLINE_FUNCTION
particle_bcs_functor(RankSlice rank):
_rank(rank){
MPI_Comm_size( MPI_COMM_WORLD, &_nranks );
}
//SIMD Parallelised implementation for now
KOKKOS_INLINE_FUNCTION
void operator()(const int ix, const int ij) const{
int rank = ix % 3;
if(rank == 2) rank = 0;
if((rank != 0) && (rank != 1)){
std::cout << "Particle going to incorrect rank: " << rank << "\n";
}
_rank.access(ix, ij) = rank;
}
};
void init_aosoa(Cabana::AoSoA<DataTypes, HostType, VectorLength> &particle_aosoa){
int myrank = 0; MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
auto id_slice = Cabana::slice<id>(particle_aosoa);
auto mass_slice = Cabana::slice<mass>(particle_aosoa);
auto charge_slice = Cabana::slice<charge>(particle_aosoa);
auto weight_slice = Cabana::slice<weight>(particle_aosoa);
auto part_pos_slice = Cabana::slice<part_pos>(particle_aosoa);
auto part_p_slice = Cabana::slice<part_p>(particle_aosoa);
auto rank_slice = Cabana::slice<rank>(particle_aosoa);
for(int i = 0; i < particle_aosoa.size(); i++){
id_slice(i) = i;
mass_slice(i) = 1.5;
charge_slice(i) = -1.0;
weight_slice(i) = 2.52341e-15;
part_pos_slice(i, 0) = 0.67;
part_p_slice(i, 0) = 0.;
part_p_slice(i, 1) = 1.;
part_p_slice(i, 2) = 2.;
rank_slice(i) = myrank;
}
}
int main(int argc, char* argv[] ){
//TODO
Kokkos::ScopeGuard scope_guard(argc, argv);
MPI_Init( &argc, &argv );
{
int myrank = 0; MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
int nranks = 1; MPI_Comm_size( MPI_COMM_WORLD, &nranks );
int npart = 2560000;
Cabana::AoSoA<DataTypes, DeviceType, VectorLength> particle_aosoa( "particle_list", npart/2);
Cabana::AoSoA<DataTypes, HostType, VectorLength> host_particle_aosoa( "particles_host",npart/2);
init_aosoa(host_particle_aosoa);
Cabana::deep_copy(particle_aosoa, host_particle_aosoa);
//Taken from cabana example to construct neighbour list
int previous_rank = ( myrank == 0 ) ? nranks - 1 : myrank - 1;
int next_rank = ( myrank == nranks - 1 ) ? 0 : myrank + 1;
std::vector<int> neighbors = { previous_rank, myrank, next_rank };
std::sort( neighbors.begin(), neighbors.end() );
auto unique_end = std::unique( neighbors.begin(), neighbors.end() );
neighbors.resize( std::distance( neighbors.begin(), unique_end ) );
for(int step = 0; step < 256; step++){
bool continu = true;
auto id_s = Cabana::slice<id>(particle_aosoa);
auto weight_s = Cabana::slice<weight>(particle_aosoa);
auto mass_s = Cabana::slice<mass>(particle_aosoa);
auto charge_s = Cabana::slice<charge>(particle_aosoa);
auto part_pos_s = Cabana::slice<part_pos>(particle_aosoa);
auto part_p_s = Cabana::slice<part_p>(particle_aosoa);
auto rank_slice = Cabana::slice<rank>(particle_aosoa);
for(int i = 0; i < particle_aosoa.size(); i++){
if( weight_s(i) != 2.52341e-15){
std::cout << "ERROR: RECEIEVE NODE: " << myrank << " Step: " << step << " Weight " << weight_s(i) << " wrong at: " << i << "\n";
continu = false;
}
if( charge_s(i) != -1.0){
std::cout << "ERROR: RECEIEVE NODE: " << myrank << " Step: " << step << " Charge " << charge_s(i) << " wrong at: " << i << "\n";
continu = false;
}
if( mass_s(i) != 1.5){
std::cout << "ERROR: RECEIVED NODE: " << myrank << " Step: " << step << " Mass " << mass_s(i) << " wrong at: " << i << "\n";
continu = false;
}
}
if(!continu){ abort(); }
particle_bcs_functor<decltype(rank_slice)> pbf(rank_slice);
Cabana::SimdPolicy<VectorLength, ExecutionSpace> simd_policy( 0,
particle_aosoa.size());
Cabana::simd_parallel_for(simd_policy, pbf, "update_ranks");
Kokkos::fence();
Cabana::Distributor<DeviceType> distributor( MPI_COMM_WORLD, rank_slice, neighbors);
Cabana::migrate( distributor, particle_aosoa );
}
}
MPI_Finalize();
}
Any progress on this?