sst-macro
sst-macro copied to clipboard
MVAPICH AllGather test deadlocks
The MPI all gather app in skeletons/tests:
#include <mpi.h>
#include <stddef.h>
#include <iostream>
int main(int argc, char** argv)
{
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (rank == 0){
std::cout << "Starting collective" << std::endl;
}
int nelems = 100;
//#define VALIDATE_BUFFERS
#ifdef VALIDATE_BUFFERS
int* send_buf = new int[nelems];
int* recv_buf = new int[nelems * size];
for (int i=0; i < nelems; ++i){
send_buf[i] = rank;
}
for (int i=0; i < size; ++i){
for (int j=0; j < nelems; ++j){
recv_buf[i*nelems + j] = -1;
}
}
#else
void* send_buf = sstmac_nullptr_send;
void* recv_buf = sstmac_nullptr_recv;
#endif
MPI_Allgather(send_buf, 100, MPI_INT,
recv_buf, 100, MPI_INT, MPI_COMM_WORLD);
if (rank == 0){
std::cout << "Cleared collective" << std::endl;
}
#ifdef VALIDATE_BUFFERS
for (int i=0; i < size; ++i){
int* values = recv_buf + i*nelems;
for (int j=0; j < nelems; ++j){
if (values[j] != i){
printf("V[%d][%d] = %d != %d\n", i, j, values[j], i);
}
}
}
#endif
MPI_Finalize();
if (rank == 0){
std::cout << "Cleared finalize" << std::endl;
}
return 0;
}
Deadlocks with the following param file when run over MVAPICH:
node {
os {
stack_size = 1MB
}
app1 {
exe = ./run
argv =
launch_cmd = aprun -n 32 -N 1
apis = [libfabric, pmi:libfabric]
env {
SLURM_NPROCS = 32
}
}
nic {
name = snappr
credits = 64KB
mtu = 4096
bandwidth = 10.0GB/s
injection {
bandwidth = 10.0GB/s
latency = 50ns
mtu = 1024
credits = 64KB
send_state {
group = state
type = ftq_calendar
output = ftq
epoch_length = 1us
}
recv_state {
group = state
type = ftq_calendar
output = ftq
epoch_length = 1us
}
}
ejection {
latency = 50ns
}
}
memory {
name = snappr
channel_bandwidth = 7GB/s
num_channels = 10
latency = 10ns
}
proc {
ncores = 4
frequency = 2GHz
}
name = simple
}
switch {
name = snappr
credits = 64KB
link {
bandwidth = 10.0GB/s
latency = 100ns
credits = 64KB
xmit_active {
group = test
type = accumulator
}
xmit_idle {
group = test
type = accumulator
}
xmit_stall {
group = test
type = accumulator
}
}
logp {
bandwidth = 1GB/s
out_in_latency = 100ns
hop_latency = 100ns
}
}
topology {
name = dragonfly
geometry = [32,9]
h = 16
inter_group = circulant
concentration = 16
}
switch {
router {
name = dragonfly_minimal
}
}
I used the following Makefile:
TARGET := run
SRC := allgather.cc
CXX := sst++
CC := sstcc
CXXFLAGS := --disable-mpi -fPIC -O0 -g
CPPFLAGS := -I. -I/home/jpkenny/install/mv2-ofi-netmod/include
LIBDIR :=
PREFIX :=
LDFLAGS := /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.so /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.a /usr/lib64/libhwloc.so -Wl,-rpath,$(PREFIX)/lib -Wl,-rpath,/home/jpkenny/install/mv2-ofi-netmod/lib -Wl,-rpath,$/home/jpkenny/install/sst-transports/lib
OBJ := $(SRC:.cc=.o)
OBJ := $(OBJ:.cpp=.o)
OBJ := $(OBJ:.c=.o)
.PHONY: clean install
all: $(TARGET)
$(TARGET): $(OBJ)
$(CXX) -o $@ $+ $(LDFLAGS) $(LIBS) $(CXXFLAGS)
%.o: %.cc
$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
%.o: %.c
$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
clean:
rm -f $(TARGET) $(OBJ)
install: $(TARGET)
cp $< $(PREFIX)/bin