mercury
mercury copied to clipboard
NA OFI: Client crashing with segfault when server becomes unavailable
Describe the bug
I have a simple Mercury client and server (code available at the end of this issue). The client sends RPCs to the server indefinitely and the server responds to those RPCs. When I use ofi+tcp
, if I kill the server, the client will block for about 5 seconds, then crash with a segfault. This does not happen with na+sm
(the client will block indefinitely). I haven't tried with other protocols.
This bug is particularly problematic because in SSG we periodically ping servers to detect if they have crashed, using a timeout after which we cancel the RPC. We expect the RPC to simply block, we can't afford the sender crashing as well if the receiver has crashed.
It's most likely a libfabric problem so you may be able to better track it down and escalate it to the libfabric developers.
To Reproduce
Compile the sources provided at the end of this issue. Start the server using ./server ofi+tcp
, then copy the address it prints, and start the client using ./client ofi+tcp <server-address>
. Finally, kill the server, and wait a few seconds. The client will crash as well with a segfault.
Expected behavior
RPC to crashed server should either fail with an error code, or block indefinitely.
Stack trace
This is the stack trace I get when running the client inside GDB:
#0 0x00007ffff7f0d7bd in rxm_conn_close ()
from /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/lib/libfabric.so.1
#1 0x00007ffff7f0f16d in rxm_conn_handle_event ()
from /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/lib/libfabric.so.1
#2 0x00007ffff7f100ab in rxm_msg_eq_progress ()
from /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/lib/libfabric.so.1
#3 0x00007ffff7f101dd in rxm_cmap_connect ()
from /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/lib/libfabric.so.1
#4 0x00007ffff7f1067b in rxm_get_conn ()
from /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/lib/libfabric.so.1
#5 0x00007ffff7f14ad2 in rxm_ep_tsend ()
from /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/lib/libfabric.so.1
#6 0x00007ffff7f8fab9 in fi_tsend (context=0x5555556afcc8, tag=<optimized out>, dest_addr=<optimized out>, desc=<optimized out>,
len=<optimized out>, buf=<optimized out>, ep=<optimized out>)
at /projects/spack/opt/spack/linux-debian10-sandybridge/gcc-8.3.0/libfabric-1.11.1-j4nile6breiplr442hhyvbup5bgbp4u7/include/rdma/fi_tagged.h:114
#7 na_ofi_cq_process_retries (context=0x555555579860)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/na/na_ofi.c:3272
#8 na_ofi_progress (na_class=<optimized out>, context=<optimized out>, timeout=<optimized out>)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/na/na_ofi.c:5051
#9 0x00007ffff7f87493 in NA_Progress (na_class=na_class@entry=0x555555559310, context=context@entry=0x555555579860, timeout=timeout@entry=0)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/na/na.c:1170
#10 0x00007ffff7fb86b7 in hg_core_progress_na (na_class=0x555555559310, na_context=0x555555579860, timeout=0,
progressed_ptr=progressed_ptr@entry=0x7fffffffdd30 "")
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/mercury_core.c:3888
#11 0x00007ffff7fbaae3 in hg_core_poll (progressed_ptr=<synthetic pointer>, timeout=<optimized out>, context=0x555555577620)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/mercury_core.c:3830
#12 hg_core_progress (context=0x555555577620, timeout=100)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/mercury_core.c:3685
#13 0x00007ffff7fc037b in HG_Core_progress (context=<optimized out>, timeout=<optimized out>)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/mercury_core.c:5046
#14 0x00007ffff7fb228e in HG_Progress (context=<optimized out>, timeout=<optimized out>)
at /tmp/mdorier/spack-stage/spack-stage-mercury-master-zwubttmx76ltv3dmw6i4lrgpwauszhkd/spack-src/src/mercury.c:2019
#15 0x0000555555555279 in main (argc=<optimized out>, argv=<optimized out>) at /projects/mochi/colza/examples/hg/client.c:62
Platform (please complete the following information):
- System description: Debian virtual machine
- Compiler version: gcc 8.3.0
- Plugin and protocol used: ofi+tcp
- Dependency version: libfabric 1.11.1
Additional context
Code for types.h:
#ifndef PARAM_H
#define PARAM_H
#include <mercury.h>
#include <mercury_macros.h>
MERCURY_GEN_PROC(sum_in_t,
((int32_t)(x))\
((int32_t)(y)))
MERCURY_GEN_PROC(sum_out_t, ((int32_t)(ret)))
#endif
Code for server.c:
#include <mercury.h>
#include "types.h"
typedef struct {
hg_class_t* hg_class;
hg_context_t* hg_context;
} server_state;
hg_return_t sum(hg_handle_t h);
int main(int argc, char** argv)
{
hg_return_t ret;
if(argc != 2) {
printf("Usage: %s <server address>\n", argv[0]);
exit(0);
}
const char* server_address = argv[1];
server_state state;
state.hg_class = HG_Init(server_address, HG_TRUE);
assert(state.hg_class != NULL);
char hostname[128];
hg_size_t hostname_size = 128;
hg_addr_t self_addr;
HG_Addr_self(state.hg_class, &self_addr);
HG_Addr_to_string(state.hg_class, hostname, &hostname_size, self_addr);
printf("Server running at address %s\n", hostname);
HG_Addr_free(state.hg_class, self_addr);
state.hg_context = HG_Context_create(state.hg_class);
assert(state.hg_context != NULL);
MERCURY_REGISTER(state.hg_class, "sum", sum_in_t, sum_out_t, sum);
do
{
unsigned int count;
do {
ret = HG_Trigger(state.hg_context, 0, 1, &count);
} while((ret == HG_SUCCESS) && count);
HG_Progress(state.hg_context, 100);
} while(1);
ret = HG_Context_destroy(state.hg_context);
assert(ret == HG_SUCCESS);
ret = HG_Finalize(state.hg_class);
assert(ret == HG_SUCCESS);
return 0;
}
hg_return_t sum(hg_handle_t handle)
{
hg_return_t ret;
sum_in_t in;
sum_out_t out;
const struct hg_info* info = HG_Get_info(handle);
ret = HG_Get_input(handle, &in);
assert(ret == HG_SUCCESS);
out.ret = in.x + in.y;
printf("%d + %d = %d\n",in.x,in.y,in.x+in.y);
ret = HG_Respond(handle,NULL,NULL,&out);
assert(ret == HG_SUCCESS);
ret = HG_Free_input(handle, &in);
assert(ret == HG_SUCCESS);
ret = HG_Destroy(handle);
assert(ret == HG_SUCCESS);
return HG_SUCCESS;
}
Code for client.c:
#include <assert.h>
#include <stdio.h>
#include <mercury.h>
#include "types.h"
typedef struct {
hg_class_t* hg_class;
hg_context_t* hg_context;
hg_id_t sum_rpc_id;
int completed;
} client_state_t;
hg_return_t sum_completed(const struct hg_cb_info *info);
int main(int argc, char** argv)
{
hg_return_t ret;
if(argc != 3) {
printf("Usage: %s <protocol> <server_address>\n",argv[0]);
printf("Example: %s tcp tcp://1.2.3.4:1234\n",argv[0]);
exit(0);
}
char* protocol = argv[1];
char* server_address = argv[2];
client_state_t state;
state.completed = 0;
state.hg_class = HG_Init(protocol, HG_FALSE);
assert(state.hg_class != NULL);
state.hg_context = HG_Context_create(state.hg_class);
assert(state.hg_context != NULL);
state.sum_rpc_id = MERCURY_REGISTER(state.hg_class, "sum", sum_in_t, sum_out_t, NULL);
hg_addr_t addr = HG_ADDR_NULL;
ret = HG_Addr_lookup2(state.hg_class, server_address, &addr);
while(1) {
hg_handle_t handle;
ret = HG_Create(state.hg_context, addr, state.sum_rpc_id, &handle);
assert(ret == HG_SUCCESS);
sum_in_t in;
in.x = 42;
in.y = 23;
ret = HG_Forward(handle, sum_completed, &state, &in);
assert(ret == HG_SUCCESS);
while(!state.completed)
{
unsigned int count;
do {
ret = HG_Trigger(state.hg_context, 0, 1, &count);
} while((ret == HG_SUCCESS) && count && !state.completed);
HG_Progress(state.hg_context, 100);
}
state.completed = 0;
}
ret = HG_Addr_free(state.hg_class, addr);
assert(ret == HG_SUCCESS);
ret = HG_Context_destroy(state.hg_context);
assert(ret == HG_SUCCESS);
hg_return_t err = HG_Finalize(state.hg_class);
assert(err == HG_SUCCESS);
return 0;
}
hg_return_t sum_completed(const struct hg_cb_info *info)
{
hg_return_t ret;
client_state_t* state = (client_state_t*)(info->arg);
sum_out_t out;
assert(info->ret == HG_SUCCESS);
ret = HG_Get_output(info->info.forward.handle, &out);
assert(ret == HG_SUCCESS);
printf("Got response: %d\n", out.ret);
ret = HG_Free_output(info->info.forward.handle, &out);
assert(ret == HG_SUCCESS);
ret = HG_Destroy(info->info.forward.handle);
assert(ret == HG_SUCCESS);
state->completed = 1;
return HG_SUCCESS;
}