mercury icon indicating copy to clipboard operation
mercury copied to clipboard

bus error happens at various locations

Open otatebe opened this issue 2 years ago • 0 comments

Describe the bug bus error happens at various locations

Screenshots https://github.com/mercury-hpc/mercury/blob/v2.1.0/src/util/mercury_poll.c#L355

Program terminated with signal 7, Bus error.
#0  0x00002ab2d2ab1cc6 in hg_poll_wait (poll_set=0x29abd00,
    timeout=timeout@entry=100, max_events=max_events@entry=1,
    events=events@entry=0x29a8950,
    actual_events=actual_events@entry=0x2ab30fd1afb4)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/util/mercury_poll.c:355
355     /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/util/mercury_poll.c: No such file or directory.
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  0x00002ab2d2ab1cc6 in hg_poll_wait (poll_set=0x29abd00,
    timeout=timeout@entry=100, max_events=max_events@entry=1,
    events=events@entry=0x29a8950,
    actual_events=actual_events@entry=0x2ab30fd1afb4)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/util/mercury_poll.c:355
#1  0x00002ab2d2697900 in hg_core_poll_wait (
    progressed_ptr=<synthetic pointer>, timeout_ms=100, context=0x29a8850)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3718
#2  hg_core_progress (context=0x29a8850, timeout_ms=timeout_ms@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3660
#3  0x00002ab2d269c88b in HG_Core_progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:5029
#4  0x00002ab2d268e8d3 in HG_Progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:2022
#5  0x00002ab2d226c43c in __margo_hg_progress_fn (foo=0x2c5f350)
    at src/margo-core.c:1482
#6  0x00002ab2d2ef246a in ABTD_ythread_func_wrapper ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#7  0x00002ab2d2ef25f1 in make_fcontext ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#8  0x0000000000000000 in ?? ()
(gdb) p *poll_set
$2 = {lock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0,
      __kind = 512, __spins = 0, __elision = 0, __list = {__prev = 0x0,
        __next = 0x0}},
    __size = '\000' <repeats 17 times>, "\002", '\000' <repeats 21 times>,
    __align = 0}, events = 0x29a4980, max_events = 32, nfds = 2, fd = 117}
(gdb) p *poll_set->events
$3 = {events = 1, data = {ptr = 0x3, fd = 3, u32 = 3, u64 = 3}}

https://github.com/mercury-hpc/mercury/blob/v2.1.0/src/mercury_proc.c#L380

Program terminated with signal 7, Bus error.
#0  hg_proc_checksum_update (proc=proc@entry=0x30d1f40,
    data=data@entry=0x7ffc33402a10, data_size=data_size@entry=4)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_proc.c:380
380     /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_proc.c: No such file or directory.
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  hg_proc_checksum_update (proc=proc@entry=0x30d1f40,
    data=data@entry=0x7ffc33402a10, data_size=data_size@entry=4)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_proc.c:380
#1  0x00002b9b73da3ff4 in hg_proc_hg_int32_t (data=<optimized out>,
    proc=0x30d1f40)
    at /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/include/mercury_proc.h:745
#2  hg_proc_kv_get_rdma_out_t (proc=0x30d1f40, data=0x7ffc33402a10)
    at kv_types.h:45
#3  0x00002b9b743d8718 in hg_get_struct (hg_handle=0x30d1d70,
    hg_proc_info=<optimized out>, op=op@entry=HG_OUTPUT,
    struct_ptr=struct_ptr@entry=0x7ffc33402a10)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:505
#4  0x00002b9b743daf4b in HG_Get_output (handle=<optimized out>,
    out_struct=out_struct@entry=0x7ffc33402a10)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:1774
#5  0x00002b9b73da736a in fs_rpc_inode_write_rdma_bulk (
    server=server@entry=0x30dc960 "ofi+verbs;ofi_rxm://10.10.10.9:42020",
    key=key@entry=0x30d7cc0, key_size=key_size@entry=84,
    client=client@entry=0x2b9b73fad4a0 <chfs_client> "ofi+verbs;ofi_rxm://10.10.0.14:36935", buf=0x34b4d80, size=size@entry=0x7ffc33402b60,
    offset=offset@entry=0, mode=mode@entry=33204,
    chunk_size=chunk_size@entry=1048576, errp=errp@entry=0x7ffc33402b5c)
    at fs_client.c:230
#6  0x00002b9b73da74cb in fs_rpc_inode_write_rdma (
    server=server@entry=0x30dc960 "ofi+verbs;ofi_rxm://10.10.10.9:42020",
    key=key@entry=0x30d7cc0, key_size=key_size@entry=84,
    client=client@entry=0x2b9b73fad4a0 <chfs_client> "ofi+verbs;ofi_rxm://10.10.0.14:36935", buf=buf@entry=0x2b9bb340c000, size=size@entry=0x7ffc33402b60,
    offset=offset@entry=0, mode=mode@entry=33204,
    chunk_size=chunk_size@entry=1048576, errp=errp@entry=0x7ffc33402b5c)
    at fs_client.c:264
#7  0x00002b9b73da1afb in chfs_rpc_inode_write (errp=0x7ffc33402b5c,
    chunk_size=1048576, mode=33204, offset=0, size=0x7ffc33402b60,
    buf=0x2b9bb340c000, key_size=84, key=0x30d7cc0) at chfs.c:455
#8  chfs_pwrite (fd=0, buf=0x2b9bb340c000, size=1048576, offset=7877951488)
    at chfs.c:699
#9  0x00000000004286ae in WriteOrReadSingle (offset=7877951488,
    pretendRank=pretendRank@entry=219, transfer=1048576,
    transferCount=transferCount@entry=0x7ffc33402c88,
    errors=errors@entry=0x7ffc33402c7c, test=test@entry=0x2bdd9e0,
    fd=0x30e1dd0, access=0, ioBuffers=<optimized out>) at ior.c:1639
#10 0x000000000042ad75 in WriteOrRead (test=test@entry=0x2bdd9e0,
    results=results@entry=0x2bddbf0, fd=fd@entry=0x30e1dd0,
    access=access@entry=0, ioBuffers=ioBuffers@entry=0x7ffc33402e50)
    at ior.c:1772
#11 0x000000000042bd54 in TestIoSys (test=0x2bdd9e0) at ior.c:1260
#12 0x000000000042d15e in ior_run (argc=28, argv=0x2beb2f0,
    world_com=<optimized out>, world_out=<optimized out>) at ior.c:168
#13 0x000000000040a6cf in ior_process_write (argv=0x2bdfa50,
    out=0x2b9b76508400 <_IO_2_1_stdout_>, res_out=0x65ff90 <o+16>)
    at src/phase_ior.c:6
#14 0x000000000040ac4c in run () at src/phase_ior_easy_write.c:48
#15 0x0000000000406031 in main (argc=4, argv=0x7ffc33403fa8) at src/main.c:389
(gdb) p *proc
$2 = {proc_buf = {buf = 0x2b9b8f5e61f0, buf_ptr = 0x2b9b8f5e61f4, size = 4072,
    size_left = 4068, is_mine = 0 '\000'}, extra_buf = {buf = 0x0,
    buf_ptr = 0x0, size = 0, size_left = 0, is_mine = 0 '\000'},
  hg_class = 0x2bde8e0, current_buf = 0x30d1f40, checksum = 0x30d1fd0,
  checksum_hash = 0x30d2030, checksum_size = 4, op = HG_DECODE,
  flags = 0 '\000'}

https://github.com/mercury-hpc/mercury/blob/v2.1.0/src/mercury_core.c#L3842

Program terminated with signal 7, Bus error.
#0  0x00002b32cf14f600 in NA_Trigger@plt ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/lib/libmercury.so.2
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  0x00002b32cf14f600 in NA_Trigger@plt ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/lib/libmercury.so.2
#1  0x00002b32cf159f8d in hg_core_progress_na (na_class=0x21f26c0,
    na_context=0x2432b00, timeout_ms=timeout_ms@entry=0,
    progressed_ptr=progressed_ptr@entry=0x2b330cf3cfb3 "")
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3842
#2  0x00002b32cf15cbf6 in hg_core_poll_wait (
    progressed_ptr=<synthetic pointer>, timeout_ms=100, context=0x24329c0)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3760
#3  hg_core_progress (context=0x24329c0, timeout_ms=timeout_ms@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3660
#4  0x00002b32cf16188b in HG_Core_progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:5029
#5  0x00002b32cf1538d3 in HG_Progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:2022
#6  0x00002b32ced3143c in __margo_hg_progress_fn (foo=0x26c7b90)
    at src/margo-core.c:1482
#7  0x00002b32cf9b746a in ABTD_ythread_func_wrapper ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#8  0x00002b32cf9b75f1 in make_fcontext ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#9  0x0000000000000000 in ?? ()
(gdb) p *na_context
$1 = {plugin_context = 0x240c9a0}
(gdb) p *na_class
$2 = {ops = 0x2b32cf99ba20 <na_ofi_class_ops_g>, plugin_class = 0x243cdf0,
  protocol_name = 0x243d170 "verbs;ofi_rxm", progress_mode = 0,
  listen = 0 '\000'}

https://github.com/mercury-hpc/mercury/blob/v2.1.0/src/na/na.c#L1208

Program terminated with signal 7, Bus error.
#0  NA_Progress (na_class=na_class@entry=0x13096d0,
    context=context@entry=0x1527ef0, timeout_ms=0)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na.c:1208
1208    /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na.c: No such file or directory.
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  NA_Progress (na_class=na_class@entry=0x13096d0,
    context=context@entry=0x1527ef0, timeout_ms=0)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na.c:1208
#1  0x00002aacecb050a9 in hg_core_progress_na (na_class=0x13096d0,
    na_context=0x1527ef0, timeout_ms=timeout_ms@entry=0,
    progressed_ptr=progressed_ptr@entry=0x2aad2ac04fb3 "")
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/util/mercury_time.h:445
#2  0x00002aacecb07bf6 in hg_core_poll_wait (
    progressed_ptr=<synthetic pointer>, timeout_ms=99, context=0x1527db0)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3760
#3  hg_core_progress (context=0x1527db0, timeout_ms=timeout_ms@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3660
#4  0x00002aacecb0c88b in HG_Core_progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:5029
#5  0x00002aacecafe8d3 in HG_Progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:2022
#6  0x00002aacec6dc43c in __margo_hg_progress_fn (foo=0x17ded50)
    at src/margo-core.c:1482
#7  0x00002aaced36246a in ABTD_ythread_func_wrapper ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#8  0x00002aaced3625f1 in make_fcontext ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#9  0x0000000000000000 in ?? ()
(gdb) p *na_class
$1 = {ops = 0x2aaced346a20 <na_ofi_class_ops_g>, plugin_class = 0x1553e40,
  protocol_name = 0x152f0c0 "verbs;ofi_rxm", progress_mode = 0,
  listen = 0 '\000'}
(gdb) p *context
$2 = {plugin_context = 0x1527fe0}
Program terminated with signal 7, Bus error.
#0  0x00002b996f7d9190 in ofi_cq_progress ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  0x00002b996f7d9190 in ofi_cq_progress ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#1  0x00002b996f7d860f in ofi_cq_readfrom ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#2  0x00002b996daa6823 in fi_cq_readfrom (src_addr=0x2b99b3259ad0, count=16,
    buf=0x2b99b3259b50, cq=0x2899880)
    at /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/include/rdma/fi_eq.h:400
#3  na_ofi_cq_read (max_count=16, context=0x286b150,
    actual_count=<synthetic pointer>, src_err_addrlen=<synthetic pointer>,
    src_err_addr=<synthetic pointer>, src_addrs=0x2b99b3259ad0,
    cq_events=0x2b99b3259b50)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na_ofi.c:3221
#4  na_ofi_progress (na_class=0x264dcc0, context=0x286b150, timeout_ms=0)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na_ofi.c:5208
#5  0x00002b996da9e133 in NA_Progress (na_class=na_class@entry=0x264dcc0,
    context=context@entry=0x286b150, timeout_ms=<optimized out>)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na.c:1272
#6  0x00002b996d4750a9 in hg_core_progress_na (na_class=0x264dcc0,
    na_context=0x286b150, timeout_ms=timeout_ms@entry=0,
    progressed_ptr=progressed_ptr@entry=0x2b99b3259fb3 "")
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/util/mercury_time.h:445
#7  0x00002b996d477bf6 in hg_core_poll_wait (
    progressed_ptr=<synthetic pointer>, timeout_ms=100, context=0x286b010)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3760
#8  hg_core_progress (context=0x286b010, timeout_ms=timeout_ms@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3660
#9  0x00002b996d47c88b in HG_Core_progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:5029
#10 0x00002b996d46e8d3 in HG_Progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:2022
#11 0x00002b996d04c43c in __margo_hg_progress_fn (foo=0x2b21cd0)
    at src/margo-core.c:1482
#12 0x00002b996dcd246a in ABTD_ythread_func_wrapper ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#13 0x00002b996dcd25f1 in make_fcontext ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#14 0x0000000000000000 in ?? ()

https://github.com/mercury-hpc/mchecksum/blob/57d9eeeb433c4c2a6b2b9a0572822c1beb9515d6/src/mchecksum.c#L158

Program terminated with signal 7, Bus error.
#0  mchecksum_update (checksum=0x233ab90, data=0x233aa50, size=1)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mchecksum/src/mchecksum.c:158
158     /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mchecksum/src/mchecksum.c: No such file or directory.
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  mchecksum_update (checksum=0x233ab90, data=0x233aa50, size=1)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mchecksum/src/mchecksum.c:158
#1  0x00002b84738abdd6 in hg_core_header_response_proc (op=op@entry=HG_DECODE,
    buf=0x2b84ad2c0228, buf_size=<optimized out>,
    hg_core_header=hg_core_header@entry=0x233aa50)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core_header.c:237
#2  0x00002b84738a5ac4 in hg_core_proc_header_response (op=HG_DECODE,
    hg_core_header=0x233aa50, hg_core_handle=0x233a990)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:838
#3  hg_core_process_output (hg_core_handle=0x233a990,
    completed=0x2b84b1686dff "\001",
    done_callback=0x2b84738a36b0 <hg_core_send_ack>)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3307
#4  0x00002b84738a86cc in hg_core_recv_output_cb (
    callback_info=<optimized out>)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3256
#5  0x00002b8473ecd4f3 in NA_Trigger (context=context@entry=0x1e4c7a0,
    timeout_ms=timeout_ms@entry=0, max_count=max_count@entry=1,
    callback_ret=callback_ret@entry=0x2b84b1686f3c,
    actual_count=actual_count@entry=0x2b84b1686f38)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na.c:1407
#6  0x00002b84738a3f8d in hg_core_progress_na (na_class=0x1e4c770,
    na_context=0x1e4c7a0, timeout_ms=timeout_ms@entry=0,
    progressed_ptr=progressed_ptr@entry=0x2b84b1686fb3 "")
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3842
#7  0x00002b84738a6bf6 in hg_core_poll_wait (
    progressed_ptr=<synthetic pointer>, timeout_ms=100, context=0x2096670)
    at /../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3760
#8  hg_core_progress (context=0x2096670, timeout_ms=timeout_ms@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3660
#9  0x00002b84738ab88b in HG_Core_progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:5029
#10 0x00002b847389d8d3 in HG_Progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:2022
#11 0x00002b847347b43c in __margo_hg_progress_fn (foo=0x2321e10)
    at src/margo-core.c:1482
#12 0x00002b847410146a in ABTD_ythread_func_wrapper ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#13 0x00002b84741015f1 in make_fcontext ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#14 0x0000000000000000 in ?? ()
(gdb) p checksum_class
$2 = (struct mchecksum_class *) 0x233ab90
(gdb) p *checksum_class
$3 = {data = 0x233abd0, destroy = 0x2b8473ab7fc0 <mchecksum_crc16_destroy>,
  reset = 0x2b8473ab7f60 <mchecksum_crc16_reset>,
  get_size = 0x2b8473ab7f70 <mchecksum_crc16_get_size>,
  get = 0x2b8473ab7fe0 <mchecksum_crc16_get>,
  update = 0x2b8473ab7f80 <mchecksum_crc16_update>}
Program terminated with signal 7, Bus error.
#0  0x00002abd1c691e10 in ofi_mutex_lock_op ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
Missing separate debuginfos, use: debuginfo-install glibc-2.17-325.el7_9.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-51.el7_9.x86_64 libatomic-4.8.5-44.el7.x86_64 libcom_err-1.42.9-19.el7.x86_64 libgcc-4.8.5-44.el7.x86_64 libibverbs-55mlnx37-1.55103.x86_64 libnl3-3.2.28-4.el7.x86_64 librdmacm-55mlnx37-1.55103.x86_64 libselinux-2.5-15.el7.x86_64 lustre-client-2.12.6_ddn66-1.el7.x86_64 ncurses-libs-5.9-14.20130511.el7_4.x86_64 numactl-libs-2.0.12-5.el7.x86_64 openssl-libs-1.0.2k-25.el7_9.x86_64 pcre-8.32-17.el7.x86_64 readline-6.2-11.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  0x00002abd1c691e10 in ofi_mutex_lock_op ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#1  0x00002abd1c6f4470 in rxm_cq_write_tx_comp.isra.14.part.15 ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#2  0x00002abd1c6f46a2 in rxm_finish_eager_send ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#3  0x00002abd1c6f6f3c in rxm_handle_comp ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#4  0x00002abd1c6f8504 in rxm_ep_do_progress ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#5  0x00002abd1c6f86d1 in rxm_ep_progress ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#6  0x00002abd1c6a81bd in ofi_cq_progress ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#7  0x00002abd1c6a760f in ofi_cq_readfrom ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/lib/libfabric.so.1
#8  0x00002abd1a975823 in fi_cq_readfrom (src_addr=0x2abd59004ad0, count=16,
    buf=0x2abd59004b50, cq=0x2278550)
    at /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/libfabric-main-j4mgzyne5655z5vye4pvkncq36thjk6y/include/rdma/fi_eq.h:400
#9  na_ofi_cq_read (max_count=16, context=0x229efd0,
    actual_count=<synthetic pointer>, src_err_addrlen=<synthetic pointer>,
    src_err_addr=<synthetic pointer>, src_addrs=0x2abd59004ad0,
    cq_events=0x2abd59004b50)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na_ofi.c:3221
#10 na_ofi_progress (na_class=0x2058330, context=0x229efd0, timeout_ms=0)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na_ofi.c:5208
#11 0x00002abd1a96d133 in NA_Progress (na_class=na_class@entry=0x2058330,
    context=context@entry=0x229efd0, timeout_ms=<optimized out>)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/na/na.c:1272
#12 0x00002abd1a3440a9 in hg_core_progress_na (na_class=0x2058330,
    na_context=0x229efd0, timeout_ms=timeout_ms@entry=0,
    progressed_ptr=progressed_ptr@entry=0x2abd59004fb3 "")
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/util/mercury_time.h:445
#13 0x00002abd1a346bf6 in hg_core_poll_wait (
    progressed_ptr=<synthetic pointer>, timeout_ms=100, context=0x229ee90)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3760
#14 hg_core_progress (context=0x229ee90, timeout_ms=timeout_ms@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:3660
#15 0x00002abd1a34b88b in HG_Core_progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury_core.c:5029
#16 0x00002abd1a33d8d3 in HG_Progress (context=<optimized out>,
    timeout=timeout@entry=100)
    at /.../spack-stage-mercury-2.1.0-otei3gni3ceongkg3rbtpzvrddjnvjqv/spack-src/src/mercury.c:2022
#17 0x00002abd19f1b43c in __margo_hg_progress_fn (foo=0x252b410)
    at src/margo-core.c:1482
#18 0x00002abd1aba146a in ABTD_ythread_func_wrapper ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#19 0x00002abd1aba15f1 in make_fcontext ()
   from /.../spack/linux-rhel7-skylake_avx512/gcc-8.3.1/argobots-1.1-4gyndgmftboa6lwbq745v4nmeesjrtoj/lib/libabt.so.1
#20 0x0000000000000000 in ?? ()

Platform (please complete the following information):

System description

  • Compiler version: gcc-8.3.1
  • Plugin and protocol used ofi+verbs
  • Dependency version libfabric@main

otatebe avatar Apr 16 '22 11:04 otatebe