libvma
libvma copied to clipboard
Ceph with libvma crashed in both V6.9.1 and tag 7.0.5
Hi,
I tried to use libvma in my Ceph cluster recently. However, I met same crash in version 6.9.1(installed via RPM) and 7.0.5(installed from src with debug enabled). The core trace is pasted below. Cloud some one help to look at this issue? Thanks very much!
(gdb) bt
#0 0x00007f4c566cf65b in raise () from /lib64/libpthread.so.0
#1 0x00000000009bd1c4 in ?? ()
#2 <signal handler called>
#3 0x00007f4c52c88d71 in ?? () from /usr/lib64/libmlx4-rdmav2.so
#4 0x00007f4c5536612f in ibv_create_cq () from /usr/lib64/libibverbs.so.1
#5 0x00007f4c5762ed08 in cq_mgr::cq_mgr (this=0x3819340, p_ring=0x15358000, p_ib_ctx_handler=0x373a200, cq_size=3000, p_comp_event_channel=0x94639f0, is_rx=false) at dev/cq_mgr.cpp:69
#6 0x00007f4c57633090 in qp_mgr::configure (this=0x37fb000, p_rx_comp_event_channel=0x94639e0) at dev/qp_mgr.cpp:104
#7 0x00007f4c5763cbae in qp_mgr_eth::qp_mgr_eth (this=0x37fb000, p_ring=0x15358000, p_context=0x373a200, port_num=1 '\001', p_rx_comp_event_channel=0x94639e0, tx_num_wr=3000,
vlan=101) at ../../src/vma/dev/qp_mgr.h:162
#8 0x00007f4c576365af in ring_eth::create_qp_mgr (this=0x15358000, key=..., p_rx_comp_event_channel=0x94639e0) at dev/ring.cpp:70
#9 0x00007f4c57637790 in ring::create_resources (this=0x15358000, p_ring_info=0x7f4c1d2ded60, active=0) at dev/ring.cpp:209
#10 0x00007f4c57652b5c in ring_eth::ring_eth (this=0x15358000, local_if=168758798, p_ring_info=0x7f4c1d2ded60, count=1, active=0, vlan=101) at ../../src/vma/dev/ring.h:382
#11 0x00007f4c57651abd in net_device_val_eth::create_ring (this=0x37a85a0) at dev/net_device_val.cpp:500
#12 0x00007f4c57650168 in net_device_val::reserve_ring (this=0x37a85a0, key=0) at dev/net_device_val.cpp:267
#13 0x00007f4c576ac9b5 in dst_entry::resolve_ring (this=0x36fad00) at proto/dst_entry.cpp:242
#14 0x00007f4c576ad896 in dst_entry::prepare_to_send (this=0x36fad00, skip_rules=false) at proto/dst_entry.cpp:482
#15 0x00007f4c576d70b9 in sockinfo_tcp::connect (this=0x6283f00, __to=0x6310310, __tolen=16) at sock/sockinfo_tcp.cpp:1631
#16 0x00007f4c576efb0c in connect (__fd=66, __to=0x6310310, __tolen=16) at sock/sock-redirect.cpp:569
#17 0x0000000000b4f037 in Pipe::connect() ()
#18 0x0000000000b53743 in Pipe::writer() ()
#19 0x0000000000b5f16d in Pipe::Writer::entry() ()
#20 0x00007f4c566c7a51 in start_thread () from /lib64/libpthread.so.0
#21 0x00007f4c556579ad in clone () from /lib64/libc.so.6
(gdb) f 5
#5 0x00007f4c5762ed08 in cq_mgr::cq_mgr (this=0x3819340, p_ring=0x15358000, p_ib_ctx_handler=0x373a200, cq_size=3000, p_comp_event_channel=0x94639f0, is_rx=false) at dev/cq_mgr.cpp:69
69 m_p_ibv_cq = ibv_create_cq(m_p_ib_ctx_handler->get_ibv_context(), cq_size, (void*)this, m_comp_event_channel, 0);
(gdb) p *this
$3 = {m_p_ring = 0x15358000, m_p_ib_ctx_handler = 0x373a200, m_b_is_rx = false, m_b_is_rx_csum_on = false, m_comp_event_channel = 0x94639f0, m_p_ibv_cq = 0x0,
m_b_notification_armed = false, m_b_was_drained = false, m_n_wce_counter = 0, m_qp_rec = {qp = 0x0, debth = 0}, m_p_next_rx_desc_poll = 0x0, m_rx_queue = {m_list = {head = {
next = 0x3819388, prev = 0x3819388}, obj_ptr = 0x0}, m_size = 0}, m_rx_pool = {m_list = {head = {next = 0x38193a8, prev = 0x38193a8}, obj_ptr = 0x0}, m_size = 0},
m_n_out_of_free_bufs_warning = 0, m_p_cq_stat = 0x0, m_cq_stat_static = {n_rx_pkt_drop = 0, n_rx_sw_queue_len = 0, n_rx_drained_at_once_max = 0, n_buffer_pool_len = 0,
buffer_miss_rate = 0}, m_cq_id = 5, m_n_cq_poll_sn = 0, m_transport_type = VMA_TRANSPORT_ETH, m_sz_transport_header = 0, m_buffer_miss_count = 0, m_buffer_total_count = 0,
m_buffer_prev_id = 0, static m_n_cq_id_counter = {counter = 6}, static m_n_global_sn = 0}
(gdb) p m_comp_event_channel
$4 = (ibv_comp_channel *) 0x94639f0
(gdb) p *m_comp_event_channel
$5 = {context = 0x3828218, fd = 69, refcnt = 0}
(gdb) p *p_ib_ctx_handler
$6 = {<event_handler_ibverbs> = {_vptr.event_handler_ibverbs = 0x7f4c5797ee90}, m_p_ibv_context = 0x3828218, m_ibv_port_attr = {state = IBV_PORT_ACTIVE, max_mtu = IBV_MTU_4096,
active_mtu = IBV_MTU_1024, gid_tbl_len = 128, port_cap_flags = 201392128, max_msg_sz = 1073741824, bad_pkey_cntr = 0, qkey_viol_cntr = 0, pkey_tbl_len = 1, lid = 0, sm_lid = 0,
lmc = 0 '\000', max_vl_num = 2 '\002', sm_sl = 0 '\000', subnet_timeout = 0 '\000', init_type_reply = 0 '\000', active_width = 2 '\002', active_speed = 4 '\004',
phys_state = 5 '\005', link_layer = 2 '\002', reserved = 0 '\000'}, m_p_ibv_device = 0x37d8300, m_ibv_device_attr = {fw_ver = "2.34.5000", '\000' <repeats 54 times>,
node_guid = 6942324114412346852, sys_image_guid = 6942324114412346852, max_mr_size = 18446744073709551615, page_size_cap = 4294966784, vendor_id = 713, vendor_part_id = 4103,
hw_ver = 0, max_qp = 392632, max_qp_wr = 16351, reserved = 6069350, max_sge = 32, max_sge_rd = 0, max_cq = 65408, max_cqe = 4194303, max_mr = 524032, max_pd = 32764,
max_qp_rd_atom = 16, max_ee_rd_atom = 0, max_res_rd_atom = 6282112, max_qp_init_rd_atom = 128, max_ee_init_rd_atom = 0, exp_atomic_cap = IBV_EXP_ATOMIC_HCA, max_ee = 0,
max_rdd = 0, max_mw = 0, max_raw_ipv6_qp = 0, max_raw_ethy_qp = 0, max_mcast_grp = 131072, max_mcast_qp_attach = 244, max_total_mcast_qp_attach = 31981568, max_ah = 2147483647,
max_fmr = 0, max_map_per_fmr = 8191, max_srq = 65472, max_srq_wr = 16383, max_srq_sge = 31, max_pkeys = 128, local_ca_ack_delay = 15 '\017', phys_port_cnt = 2 '\002',
comp_mask = 9, calc_cap = {data_types = 7, data_sizes = 1, int_ops = 29, uint_ops = 29, fp_ops = 29}, timestamp_mask = 0, hca_core_clock = 0,
exp_device_cap_flags = 8152104758268435558, max_dc_req_rd_atom = 0, max_dc_res_rd_atom = 0, inline_recv_sz = 0, max_rss_tbl_sz = 0, ext_atom = {log_atomic_arg_sizes = 0,
max_fa_bit_boundary = 0, log_max_atomic_inline = 0}, umr_caps = {max_klm_list_size = 0, max_send_wqe_inline_klms = 0, max_umr_recursion_depth = 0, max_umr_stride_dimension = 0},
odp_caps = {general_odp_caps = 0, per_transport_caps = {rc_odp_caps = 0, uc_odp_caps = 0, ud_odp_caps = 0, dc_odp_caps = 0, xrc_odp_caps = 0, raw_eth_odp_caps = 0}}, max_dct = 0,
max_ctx_res_domain = 0}, m_p_ibv_pd = 0x36b6100, m_channel = 0, m_removed = false, m_conf_attr_rx_num_wre = 16000, m_conf_attr_tx_num_post_send_notify = 64,
m_conf_attr_tx_max_inline = 220, m_conf_attr_tx_num_wre = 3000}
(gdb) p *p_ring
$7 = {<mem_buf_desc_owner> = {_vptr.mem_buf_desc_owner = 0x7f4c5797ea70}, m_p_ring_stat = 0x0, m_local_if = 168758798, m_transport_type = VMA_TRANSPORT_ETH, m_n_num_resources = 1,
m_ring_resources_map = std::tr1::unordered_map with 0 elements,
m_ring_active_resource = {<std::tr1::__detail::_Hashtable_iterator_base<std::pair<ring_resource_definition const, ring_resources_info_t>, false>> = {_M_cur_node = 0x1000,
_M_cur_bucket = 0x1523d758}, <No data fields>}, m_rx_channel_fd_to_ring_resources_map = std::tr1::unordered_map with 0 elements,
m_l2_mc_ip_attach_map = std::tr1::unordered_map with 0 elements, m_tcp_dst_port_attach_map = std::tr1::unordered_map with 0 elements, m_p_tx_comp_event_channel = 0x94639f0,
m_flow_tcp_map = {_vptr.hash_map = 0x7f4c5797eb10, m_hash_table = {0x0 <repeats 4096 times>}, m_last = 0x0}, m_flow_udp_mc_map = {_vptr.hash_map = 0x7f4c5797eaf0, m_hash_table = {
0x0 <repeats 4096 times>}, m_last = 0x0}, m_flow_udp_uc_map = {_vptr.hash_map = 0x7f4c5797ead0, m_hash_table = {0x0 <repeats 4096 times>}, m_last = 0x0},
m_lock_ring_rx = {<lock_mutex> = {<lock_base> = {_vptr.lock_base = 0x7f4c5797eb70, m_lock_name = 0x7f4c577198ab "ring:lock_rx"}, m_lock = {__data = {__lock = 0, __count = 0,
__owner = 0, __nusers = 0, __kind = 1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 16 times>, "\001", '\000' <repeats 22 times>,
__align = 0}}, <No data fields>}, m_lock_ring_tx = {<lock_mutex> = {<lock_base> = {_vptr.lock_base = 0x7f4c5797eb70, m_lock_name = 0x7f4c577198b8 "ring:lock_tx"}, m_lock = {
__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 16 times>, "\001", '\000' <repeats 22 times>, __align = 0}}, <No data fields>}, m_lock_ring_tx_buf_wait = {<lock_base> = {
_vptr.lock_base = 0x7f4c5797ebb0, m_lock_name = 0x7f4c577198c5 "ring:lock_tx_buf_wait"}, m_lock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0,
__spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}}, m_p_n_rx_channel_fds = 0x11410f60, m_tx_pool = {m_list = {head = {
next = 0x153701d8, prev = 0x153701d8}, obj_ptr = 0x0}, m_size = 0}, m_tx_num_bufs = 0, m_tx_num_wr = 3000, m_tx_num_wr_free = 3000,
m_b_qp_tx_first_flushed_completion_handled = false, m_missing_buf_ref_count = 0, m_cq_moderation_info = {period = 0, count = 0, packets = 0, bytes = 0, prev_packets = 0,
prev_bytes = 0, missed_rounds = 0}, m_tx_lkey = 0, m_partition = 101, m_gro_mgr = {_vptr.gro_mgr = 0x7f4c5797e970, m_n_flow_max = 32, m_n_buf_max = 32, m_n_flow_count = 0,
m_p_rfs_arr = 0x36f2500}, m_ring_stat_static = {n_rx_pkt_count = 0, n_rx_byte_count = 0, n_rx_interrupt_requests = 0, n_rx_interrupt_received = 0, n_rx_cq_moderation_count = 0,
n_rx_cq_moderation_period = 0, n_tx_retransmits = 0}}
from the trace I see that a low lever libverbs API 'ibv_create_cq()' failure has occurred. Did you run any basic network test tool with VMA before this? Is you system and NIC up to date with all FW and MLNX_OFED stack?
Please verify netperf or sockperf are working as expected with VMA. Best if you can paste the output.
You can follow the example on the VMA User Manual which can be downloaded from http://www.mellanox.com/downloads/Accelerator/VMA_7.0.4_package.zip
Yes, both sockperf and iperf works. And the 2us latency seems pretty good.
Here is the output:
svr:
#LD_PRELOAD=libvma.so sockperf sr -i 14.14.15.12
sockperf: == version #2.5.254 ==
sockperf: [SERVER] listen on:
[ 0] IP = 14.14.15.12 PORT = 11111 # UDP
sockperf: Warmup stage (sending a few dummy messages)...
sockperf: [tid 6200] using recvfrom() to block on socket(s)
Client:
#LD_PRELOAD=libvma.so sockperf pp -i 14.14.15.12
sockperf: == version #2.5.254 ==
sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s)
[ 0] IP = 14.14.15.12 PORT = 11111 # UDP
sockperf: Warmup stage (sending a few dummy messages)...
sockperf: Starting test...
sockperf: Test end (interrupted by timer)
sockperf: Test ended
sockperf: [Total Run] RunTime=1.100 sec; SentMessages=242818; ReceivedMessages=242817
sockperf: ========= Printing statistics for Server No: 0
sockperf: [Valid Duration] RunTime=1.000 sec; SentMessages=224628; ReceivedMessages=224628
sockperf: ====> avg-lat= 2.154 (std-dev=1.429)
sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0
sockperf: Summary: Latency is 2.154 usec
sockperf: Total 224628 observations; each percentile contains 2246.28 observations
sockperf: ---> <MAX> observation = 178.810
sockperf: ---> percentile 99.99 = 13.762
sockperf: ---> percentile 99.90 = 6.445
sockperf: ---> percentile 99.50 = 5.190
sockperf: ---> percentile 99.00 = 5.009
sockperf: ---> percentile 95.00 = 2.689
sockperf: ---> percentile 90.00 = 2.390
sockperf: ---> percentile 75.00 = 2.147
sockperf: ---> percentile 50.00 = 1.998
sockperf: ---> percentile 25.00 = 1.962
sockperf: ---> <MIN> observation = 1.769
And here are some local configurations for your reference. Please let me know if you need more. Thanks!
[root@node-26 ~]# ibv_devices
device node GUID
------ ----------------
mlx4_0 e41d2d0300175860
[root@node-26 ~]# ibv_devinfo
hca_id: mlx4_0
transport: InfiniBand (0)
fw_ver: 2.34.5000
node_guid: e41d:2d03:0017:5860
sys_image_guid: e41d:2d03:0017:5860
vendor_id: 0x02c9
vendor_part_id: 4103
hw_ver: 0x0
board_id: MT_1090111023
phys_port_cnt: 2
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 1024 (3)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: Ethernet
port: 2
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 1024 (3)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: Ethernet
[root@node-26 ~]# cat /etc/modprobe.d/mlnx.conf
# Module parameters for MLNX_OFED kernel modules
options ib_uverbs disable_raw_qp_enforcement=1
options mlx4_core fast_drop=1
options mlx4_core log_num_mgm_entry_size=-1
HI All,
An update on this issue. The crash disappears after I upgraded mlnx_ofed from 3.0.1 to 3.0.2.This time I am using VMA version 7.0.4. However it hangs there instead of crash. Below is the trace. Any ideas? Seems VMA had sent out the connection request but never got ACK back. Thanks very much!
(gdb) bt
#0 0x00007fe4d3cc6fa3 in epoll_wait () from /lib64/libc.so.6
#1 0x00007fe4d5c4e6e9 in sockinfo_tcp::rx_wait_helper(int&, bool) () from /usr/lib64/libvma.so
#2 0x00007fe4d5c4f08b in sockinfo_tcp::wait_for_conn_ready() () from /usr/lib64/libvma.so
#3 0x00007fe4d5c52e38 in sockinfo_tcp::connect(sockaddr const*, unsigned int) () from /usr/lib64/libvma.so
#4 0x00007fe4d5c6342b in connect () from /usr/lib64/libvma.so
#5 0x0000000000b4f037 in Pipe::connect() ()
#6 0x0000000000b53743 in Pipe::writer() ()
#7 0x0000000000b5f16d in Pipe::Writer::entry() ()
#8 0x00007fe4d4d36a51 in start_thread () from /lib64/libpthread.so.0
#9 0x00007fe4d3cc69ad in clone () from /lib64/libc.so.6
We did not test VMA with Ceph until now. It will take some time to reach that goal.
Can you try running without the high polling factor: VMA_RX_POLL=0 VMA_SELECT_POLL=0
For now I'll update the issue title to represent the "Ceph" as target application.
Thanks for your responding.
The hang issue is still there after setting VMA_RX_POLL=0 VMA_SELECT_POLL=0. The process had a page locked flag(L) with it. This flag was there too in my last run.
root 10698 0.5 2.7 1770236 904424 ? SLsl 01:32 0:01 /usr/bin/ceph-osd -i 0 --pid-file /var/run/ceph/osd.0.pid -c /etc/ceph/ceph.conf --cluster ceph
And the hang trace is
(gdb) t 148
[Switching to thread 148 (Thread 0x7f853a5be700 (LWP 10701))]#0 0x00007f853c1a9fa3 in epoll_wait () from /lib64/libc.so.6
(gdb) bt
#0 0x00007f853c1a9fa3 in epoll_wait () from /lib64/libc.so.6
#1 0x00007f853e330501 in event_handler_manager::thread_loop() () from /usr/lib64/libvma.so
#2 0x00007f853e330cd6 in event_handler_thread(void*) () from /usr/lib64/libvma.so
#3 0x00007f853d219a51 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f853c1a99ad in clone () from /lib64/libc.so.6
(gdb) t 64
[Switching to thread 64 (Thread 0x7f84f47cc700 (LWP 11764))]#0 0x00007f853c1a0183 in poll () from /lib64/libc.so.6
(gdb) bt
#0 0x00007f853c1a0183 in poll () from /lib64/libc.so.6
#1 0x00007f853e33ad51 in poll_call::wait_os(bool) () from /usr/lib64/libvma.so
#2 0x00007f853e339c3a in io_mux_call::call() () from /usr/lib64/libvma.so
#3 0x00007f853e38c55b in poll_helper(pollfd*, unsigned long, int, __sigset_t const*) () from /usr/lib64/libvma.so
#4 0x0000000000b77451 in Pipe::tcp_read_wait() ()
#5 0x0000000000b7f2a0 in Pipe::tcp_read(char*, int) ()
#6 0x0000000000b8ee8f in Pipe::reader() ()
#7 0x0000000000b92d2d in Pipe::Reader::entry() ()
#8 0x00007f853d219a51 in start_thread () from /lib64/libpthread.so.0
#9 0x00007f853c1a99ad in clone () from /lib64/libc.so.6