vpp
vpp copied to clipboard
ip: route APIs are not thread safe
This is the cause why high-scale IPsec policy tests are failing in CSIT, as tracked in [0]. Using a debug build (with ASAN) the VPP crashes [1] generating core that looks like this: On main thread, API is parked in the middle of adding a FIB entry:
#0 0x00007ffff51564de in clib_time_now_internal (c=0x7ffe72000740, n=11953583642511806) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vppinfra/time.h:227
#1 0x00007ffff5156385 in clib_time_now (c=0x7ffe72000740) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vppinfra/time.h:241
#2 0x00007ffff514e2a4 in vlib_time_now (vm=0x7ffe72000740) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/main.h:334
#3 0x00007ffff514ebeb in vlib_worker_thread_barrier_sync_int (vm=0x7ffe72000740, func_name=0x7ffff7758780 <__FUNCTION__.load_balance_alloc_i> "load_balance_alloc_i") at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/threads.c:1398
#4 0x00007ffff680dfa9 in load_balance_alloc_i () at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/dpo/load_balance.c:106
#5 0x00007ffff6807551 in load_balance_create_i (num_buckets=0, lb_proto=DPO_PROTO_IP4, fhc=(IP_FLOW_HASH_SRC_ADDR | IP_FLOW_HASH_DST_ADDR | IP_FLOW_HASH_SRC_PORT | IP_FLOW_HASH_DST_PORT | IP_FLOW_HASH_PROTO | IP_FLOW_HASH_FL)) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/dpo/load_balance.c:255
#6 0x00007ffff68074a5 in load_balance_create (n_buckets=0, lb_proto=DPO_PROTO_IP4, fhc=(IP_FLOW_HASH_SRC_ADDR | IP_FLOW_HASH_DST_ADDR | IP_FLOW_HASH_SRC_PORT | IP_FLOW_HASH_DST_PORT | IP_FLOW_HASH_PROTO | IP_FLOW_HASH_FL)) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/dpo/load_balance.c:278
#7 0x00007ffff6755b5b in fib_entry_src_mk_lb (fib_entry=0x7ffe73bd1bc0, source=FIB_SOURCE_API, fct=FIB_FORW_CHAIN_TYPE_UNICAST_IP4, dpo_lb=0x7ffe73bd1be8) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_entry_src.c:644
#8 0x00007ffff6757bee in fib_entry_src_action_install (fib_entry=0x7ffe73bd1bc0, source=FIB_SOURCE_API) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_entry_src.c:715
#9 0x00007ffff6758c6d in fib_entry_src_action_activate (fib_entry=0x7ffe73bd1bc0, source=FIB_SOURCE_API) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_entry_src.c:1093
#10 0x00007ffff674bb9c in fib_entry_create (fib_index=0, prefix=0x7ffff47ff580, source=FIB_SOURCE_API, flags=FIB_ENTRY_FLAG_NONE, paths=0x7ffe739030e8) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_entry.c:704
#11 0x00007ffff67297fb in fib_table_entry_update (fib_index=0, prefix=0x7ffff47ff580, source=FIB_SOURCE_API, flags=FIB_ENTRY_FLAG_NONE, paths=0x7ffe739030e8) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_table.c:806
#12 0x00007ffff67af56c in fib_api_route_add_del (is_add=1 '\001', is_multipath=0 '\000', fib_index=0, prefix=0x7ffff47ff580, src=FIB_SOURCE_API, entry_flags=FIB_ENTRY_FLAG_NONE, rpaths=0x7ffe739030e8) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_api.c:481
#13 0x00007ffff5e1c501 in ip_route_add_del_t_handler (mp=0x7ffe739d0de8, stats_index=0x7ffff47ff780) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/ip/ip_api.c:732
#14 0x00007ffff5e1b9bb in vl_api_ip_route_add_del_t_handler (mp=0x7ffe739d0de8) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/ip/ip_api.c:803
#15 0x00007ffff7ef4a55 in msg_handler_internal (am=0x7ffff7f10340 <api_global_main>, the_msg=0x7ffe739d0de8, msg_len=206, trace_it=0, do_it=1, free_it=0) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlibapi/api_shared.c:570
#16 0x00007ffff7ef5722 in vl_msg_api_socket_handler (the_msg=0x7ffe739d0de8, msg_len=206) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlibapi/api_shared.c:721
#17 0x00007ffff7f3f484 in vl_socket_process_api_msg (rp=0x7ffe7385fc38, input_v=0x7ffe739d0dd8 "") at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlibmemory/socket_api.c:205
#18 0x00007ffff7f5118c in vl_api_clnt_process (vm=0x7ffe72000740, node=0x7ffe752a6480, f=0x0) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlibmemory/memclnt_api.c:464
#19 0x00007ffff50d0bdb in vlib_process_bootstrap (_a=140730676805984) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/main.c:1208
At the same time, worker processes ARP and gets broken (use-after-poison):
#13 0x000055555563435c in __asan_report_load4 ()
#14 0x00007ffff677234d in __vec_len (v=0x7ffe73a2dc20) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vppinfra/vec_bootstrap.h:129
#15 0x00007ffff6779d79 in pool_is_free_index (p=0x7ffe73a2dc20, index=941) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vppinfra/pool.h:279
#16 0x00007ffff677226e in fib_path_list_get (index=941) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_path_list.c:102
#17 0x00007ffff6772983 in fib_path_list_get_resolving_interface (path_list_index=941) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_path_list.c:619
#18 0x00007ffff675d488 in fib_entry_get_resolving_interface_for_source (entry_index=934, source=FIB_SOURCE_INTERFACE) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_entry_src.c:1843
#19 0x00007ffff674f418 in fib_entry_get_any_resolving_interface (entry_index=934) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/fib/fib_entry.c:1571
#20 0x00007ffff69948ac in arp_reply (vm=0x7ffe750915c0, node=0x7ffe73933bc0, frame=0x7ffe74fb5c80) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vnet/arp/arp.c:428
#21 0x00007ffff50d3a0a in dispatch_node (vm=0x7ffe750915c0, node=0x7ffe73933bc0, type=VLIB_NODE_TYPE_INTERNAL, dispatch_state=VLIB_NODE_STATE_POLLING, frame=0x7ffe74fb5c80, last_time_stamp=11953583605887018) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/main.c:949
#22 0x00007ffff50d5d3c in dispatch_pending_node (vm=0x7ffe750915c0, pending_frame_index=2, last_time_stamp=11953583605887018) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/main.c:1106
#23 0x00007ffff50c951c in vlib_main_or_worker_loop (vm=0x7ffe750915c0, is_main=0) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/main.c:1581
#24 0x00007ffff50c81e7 in vlib_worker_loop (vm=0x7ffe750915c0) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/main.c:1712
#25 0x00007ffff51519d6 in vlib_worker_thread_fn (arg=0x7ffe725d71c0) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/threads.c:1643
#26 0x00007ffff5144502 in vlib_worker_thread_bootstrap_fn (arg=0x7ffe725d71c0) at /w/workspace/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/src/vlib/threads.c:448
The obvious explanation is that the path list vector has grown and it got reallocated, but that part was not guarded as carefully as the following dpo the main thread parked in.
[0] csit-4020 [1] https://logs.fd.io/vex-yul-rot-jenkins-1/vpp-csit-verify-perf-master-ubuntu2404-x86_64-3n-snr/48/csit_current/0/log.html.gz#s1-s1-s1-s1-s1-t1-k3-k4-k1