pprof-rs icon indicating copy to clipboard operation
pprof-rs copied to clipboard

Panic on aarch64

Open YangKeao opened this issue 4 years ago • 46 comments

tikv/tikv#10658

TiKV on HUAWEI,Kunpeng 920 failed to profile and got an panic.

#0  0x0000fffd7b6aceb4 in ?? () from /lib64/libgcc_s.so.1
#1  0x0000fffd7b6ae534 in _Unwind_Backtrace () from /lib64/libgcc_s.so.1
#2  0x0000aaac01eedb58 in backtrace::backtrace::libunwind::trace (cb=...) at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.37/src/backtrace/libunwind.rs:88
#3  backtrace::backtrace::trace_unsynchronized (cb=...) at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.37/src/backtrace/mod.rs:66
#4  pprof::profiler::perf_signal_handler (_signal=<optimized out>) at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/pprof-0.4.2/src/profiler.rs:128
#5  <signal handler called>

YangKeao avatar Aug 03 '21 08:08 YangKeao

Here is a full backtrace from a libgcc compiled with -g -ggdb (running the example flamegraph: gdb target/debug/examples/flamegraph):

Program received signal SIGSEGV, Segmentation fault.
0x0000007fbf6b1a78 in aarch64_fallback_frame_state (context=0x7fffffbbf0, 
    context=0x7fffffbbf0, fs=0x7fffffbfb0) at ./md-unwind-support.h:74
74      ./md-unwind-support.h: No such file or directory.
(gdb) bt
#0  0x0000007fbf6b1a78 in aarch64_fallback_frame_state (context=0x7fffffbbf0, 
    context=0x7fffffbbf0, fs=0x7fffffbfb0) at ./md-unwind-support.h:74
#1  uw_frame_state_for (context=context@entry=0x7fffffbbf0, fs=fs@entry=0x7fffffbfb0)
    at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1271
#2  0x0000007fbf6b31a0 in _Unwind_Backtrace (
    trace=0x55556de09c <backtrace::backtrace::libunwind::trace::trace_fn>, 
    trace_argument=0x7fffffc650)
    at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind.inc:302
#3  0x000000555557893c in backtrace::backtrace::libunwind::trace (cb=...)
    at /data/gentoo64/home/yangkeao/.cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.61/src/backtrace/libunwind.rs:90
#4  backtrace::backtrace::trace_unsynchronized<pprof::profiler::perf_signal_handler::{closure#0}> (cb=...)
    at /data/gentoo64/home/yangkeao/.cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.61/src/backtrace/mod.rs:66
#5  0x000000555557af20 in pprof::profiler::perf_signal_handler (_signal=27, 
    _siginfo=0x7fffffd3c0, ucontext=0x7fffffd440) at src/profiler.rs:248
#6  <signal handler called>
#7  0x0000005555680b8c in core::ptr::non_null::NonNull<usize>::as_ptr<usize> (self=...)
    at /rustc/09c42c45858d5f3aedfa670698275303a3d19afa/library/core/src/ptr/non_null.rs:270
#8  0x00000055556812a4 in core::slice::iter::Iter<usize>::post_inc_start<usize> (
    self=0x555582aec8, offset=246)
    at /rustc/09c42c45858d5f3aedfa670698275303a3d19afa/library/core/src/slice/iter/macros.rs:93
#9  core::slice::iter::{impl#172}::next<usize> (self=0x7fffffe700)
    at /rustc/09c42c45858d5f3aedfa670698275303a3d19afa/library/core/src/slice/iter/macros.rs:149
#10 0x00000000000004cd in ?? ()

The signal frame has been resolved successfully. But the following codes get wrong pc value:

 /* A signal frame will have a return address pointing to
     __default_sa_restorer. This code is hardwired as:

     0xd2801168         movz x8, #0x8b
     0xd4000001         svc  0x0
   */
  if (pc[0] != MOVZ_X8_8B || pc[1] != SVC_0)
    {
      return _URC_END_OF_STACK;
    }

In this stack frame, the pc value is 0x4cd, which causes this bug.

YangKeao avatar Nov 11 '21 06:11 YangKeao

image

It is only 64bit away from the correct parent frame.

YangKeao avatar Nov 12 '21 09:11 YangKeao

The corrupted frame located at 0x12c524 in the file, the related FDE is :

000280a0 00000014 0002801c FDE cie=00000088 pc=0012c48c...0012c584
  Format:       DWARF32
  DW_CFA_advance_loc: 8
  DW_CFA_def_cfa_offset: +144
  DW_CFA_offset: reg30 -16
  DW_CFA_nop:

  0x12c48c: CFA=reg31
  0x12c494: CFA=reg31+144: reg30=[CFA-16]

Then we should calculate 0x0000007fffffe190 + 144 - 16 = 0x7fffffe210, which is actually the correct address. The dwarf information is actually correct. It seems that the problem locates in the libgcc?

YangKeao avatar Nov 12 '21 10:11 YangKeao

Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256    in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93369 = (void *) 0x7fffffc2d0
$93370 = (_Unwind_Context_Reg_Val) 0x7fffffb7f8

Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256    in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93371 = (void *) 0x7fffffc310
$93372 = (_Unwind_Context_Reg_Val) 0x7fffffc300

Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256    in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93373 = (void *) 0x7fffffd060
$93374 = (_Unwind_Context_Reg_Val) 0x7fffffd058

Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256    in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93375 = (void *) 0x7fffffd190
$93376 = (_Unwind_Context_Reg_Val) 0x7fffffd288

Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256    in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93377 = (void *) 0x7fffffe350
$93378 = (_Unwind_Context_Reg_Val) 0x7fffffe340

Here is the last several CFA and SP.

0x0000007fffffc2f8│+0x0b68: 0x000000555581ab40  →  0x000000555556da18  →  <core::ptr::drop_in_
place<pprof::profiler::perf_signal_handler::{{closure}}>+0> sub sp,  sp,  #0x10

0x0000007fffffc300│+0x0b70: 0x000000555557b1a0  →  <perf_signal_handler+400> bl 0x5555568540 <pthread_self@plt> 

0x0000007fffffd058│+0x18c8: 0x0000007fbf6fc668  →  <__kernel_rt_sigreturn+0> mov x8,  #0x8b   
                // #139

0x0000007fffffd288│+0x1af8: 0x000000555556b400  →  <flamegraph::main+836> str x0,  [sp,  #48]

0x0000007fffffe350│+0x2bc0: 0x00000000000004cd   ← $x6

Related dwarf information:

00000154 00000020 00000048 FDE cie=00000110 pc=000160bc...00016700
  Format:       DWARF32
  LSDA Address: 00000000002a8c70
  DW_CFA_advance_loc: 8
  DW_CFA_def_cfa_offset: +1024
  DW_CFA_offset: reg30 -8
  DW_CFA_offset: reg29 -16
  DW_CFA_nop:
  DW_CFA_nop:
  DW_CFA_nop:

  0x160bc: CFA=reg31
  0x160c4: CFA=reg31+1024: reg29=[CFA-16], reg30=[CFA-8]

0x000000555556b400 is 0x16400.

YangKeao avatar Nov 12 '21 12:11 YangKeao

Thanks for @SchrodingerZhu . I have tried to use nongnu libunwind and the unw_xxx API to get the backtrace in arm. It works surprisingly well!

The modification on backtrace-rs can be found in https://github.com/YangKeao/backtrace-rs/commit/ee71341ca6a1ea68e2c60677a4d9b31f0042378b

YangKeao avatar Jan 28 '22 08:01 YangKeao

Thanks for @SchrodingerZhu . I have tried to use nongnu libunwind and the unw_xxx API to get the backtrace in arm. It works surprisingly well!

The modification on backtrace-rs can be found in YangKeao/backtrace-rs@ee71341

Great progress! I will also continue to help test nongnu/linunwind based backtrace in signal handler.

mornyx avatar Jan 28 '22 09:01 mornyx

Hi @YangKeao, what is your test env? Have you tested it on darwin+aarch64 (m1)? If not, I will make up for this part.

mornyx avatar Jan 28 '22 09:01 mornyx

glad to see this

SchrodingerZhu avatar Jan 28 '22 12:01 SchrodingerZhu

@mornyx

No. I only test it on raspberry pi.

YangKeao avatar Jan 28 '22 13:01 YangKeao

nongnu-libunwind seems to be a good solution then. it will also not break TiFlash (where rust and c++ transits), while llvm-libunwind has problem handling that solution.

cc @YangKeao

SchrodingerZhu avatar Feb 12 '22 01:02 SchrodingerZhu

Cool. Then I could move on fixing this issue in these steps:

  1. Add support to unwind with unw_xxx APIs, and allow users to (dynamically) link with nongnu-libunwind.
  2. Allow statically compile with nongu-libunwind, which could be more convenient for tikv-server to distribute. In this case, the other functions in libgcc_s will also be overridden (e.g. _Unwind_Backtrace).

BTW, some magic (like rewriting the symbol name with objcopy) can also be tried, but I'm not sure it will work perfectly. As a .a file has many .o files, and they could call each other, it means we should also rewrite the calling symbol (relocation rules), not only the exposed symbol, which could be hard to do perfectly.

/cc @BusyJay @sticnarf

What's your opinion on linking nongu-libunwind with tikv-server?

YangKeao avatar Feb 14 '22 05:02 YangKeao

Why Rust choose llvm libuwind in the first place?

BusyJay avatar Feb 14 '22 05:02 BusyJay

Why Rust choose llvm libuwind in the first place?

Default Rust unwind implementation is libgcc_s. I think this is because the C runtime libgcc_s is provided by almost all Linux distributions.

sticnarf avatar Feb 14 '22 05:02 sticnarf

BTW, some magic (like rewriting the symbol name with objcopy) can also be tried, but I'm not sure it will work perfectly. As a .a file has many .o files, and they could call each other, it means we should also rewrite the calling symbol (relocation rules), not only the exposed symbol, which could be hard to do perfectly.

Is it possible to patch the nongnu libunwind code? For example, to avoid overriding _Unwind_Backtrace, just remove the _Unwind_Backtrace provided by nongnu libunwind. And the unw_ prefix might also be patched?

sticnarf avatar Feb 14 '22 05:02 sticnarf

Is it possible to patch the nongnu libunwind code? For example, to avoid override _Unwind_Backtrace, just remove the _Unwind_Backtrace provided by nongnu libunwind. And the unw_ prefix might be also be patched?

I could try.

YangKeao avatar Feb 14 '22 05:02 YangKeao

@sticnarf @mornyx Building nongnu-libunwind without enable-cxx-exceptions will not build the function _Unwind_XXX (e.g. _Unwind_Backtrace), which is also the default behavior of the nongnu-libunwind shipped by ubuntu. I think it's fine to directly static-link with it.

The libunwind-sys needs to be modified to support static link. I will try to modify it and submit a PR.

YangKeao avatar Feb 24 '22 09:02 YangKeao

nongnu/libunwind does not support macOS. I implemented a custom unwind library on macOS using pure Rust. I avoid handling signal frame by using ucontext in the perf_signal_handler's param list (which contains the register context before the signal frame). It seems to work well in TiKV so far, I will clean up the code and submit a PR soon. /cc @YangKeao @sticnarf

mornyx avatar Feb 24 '22 16:02 mornyx

Additional info: I previously wrote a minimal Rust demo to test nongnu/libunwind on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm that nongnu/libunwind works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao

mornyx avatar Feb 24 '22 16:02 mornyx

Additional info: I previously wrote a minimal Rust demo to test nongnu/libunwind on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm that nongnu/libunwind works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao

The debug vs release mode difference may be related to inlining..

sticnarf avatar Feb 25 '22 03:02 sticnarf

Additional info: I previously wrote a minimal Rust demo to test nongnu/libunwind on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm that nongnu/libunwind works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao

libunwind-rs inits its Cursor with unw_init_local:

https://github.com/xcoldhandsx/libunwind-rs/blob/8d5e97730f012405cdea0676d18ba884553f8b4f/src/lib.rs#L302

But from the nongnu libunwind doc, we need to use unw_init_local2 and pass special flag if we are using it in a signal handler. I guess if the problem is related to this.

sticnarf avatar Feb 25 '22 03:02 sticnarf

Additional info: I previously wrote a minimal Rust demo to test nongnu/libunwind on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm that nongnu/libunwind works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao

libunwind-rs inits its Cursor with unw_init_local:

https://github.com/xcoldhandsx/libunwind-rs/blob/8d5e97730f012405cdea0676d18ba884553f8b4f/src/lib.rs#L302

But from the nongnu libunwind doc, we need to use unw_init_local2 and pass special flag if we are using it in a signal handler. I guess if the problem is related to this.

I use nongnu/libunwind bindings provided by Keao and it looks like UNW_INIT_SIGNAL_FRAME is set correctly (https://github.com/YangKeao/backtrace-rs/blob/master/src/backtrace/libunwind.rs#L453).

Here is my demo:

use nix::libc;
use nix::sys::signal;
use rand::Rng;
use parking_lot::RwLock;
use smallvec::SmallVec;

// same as pprof-rs/src/timer.rs
mod timer;

const MAX_DEPTH: usize = 32;

lazy_static::lazy_static! {
    static ref PROFILER: RwLock<()> = RwLock::new(());
}

fn main() {
    // Register signal handler.
    let h = signal::SigHandler::SigAction(perf_signal_handler);
    let a = signal::SigAction::new(h, signal::SaFlags::SA_SIGINFO, signal::SigSet::empty());
    unsafe {
        signal::sigaction(signal::SIGPROF, &a).unwrap();
    }

    // Register SIGPROF.
    let _t = timer::Timer::new(99);

    // Run some workloads.
    loop {
        let mut rng = rand::thread_rng();
        let mut vec: Vec<i32> = vec![];
        for _ in 0..1000000 {
            vec.push(rng.gen())
        }
        vec.sort();
    }
}

#[no_mangle]
pub extern "C" fn perf_signal_handler(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) {
    if let Some(mut guard) = PROFILER.try_write() {
        let mut bt: SmallVec<[_; MAX_DEPTH]> = SmallVec::with_capacity(MAX_DEPTH);
        let mut index = 0;
        unsafe {
            // in Cargo.toml:
            // backtrace = { git = "https://github.com/YangKeao/backtrace-rs.git", features = ["nongnu-unwind"], branch = "master" }
            backtrace::trace_unsynchronized_external_api(|frame| {
                if index < MAX_DEPTH {
                    bt.push(frame.clone());
                    index += 1;
                    true
                } else {
                    false
                }
            }, true); // signal_frame = true
        }
    }
}

Crash stack:

(gdb) r
Starting program: /home/ubuntu/demo/target/debug/demo 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/atomics/libthread_db.so.1".

Program received signal SIGSEGV, Segmentation fault.
0x0000fffff7f8a078 in access_mem (as=<optimized out>, addr=726228, val=0xffffffffa350, write=0, arg=<optimized out>) at aarch64/Ginit.c:337
337	      *val = *(unw_word_t *) addr;
(gdb) bt
#0  0x0000fffff7f8a078 in access_mem (as=<optimized out>, addr=726228, val=0xffffffffa350, write=0, arg=<optimized out>) at aarch64/Ginit.c:337
#1  0x0000fffff7f8ae9c in _ULaarch64_is_signal_frame (cursor=cursor@entry=0xffffffffad48) at aarch64/Gis_signal_frame.c:51
#2  0x0000fffff7f8b8cc in _ULaarch64_step (cursor=0xffffffffad48) at aarch64/Gstep.c:147
#3  0x0000aaaaaaaaecdc in backtrace::backtrace::libunwind::external_unwind::UnwCursor::step (self=0xffffffffad48)
    at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/libunwind.rs:470
#4  0x0000aaaaaaaa9c64 in backtrace::backtrace::libunwind::trace_external_api::{{closure}} (x=...)
    at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/libunwind.rs:136
#5  0x0000aaaaaaaa8d38 in core::result::Result<T,E>::and_then (self=..., op=...) at /rustc/db9d1b20bba1968c1ec1fc49616d4742c1725b4b/library/core/src/result.rs:966
#6  0x0000aaaaaaaa9288 in backtrace::backtrace::libunwind::trace_external_api (f=0xffffffffbd80, signal_frame=true)
    at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/libunwind.rs:121
#7  backtrace::backtrace::trace_unsynchronized_external_api (cb=..., signal_frame=true)
    at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/mod.rs:72
#8  0x0000aaaaaaaad794 in demo::perf_signal_handler (_signal=27, _siginfo=0xffffffffdc50, _ucontext=0xffffffffdcd0) at src/main.rs:43
#9  <signal handler called>
#10 0x0000aaaaaaab0564 in core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt (self=0xfffffffff200, other=0x376726befffff1f8)
    at /rustc/db9d1b20bba1968c1ec1fc49616d4742c1725b4b/library/core/src/cmp.rs:1327
#11 0x0000aaaaaaab0678 in <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next (
    self=0xaaaaaaab062c <core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next+20>)
    at /rustc/db9d1b20bba1968c1ec1fc49616d4742c1725b4b/library/core/src/iter/range.rs:621
#12 0x00000000000b14d4 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)

uname -a:

Linux ubuntu 5.11.0-1025-aws #27~20.04.1-Ubuntu SMP Fri Jan 7 13:14:39 UTC 2022 aarch64 aarch64 aarch64 GNU/Linux

mornyx avatar Feb 25 '22 03:02 mornyx

nongnu/libunwind does not support macOS.

@sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than libgcc 😢 . If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.

@siddontang told me he cannot reproduce this issue (with the examples in pprof-rs) on M1 Mac.

YangKeao avatar Feb 25 '22 05:02 YangKeao

@sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than libgcc . If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.

Actually https://github.com/tikv/tikv/issues/9957#issuecomment-813368686 still segfaults on latest macOS :(

@siddontang told me he cannot reproduce this issue (with the examples in pprof-rs) on M1 Mac.

Maybe he was using macOS 10.x. I remember the issue appears after macOS 11

sticnarf avatar Feb 25 '22 05:02 sticnarf

nongnu/libunwind does not support macOS.

@sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than libgcc 😢 . If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.

@siddontang told me he cannot reproduce this issue (with the examples in pprof-rs) on M1 Mac.

This problem can be reproduced by executing the demo I posted above on my m1 mac.

(lldb) r
Process 41331 launched: '/Users/mornyx/Desktop/repos/segfault-demo/target/debug/segfault-demo' (arm64)
Process 41331 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BREAKPOINT (code=1, subcode=0x1b7dada18)
    frame #0: 0x00000001b7dada18 libunwind.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::step() + 680
libunwind.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::step:
->  0x1b7dada18 <+680>: brk    #0xc471
    0x1b7dada1c <+684>: pacib  x16, x8
    0x1b7dada20 <+688>: str    x16, [x8]
    0x1b7dada24 <+692>: ldr    x16, [x19]
Target 0: (segfault-demo) stopped.
(lldb) bt
error: need to add support for DW_TAG_base_type '()' encoded with DW_ATE = 0x7, bit_size = 0
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BREAKPOINT (code=1, subcode=0x1b7dada18)
  * frame #0: 0x00000001b7dada18 libunwind.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::step() + 680
    frame #1: 0x00000001b7db010c libunwind.dylib`_Unwind_Backtrace + 284
    frame #2: 0x0000000100004470 segfault-demo`backtrace::backtrace::trace_unsynchronized::hc3e1b7c2884d399c [inlined] backtrace::backtrace::libunwind::trace::hf29b36d39a2f0171(cb=&mut dyn core::ops::function::FnMut<(&backtrace::backtrace::Frame), Output=bool> @ 0x000000016fdfe3a0) at libunwind.rs:93:5
    frame #3: 0x0000000100004464 segfault-demo`backtrace::backtrace::trace_unsynchronized::hc3e1b7c2884d399c(cb={closure#0} @ 0x000000016fdfe388) at mod.rs:66:5
    frame #4: 0x0000000100006158 segfault-demo`perf_signal_handler((null)=27, (null)=0x000000016fdfe450, ucontext=0x000000016fdfe4b8) at main.rs:43:13
    frame #5: 0x00000001ad3984e4 libsystem_platform.dylib`_sigtramp + 56
    frame #6: 0x00000001ad196594 libsystem_malloc.dylib`szone_malloc_should_clear + 92
    frame #7: 0x00000001ad196594 libsystem_malloc.dylib`szone_malloc_should_clear + 92
    frame #8: 0x00000001ad19876c libsystem_malloc.dylib`nanov2_realloc + 88
    frame #9: 0x00000001ad198658 libsystem_malloc.dylib`malloc_zone_realloc + 140
    frame #10: 0x00000001ad1973c8 libsystem_malloc.dylib`realloc + 328
    frame #11: 0x0000000100034d18 segfault-demo`alloc::alloc::realloc::h74f14f2ce4ab84b0(ptr="@B\n", layout=Layout @ 0x000000016fdfeab0, new_size=512) at alloc.rs:124:14
    frame #12: 0x0000000100034ab4 segfault-demo`alloc::alloc::Global::grow_impl::hb5673b9e9d83a1b7(self=0x000000016fdff2e8, ptr=(pointer = "@B\n"), old_layout=Layout @ 0x000000016fdfeb90, new_layout=Layout @ 0x000000016fdfeba0, zeroed=false) at alloc.rs:201:31
    frame #13: 0x0000000100034dc8 segfault-demo`_$LT$alloc..alloc..Global$u20$as$u20$core..alloc..Allocator$GT$::grow::h9a808bc86915a00c(self=0x000000016fdff2e8, ptr=(pointer = "@B\n"), old_layout=Layout @ 0x000000016fdfecd0, new_layout=Layout @ 0x000000016fdfece0) at alloc.rs:254:18
    frame #14: 0x00000001000344a0 segfault-demo`alloc::raw_vec::finish_grow::h4ae967a0979b21be(new_layout=Result<core::alloc::layout::Layout, core::alloc::layout::LayoutError> @ 0x000000016fdfedc8, current_memory=Option<(core::ptr::non_null::NonNull<u8>, core::alloc::layout::Layout)> @ 0x000000016fdfef20, alloc=0x000000016fdff2e8) at raw_vec.rs:457:13
    frame #15: 0x00000001000319c4 segfault-demo`alloc::raw_vec::RawVec$LT$T$C$A$GT$::grow_amortized::hf32eb34c9a647fa0(self=0x000000016fdff2e8, len=16, additional=1) at raw_vec.rs:393:19
    frame #16: 0x0000000100062874 segfault-demo`alloc::raw_vec::RawVec$LT$T$C$A$GT$::reserve::do_reserve_and_handle::hf1a1ca5b41c0d04a(slf=0x000000016fdff2e8, len=16, additional=1) at raw_vec.rs:284:28
    frame #17: 0x0000000100031b40 segfault-demo`alloc::raw_vec::RawVec$LT$T$C$A$GT$::reserve::h31a07e947ecd1269(self=0x000000016fdff2e8, len=16, additional=1) at raw_vec.rs:288:13
    frame #18: 0x0000000100033280 segfault-demo`alloc::vec::Vec$LT$T$C$A$GT$::reserve::h2fb49bd2d09baade(self=0x000000016fdff2e8, additional=1) at mod.rs:809:9
    frame #19: 0x0000000100033068 segfault-demo`alloc::vec::Vec$LT$T$C$A$GT$::push::h7a8573a64b7fabd6(self=0x000000016fdff2e8, value=(start = 344640, len = 10)) at mod.rs:1729:13
    frame #20: 0x0000000100004e20 segfault-demo`alloc::slice::merge_sort::h5090a8047527648e(v=&mut [i32] @ 0x000000016fdff3b8, is_less={closure#0} @ 0x000000016fdff297) at slice.rs:1131:9
    frame #21: 0x0000000100005440 segfault-demo`alloc::slice::_$LT$impl$u20$$u5b$T$u5d$$GT$::sort::hab2ae5faf8805e55(self=&mut [i32] @ 0x000000016fdff430) at slice.rs:276:9
    frame #22: 0x00000001000060c8 segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:34:9
    frame #23: 0x0000000100004050 segfault-demo`core::ops::function::FnOnce::call_once::h769b14bc6391e915((null)=(segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:16), (null)=<unavailable>) at function.rs:227:5
    frame #24: 0x00000001000059b0 segfault-demo`std::sys_common::backtrace::__rust_begin_short_backtrace::h212780ff714d2704(f=(segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:16)) at backtrace.rs:123:18
    frame #25: 0x000000010000678c segfault-demo`std::rt::lang_start::_$u7b$$u7b$closure$u7d$$u7d$::he43552e542027ddc at rt.rs:145:18
    frame #26: 0x000000010004bd88 segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] core::ops::function::impls::_$LT$impl$u20$core..ops..function..FnOnce$LT$A$GT$$u20$for$u20$$RF$F$GT$::call_once::h10f2582b16e2b13c at function.rs:259:13 [opt]
    frame #27: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::do_call::hd3dfc31f9ced2f42 at panicking.rs:406:40 [opt]
    frame #28: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::h584945b02ec0e15d at panicking.rs:370:19 [opt]
    frame #29: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panic::catch_unwind::h1138cecd37279bb6 at panic.rs:133:14 [opt]
    frame #30: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::rt::lang_start_internal::_$u7b$$u7b$closure$u7d$$u7d$::hf94f7401539e24a6 at rt.rs:128:48 [opt]
    frame #31: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::do_call::ha8b5def05088e3d3 at panicking.rs:406:40 [opt]
    frame #32: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::h3ce579dae5f3a6fb at panicking.rs:370:19 [opt]
    frame #33: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panic::catch_unwind::h29ecbe0d385e9017 at panic.rs:133:14 [opt]
    frame #34: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 at rt.rs:128:20 [opt]
    frame #35: 0x0000000100006758 segfault-demo`std::rt::lang_start::h969a743b4698eeaf(main=(segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:16), argc=1, argv=0x000000016fdff810) at rt.rs:144:17
    frame #36: 0x0000000100006270 segfault-demo`main + 32
    frame #37: 0x00000001001090f4 dyld`start + 520

mornyx avatar Feb 25 '22 06:02 mornyx

On ARM+macOS, we can be confident that the frame-pointer exists and we can take advantage of this. And because our stacktrace scene only exists in signal handler, we can use the ucontext provided by kernel to skip the signal frame. Here is my implementation: https://github.com/mornyx/unwind

Then I introduced this implementation in pprof-rs: https://github.com/mornyx/pprof-rs/commit/a68772cb4093c8e31064f5516520afb1cb692aa3

And it worked fine in TiKV (based on sysbench stress): 截屏2022-02-25 14 25 04

mornyx avatar Feb 25 '22 06:02 mornyx

In fact I also tried parsing compact unwind info (the __unwind_info section of the binary on macOS). It does not contain the function index in the darwin-xnu kernel (as discussed in https://github.com/tikv/tikv/issues/9957).

mornyx avatar Feb 25 '22 06:02 mornyx

on macos, I think libunwind is referring to llvm's libunwind. As I early mentioned in the internal chat, at least on linux, llvm's libunwind would parse rust's debug CFI wrongly. This actually forbids the tiflash team from using the lib when migrating to LLVM toolchain.

Sent from ProtonMail mobile

-------- Original Message -------- On Feb 25, 2022, 1:28 PM, Yilin Chen < @.***> wrote:

@.***sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than libgcc . If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.

Actually tikv/tikv#9957 (comment) still segfaults on latest macOS :(

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you were mentioned.AEZNMJJG54C5LAG2KMJLS3DU44HQDA5CNFSM5BOJFMC2YY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOH2O63MQ.gifMessage ID: @.***>

SchrodingerZhu avatar Feb 25 '22 08:02 SchrodingerZhu

On ARM+macOS, we can be confident that the frame-pointer exists and we can take advantage of this. And because our stacktrace scene only exists in signal handler, we can use the ucontext provided by kernel to skip the signal frame. Here is my implementation: https://github.com/mornyx/unwind

Then I introduced this implementation in pprof-rs: mornyx@a68772c

And it worked fine in TiKV (based on sysbench stress): 截屏2022-02-25 14 25 04

From the graph you posted, the result looks distorted.

For example, I never see _get_attr_list so frequently sampled. And a lot of stacks seem missing in the grpc part.

I guess many frame pointers are not actually emitted or they are not properly parsed.

sticnarf avatar Feb 25 '22 08:02 sticnarf

I suppose frame pointers cannot be used for inlined functions while dwarf section can still be used locate inline functions.

SchrodingerZhu avatar Feb 25 '22 09:02 SchrodingerZhu

I suppose frame pointers cannot be used for inlined functions while dwarf section can still be used locate inline functions.

I tested it on newer versions of macOS, and in fact the binary only contains the __unwind_info segment, no longer the __eh_frame segment. The point is that __unwind_info does not contain inline information either. However backtrace::resolve() can correctly call callback for inlined functions.

The demo below confirms this point:

use unwind::{init_unwind_context, UnwindContext};

fn main() {
    func1_inlined();
}

#[inline(always)]
fn func1_inlined() {
    func2();
}

fn func2() {
    unsafe {
        let mut context = UnwindContext::default();
        init_unwind_context(&mut context as _);
        show(context.pc);
        // jump to next frame
        context.pc = *std::mem::transmute::<_, *const u64>(context.fp + 8);
        context.fp = *std::mem::transmute::<_, *const u64>(context.fp);
        show(context.pc);
    }
}

unsafe fn show(pc: u64) {
    println!("{:#x}", pc);
    backtrace::resolve(std::mem::transmute(pc), |s| {
        println!("{:?}", s.name());
    });
}

output:

0x100a82ffc
Some(demo::func2::ha91bc1d34ffee8cf)
0x100a82fd4
Some(demo::func1_inlined::h45123ba64dc5a176)
Some(demo::main::h9c57e81ec6be060b)

Looking at 0x100a82fd4, we got two functions for one pointer, including the one which is inlined.

mornyx avatar Feb 25 '22 09:02 mornyx