bcachefs-tools icon indicating copy to clipboard operation
bcachefs-tools copied to clipboard

__bch2_fs_read_only in ininite loop

Open jpsollie opened this issue 4 years ago • 0 comments

bcachefs fsck never completes, it hangs after flushing journal and stopping allocators complete I investigated it with gdb:

(gdb) info threads
  Id   Target Id                                           Frame 
* 1    Thread 0x7fb07c46db80 (LWP 21551) "bcachefs"        0x0000555c9826db2b in __bch2_fs_read_only (c=c@entry=0x7fb07c428000) at libbcachefs/super.c:283
  2    Thread 0x7fb07c8eb700 (LWP 21552) "aio_completion"  0x00007fb07c5ff24a in ?? () from /lib64/libaio.so.1
  3    Thread 0x7fb07c8e2700 (LWP 21553) "events"          0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  4    Thread 0x7fb07c8d9700 (LWP 21554) "events_highpri"  0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  5    Thread 0x7fb07c46c700 (LWP 21555) "events_long"     0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  6    Thread 0x7fb07c463700 (LWP 21556) "events_unbound"  0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  7    Thread 0x7fb07c45a700 (LWP 21557) "events_freezabl" 0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  8    Thread 0x7fb07c451700 (LWP 21558) "timers"          0x00007fb07c62c9e7 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
  9    Thread 0x7fb07c426700 (LWP 21559) "bcachefs"        0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  10   Thread 0x7fb07c41d700 (LWP 21560) "bcache_copygc"   0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  11   Thread 0x7fb07c414700 (LWP 21561) "bcache_journal"  0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  12   Thread 0x7fb07bbd9700 (LWP 21562) "btree_update"    0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  13   Thread 0x7fb058eca700 (LWP 21589) "events"          0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  14   Thread 0x7fb07c209700 (LWP 21615) "bch_alloc[/dev/" preempt_disable () at linux/preempt.c:31
  15   Thread 0x7fb07c200700 (LWP 21616) "bch_alloc[/dev/" 0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
  16   Thread 0x7fb07c1f7700 (LWP 21617) "bch_alloc[/dev/" 0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6

I suspect the error to be at super.c:283, as I set a breakpoint at 288 and it never reached it.

#0  0x00007fb07c565bd9 in syscall () from /lib64/libc.so.6
#1  0x0000555c982dc809 in futex (val3=0, uaddr2=0x0, timeout=0x0, val=<optimized out>, op=128, uaddr=<optimized out>) at /usr/include/urcu/futex.h:66
#2  schedule () at linux/sched.c:41
#3  0x0000555c982db51d in wait_for_completion (x=x@entry=0x555c98579fa8) at linux/wait.c:235
#4  0x0000555c982db82c in kthread_stop (p=p@entry=0x555c98579f80) at linux/kthread.c:121
#5  0x0000555c9828c705 in bch2_dev_allocator_stop (ca=<optimized out>) at libbcachefs/alloc_background.c:1394
#6  0x0000555c9826db2b in __bch2_fs_read_only (c=c@entry=0x7fb07c428000) at libbcachefs/super.c:283
#7  0x0000555c982703f6 in bch2_fs_read_only (c=c@entry=0x7fb07c428000) at libbcachefs/super.c:351
#8  0x0000555c98270700 in bch2_fs_stop (c=0x7fb07c428000) at libbcachefs/super.c:571
#9  0x0000555c982e2f07 in cmd_fsck (argc=<optimized out>, argv=0x7ffd28d3b580) at cmd_fsck.c:90
#10 0x0000555c98251f6e in main (argc=<optimized out>, argv=0x7ffd28d3b558) at bcachefs.c:165

further digging into the main tread may have shown the error:

gdb) info frame
Stack level 6, frame at 0x7ffd28d3b2f0:
 rip = 0x555c9826db2b in __bch2_fs_read_only (libbcachefs/super.c:283); saved rip = 0x555c982703f6
 called by frame at 0x7ffd28d3b340, caller of frame at 0x7ffd28d3b250
 source language c.
 Arglist at 0x7ffd28d3b248, args: c=c@entry=0x7fb07c428000
 Locals at 0x7ffd28d3b248, Previous frame's sp is 0x7ffd28d3b2f0
 Saved registers:
  rbx at 0x7ffd28d3b2b8, rbp at 0x7ffd28d3b2c0, r12 at 0x7ffd28d3b2c8, r13 at 0x7ffd28d3b2d0, r14 at 0x7ffd28d3b2d8, r15 at 0x7ffd28d3b2e0, rip at 0x7ffd28d3b2e8
(gdb) info locals
ca = 0x555c984f8000
wrote = true
i = 0
clean_passes = <optimized out>
ret = <optimized out>
(gdb) info args
c = 0x7fb07c428000
(gdb) print ca
$1 = (struct bch_dev *) 0x555c984f8000
(gdb) print *ca
$2 = {kobj = {parent = 0x0, kset = 0x0, ktype = 0x555c9833e5e0 <bch2_dev_ktype>, sd = 0x0, ref = {counter = 1}, state_initialized = 1, state_in_sysfs = 0, 
    state_add_uevent_sent = 0, state_remove_uevent_sent = 0, uevent_suppress = 0}, ref = {count = {counter = 2}, release = 0x555c9826ceb0 <bch2_dev_ref_complete>, 
    confirm_switch = 0x0}, ref_completion = {done = 0, wait = {lock = {count = 0}, task_list = {next = 0x555c984f8050, prev = 0x555c984f8050}}}, io_ref = {count = {
      counter = 1}, release = 0x555c9826cea0 <bch2_dev_io_ref_complete>, confirm_switch = 0x0}, io_ref_completion = {done = 0, wait = {lock = {count = 0}, task_list = {
        next = 0x555c984f8088, prev = 0x555c984f8088}}}, fs = 0x7fb07c428000, dev_idx = 0 '\000', mi = {nbuckets = 366304, first_bucket = 0, bucket_size = 512, group = 1, 
    state = 0 '\000', replacement = 0 '\000', discard = 1 '\001', data_allowed = 28 '\034', durability = 1 '\001', valid = 1 '\001'}, uuid = {
    b = "<ҧ\251!\270E\220\217`!\326tu\215)"}, name = "/dev/disk/by-partuuid/07bb2c48-", disk_sb = {sb = 0x555c98447000, bdev = 0x555c98444150, bio = 0x555c98444300, 
    page_order = 2, mode = 131, have_layout = 1, have_bio = 1, fs_sb = 0, seq = 82}, sb_read_scratch = 0x555c984fa000, sb_write_error = 0, self = {d = {1}}, 
  replica_set = {front_pad = 40}, buckets = {0x7fb07b3d0000, 0x0}, buckets_nouse = 0x555c984fc000, bucket_lock = {lock = {__data = {__readers = 0, __writers = 0, 
        __wrphase_futex = 0, __writers_futex = 0, __pad3 = 0, __pad4 = 0, __cur_writer = 0, __shared = 0, __rwelision = 0 '\000', __pad1 = "\000\000\000\000\000\000", 
        __pad2 = 0, __flags = 0}, __size = '\000' <repeats 55 times>, __align = 0}}, usage = {0x555c9845cf40, 0x0}, alloc_thread = 0x0, free = {{front = 0, back = 7, 
      size = 7, mask = 7, data = 0x555c9844ef40}, {front = 0, back = 1501, size = 2861, mask = 4095, data = 0x555c9850d000}, {front = 185, back = 900, size = 715, 
      mask = 1023, data = 0x555c98516000}}, free_inc = {front = 2408, back = 2495, size = 89, mask = 127, data = 0x555c98498400}, open_buckets_partial = {
    0 <repeats 1024 times>}, open_buckets_partial_nr = 0, fifo_last_bucket = 0, max_last_bucket_io = {62, 89}, inc_gen_needs_gc = 0, inc_gen_really_needs_gc = 0, 
  allocator_state = ALLOCATOR_RUNNING, alloc_heap = {size = 1430, used = 2, data = 0x555c98519000}, copygc_thread = 0x0, copygc_heap = {size = 2861, used = 0, 
    data = 0x555c9852a000}, copygc_pd = {rate = {next = 13306909197746, rate = 4294967295}, last_update = 13227428, last_actual = 0, smoothed_derivative = 0, 
    p_term_inverse = 6000, d_smooth = 30, d_term = 0, last_derivative = 0, last_proportional = 0, last_change = 0, last_target = 0, backpressure = true}, 
  copygc_write_point = {node = {next = 0x0, prev = 0x0}, lock = {lock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, 
          __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}}, last_used = 0, write_point = 0, type = BCH_DATA_USER, 
    is_ec = false, sectors_free = 0, ptrs = {nr = 0, v = {0 <repeats 15 times>}}, stripe = {next_alloc = {0 <repeats 64 times>}}}, copygc_threshold = 1469952, 
  rebalance_work = {counter = 0}, journal = {bucket_seq = 0x555c98572000, sectors_free = 75, discard_idx = 654, dirty_idx_ondisk = 654, dirty_idx = 654, cur_idx = 654, 
    nr = 1024, buckets = 0x555c9857a000, bio = 0x555c98575000, read = {{{wq = 0x0, s = 0x555c984f8dd0, list = {next = 0x555c984f8dd0}, fn = 0x0}, work = {data = {
            counter = 0}, entry = {next = 0x555c984f8dd0, prev = 0x555c984f8dd0}, func = 0x0}}, parent = 0x7ffd28d3b0c0, remaining = {counter = 0}}}, io_error_work = {
    data = {counter = 0}, entry = {next = 0x555c984f8e00, prev = 0x555c984f8e00}, func = 0x555c9827fb20 <bch2_io_error_work>}, cur_latency = {{counter = 0}, {
      counter = 221382}}, io_latency = {{lock = {count = 0}, count = 0, average_duration = 0, average_frequency = 0, max_duration = 0, last_event = 0, quantiles = {
        entries = {{m = 0, step = 0} <repeats 15 times>}}, buffer = 0x0}, {lock = {count = 0}, count = 293, average_duration = 216274, average_frequency = 133702799802, 
      max_duration = 907092, last_event = 13307762416075, quantiles = {entries = {{m = 205779, step = 96}, {m = 102947, step = 6434}, {m = 0, step = 0}, {m = 11986, 
            step = 27}, {m = 0, step = 0}, {m = 0, step = 0}, {m = 0, step = 0}, {m = 9798, step = 150}, {m = 0, step = 0}, {m = 0, step = 0}, {m = 0, step = 0}, {m = 0, 
            step = 0}, {m = 0, step = 0}, {m = 0, step = 0}, {m = 0, step = 0}}}, buffer = 0x0}}, congested = {counter = 1024}, congested_last = 13307762416075, 
  io_done = 0x555c984b6390}

the ref->completion->wait->task_list values seem inconsistent:

(gdb) print ca->ref_completion->wait->task_list->next->next
$9 = (struct cds_list_head *) 0x555c984f8050
(gdb) print ca->ref_completion->wait->task_list->next
$10 = (struct cds_list_head *) 0x555c984f8050
(gdb) print ca->ref_completion->wait->task_list->next->next->next
$11 = (struct cds_list_head *) 0x555c984f8050
(gdb) print ca->ref_completion->wait->task_list->prev
$12 = (struct cds_list_head *) 0x555c984f8050
(gdb) print ca->ref_completion->wait->task_list->prev->prev
$13 = (struct cds_list_head *) 0x555c984f8050

EDIT: I don't like ca->journal->read either:

print ca->journal->read
$4 = {{{wq = 0x0, s = 0x555a2a85ddf0, list = {next = 0x555a2a85ddf0}, fn = 0x0}, work = {data = {counter = 0}, entry = {next = 0x555a2a85ddf0, prev = 0x555a2a85ddf0}, 
      func = 0x0}}, parent = 0x7ffdb89ca830, remaining = {counter = 0}}

@koverstreet : should I look further? or do you have enough info?

jpsollie avatar Jun 16 '20 11:06 jpsollie