Cygwin select() returns -1 with errno 0
The problem is that select returns -1 with errno 0, so perror says select: No error.
Original seen in https://github.com/fish-shell/fish-shell/pull/11238/commits/0ad62cea2a9a6606128b182d11b3aac4a62b3b9d in https://github.com/fish-shell/fish-shell/pull/11238
It occurs randomly.
However when using strace this occurs more often. The following example uses strace -f -m syscall,select ./fish -c 'sha256sum fish.exe' &> output.txt.
Trimmed output
627 479850 [fish] fish 1628 pselect: pselect (15, 0x7FFD0C690, 0x7FFD0C680, 0x7FFD0C670, 0x0, 0x0)
104 479954 [fish] fish 1628 pselect: to NULL, us -1
770 480724 [fish] fish 1628 select: sel.always_ready 0
3427 484151 [main] fish 1628 write: write(15, 0x7FFFF9E4C, 1)
89 484240 [main] fish 1628 write: 1 = write(15, 0x7FFFF9E4C, 1)
53 484293 [pipesel] fish 1628 peek_pipe: read: pipe:[12884907236], ready for read: avail 1
108 484401 [fish] fish 1628 select_stuff::wait: wait_ret 2, m = 3. verifying
101 484502 [fish] fish 1628 set_bits: me 0xA00046280, testing fd 14 (pipe:[12884907236])
57 484559 [main] fish 1628 read: read(12, 0x7FFFF5DF8, 16384) nonblocking
155 484714 [fish] fish 1628 set_bits: ready 1
114 484828 [fish] fish 1628 set_bits: me 0xA00045C20, testing fd 12 (pipe:[8589939940])
117 484945 [fish] fish 1628 set_bits: ready 0
52 484997 [main] fish 1628 read: -1 = read(12, 0x7FFFF5DF8, -1), errno 11
53 485050 [fish] fish 1628 select_stuff::wait: res after verify 0
102 485152 [fish] fish 1628 select_stuff::wait: returning 0
51 485203 [main] fish 1628 close: close(12)
52 485255 [fish] fish 1628 select: sel.wait returns 0
100 485355 [main] fish 1628 fhandler_base::close: closing 'pipe:[8589939940]' handle 0x2F0
49 485404 [main] fish 1628 close: 0 = close(12)
45 485449 [fish] fish 1628 peek_pipe: pipe:[12884907236], already ready for read
52 485501 [main] fish 1628 close: close(13)
51 485552 [fish] fish 1628 set_bits: me 0xA00046280, testing fd 14 (pipe:[12884907236])
108 485660 [fish] fish 1628 set_bits: ready 1
56 485716 [main] fish 1628 fhandler_base::close: closing 'pipe:[8589939940]' handle 0x2D8
53 485769 [main] fish 1628 close: 0 = close(13)
45 485814 [fish] fish 1628 select_stuff::cleanup: calling cleanup routines
293 486107 [fish] fish 1628 select_stuff::destroy: deleting select records
310 486417 [fish] fish 1628 select_stuff::cleanup: calling cleanup routines
101 486518 [fish] fish 1628 select_stuff::destroy: deleting select records
107 486625 [fish] fish 1628 pselect: -1 = select (15, 0x7FFD0C690, 0x7FFD0C680, 0x7FFD0C670, 0x0), errno 0
select 890 487515 [fish] fish 1628 write: 6 = write(2, 0x100C27640, 6)
: 208 487723 [fish] fish 1628 write: 2 = write(2, 0x100C5BA30, 2)
No error 206 487929 [fish] fish 1628 write: 8 = write(2, 0x2102D4D35, 8)
197 488126 [fish] fish 1628 write: 1 = write(2, 0x100C5C791, 1)
The binary can be compiled using following commands.
pacman -S gcc mingw-w64-x86_64-rustup
rustup component add rust-src --toolchain nightly-x86_64-pc-windows-gnu
cargo +nightly-x86_64-pc-windows-gnu install -Z build-std --target x86_64-pc-cygwin --profile release-with-debug --git https://github.com/fish-shell/fish-shell.git --rev d8e5821 --bin fish
Note that --rev d8e5821 is required as workaround is added in https://github.com/fish-shell/fish-shell/commit/4508b5b0db7b012ae2c19f4f7aa9df74ac89ea3c. Commit history is available here.
Code that trigger perror is here.
https://github.com/fish-shell/fish-shell/blob/d8e5821a3b8f944cb071b98781680f15ba4effd6/src/fd_monitor.rs#L390-L395
let ret = fds.check_readable(timeout.map(Timeout::Duration).unwrap_or(Timeout::Forever));
if ret < 0 && !matches!(errno().0, libc::EINTR | libc::EBADF) {
// Surprising error
perror("select");
}
p.s. This might be an upstream issue but website of cygwin is temporary unavailable for me.
An analysis based on the output of strace.
do {
...
// select: sel.always_ready 0
select_printf("sel.always_ready %d", sel.always_ready);
if (sel.always_ready || us == 0)
...
else
wait_state = sel.wait(readfds, writefds, exceptfds, us);
// select: sel.wait returns 0 (select_ok)
select_printf("sel.wait returns %d", wait_state);
if (wait_state == select_stuff::select_ok)
{
...
// ret < 0 here?
ret = sel.poll(readfds, writefds, exceptfds);
if (ret < 0)
wait_state = select_stuff::select_signalled;
else if (!ret)
wait_state = select_stuff::select_set_zero;
}
sel.cleanup();
sel.destroy();
if (us != -1LL && wait_state == select_stuff::select_set_zero)
{
// not reached
select_printf("recalculating us");
...
}
} // wait_state != select_set_zero, exit loop
while (wait_state == select_stuff::select_set_zero);
// wait_state == select_signalled
if (wait_state < select_stuff::select_ok)
ret = -1;
// return -1 with errno 0
return ret;
https://github.com/msys2/msys2-runtime/blob/23a25d49e3b8c18f12ce588869d4cb24de9f2454/winsup/cygwin/select.cc#L204-L208
And I noticed that thread main got EAGAIN from read(12) and closed fd 12 before select() return. So maybe it should be EBADF but we forgot to set errno.
Could you please provide simple C reproducer?
Currently I can't reproduce the issue without using fish.
Fish use non-blocking pipe to implement eventfd on cygwin.
https://github.com/fish-shell/fish-shell/blob/d8e5821a3b8f944cb071b98781680f15ba4effd6/src/fd_monitor.rs#L58-L67
Here I use diff to colorize thread main and fish.
# pipe [12, 13]
- 43 438756 [main] fish 1174 pipe2: 0 = pipe2([4, 5], 0x40000)
# ...
- 45 438843 [main] fish 1174 dtable::dup3: dup3 (4, 12, 0x40000)
# ...
- 67 439425 [main] fish 1174 dtable::dup3: dup3 (5, 13, 0x40000)
# pipe [14, 15]
- 41 441303 [main] fish 1174 pipe2: 0 = pipe2([4, 5], 0x40000)
# ...
- 39 441381 [main] fish 1174 dtable::dup3: dup3 (4, 14, 0x40000)
# ...
- 45 442033 [main] fish 1174 dtable::dup3: dup3 (5, 15, 0x40000)
# ...
+ 366 450157 [fish] fish 1174 pselect: pselect (15, 0x7FFD0C6D0, 0x7FFD0C6C0, 0x7FFD0C6B0, 0x0, 0x0)
+ 89 450246 [fish] fish 1174 pselect: to NULL, us -1
+ 265 450511 [fish] fish 1174 dtable::select_read: pipe:[8589945136] fd 12
+ 179 450690 [fish] fish 1174 dtable::select_read: pipe:[12884912432] fd 14
+ 91 450781 [fish] fish 1174 select: sel.always_ready 0
+ 290 451071 [fish] fish 1174 select_stuff::wait: m 3, us 18446744073709551615, wmfo_timeout -1
# on post fish write one byte to pipew
- 3063 454134 [main] fish 1174 write: write(15, 0x7FFFF9E7C, 1)
- 48 454182 [main] fish 1174 fhandler_pipe::release_select_sem: raw_write(PIPEW) release 2
- 43 454225 [main] fish 1174 write: 1 = write(15, 0x7FFFF9E7C, 1)
43 454268 [pipesel] fish 1174 peek_pipe: read: pipe:[12884912432], ready for read: avail 1
+ 99 454367 [fish] fish 1174 select_stuff::wait: wait_ret 2, m = 3. verifying
+ 95 454462 [fish] fish 1174 set_bits: me 0xA000460B0, testing fd 14 (pipe:[12884912432])
- 46 454508 [main] fish 1174 read: read(12, 0x7FFFF5E28, 16384) nonblocking
+ 55 454563 [fish] fish 1174 set_bits: ready 1
- 45 454608 [main] fish 1174 __set_errno: virtual void fhandler_pipe::raw_read(void*, size_t&):345 setting errno 11
+ 51 454659 [fish] fish 1174 set_bits: me 0xA00045AD0, testing fd 12 (pipe:[8589945136])
- 46 454705 [main] fish 1174 fhandler_base::read: returning -1, binary mode
+ 49 454754 [fish] fish 1174 set_bits: ready 0
- 46 454800 [main] fish 1174 read: -1 = read(12, 0x7FFFF5E28, -1), errno 11
+ 52 454852 [fish] fish 1174 select_stuff::wait: res after verify 0
+ 88 454940 [fish] fish 1174 select_stuff::wait: returning 0
- 50 454990 [main] fish 1174 close: close(12)
+ 49 455039 [fish] fish 1174 select: sel.wait returns 0
- 44 455083 [main] fish 1174 fhandler_pipe::release_select_sem: close(PIPER) release 1
- 49 455132 [main] fish 1174 fhandler_base::close: closing 'pipe:[8589945136]' handle 0x2DC
- 48 455180 [main] fish 1174 close: 0 = close(12)
# pipe [12, 13] is closed
# pipe [14, 15] is ready
+ 42 455222 [fish] fish 1174 peek_pipe: pipe:[12884912432], already ready for read
- 48 455270 [main] fish 1174 close: close(13)
+ 46 455316 [fish] fish 1174 set_bits: me 0xA000460B0, testing fd 14 (pipe:[12884912432])
- 44 455360 [main] fish 1174 fhandler_pipe::release_select_sem: close(PIPEW) release 0
+ 49 455409 [fish] fish 1174 set_bits: ready 1
- 46 455455 [main] fish 1174 fhandler_base::close: closing 'pipe:[8589945136]' handle 0x240
- 55 455510 [main] fish 1174 close: 0 = close(13)
+ 49 455559 [fish] fish 1174 select_stuff::cleanup: calling cleanup routines
+ 262 455821 [fish] fish 1174 select_stuff::destroy: deleting select records
+ 285 456106 [fish] fish 1174 select_stuff::cleanup: calling cleanup routines
+ 98 456204 [fish] fish 1174 select_stuff::destroy: deleting select records
+ 99 456303 [fish] fish 1174 pselect: -1 = select (15, 0x7FFD0C6D0, 0x7FFD0C6C0, 0x7FFD0C6B0, 0x0), errno 0
However when I close the pipe in another thread, select sets errno to EBADF as expected.
Currently I can't reproduce the issue without using fish.
we should start by reducing fish source code
(delete as much as possible from main() while it still reproduces).
Once we have a minimal Rust reproducer, it should be easy to extract a C reproducer.
Currently I can't reproduce the issue without using fish. we should start by reducing fish source code (delete as much as possible from
main()while it still reproduces). Once we have a minimal Rust reproducer, it should be easy to extract a C reproducer.
Right. It's just that someone has to do it. And since everyone in this thread seems to be happy to let someone else be that someone, I predict that it won't ever happen.
I thought that not everyone was aware how to best do this sort of thing (reducing inputs is always easy, though sometimes time-consuming) and I didn't know how much fish-specific knowledge was needed to do this quickly (quite a bit in hindsight).
Here's my current reproduction. It should be somewhat close to minimal. I can try to simplify further later.
I used the tmate github action to get a shell on a windows system (via ssh):
name: Cygwin repro
on: [push]
permissions:
contents: read
jobs:
windows:
runs-on: windows-latest
steps:
- name: Shell
shell: C:\cygwin64\bin\bash.exe -l {0}
run: false
- name: Setup tmate session
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
This command will loop until the problem reproduces
(i.e. it prints poll: No error).
It sometimes takes a lot of iterations, but usually it's very quick.
pacman -S gcc --noconfirm --needed &&
cat >repro.c <<'EOF'
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <pthread.h>
#include <stdio.h> // perror
#include <unistd.h>
// May tweak these for faster reproduction.
enum {
NUM_ITEMS = 3 // 3 seems to be the absolute minimum.
};
enum {
POLL_TIMEOUT_MSEC = 2
};
typedef struct {
int fd;
} FdMonitorItem;
typedef struct {
FdMonitorItem items[NUM_ITEMS];
int item_count;
pthread_mutex_t mutex;
} SharedData;
static SharedData g_shared_data;
static pthread_t g_background_thread;
static int g_thread_started = 0;
static void io_buffer_read_once(int fd) {
assert(fd != -1);
char data[1];
ssize_t amt = read(fd, data, 1);
assert(amt >= 0 && "expecting no IO error or interrupt");
assert(amt == 0); // no write
}
static void *background_fd_monitor_run(void *_arg) {
(void)_arg;
SharedData *shared_data = &g_shared_data;
while (1) {
struct pollfd pollfds[NUM_ITEMS] = {0};
int pollfds_index_to_item_index[NUM_ITEMS];
int nfds = 0;
pthread_mutex_lock(&shared_data->mutex);
for (int i = 0; i < shared_data->item_count; i++) {
int fd = shared_data->items[i].fd;
if (fd == -1) continue;
pollfds[nfds].fd = fd;
pollfds[nfds].events = POLLIN;
pollfds_index_to_item_index[nfds] = i; // Map back to item index
nfds++;
}
pthread_mutex_unlock(&shared_data->mutex);
int ret = poll(pollfds, nfds, POLL_TIMEOUT_MSEC);
if (ret == -1) {
perror("poll");
if (errno == 0) {
// Hit the bug!
_exit(1);
}
assert(0 && "expecting no IO error or interrupt");
}
pthread_mutex_lock(&shared_data->mutex);
for (int i = 0; i < nfds; i++) {
if (!(pollfds[i].revents & POLLIN)) continue;
int item_idx = pollfds_index_to_item_index[i];
assert(item_idx < shared_data->item_count);
FdMonitorItem *item = &shared_data->items[item_idx];
if (item->fd == -1) continue;
io_buffer_read_once(item->fd);
close(item->fd);
item->fd = -1;
}
pthread_mutex_unlock(&shared_data->mutex);
}
assert(0);
return NULL;
}
static int fd_monitor_add(int fd) {
SharedData *shared_data = &g_shared_data;
pthread_mutex_lock(&shared_data->mutex);
assert(shared_data->item_count < NUM_ITEMS);
int item_id = shared_data->item_count;
shared_data->items[item_id].fd = fd;
shared_data->item_count++;
pthread_mutex_unlock(&shared_data->mutex);
// Start background thread. It only reproduces if this happens after the first item was added.
if (!g_thread_started) {
pthread_create(&g_background_thread, NULL, background_fd_monitor_run, NULL);
g_thread_started = 1;
}
return item_id;
}
static int fd_monitor_remove_item(int item_id) {
SharedData *shared_data = &g_shared_data;
pthread_mutex_lock(&shared_data->mutex);
assert(item_id >= 0);
assert(item_id < shared_data->item_count);
int fd = shared_data->items[item_id].fd;
shared_data->items[item_id].fd = -1;
pthread_mutex_unlock(&shared_data->mutex);
return fd;
}
int main() {
pthread_mutex_init(&g_shared_data.mutex, NULL);
int item_ids[NUM_ITEMS];
for (int i = 0; i < NUM_ITEMS; i++) {
int fds[2];
int ok = pipe(fds) != -1;
assert(ok);
int read_fd = fds[0];
close(fds[1]);
int flags = fcntl(read_fd, F_GETFL, 0);
assert(flags != -1);
ok = fcntl(read_fd, F_SETFL, flags | O_NONBLOCK) != -1;
assert(ok);
item_ids[i] = fd_monitor_add(read_fd);
}
for (int i = 0; i < NUM_ITEMS; i++) {
int item_id = item_ids[i];
int removed_fd = fd_monitor_remove_item(item_id);
if (removed_fd != -1) {
io_buffer_read_once(removed_fd);
close(removed_fd);
}
}
// Did not reproduce.
_exit(0);
}
EOF
gcc repro.c -o repro -Wall -Wextra -pthread -g -O0 && ./repro &&
while ! ./repro 2>&1 | grep -i no.error; do printf .; done
For the record, some of my steps:
- it took me a bit to find out how to build a working fish in debug mode; because the
embed-datais broken on Windows I needed to enable rust-embed'sdebug-embedfeature. - I found that it doesn't happen with
fish --no-config -c'', but it does happen withfish --no-config -c'source /usr/share/fish/config.fish' - I bisected it to
fish --no-config -c ': $(:)' - I realized that we must have been crashing around fish's
exec_subshell_internal()function, and extracted a Rust unit test - For test minimization, Claude was pretty helpful for quickly moving the test and all its dependencies into a standalone, single-file Rust crate. Also for translating the test to C. (single-file is important because I'm copy-pasting every change over to Windows)