binaryen Asyncify runtime fails when enabling `-Oz` optimization

trafficstars

I was reading @kripken slides and blog post about asyncify and I gave a shot on trying to implement a minimal coroutine system in pure C for WASM using it, to work in standalone WASM. However while it has worked fine when I compiled with -O0 or -O1, it fails when enabling optimizations -O2 or -Oz.

Here is the minimal test case with comments that I could came up, it's really just a minimal coroutine system based on asyncify:

#include <stdint.h>
#include <stdio.h>
#include <assert.h>

/* Import asyncify */
typedef struct _asyncify_stack_region {
  void* start;
  void* limit;
} _asyncify_stack_region;
__attribute__((import_module("asyncify"), import_name("start_unwind"))) void _asyncify_start_unwind(_asyncify_stack_region*);
__attribute__((import_module("asyncify"), import_name("stop_unwind"))) void _asyncify_stop_unwind();
__attribute__((import_module("asyncify"), import_name("start_rewind"))) void _asyncify_start_rewind(_asyncify_stack_region*);
__attribute__((import_module("asyncify"), import_name("stop_rewind"))) void _asyncify_stop_rewind();

/* Coroutine state. */
typedef enum mco_state {
  MCO_DEAD = 0,
  MCO_RUNNING,
  MCO_SUSPENDED
} mco_state;

/* Coroutine structure. */
typedef struct mco_coro {
  mco_state state;
  _asyncify_stack_region stack_region;
  int rewind_id; /* Current point to rewind to when resuming. */
  void (*entry)(struct mco_coro* co); /* Entry point of the coroutine. */
  uint8_t stack[8192]; /* Buffer for saving coroutine stack locals and call stack. */
} mco_coro;

/* Coroutine structure. */
static __attribute__((noinline)) void _mco_finish_jumpout(mco_coro* co, volatile int rewind_id) {
  int next_rewind_id = co->rewind_id + 1;
  if(rewind_id == next_rewind_id) { /* Begins unwinding the stack (save locals and call stack to rewind later) */
    co->rewind_id = next_rewind_id;
    _asyncify_start_unwind(&co->stack_region);
  } else if(rewind_id == co->rewind_id) { /* Continue from yield point. */
    _asyncify_stop_rewind();
  } else {
    /* Let it continue rewinding... */
  }
}

/* Called when jumping out of a coroutine. */
static __attribute__((noinline)) void _mco_jumpout(mco_coro* co) {
  /*
  Save rewind point into a local, that should be restored when rewinding.
  That is "rewind_id != co->rewind_id + 1" may be true when rewinding.
  Use volatile here just to be safe from compiler optimizing this out.
  */
  volatile int rewind_id = co->rewind_id + 1;
  _mco_finish_jumpout(co, rewind_id);
}

/* Run coroutine entry.*/
static __attribute__((noinline)) void _mco_run_entry(mco_coro* co) {
  co->entry(co);
  co->state = MCO_DEAD; /* Coroutine finished, it should now be dead. */
  _mco_jumpout(co); /* Jump out anyway, because `_mco_jumpin` will always call `_asyncify_stop_unwind`. */
}

/* Called when jumping into a coroutine. */
static __attribute__((noinline)) void _mco_jumpin(mco_coro* co) {
  if(co->rewind_id > 0) {  /* Begin rewinding until last yield point. */
    _asyncify_start_rewind(&co->stack_region);
  }
  _mco_run_entry(co); /* Execute the coroutine entry. */
  _asyncify_stop_unwind(); /* Stop saving coroutine stack. */
}

/* Resume coroutine. */
static void mco_resume(mco_coro* co) {
  co->state = MCO_RUNNING;
  _mco_jumpin(co);
}

/* Suspend coroutine. */
static void mco_yield(mco_coro* co) {
  co->state = MCO_SUSPENDED;
  _mco_jumpout(co);
}

/* Initialize coroutine. */
static void mco_init(mco_coro* co, void (*entry)(mco_coro* co)) {
  co->state = MCO_SUSPENDED;
  co->rewind_id = 0;
  co->entry = entry;
  co->stack_region.start = (void*)co->stack;
  co->stack_region.limit = (void*)((size_t)co->stack_region.start + 8192);
}

// Coroutine test entry function.
void coro_entry(mco_coro* co) {
  printf("coroutine started\n");
  for(int i=0;i<10;++i) {
    mco_yield(co);
    printf("coroutine %d\n", i);
  }
  printf("coroutine finished\n");
}

mco_coro co;

int main() {
  // Call `mco_create` with the output coroutine pointer and `desc` pointer.
  mco_init(&co, coro_entry);
  // The coroutine should be now in suspended state.
  assert(co.state == MCO_SUSPENDED);
  // Call `mco_resume` to start for the first time, switching to its context.
  mco_resume(&co); // Should print "coroutine started".
  // We get back from coroutine context in suspended state (because it's unfinished).
  assert(co.state == MCO_SUSPENDED);
  // Call `mco_resume` 10 more times.
  for(int i=0;i<10;++i) {
      mco_resume(&co); // Should print "coroutine X".
  }
  // The coroutine finished and should be now dead.
  assert(co.state == MCO_DEAD);
  return 0;
}

Sorry for sharing a test case this big, it's the most I could minimize the issue, but it should not be hard to understand what is going on and it is commented.

When I compile with

emcc t.c -o t.wasm -s ERROR_ON_UNDEFINED_SYMBOLS=0 -O0
wasm-opt --asyncify t.wasm -o t.wasm
wasmtime t.wasm

Things work fine and I get the expected output:

coroutine started
coroutine 0
coroutine 1
coroutine 2
coroutine 3
coroutine 4
coroutine 5
coroutine 6
coroutine 7
coroutine 8
coroutine 9
coroutine finished

However when I enable optimizations with -Oz it breaks:

emcc t.c -o t.wasm -s ERROR_ON_UNDEFINED_SYMBOLS=0 -Oz
wasm-opt --asyncify t.wasm -o t.wasm
wasmtime t.wasm

coroutine started
Assertion failed: co.state == MCO_SUSPENDED (t.c: main: 112)

Seems like enabling -Oz the unwinding does not work expected in the first yield point, leaving the coroutine in an invalid state. What what I am doing wrong here, or is this a Binaryen bug? I've tried to use noinline in the asyncify's runtime place, this made things work with -O0, however it fails with -Oz.

Environment:

$ emcc -v
emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.1-git (1934a98e709b57d3592b8272d3f1264a72c089e4)
clang version 14.0.0 (/srcdest/llvm-project 50fb44eebb0397f9b5f45a44239d6b53faf07c3b)
Target: wasm32-unknown-emscripten
Thread model: posix
InstalledDir: /opt/emscripten-llvm/bin

$ wasm-opt --version
wasm-opt version 104 (version_104-23-g1ef8f1f2c)

Side notes: The shared test case is a minimal version of where is being used in the https://github.com/edubart/minicoro project, a cross platform coroutine library for C. I hope to improve support there for standalone WASM with Asyncify by sorting out this issue.

Jan 27 '22 12:01 edubart

I've noticed that using emcc -Oz -g makes the test case work as expected, only -g added, why would debug information change the runtime behavior of asyncify?

Jan 27 '22 13:01 edubart

However while it has worked fine when I compiled with -O0 or -O1, it fails when enabling optimizations -O2 or -Oz. [..] I've noticed that using emcc -Oz -g makes the test case work as expected, only -g added, why would debug information change the runtime behavior of asyncify?

Those two things together suggest that inlining is the issue here. It's a little complicated, sadly, for a few reasons:

Preventing inlining in LLVM does not prevent inlining in binaryen. We do hope to add a custom section with optimization hints like noinline to wasm eventually, which would allow that.
Debug info prevents inlining, because atm binaryen disables some optimization features when it sees DWARF (some passes can break DWARF, so it avoids them).

To check that theory, you can try disabling inlining in the binaryen flags. I see we don't have a simple "no-inline" flag atm, which maybe we should add. Meanwhile, --flexible-inline-max-function-size=0 and --one-caller-inline-max-function-size=0 should do that.

Jan 27 '22 19:01 kripken

To check that theory, you can try disabling inlining in the binaryen flags. I see we don't have a simple "no-inline" flag atm, which maybe we should add. Meanwhile, --flexible-inline-max-function-size=0 and --one-caller-inline-max-function-size=0 should do that.

I've tried that:

emcc t.c -o t.wasm -s ERROR_ON_UNDEFINED_SYMBOLS=0 -Oz
wasm-opt t.wasm -o t.wasm --asyncify  --flexible-inline-max-function-size=0 --one-caller-inline-max-function-size=0
wasmtime t.wasm

The test case still fails. I think the issue is before wasm-opt pass, seems like emcc is aggressively optimizing and eliminating code that it should not, my theory is that its code flow analysis cannot deal with jumping code (both rewinding/unwinding), and it optimizes out ifs/calls that should not be optimized out (that is just a guess, I don't have good picture on what is happening behind the scenes in both emcc and wasm-opt). What hints me to think that is the assertion Assertion failed: co.state == MCO_SUSPENDED (t.c: main: 112), this should never happen, unless co->state = MCO_DEAD; is executed, but that line should never be executed while unwinding, so seems like unwinding was ignored when using -Oz.

Jan 27 '22 20:01 edubart

seems like emcc is aggressively optimizing and eliminating code that it should not

Yes, it could be the emcc invocation of binaryen. Try emcc -s BINARYEN_EXTRA_PASSES=--flexible-inline-max-function-size=0,--one-caller-inline-max-function-size=0

(If so, this is another case that should have "just worked" if we had wasm annotations of "noinline"... we really need that)

Jan 27 '22 21:01 kripken

emcc -s BINARYEN_EXTRA_PASSES=--flexible-inline-max-function-size=0,--one-caller-inline-max-function-size=0

That seems to solve the issue! Do you know if this is possible with WASI SDK clang? Or an emcc only thing? Now I want to try to get it working with WASI SDK, just to make sure everything is working fine.

Jan 27 '22 21:01 edubart

Great!

Hmm, one problem you may run into is that AFAIK wask-sdk's clang (or wasm-ld?) will run wasm-opt if it is in the path. So you'd need to find out how to pass these no-inline params to it. Or, avoid wasm-opt being in your path I guess.

Aside from that, this should all just work with wasi-sdk, I think.

Jan 27 '22 22:01 kripken

Or, avoid wasm-opt being in your path I guess.

Just did that, and things seems to work now with WASI SDK! Unfortunately I could not find a way to disable wasm-opt without removing it from the path, but that's their issue.

Now with the workarounds things are working, thanks!

Jan 27 '22 22:01 edubart

binaryen binaryen copied to clipboard

Asyncify runtime fails when enabling `-Oz` optimization

binaryen
binaryen copied to clipboard