emscripten icon indicating copy to clipboard operation
emscripten copied to clipboard

evaluation of std::sin in C++ is slower than in JS directly

Open adam-ce opened this issue 10 months ago • 5 comments

When executing the code below, the c++ execution is way slower than the JS one:

chrome on linux:

cpp_time: 26.043s
js_time: 2.685s

firefox on linux:

cpp_time: 28.458s
js_time: 2.711s

when compiling with gcc 11.4, i get cpp_time: 4.217s (yes, that's slower than using JS). timings on windows are similar.

I asked about it on stackoverflow before.

There, user21489919 reported, that in his tiny-wasm-runtime, the c++ sin is almost as fast as the JS version (on chrome, on FF the C++ version is faster). The reason seems to be, that tiny-wasm-runtime calls JS in its implementation of sin.

So this makes me believe, that there is some performance issue with emscripten, hence the bugreport.

Code:

#include <chrono>
#include <iostream>
#include <cmath>

#ifdef __EMSCRIPTEN__
#include <emscripten.h>
#endif

int main() {
    const auto n_iter = 50u;
    
    double cpp_sum = 0;
    const auto cpp_begin = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < n_iter; ++i) {
        double sum = 1.0;
        for (int t = 0; t < 5000000; ++t) {
            sum += std::sin(double(t));
        }
        cpp_sum = sum;
    };
    const auto cpp_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_begin).count();
    std::cout << "cpp_sum: " << cpp_sum << std::endl;
    std::cout << "cpp_time: " << double(cpp_time) / 1000.0 << "s" << std::endl;
 
#ifdef __EMSCRIPTEN__
    double js_sum = 0;
    const auto js_begin = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < n_iter; ++i) {
        js_sum = EM_ASM_DOUBLE({
            let sum = 1;
            for (let i = 0; i < 5000000; i++) {
                sum = sum + Math.sin(i);
            }
            return sum;
        });
    };
    const auto js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - js_begin).count();
    std::cout << "js_sum: " << js_sum << std::endl;
    std::cout << "js_time: " << double(js_time) / 1000.0 << "s" << std::endl;
    
#endif
    return 0;
}

Version of emscripten/emsdk: I ran the code with the version 3.1.56 now, but several older version have the same performance hit.

emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.56 (cf90417346b78455089e64eb909d71d091ecc055) clang version 19.0.0git (https:/github.com/llvm/llvm-project 34ba90745fa55777436a2429a51a3799c83c6d4c) Target: wasm32-unknown-emscripten Thread model: posix InstalledDir: /home/madam/bin/emsdk/upstream/bin

Full link command and output with -v appended:

em++ ./main.cpp -o main.html -v "/home/madam/bin/emsdk/upstream/bin/clang++" -target wasm32-unknown-emscripten -fignore-exceptions -fvisibility=default -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr --sysroot=/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot -DEMSCRIPTEN -Xclang -iwithsysroot/include/fakesdl -Xclang -iwithsysroot/include/compat -v ./main.cpp -c -o /tmp/emscripten_temp_lpd1_wvw/main_0.o clang version 19.0.0git (https:/github.com/llvm/llvm-project 34ba90745fa55777436a2429a51a3799c83c6d4c) Target: wasm32-unknown-emscripten Thread model: posix InstalledDir: /home/madam/bin/emsdk/upstream/bin (in-process) "/home/madam/bin/emsdk/upstream/bin/clang-19" -cc1 -triple wasm32-unknown-emscripten -emit-obj -mrelax-all -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name main.cpp -mrelocation-model static -mframe-pointer=none -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/madam/Documents/emscripten_perf_bug -v -fcoverage-compilation-dir=/home/madam/Documents/emscripten_perf_bug -resource-dir /home/madam/bin/emsdk/upstream/lib/clang/19 -D EMSCRIPTEN -isysroot /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten/c++/v1 -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/c++/v1 -internal-isystem /home/madam/bin/emsdk/upstream/lib/clang/19/include -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include -fdeprecated-macro -ferror-limit 19 -fvisibility=default -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fcxx-exceptions -fignore-exceptions -fexceptions -fcolor-diagnostics -iwithsysroot/include/fakesdl -iwithsysroot/include/compat -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr -o /tmp/emscripten_temp_lpd1_wvw/main_0.o -x c++ ./main.cpp clang -cc1 version 19.0.0git based upon LLVM 19.0.0git default target x86_64-unknown-linux-gnu ignoring nonexistent directory "/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten/c++/v1" ignoring nonexistent directory "/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten" #include "..." search starts here: #include <...> search starts here: /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/fakesdl /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/compat /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/c++/v1 /home/madam/bin/emsdk/upstream/lib/clang/19/include /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include End of search list. /home/madam/bin/emsdk/upstream/bin/clang --version /home/madam/bin/emsdk/upstream/bin/wasm-ld -o main.wasm /tmp/emscripten_temp_lpd1_wvw/main_0.o -L/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/lib/wasm32-emscripten -lGL-getprocaddr -lal -lhtml5 -lstubs-debug -lnoexit -lc-debug -ldlmalloc -lcompiler_rt -lc++-noexcept -lc++abi-debug-noexcept -lsockets -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr /tmp/tmpzvrhcznelibemscripten_js_symbols.so --strip-debug --export=emscripten_stack_get_end --export=emscripten_stack_get_free --export=emscripten_stack_get_base --export=emscripten_stack_get_current --export=emscripten_stack_init --export=stackAlloc --export=stackSave --export=stackRestore --export=__get_temp_ret --export=__set_temp_ret --export=__wasm_call_ctors --export-if-defined=__start_em_asm --export-if-defined=__stop_em_asm --export-if-defined=__start_em_lib_deps --export-if-defined=__stop_em_lib_deps --export-if-defined=__start_em_js --export-if-defined=__stop_em_js --export-if-defined=main --export-if-defined=__main_argc_argv --export-if-defined=fflush --export-table -z stack-size=65536 --no-growable-memory --initial-heap=16777216 --no-entry --stack-first --table-base=1 /home/madam/bin/emsdk/upstream/bin/llvm-objcopy main.wasm main.wasm --remove-section=.debug* --remove-section=producers /home/madam/bin/emsdk/upstream/bin/wasm-emscripten-finalize --dyncalls-i64 --pass-arg=legalize-js-interface-exported-helpers main.wasm -o main.wasm --detect-features /home/madam/bin/emsdk/node/16.20.0_64bit/bin/node /home/madam/bin/emsdk/upstream/emscripten/src/compiler.mjs /tmp/tmpmv73rwfy.json /home/madam/bin/emsdk/node/16.20.0_64bit/bin/node /home/madam/bin/emsdk/upstream/emscripten/tools/preprocessor.mjs /tmp/emscripten_temp_lpd1_wvw/settings.js shell.html

adam-ce avatar Mar 31 '24 08:03 adam-ce

I suppose its possible that JS Math.sin is faster than the native sin. However it seems unlikely that the it would be bigger than the cost of call out to JS for each call to std::sin.

For a real test here you would need to write both the outer and inner loop in native code and then call out to JS for each sin operation (i.e. make 500,000 JS calls). I would imagine that if you restruture your test like this you will get different results. You would mind trying that?

sbc100 avatar Apr 01 '24 15:04 sbc100

are you talking about using EM_ASM_DOUBLE for each sin call? that takes cpp_js_time: 83.201s

it might be, that EM_ASM_DOUBLE is loosing extra time (recompiling the js every time? extra safety tests?). I'm also not saying, that that is the best approach. Only that the implementation of sin (and likely other trig functions) is slower in c++ than it should be.

the 10% that is lost by tiny-wasm-runtime in chrome in this test: https://twiddlingbits.dev/examples/dist/tests/index.html show that it's possible. i don't understand what tiny-wasm-runtime is doing exactly, but it's faster :)

adam-ce avatar Apr 02 '24 15:04 adam-ce

I looks like you have shown that calling out to JS is much slower. EM_ASM_DOUBLE shouldn't be doing any runtime JS compilation of anything like that. I assume you are building with -O3 when doing these measurements?

I actually forgot that we do have JS implementations of these math functions if you want to compare performance. See: https://github.com/emscripten-core/emscripten/blob/a8b489dc20684cede4f9819cbc6f9e75ea6d50ef/src/library_math.js#L31.

So you could try replacing your EM_ASM call with simply emscripten_math_sin(), to see what kind of results you get. As wouldn't be surprised of the call to JS has a measurable cost. If tiny wasm is something able to reduce this cost or be in some way faster than emscripten I would be very curious to know how/why.

sbc100 avatar Apr 02 '24 15:04 sbc100

It looks for me that JavaScript function call optimization does matter rather than difference between std::sin and Math.sin.

Here is disassembled wat of cpp part and JS part in the sample code.

cpp

loop  ;; label = @1
  f64.const 0x1p+0 (;=1;)
  local.set 3
  i32.const 0
  local.set 0
  loop  ;; label = @2
    local.get 3
    local.get 0
    f64.convert_i32_u ;; loop unrolled code, nice ;-)
    call 21
    f64.add
    local.get 0
    i32.const 1
    i32.add
    f64.convert_i32_u
    call 21
    f64.add
    local.get 0
    i32.const 2
    i32.add
    f64.convert_i32_u
    call 21
    f64.add
    local.get 0
    i32.const 3
    i32.add
    f64.convert_i32_u
    call 21
    f64.add
    local.get 0
    i32.const 4
    i32.add
    f64.convert_i32_u
    call 21
    f64.add
    local.set 3
    local.get 0
    i32.const 5
    i32.add
    local.tee 0
    i32.const 5000000
    i32.ne
    br_if 0 (;@2;)
  end
  local.get 1
  i32.const 1
  i32.add
  local.tee 1
  i32.const 50
  i32.ne
  br_if 0 (;@1;)
end

JS

    loop  ;; label = @1
      local.get 3
      i32.const 17908 ;; offset of string to be evaluated
      i32.const 1610
      i32.const 0
      call 0
      f64.add
      local.set 3
      local.get 0
      i32.const 1
      i32.add
      local.tee 0
      i32.const 50
      i32.ne
      br_if 0 (;@1;)
    end
n=3 n=10 n=20 n=40
cpp 0.551s 0.574s 8.392s 32.479s
js 0.123s 0.439s 0.716s 1.329s

When executing EM_ASM_DOUBLE code, the same string will be passed into JavaScript side. thus JavaScript engine can memorize the result of EM_ASM_DOUBLE code and just returns the cached result after first EM_ASM_DOUBLE function call.

When moved cpp loop into a separated function, the cpp code performs with much better speed:

    loop  ;; label = @1
      local.get 3
      local.get 0
      f64.convert_i32_u
      call 21
      f64.add
      local.get 0
      i32.const 1
      i32.add
      f64.convert_i32_u
      call 21
      f64.add
      local.get 0
      i32.const 2
      i32.add
      f64.convert_i32_u
      call 21
      f64.add
      local.get 0
      i32.const 3
      i32.add
      f64.convert_i32_u
      call 21
      f64.add
      local.get 0
      i32.const 4
      i32.add
      f64.convert_i32_u
      call 21
      f64.add
      local.set 3
      local.get 0
      i32.const 5
      i32.add
      local.tee 0
      i32.const 5000000
      i32.ne
      br_if 0 (;@1;)
    end
    ;; outer iteration loop is eliminated!
cpp_sum: 2.60059
cpp_time: 0.413s
js_sum: 2.60059
js_time: 0.368s
__attribute__((noinline))
double cpp_iter() {
    double sum = 1.0;
    for (int t = 0; t < 5000000; ++t) {
        sum += std::sin(double(t));
    }
    return sum;
}

nokotan avatar Apr 12 '24 05:04 nokotan

thanks a ton for the analysis. i can confirm the caching as using an extra function speeds up the c++ version (in fact, on my machine it's way faster than js now, but that's probably a different problem).

i also tried to confirm, that without caching it's about the same speed. I did that by removing the outer loop and increasing the sample count:

0-50000000 cpp_time: 5.02s
0-50000000 js_time: 0.529s
0-50000000 cpp_js_time: 5.365s

so webassembly is still slower.

then, for a reason i don't remember, i tried to change the range of values:

0-2pi cpp_time: 0.443s
0-2pi js_time: 0.594s
0-2pi cpp_js_time: 5.687s

with clang++ 15 native (linux) and g++ 11 both cpp loops take about half a second. so now i'm puzzled why using large numbers in sin takes so long in webassembly.

i'm compiling with em++ -O3 ./main.cpp -o main.html now.

this is the code (oh, and it didn't compile with emscripten_math_sin, i also didn't find it in the docs):

#include <chrono>
#include <iostream>
#include <cmath>

#ifdef __EMSCRIPTEN__
#include <emscripten.h>
#endif

double cpp_sum = 0;
double js_sum = 0;
double cpp_js_sum = 0;

int main() {
    
    { // warm up
        double sum = 1;
        for (int t = 0; t < 50000000; ++t) {
            sum += std::sin(double(t));
        }
        cpp_sum = sum;
    }
    
    if (true) {
        const auto cpp_begin = std::chrono::high_resolution_clock::now();
        double sum = 1;
        for (int t = 0; t < 50000000; ++t) {
            sum += std::sin(double(t));
        }
        cpp_sum = sum;
        const auto cpp_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_begin).count();
        std::cout << "0-50000000 cpp_time: " << double(cpp_time) / 1000.0 << "s" << std::endl;
    }
    
#ifdef __EMSCRIPTEN__
    if (true) {
        const auto js_begin = std::chrono::high_resolution_clock::now();
        js_sum = EM_ASM_DOUBLE({
            let sum = 1;
            for (let i = 0; i < 50000000; i++) {
                sum = sum + Math.sin(i);
            }
            return sum;
        });
        const auto js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - js_begin).count();
        std::cout << "0-50000000 js_time: " << double(js_time) / 1000.0 << "s" << std::endl;
    }
    
    
    if (true) {
        const auto cpp_js_begin = std::chrono::high_resolution_clock::now();
        double sum = 1.0;
        for (int t = 0; t < 50000000; ++t) {
            sum += EM_ASM_DOUBLE({ return Math.sin($0); }, t);
        }
        cpp_js_sum = sum;
        const auto cpp_js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_js_begin).count();
        std::cout << "0-50000000 cpp_js_time: " << double(cpp_js_time) / 1000.0 << "s" << std::endl;
    }
    
    
    //if (true) {
        //double cpp_js_sum = 0;
        //const auto cpp_js_begin = std::chrono::high_resolution_clock::now();
        //double sum = 1.0;
        //for (int t = 0; t < 50000000; ++t) {
            //sum += emscripten_math_sin(double(t));
        //}
        //cpp_js_sum = sum;
        //const auto cpp_js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_js_begin).count();
        //std::cout << "emscripten_math_sin_sum: " << cpp_js_sum << std::endl;
        //std::cout << "emscripten_math_sin_time: " << double(cpp_js_time) / 1000.0 << "s" << std::endl;
    //}
    
#endif

    std::cout << std::endl;
    std::cout << "js_sum: " << js_sum << std::endl;
    std::cout << "cpp_sum: " << cpp_sum << std::endl;
    std::cout << "cpp_js_sum: " << cpp_js_sum << std::endl;
    std::cout << std::endl;
    std::cout << std::endl;
    
    if (true) {
        const auto cpp_begin = std::chrono::high_resolution_clock::now();
        double sum = 1;
        for (int t = 1; t < 50000000; ++t) {
            sum += std::sin(6.283185307179586 * double(t) / 50000000);
        }
        cpp_sum = sum;
        const auto cpp_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_begin).count();
        std::cout << "0-2pi cpp_time: " << double(cpp_time) / 1000.0 << "s" << std::endl;
    }
 
#ifdef __EMSCRIPTEN__
    if (true) {
        const auto js_begin = std::chrono::high_resolution_clock::now();
        js_sum = EM_ASM_DOUBLE({
            let sum = 1;
            for (let i = 1; i < 50000000; i++) {
                sum = sum + Math.sin(6.283185307179586 * i / 50000000);
            }
            return sum;
        });
        const auto js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - js_begin).count();
        std::cout << "0-2pi js_time: " << double(js_time) / 1000.0 << "s" << std::endl;
    }
    
    if (true) {
        const auto cpp_js_begin = std::chrono::high_resolution_clock::now();
        double sum = 1.0;
        for (int t = 1; t < 50000000; ++t) {
            sum += EM_ASM_DOUBLE({ return Math.sin(6.283185307179586 * $0 / 50000000); }, t);
        }
        cpp_js_sum = sum;
        const auto cpp_js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_js_begin).count();
        std::cout << "0-2pi cpp_js_time: " << double(cpp_js_time) / 1000.0 << "s" << std::endl;
    }
#endif

    std::cout << std::endl;
    std::cout << "js_sum: " << js_sum << std::endl;
    std::cout << "cpp_sum: " << cpp_sum << std::endl;
    std::cout << "cpp_js_sum: " << cpp_js_sum << std::endl;
    std::cout << std::endl;
    std::cout << std::endl;
    
    return 0;
}

adam-ce avatar Apr 19 '24 14:04 adam-ce