emscripten
emscripten copied to clipboard
evaluation of std::sin in C++ is slower than in JS directly
When executing the code below, the c++ execution is way slower than the JS one:
chrome on linux:
cpp_time: 26.043s
js_time: 2.685s
firefox on linux:
cpp_time: 28.458s
js_time: 2.711s
when compiling with gcc 11.4, i get cpp_time: 4.217s (yes, that's slower than using JS). timings on windows are similar.
I asked about it on stackoverflow before.
There, user21489919 reported, that in his tiny-wasm-runtime, the c++ sin is almost as fast as the JS version (on chrome, on FF the C++ version is faster). The reason seems to be, that tiny-wasm-runtime calls JS in its implementation of sin.
So this makes me believe, that there is some performance issue with emscripten, hence the bugreport.
Code:
#include <chrono>
#include <iostream>
#include <cmath>
#ifdef __EMSCRIPTEN__
#include <emscripten.h>
#endif
int main() {
const auto n_iter = 50u;
double cpp_sum = 0;
const auto cpp_begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n_iter; ++i) {
double sum = 1.0;
for (int t = 0; t < 5000000; ++t) {
sum += std::sin(double(t));
}
cpp_sum = sum;
};
const auto cpp_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_begin).count();
std::cout << "cpp_sum: " << cpp_sum << std::endl;
std::cout << "cpp_time: " << double(cpp_time) / 1000.0 << "s" << std::endl;
#ifdef __EMSCRIPTEN__
double js_sum = 0;
const auto js_begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n_iter; ++i) {
js_sum = EM_ASM_DOUBLE({
let sum = 1;
for (let i = 0; i < 5000000; i++) {
sum = sum + Math.sin(i);
}
return sum;
});
};
const auto js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - js_begin).count();
std::cout << "js_sum: " << js_sum << std::endl;
std::cout << "js_time: " << double(js_time) / 1000.0 << "s" << std::endl;
#endif
return 0;
}
Version of emscripten/emsdk: I ran the code with the version 3.1.56 now, but several older version have the same performance hit.
emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.56 (cf90417346b78455089e64eb909d71d091ecc055) clang version 19.0.0git (https:/github.com/llvm/llvm-project 34ba90745fa55777436a2429a51a3799c83c6d4c) Target: wasm32-unknown-emscripten Thread model: posix InstalledDir: /home/madam/bin/emsdk/upstream/bin
Full link command and output with -v
appended:
em++ ./main.cpp -o main.html -v "/home/madam/bin/emsdk/upstream/bin/clang++" -target wasm32-unknown-emscripten -fignore-exceptions -fvisibility=default -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr --sysroot=/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot -DEMSCRIPTEN -Xclang -iwithsysroot/include/fakesdl -Xclang -iwithsysroot/include/compat -v ./main.cpp -c -o /tmp/emscripten_temp_lpd1_wvw/main_0.o clang version 19.0.0git (https:/github.com/llvm/llvm-project 34ba90745fa55777436a2429a51a3799c83c6d4c) Target: wasm32-unknown-emscripten Thread model: posix InstalledDir: /home/madam/bin/emsdk/upstream/bin (in-process) "/home/madam/bin/emsdk/upstream/bin/clang-19" -cc1 -triple wasm32-unknown-emscripten -emit-obj -mrelax-all -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name main.cpp -mrelocation-model static -mframe-pointer=none -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/madam/Documents/emscripten_perf_bug -v -fcoverage-compilation-dir=/home/madam/Documents/emscripten_perf_bug -resource-dir /home/madam/bin/emsdk/upstream/lib/clang/19 -D EMSCRIPTEN -isysroot /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten/c++/v1 -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/c++/v1 -internal-isystem /home/madam/bin/emsdk/upstream/lib/clang/19/include -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten -internal-isystem /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include -fdeprecated-macro -ferror-limit 19 -fvisibility=default -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fcxx-exceptions -fignore-exceptions -fexceptions -fcolor-diagnostics -iwithsysroot/include/fakesdl -iwithsysroot/include/compat -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr -o /tmp/emscripten_temp_lpd1_wvw/main_0.o -x c++ ./main.cpp clang -cc1 version 19.0.0git based upon LLVM 19.0.0git default target x86_64-unknown-linux-gnu ignoring nonexistent directory "/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten/c++/v1" ignoring nonexistent directory "/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/wasm32-emscripten" #include "..." search starts here: #include <...> search starts here: /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/fakesdl /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/compat /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include/c++/v1 /home/madam/bin/emsdk/upstream/lib/clang/19/include /home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/include End of search list. /home/madam/bin/emsdk/upstream/bin/clang --version /home/madam/bin/emsdk/upstream/bin/wasm-ld -o main.wasm /tmp/emscripten_temp_lpd1_wvw/main_0.o -L/home/madam/bin/emsdk/upstream/emscripten/cache/sysroot/lib/wasm32-emscripten -lGL-getprocaddr -lal -lhtml5 -lstubs-debug -lnoexit -lc-debug -ldlmalloc -lcompiler_rt -lc++-noexcept -lc++abi-debug-noexcept -lsockets -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr /tmp/tmpzvrhcznelibemscripten_js_symbols.so --strip-debug --export=emscripten_stack_get_end --export=emscripten_stack_get_free --export=emscripten_stack_get_base --export=emscripten_stack_get_current --export=emscripten_stack_init --export=stackAlloc --export=stackSave --export=stackRestore --export=__get_temp_ret --export=__set_temp_ret --export=__wasm_call_ctors --export-if-defined=__start_em_asm --export-if-defined=__stop_em_asm --export-if-defined=__start_em_lib_deps --export-if-defined=__stop_em_lib_deps --export-if-defined=__start_em_js --export-if-defined=__stop_em_js --export-if-defined=main --export-if-defined=__main_argc_argv --export-if-defined=fflush --export-table -z stack-size=65536 --no-growable-memory --initial-heap=16777216 --no-entry --stack-first --table-base=1 /home/madam/bin/emsdk/upstream/bin/llvm-objcopy main.wasm main.wasm --remove-section=.debug* --remove-section=producers /home/madam/bin/emsdk/upstream/bin/wasm-emscripten-finalize --dyncalls-i64 --pass-arg=legalize-js-interface-exported-helpers main.wasm -o main.wasm --detect-features /home/madam/bin/emsdk/node/16.20.0_64bit/bin/node /home/madam/bin/emsdk/upstream/emscripten/src/compiler.mjs /tmp/tmpmv73rwfy.json /home/madam/bin/emsdk/node/16.20.0_64bit/bin/node /home/madam/bin/emsdk/upstream/emscripten/tools/preprocessor.mjs /tmp/emscripten_temp_lpd1_wvw/settings.js shell.html
I suppose its possible that JS Math.sin
is faster than the native sin
. However it seems unlikely that the it would be bigger than the cost of call out to JS for each call to std::sin
.
For a real test here you would need to write both the outer and inner loop in native code and then call out to JS for each sin
operation (i.e. make 500,000 JS calls). I would imagine that if you restruture your test like this you will get different results. You would mind trying that?
are you talking about using EM_ASM_DOUBLE for each sin call?
that takes cpp_js_time: 83.201s
it might be, that EM_ASM_DOUBLE is loosing extra time (recompiling the js every time? extra safety tests?). I'm also not saying, that that is the best approach. Only that the implementation of sin (and likely other trig functions) is slower in c++ than it should be.
the 10% that is lost by tiny-wasm-runtime in chrome in this test: https://twiddlingbits.dev/examples/dist/tests/index.html show that it's possible. i don't understand what tiny-wasm-runtime is doing exactly, but it's faster :)
I looks like you have shown that calling out to JS is much slower. EM_ASM_DOUBLE shouldn't be doing any runtime JS compilation of anything like that. I assume you are building with -O3
when doing these measurements?
I actually forgot that we do have JS implementations of these math functions if you want to compare performance. See: https://github.com/emscripten-core/emscripten/blob/a8b489dc20684cede4f9819cbc6f9e75ea6d50ef/src/library_math.js#L31.
So you could try replacing your EM_ASM call with simply emscripten_math_sin()
, to see what kind of results you get. As wouldn't be surprised of the call to JS has a measurable cost. If tiny wasm is something able to reduce this cost or be in some way faster than emscripten I would be very curious to know how/why.
It looks for me that JavaScript function call optimization does matter rather than difference between std::sin
and Math.sin
.
Here is disassembled wat of cpp part and JS part in the sample code.
cpp
loop ;; label = @1
f64.const 0x1p+0 (;=1;)
local.set 3
i32.const 0
local.set 0
loop ;; label = @2
local.get 3
local.get 0
f64.convert_i32_u ;; loop unrolled code, nice ;-)
call 21
f64.add
local.get 0
i32.const 1
i32.add
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 2
i32.add
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 3
i32.add
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 4
i32.add
f64.convert_i32_u
call 21
f64.add
local.set 3
local.get 0
i32.const 5
i32.add
local.tee 0
i32.const 5000000
i32.ne
br_if 0 (;@2;)
end
local.get 1
i32.const 1
i32.add
local.tee 1
i32.const 50
i32.ne
br_if 0 (;@1;)
end
JS
loop ;; label = @1
local.get 3
i32.const 17908 ;; offset of string to be evaluated
i32.const 1610
i32.const 0
call 0
f64.add
local.set 3
local.get 0
i32.const 1
i32.add
local.tee 0
i32.const 50
i32.ne
br_if 0 (;@1;)
end
n=3 | n=10 | n=20 | n=40 | |
---|---|---|---|---|
cpp | 0.551s | 0.574s | 8.392s | 32.479s |
js | 0.123s | 0.439s | 0.716s | 1.329s |
When executing EM_ASM_DOUBLE
code, the same string will be passed into JavaScript side. thus JavaScript engine can memorize the result of EM_ASM_DOUBLE
code and just returns the cached result after first EM_ASM_DOUBLE
function call.
When moved cpp loop into a separated function, the cpp code performs with much better speed:
loop ;; label = @1
local.get 3
local.get 0
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 1
i32.add
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 2
i32.add
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 3
i32.add
f64.convert_i32_u
call 21
f64.add
local.get 0
i32.const 4
i32.add
f64.convert_i32_u
call 21
f64.add
local.set 3
local.get 0
i32.const 5
i32.add
local.tee 0
i32.const 5000000
i32.ne
br_if 0 (;@1;)
end
;; outer iteration loop is eliminated!
cpp_sum: 2.60059
cpp_time: 0.413s
js_sum: 2.60059
js_time: 0.368s
__attribute__((noinline))
double cpp_iter() {
double sum = 1.0;
for (int t = 0; t < 5000000; ++t) {
sum += std::sin(double(t));
}
return sum;
}
thanks a ton for the analysis. i can confirm the caching as using an extra function speeds up the c++ version (in fact, on my machine it's way faster than js now, but that's probably a different problem).
i also tried to confirm, that without caching it's about the same speed. I did that by removing the outer loop and increasing the sample count:
0-50000000 cpp_time: 5.02s
0-50000000 js_time: 0.529s
0-50000000 cpp_js_time: 5.365s
so webassembly is still slower.
then, for a reason i don't remember, i tried to change the range of values:
0-2pi cpp_time: 0.443s
0-2pi js_time: 0.594s
0-2pi cpp_js_time: 5.687s
with clang++ 15 native (linux) and g++ 11 both cpp loops take about half a second. so now i'm puzzled why using large numbers in sin takes so long in webassembly.
i'm compiling with em++ -O3 ./main.cpp -o main.html
now.
this is the code (oh, and it didn't compile with emscripten_math_sin, i also didn't find it in the docs):
#include <chrono>
#include <iostream>
#include <cmath>
#ifdef __EMSCRIPTEN__
#include <emscripten.h>
#endif
double cpp_sum = 0;
double js_sum = 0;
double cpp_js_sum = 0;
int main() {
{ // warm up
double sum = 1;
for (int t = 0; t < 50000000; ++t) {
sum += std::sin(double(t));
}
cpp_sum = sum;
}
if (true) {
const auto cpp_begin = std::chrono::high_resolution_clock::now();
double sum = 1;
for (int t = 0; t < 50000000; ++t) {
sum += std::sin(double(t));
}
cpp_sum = sum;
const auto cpp_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_begin).count();
std::cout << "0-50000000 cpp_time: " << double(cpp_time) / 1000.0 << "s" << std::endl;
}
#ifdef __EMSCRIPTEN__
if (true) {
const auto js_begin = std::chrono::high_resolution_clock::now();
js_sum = EM_ASM_DOUBLE({
let sum = 1;
for (let i = 0; i < 50000000; i++) {
sum = sum + Math.sin(i);
}
return sum;
});
const auto js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - js_begin).count();
std::cout << "0-50000000 js_time: " << double(js_time) / 1000.0 << "s" << std::endl;
}
if (true) {
const auto cpp_js_begin = std::chrono::high_resolution_clock::now();
double sum = 1.0;
for (int t = 0; t < 50000000; ++t) {
sum += EM_ASM_DOUBLE({ return Math.sin($0); }, t);
}
cpp_js_sum = sum;
const auto cpp_js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_js_begin).count();
std::cout << "0-50000000 cpp_js_time: " << double(cpp_js_time) / 1000.0 << "s" << std::endl;
}
//if (true) {
//double cpp_js_sum = 0;
//const auto cpp_js_begin = std::chrono::high_resolution_clock::now();
//double sum = 1.0;
//for (int t = 0; t < 50000000; ++t) {
//sum += emscripten_math_sin(double(t));
//}
//cpp_js_sum = sum;
//const auto cpp_js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_js_begin).count();
//std::cout << "emscripten_math_sin_sum: " << cpp_js_sum << std::endl;
//std::cout << "emscripten_math_sin_time: " << double(cpp_js_time) / 1000.0 << "s" << std::endl;
//}
#endif
std::cout << std::endl;
std::cout << "js_sum: " << js_sum << std::endl;
std::cout << "cpp_sum: " << cpp_sum << std::endl;
std::cout << "cpp_js_sum: " << cpp_js_sum << std::endl;
std::cout << std::endl;
std::cout << std::endl;
if (true) {
const auto cpp_begin = std::chrono::high_resolution_clock::now();
double sum = 1;
for (int t = 1; t < 50000000; ++t) {
sum += std::sin(6.283185307179586 * double(t) / 50000000);
}
cpp_sum = sum;
const auto cpp_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_begin).count();
std::cout << "0-2pi cpp_time: " << double(cpp_time) / 1000.0 << "s" << std::endl;
}
#ifdef __EMSCRIPTEN__
if (true) {
const auto js_begin = std::chrono::high_resolution_clock::now();
js_sum = EM_ASM_DOUBLE({
let sum = 1;
for (let i = 1; i < 50000000; i++) {
sum = sum + Math.sin(6.283185307179586 * i / 50000000);
}
return sum;
});
const auto js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - js_begin).count();
std::cout << "0-2pi js_time: " << double(js_time) / 1000.0 << "s" << std::endl;
}
if (true) {
const auto cpp_js_begin = std::chrono::high_resolution_clock::now();
double sum = 1.0;
for (int t = 1; t < 50000000; ++t) {
sum += EM_ASM_DOUBLE({ return Math.sin(6.283185307179586 * $0 / 50000000); }, t);
}
cpp_js_sum = sum;
const auto cpp_js_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - cpp_js_begin).count();
std::cout << "0-2pi cpp_js_time: " << double(cpp_js_time) / 1000.0 << "s" << std::endl;
}
#endif
std::cout << std::endl;
std::cout << "js_sum: " << js_sum << std::endl;
std::cout << "cpp_sum: " << cpp_sum << std::endl;
std::cout << "cpp_js_sum: " << cpp_js_sum << std::endl;
std::cout << std::endl;
std::cout << std::endl;
return 0;
}