Segfault in lwt_echo_post
Might be related with #64, but I'm experiencing segfault in lwt_echo_post compiled to native binary.
Steps to reproduce:
- create a large file (mine was 7.2 GB, YMMV)
- launch lwt_echo_post.exe
- upload this large file
curl -H"Expect:" -XPOST -d @very_big_file -o/dev/null http://0.0.0.0:8080/
Observed behavior:
- curl loads file to memory, and starts uploading it
- first there is no download activity, process memory footprint goes up to 3,978g, upload goes at several hundred megabytes per second
- at some point download traffic starts being reported by curl, at ~10-15 MBps
- some time later lwt_echo_post.exe segfaults, redacted backtrace follows:
#0 camlFaraday__shift_buffers_1564 () at lib/faraday.ml:371
#1 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#2 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#3 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#4 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#5 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#6 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
<...>
#29907 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#29908 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#29909 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#29932 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#29933 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#29934 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
<...>
#174595 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174596 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174597 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174598 0x00005555557694f2 in camlHttpaf__Body__fun_1699 () at lib/faraday.ml:408
#174599 0x00005555557700ca in camlFaraday__shift_flushes_1569 () at lib/faraday.ml:402
#174600 0x00005555557377f6 in camlHttpaf_lwt_unix__fun_2310 () at lwt-unix/httpaf_lwt_unix.ml:168
#174601 0x0000555555756a8c in camlLwt__catch_24942 () at src/core/lwt.ml:2025
#174602 0x0000555555759345 in camlLwt__async_53201 () at src/core/lwt.ml:2463
#174603 0x00005555557690f1 in camlHttpaf__Body__do_execute_read_1452 () at lib/body.ml:115
#174604 0x000055555576a06b in camlHttpaf__Parse__fun_2248 () at lib/parse.ml:137
#174605 0x00005555557715f3 in camlAngstrom__Parser__succ$27_1437 () at lib/parser.ml:53
#174606 0x000055555576aaf9 in camlHttpaf__Parse__read_with_more_1808 () at lib/parse.ml:300
#174607 0x000055555576d252 in camlHttpaf__Server_connection__read_with_more_1824 () at lib/server_connection.ml:236
#174608 0x0000555555736c4b in camlHttpaf_lwt_unix__get_1168 () at lwt-unix/httpaf_lwt_unix.ml:66
#174609 0x00005555557374d5 in camlHttpaf_lwt_unix__fun_2241 () at lwt-unix/httpaf_lwt_unix.ml:133
#174610 0x0000555555755c4e in camlLwt__callback_13871 () at src/core/lwt.ml:1866
#174611 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174612 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174613 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174614 0x0000555555755c6f in camlLwt__callback_13871 () at src/core/lwt.ml:1880
#174615 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174616 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174617 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174618 0x0000555555756d0c in camlLwt__callback_24954 () at src/core/lwt.ml:2041
#174619 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174620 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174621 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174622 0x0000555555755c6f in camlLwt__callback_13871 () at src/core/lwt.ml:1880
#174623 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174624 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174625 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174626 0x0000555555754cbc in camlLwt__wakeup_general_4627 () at src/core/lwt.ml:1385
#174627 0x000055555575308d in camlLwt_sequence__loop_1061 () at src/core/lwt_sequence.ml:128
#174628 0x000055555575308d in camlLwt_sequence__loop_1061 () at src/core/lwt_sequence.ml:128
#174629 0x0000555555783881 in camlList__iter_1083 () at list.ml:100
#174630 0x0000555555739d24 in camlLwt_engine__fun_3023 () at src/unix/lwt_engine.ml:357
#174631 0x000055555573c34c in camlLwt_main__run_1134 () at src/unix/lwt_main.ml:33
#174632 0x00005555556d15b6 in camlLwt_echo_post__entry () at examples/lwt/lwt_echo_post.ml:32
#174633 0x00005555556ce5e9 in caml_program ()
#174634 0x00005555558060d0 in caml_start_program ()
#174635 0x00005555557eae15 in caml_startup_common (argv=0x7fffffffd728, pooling=<optimized out>, pooling@entry=0) at startup.c:156
#174636 0x00005555557eae7b in caml_startup_exn (argv=<optimized out>) at startup.c:161
#174637 caml_startup (argv=<optimized out>) at startup.c:166
#174638 0x00005555556cd85c in main (argc=<optimized out>, argv=<optimized out>) at main.c:44
Probably process ran out of stack. Is shift_buffers function expected to be subject to tailcall optimization, or the issue is due to excessive buffering?
Hm, last build was using some weird environment. I've ran dune clean to get rid of old artifacts and rebuild from scratch. Backtrace changed a bit:
Program received signal SIGSEGV, Segmentation fault.
0x00005555557f01d3 in caml_modify (fp=0x7ffff1d2a028, val=93825026271160) at memory.c:658
658 memory.c: No such file or directory.
(gdb) bt
#0 0x00005555557f01d3 in caml_modify (fp=0x7ffff1d2a028, val=93825026271160) at memory.c:658
#1 0x000055555576ec6d in camlFaraday__dequeue_exn_1186 () at lib/faraday.ml:102
#2 0x000055555576ffd0 in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:371
#3 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#4 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#5 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#6 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#7 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#8 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#9 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#10 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#11 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#12 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
<...>
#174595 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174596 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174597 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174598 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174599 0x000055555576ffee in camlFaraday__shift_buffers_1564 () at lib/faraday.ml:372
#174600 0x00005555557694f2 in camlHttpaf__Body__fun_1699 () at lib/faraday.ml:408
#174601 0x00005555557700ca in camlFaraday__shift_flushes_1569 () at lib/faraday.ml:402
#174602 0x00005555557377f6 in camlHttpaf_lwt_unix__fun_2310 () at lwt-unix/httpaf_lwt_unix.ml:168
#174603 0x0000555555756a8c in camlLwt__catch_24942 () at src/core/lwt.ml:2025
#174604 0x0000555555759345 in camlLwt__async_53201 () at src/core/lwt.ml:2463
#174605 0x00005555557690f1 in camlHttpaf__Body__do_execute_read_1452 () at lib/body.ml:115
#174606 0x000055555576a06b in camlHttpaf__Parse__fun_2248 () at lib/parse.ml:137
#174607 0x00005555557715f3 in camlAngstrom__Parser__succ$27_1437 () at lib/parser.ml:53
#174608 0x000055555576aaf9 in camlHttpaf__Parse__read_with_more_1808 () at lib/parse.ml:300
#174609 0x000055555576d252 in camlHttpaf__Server_connection__read_with_more_1824 () at lib/server_connection.ml:236
#174610 0x0000555555736c4b in camlHttpaf_lwt_unix__get_1168 () at lwt-unix/httpaf_lwt_unix.ml:66
#174611 0x00005555557374d5 in camlHttpaf_lwt_unix__fun_2241 () at lwt-unix/httpaf_lwt_unix.ml:133
#174612 0x0000555555755c4e in camlLwt__callback_13871 () at src/core/lwt.ml:1866
#174613 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174614 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174615 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174616 0x0000555555755c6f in camlLwt__callback_13871 () at src/core/lwt.ml:1880
#174617 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174618 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174619 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174620 0x0000555555756d0c in camlLwt__callback_24954 () at src/core/lwt.ml:2041
#174621 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174622 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174623 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174624 0x0000555555755c6f in camlLwt__callback_13871 () at src/core/lwt.ml:1880
#174625 0x0000555555754748 in camlLwt__iter_callback_list_4529 () at src/core/lwt.ml:1209
#174626 0x00005555557548b9 in camlLwt__run_in_resolution_loop_4573 () at src/core/lwt.ml:1275
#174627 0x0000555555754a67 in camlLwt__resolve_4591 () at src/core/lwt.ml:1311
#174628 0x0000555555754cbc in camlLwt__wakeup_general_4627 () at src/core/lwt.ml:1385
#174629 0x000055555575308d in camlLwt_sequence__loop_1061 () at src/core/lwt_sequence.ml:128
#174630 0x000055555575308d in camlLwt_sequence__loop_1061 () at src/core/lwt_sequence.ml:128
#174631 0x0000555555783881 in camlList__iter_1083 () at list.ml:100
#174632 0x0000555555739d24 in camlLwt_engine__fun_3023 () at src/unix/lwt_engine.ml:357
#174633 0x000055555573c34c in camlLwt_main__run_1134 () at src/unix/lwt_main.ml:33
#174634 0x00005555556d15b6 in camlLwt_echo_post__entry () at examples/lwt/lwt_echo_post.ml:32
#174635 0x00005555556ce5e9 in caml_program ()
#174636 0x00005555558060d0 in caml_start_program ()
#174637 0x00005555557eae15 in caml_startup_common (argv=0x7fffffffd748, pooling=<optimized out>, pooling@entry=0) at startup.c:156
#174638 0x00005555557eae7b in caml_startup_exn (argv=<optimized out>) at startup.c:161
#174639 caml_startup (argv=<optimized out>) at startup.c:166
#174640 0x00005555556cd85c in main (argc=<optimized out>, argv=<optimized out>) at main.c:44
Curl first uploads 3620M (out of 7.2G file), then upload stops and it proceeds to downloading 3620M back. Once downloaded server process receives segfault.
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 7240M 0 3620M 100 3620M 21.5M 21.5M 0:02:48 0:02:48 --:--:-- 0^C
It points to this part of the code of faraday.ml in my _opam directory:
369 let rec shift_buffers t written =
370 try
371 let { len; _ } as iovec = Buffers.dequeue_exn t.scheduled in
372 if len <= written then begin
373 shift_buffers t (written - len)
374 end else
375 Buffers.enqueue_front (IOVec.shift iovec written) t.scheduled
376 with Dequeue_empty ->
377 assert (written = 0);
378 if t.scheduled_pos = t.write_pos then begin
379 t.scheduled_pos <- 0;
380 t.write_pos <- 0
381 end
FWIW some top-level interaction:
# let rec shift_buffers t written =
try
let len = 42 in
if len <= written then begin
(shift_buffers[@tailcall]) t (written - len)
end else
Printf.printf "bla\n"
with _ ->
assert (written = 0)
;;
Warning 51: expected tailcall
# let rec shift_buffers t written =
(* try *)
let len = 42 in
if len <= written then begin
(shift_buffers[@tailcall]) t (written - len)
end else
Printf.printf "bla\n"
(* with _ ->
assert (written = 0) *);;
val shift_buffers : 'a -> int -> unit = <fun>
~> calling ourselves within the exception handler leads to stack allocation (of the exception handler AFAICT), you'll have to move the recursive self-call out of the exception handler (the tailcall annotation is pretty good at spotting this)
Cool! For some reason I was trying to add tailcall annotation to pinned faraday repo clone on my laptop and make dune stop on this warning, without any results. Should've tried with top level :)
Actually I've suggested to get rid of that exception altogether in faraday/51, but for other reasons.
(another option is to have the let rec (and call to it) inside the ... of try ... catch, installing the exception handler only once)
Just rebuilt with my fork of faraday and it does not segfault any more, although odd behavior with buffering of 3+GB in memory is still in place.
Excessive buffering was due to asynchronous write in example echo handler. Can be fixed like this:
--- a/examples/lib/httpaf_examples.ml
+++ b/examples/lib/httpaf_examples.ml
@@ -39,8 +39,9 @@ module Server = struct
let request_body = Reqd.request_body reqd in
let response_body = Reqd.respond_with_streaming reqd response in
let rec on_read buffer ~off ~len =
- Body.write_bigstring response_body buffer ~off ~len;
- Body.schedule_read request_body ~on_eof ~on_read;
+ Body.schedule_bigstring response_body buffer ~off ~len;
+ Body.flush response_body (fun () ->
+ Body.schedule_read request_body ~on_eof ~on_read);
and on_eof () =
Body.close_writer response_body
in