otp Compiler Performance Regression in SSA Sink Optimization

Compiler Performance Regression in SSA Sink Optimization

Open davisp opened this issue 3 years ago • 8 comments

Describe the bug

There's a performance regression in the Erlang compiler that started in Erlang 22 and exists as of master today. From what I can tell the issue is that the SSA sink optimization for get_tuple_element is maintaining unnecessary references to items on the stack. In some very specific circumstances this leads to some code taking significantly more time and RAM to execute.

Given that the behavior is still technically correct I'm not 100% certain whether I'd call this a bug or just a performance regression. The scenario is fairly difficult to generate on purpose but it can certainly have a sizable impact on performance. There were CouchDB users with real world reports of some operations that take less than a second on Erlang 20 failing to complete after chewing through tens of gigabytes of RAM with Erlang 23.

After narrowing down a minimal test case (included inline below) I believe the issue is in this difference between the generated Erlang Assembly without and with the SSA sink optimization:

@@ -357,19 +358,19 @@
   {label,37}.
     {test,is_tuple,{f,39},[{x,1}]}.
     {test,test_arity,{f,39},[{x,1},3]}.
-    {allocate,2,2}.
-    {get_tuple_element,{x,1},0,{x,2}}.
-    {get_tuple_element,{x,1},1,{y,1}}.
-    {get_tuple_element,{x,1},2,{y,0}}.
-    {move,{x,2},{x,1}}.
+    {allocate,1,2}.
+    {move,{x,1},{y,0}}.
+    {get_tuple_element,{x,1},0,{x,1}}.
     {line,[{location,"stack_test.erl",39}]}.

If I'm reading that correctly, the unoptimized version ends up storing two of the three tuple elements on the stack and then passes the third directly to the body recursive call. In the optimized version we end up storing a single reference to the entire tuple and passing the first element to the recursive call. My theory here is that this extra reference in the tuple on the stack is preventing garbage collection from clearing out many of the intermediate versions of that first tuple element. And if, as in our particular test case, this value grows large it can end up using significantly more RAM with the optimization.

I should mention that hitting this particular edge case is fairly difficult to engineer. I was unable to come up with a test case from scratch and ended up having to reduce an example that was found in the wild. Even just shortening the tuple to two elements prevents the sink optimization from keeping a reference to the tuple on the stack.

To Reproduce

I've included a reproducing module at the bottom of this issue. To see the bug in action you just need to compile with and without the SSA sink optimization like such:

$ erlc stack_test.erl && erl -noshell -eval 'io:format("~p~n", [stack_test:run()]), init:stop().'
{4693,6203.939636230469}

$ erlc +dssappt +no_ssa_opt_sink stack_test.erl && erl -noshell -eval 'io:format("~p~n", [stack_test:run()]), init:stop().'
{907,213.61334228515625}

On my ancient MacBook Pro the optimized version takes roughly 5s and 6GiB of RAM to execute. Disabling the optimization reduces the time to execute to roughly 1s and only uses 200 MiB of RAM.

Expected behavior

Use less RAM.

Affected versions

The issue was introduced in 6bee2ac7d11668888d93ec4f93730bcae3e5fa79 and still exists on master as of 3f45eead8cbcc7226d8a3cd8da9002eb7ef5515e.

Additional context


-module(stack_test).

-export([
    run/0
]).


run() ->
    run(7500, 0.0008).


run(Depth, BranchChance) ->
    rand:seed(exrop, {1647,841737,351137}),

    Tree = node(Depth, BranchChance),

    erlang:garbage_collect(),

    MPid = spawn_mem_sampler(self(), 500),

    T1 = erlang:monotonic_time(),
    visit(Tree, sets:new()),    
    T2 = erlang:monotonic_time(),

    Max = get_max_mem(MPid),
    unlink(MPid),
    exit(MPid, kill),
    
    {deltaT(T2, T1), Max}.


visit([], Seen) ->
    {sets:add_element(rand:uniform(), Seen), ignore1, ignore2};
visit(Children, Seen0) ->
    Seen1 = sets:add_element(rand:uniform(), Seen0),
    lists:foldl(fun(Child, Acc) ->
        {SeenAcc, Ignore1, Ignore2} = Acc,
        {NewSeenAcc, _, _} = visit(Child, SeenAcc),
        {NewSeenAcc, Ignore1, Ignore2}
    end, {Seen1, ignore1, ignore2}, Children).


node(0, _) ->
    [];
node(Depth, BranchChance) ->
    case rand:uniform() < BranchChance of
        true ->
            [
                node(Depth - 1, BranchChance),
                node(Depth - 1, BranchChance)
            ];
        false ->
            [node(Depth - 1, BranchChance)]
    end.


deltaT(T0, T1) ->
    erlang:convert_time_unit(T0 - T1, native, millisecond).


spawn_mem_sampler(Pid, DtMsec) ->
    spawn_link(fun() -> mem_sampler(Pid, DtMsec, mem_mb(Pid)) end).


mem_sampler(Pid, DtMsec, Max0) ->
    timer:sleep(DtMsec),
    Max = max(mem_mb(Pid), Max0),
    receive
        {get_mem, From} ->
            From ! {mem_max, Max}
    after 0 ->
            ok
    end,
    mem_sampler(Pid, DtMsec, Max).


mem_mb(Pid) ->
    {memory, Words} = erlang:process_info(Pid, memory),
    Bytes = Words * erlang:system_info(wordsize),
    Bytes / (1024 * 1024).


get_max_mem(Pid) ->
    Pid ! {get_mem, self()},
    receive {mem_max, Max} -> Max end.

This is the full diff of the Erlang Assembly generated without the optimization vs with the optimization (both generated with erlc 23.3.4.11):

--- stack_test.no-sink.S	2022-03-22 11:05:41.000000000 -0500
+++ stack_test.opt.S	2022-03-22 11:05:33.000000000 -0500
@@ -240,9 +240,9 @@
     {loop_rec,{f,20},{x,0}}.
     {test,is_tagged_tuple,{f,19},[{x,0},2,{atom,get_mem}]}.
     {test_heap,3,1}.
-    {get_tuple_element,{x,0},1,{x,0}}.
     remove_message.
     {put_tuple2,{x,1},{list,[{atom,mem_max},{y,0}]}}.
+    {get_tuple_element,{x,0},1,{x,0}}.
     {line,[{location,"stack_test.erl",71}]}.
     send.
     {jump,{f,21}}.
@@ -266,13 +266,14 @@
     {move,{atom,memory},{x,1}}.
     {line,[{location,"stack_test.erl",79}]}.
     {call_ext,2,{extfunc,erlang,process_info,2}}.
+    {move,{x,0},{y,0}}.
     {test,is_tagged_tuple,{f,24},[{x,0},2,{atom,memory}]}.
-    {get_tuple_element,{x,0},1,{y,0}}.
     {move,{atom,wordsize},{x,0}}.
     {line,[{location,"stack_test.erl",80}]}.
     {call_ext,1,{extfunc,erlang,system_info,1}}.
+    {get_tuple_element,{y,0},1,{x,1}}.
     {line,[{location,"stack_test.erl",80}]}.
-    {gc_bif,'*',{f,0},1,[{y,0},{x,0}],{x,0}}.
+    {gc_bif,'*',{f,0},2,[{x,1},{x,0}],{x,0}}.
     fclearerror.
     {line,[{location,"stack_test.erl",81}]}.
     {fconv,{x,0},{fr,0}}.
@@ -357,19 +358,19 @@
   {label,37}.
     {test,is_tuple,{f,39},[{x,1}]}.
     {test,test_arity,{f,39},[{x,1},3]}.
-    {allocate,2,2}.
-    {get_tuple_element,{x,1},0,{x,2}}.
-    {get_tuple_element,{x,1},1,{y,1}}.
-    {get_tuple_element,{x,1},2,{y,0}}.
-    {move,{x,2},{x,1}}.
+    {allocate,1,2}.
+    {move,{x,1},{y,0}}.
+    {get_tuple_element,{x,1},0,{x,1}}.
     {line,[{location,"stack_test.erl",39}]}.
     {call,2,{f,6}}.
     {test,is_tuple,{f,38},[{x,0}]}.
     {test,test_arity,{f,38},[{x,0},3]}.
     {test_heap,4,1}.
+    {get_tuple_element,{y,0},1,{x,1}}.
+    {get_tuple_element,{y,0},2,{x,2}}.
     {get_tuple_element,{x,0},0,{x,0}}.
-    {put_tuple2,{x,0},{list,[{x,0},{y,1},{y,0}]}}.
-    {deallocate,2}.
+    {put_tuple2,{x,0},{list,[{x,0},{x,1},{x,2}]}}.
+    {deallocate,1}.
     return.
   {label,38}.
     {line,[{location,"stack_test.erl",39}]}.

Mar 22 '22 16:03 davisp

otp otp copied to clipboard

Compiler Performance Regression in SSA Sink Optimization

otp
otp copied to clipboard