julia icon indicating copy to clipboard operation
julia copied to clipboard

precompile: compile inlinable methods that were inferred

Open topolarity opened this issue 1 year ago • 3 comments

The code generation policy here previously assumes that inlinable code almost never ends up being invoked, but it has the unfortunate side effect that making more code inline-eligible means fewer entry points to your code end up compiled.

Intuitively I think users expect function calls that ran during pre-compilation to be compiled and available (regardless of the context they are called from), which requires us to pre-compile these

This change may not be viable due to the increase in code size, but I wanted to open it to see what the effects are.

topolarity avatar Oct 14 '24 12:10 topolarity

On my machine, this causes my sysimage to grow 150 MB → 173 MB

topolarity avatar Oct 14 '24 14:10 topolarity

Oof.

On my machine, this causes my sysimage to grow 150 MB → 173 MB

Could we still throw out some of the smallest functions? E.f. have an inlining cost threshold that allows medium size function to be cached, but not tiny ones that mostly forward things?

vchuravy avatar Oct 14 '24 14:10 vchuravy

I thought it might be better to leverage some runtime behavior of the program for this issue, so I tried the following approach:

diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl
index 8b85f7c6f3..216d51426e 100644
--- a/base/compiler/typeinfer.jl
+++ b/base/compiler/typeinfer.jl
@@ -1036,6 +1036,14 @@ end
 _uncompressed_ir(codeinst::CodeInstance, s::String) =
     ccall(:jl_uncompress_ir, Ref{CodeInfo}, (Any, Any, Any), codeinst.def.def::Method, codeinst, s)
 
+function force_compile_inlineable(codeinst::CodeInstance)
+    inferred = @atomic :monotonic codeinst.inferred
+    if inferred isa MaybeCompressed && is_inlineable(inferred)
+        ccall(:jl_force_compile_codeinst, Cvoid, (Any,), codeinst)
+    end
+    nothing
+end
+
 # compute (and cache) an inferred AST and return type
 function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mode::UInt8)
     start_time = ccall(:jl_typeinf_timing_begin, UInt64, ())
@@ -1048,6 +1056,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
                 return code
             end
             if ci_meets_requirement(code, source_mode)
+                force_compile_inlineable(code)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
             end
@@ -1075,6 +1084,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
                 return code
             end
             if ci_meets_requirement(code, source_mode)
+                force_compile_inlineable(code)
                 engine_reject(interp, ci)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
@@ -1112,6 +1122,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
         finish!(interp, frame; can_discard_trees) # redo finish! with the correct can_discard_trees parameter value
         @assert ci_meets_requirement(ci, source_mode)
     end
+    force_compile_inlineable(ci)
     return ci
 end
 
diff --git a/src/gf.c b/src/gf.c
index fc2e62ebff..29b1d3c36c 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -581,6 +581,10 @@ JL_DLLEXPORT void jl_update_codeinst(
     jl_atomic_store_relaxed(&codeinst->max_world, max_world); // since the edges shouldn't change after jl_fill_codeinst
 }
 
+JL_DLLEXPORT void jl_force_compile_codeinst(jl_code_instance_t *codeinst) {
+    jl_atomic_store_relaxed(&codeinst->precompile, 1);
+}
+
 JL_DLLEXPORT void jl_fill_codeinst(
         jl_code_instance_t *codeinst,
         jl_value_t *rettype, jl_value_t *exctype,
diff --git a/src/precompile_utils.c b/src/precompile_utils.c
index b361bbebc5..183e29d206 100644
--- a/src/precompile_utils.c
+++ b/src/precompile_utils.c
@@ -188,7 +188,7 @@ static int precompile_enq_specialization_(jl_method_instance_t *mi, void *closur
         else if (jl_atomic_load_relaxed(&codeinst->invoke) != jl_fptr_const_return) {
             jl_value_t *inferred = jl_atomic_load_relaxed(&codeinst->inferred);
             if (inferred && inferred != jl_nothing && jl_options.compile_enabled != JL_OPTIONS_COMPILE_ALL &&
-                (jl_options.trim == JL_TRIM_NO || jl_ir_inlining_cost(inferred) == UINT16_MAX)) {
+                ((jl_ir_inlining_cost(inferred) == UINT16_MAX) || jl_atomic_load_relaxed(&codeinst->precompile))) {
                 do_compile = 1;
             }
             else if (jl_atomic_load_relaxed(&codeinst->invoke) != NULL || jl_atomic_load_relaxed(&codeinst->precompile)) {

The basic idea is that even if it is inlineable, I thought it might be more efficient to do_compile it if it is called via dynamic dispatch.

However, this approach is incomplete because if a code instance compiled from a type-stable caller context first is later called from another dynamic dispatch context, we would need a mechanism to re-enqueue that code instance via precompile_enq_specialization_ with overriding the cache's ->precompile on the cache retrieval.

aviatesk avatar Oct 16 '24 09:10 aviatesk

This PR was closed without being merged, so I will remove the backport labels.

If that is incorrect, please let me know.

DilumAluthge avatar Sep 09 '25 21:09 DilumAluthge