julia Experiment with a slighly adjusted pipeline

and add GC final lowering verification.

Jan 10 '24 18:01 gbaraldi

@nanosoldier runbenchmarks(ALL, vs=":master")

Jan 11 '24 16:01 gbaraldi

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

Jan 12 '24 00:01 nanosoldier

@nanosoldier runbenchmarks(!"scalar", vs=":master")

Jan 12 '24 17:01 gbaraldi

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

Jan 13 '24 00:01 nanosoldier

@nanosoldier runbenchmarks(!"scalar", vs=":master")

Jan 31 '24 14:01 gbaraldi

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

Jan 31 '24 21:01 nanosoldier

looks like you need to fix a couple tests:

Failed Tests (2):
2024-10-16 14:39:21 EDT	  Julia :: image-codegen.jl
2024-10-16 14:39:21 EDT	  Julia :: pipeline-prints.ll

also rerunning nanosoldier, since a lot of changes have happened since: @nanosoldier runbenchmarks(!"scalar", vs=":master")

Oct 16 '24 19:10 vtjnash

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

Oct 17 '24 02:10 nanosoldier

Looks overall pretty good, but there are a couple 10x regressions (look like vectorization failures). Is there an easy way from nanosoldier for us to test compile time to make sure it's comparable?

Oct 17 '24 03:10 oscardssmith

Isn't that what the inference benchmarks are for, which look like no change to me.

Oct 17 '24 06:10 Zentrik

I took a big look at it. There's still a couple regressions, but it seems to be a pretty clear overall win. If anyone wants to take a further look

["union", "array", ("perf_countequals", "Int8")]
["array", "index", ("sumelt_boundscheck", "Base.ReinterpretArray{BaseBenchmarks.ArrayBenchmarks.PairVals{Int32}, 2, Int64, Matrix{Int64}, false}")] We are failing to elide a boundscheck
The simd conditional loop ones (they are very noisy (per run and per machine)

The 16x regression is now gone with my latest commit

Oct 17 '24 15:10 gbaraldi

Do we want to run a pkgeval? Im slightly worried about the fact that I had to modify passes.

Oct 21 '24 02:10 gbaraldi

@nanosoldier runtests(ALL, vs = ":master", configuration = (buildflags=["LLVM_ASSERTIONS=1", "FORCE_ASSERTIONS=1"],), vs_configuration = (buildflags = ["LLVM_ASSERTIONS=1", "FORCE_ASSERTIONS=1"],))

Oct 21 '24 07:10 vchuravy

The package evaluation job you requested has completed - possible new issues were detected. The full report is available.

Oct 21 '24 19:10 nanosoldier

@nanosoldier runbenchmarks(ALL, vs=":master")

Nov 18 '24 22:11 gbaraldi

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

Nov 19 '24 12:11 nanosoldier

["union", "array", ("map", "*", "Float64", "(false, true)")] seems to be quite a large regression in my mac 2x

Nov 19 '24 14:11 gbaraldi

nice to see that this improves the allocation elimination :)

Feb 06 '25 21:02 oscardssmith

I can confirm this PR on top of #57380 addresses most of #56145 (stores are vectorised, there's still an out-of-bounds section in the preamble, it shouldn't be a big deal performance-wise although it probably taints effects since it can throw):

julia> code_llvm((Memory{Float64},)) do v
           for idx in eachindex(v)
               v[idx] = 1.0
           end
       end

; Function Signature: var"#59"(Memory{Float64})
;  @ REPL[17]:2 within `#59`
define void @"julia_#59_3153"(ptr noundef nonnull align 8 dereferenceable(16) %"v::GenericMemory") local_unnamed_addr #0 {
top:
  %thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #11
  %tls_ppgcstack = getelementptr inbounds i8, ptr %thread_ptr, i64 -8
  %tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
; ┌ @ range.jl:917 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
      %.unbox = load i64, ptr %"v::GenericMemory", align 8
      %0 = icmp slt i64 %.unbox, 1
; └└└└
  br i1 %0, label %L29, label %preloop.pseudo.exit

[...]

oob:                                              ; preds = %L11.postloop, %L11
  %value_phi3.lcssa = phi i64 [ %value_phi3.postloop, %L11.postloop ], [ %6, %L11 ]
;  @ REPL[17]:3 within `#59`
; ┌ @ genericmemory.jl:260 within `setindex!`
; │┌ @ genericmemory.jl:252 within `_setindex!`
    %ptls_field = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
    %ptls_load = load ptr, ptr %ptls_field, align 8
    %"box::GenericMemoryRef" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 408, i32 32, i64 140577962397856) #8
    %"box::GenericMemoryRef.tag_addr" = getelementptr inbounds i64, ptr %"box::GenericMemoryRef", i64 -1
    store atomic i64 140577962397856, ptr %"box::GenericMemoryRef.tag_addr" unordered, align 8
    store ptr %memoryref_data, ptr %"box::GenericMemoryRef", align 8
    %.repack16 = getelementptr inbounds { ptr, ptr }, ptr %"box::GenericMemoryRef", i64 0, i32 1
    store ptr %"v::GenericMemory", ptr %.repack16, align 8
    call void @ijl_bounds_error_int(ptr nonnull %"box::GenericMemoryRef", i64 %value_phi3.lcssa)
    unreachable

load:                                             ; preds = %L11
    %memoryref_offset = shl i64 %value_phi3, 3
; ││ @ genericmemory.jl:253 within `_setindex!`
    %gep = getelementptr i8, ptr %invariant.gep, i64 %memoryref_offset
    store i64 4607182418800017408, ptr %gep, align 8
; └└
;  @ REPL[17]:4 within `#59`
; ┌ @ range.jl:921 within `iterate`
   %1 = add nuw nsw i64 %value_phi3, 1
; └
  %exitcond39.not = icmp eq i64 %value_phi3, %umax
  br i1 %exitcond39.not, label %main.exit.selector, label %L11

[...]

vector.body:                                      ; preds = %vector.body, %vector.ph
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    %offset.idx = shl i64 %index, 3
    %12 = or disjoint i64 %offset.idx, 8
; ││ @ genericmemory.jl:253 within `_setindex!`
    %13 = getelementptr i8, ptr %invariant.gep, i64 %12
    %14 = getelementptr i64, ptr %13, i64 4
    %15 = getelementptr i64, ptr %13, i64 8
    %16 = getelementptr i64, ptr %13, i64 12
    store <4 x i64> <i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408>, ptr %13, align 8
    store <4 x i64> <i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408>, ptr %14, align 8
    store <4 x i64> <i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408>, ptr %15, align 8
    store <4 x i64> <i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408>, ptr %16, align 8
    %index.next = add nuw i64 %index, 16
    %17 = icmp eq i64 %index.next, %n.vec
    br i1 %17, label %L11, label %vector.body

[...]

It'd be nice to have a test to make sure this doesn't regress, once #57380 is merged.

Feb 17 '25 16:02 giordano

@nanosoldier runbenchmarks(!"scalar", vs=":master")

Feb 19 '25 19:02 gbaraldi

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

Feb 20 '25 06:02 nanosoldier