julia
julia copied to clipboard
Regression in performance of copying small vectors w/for loop
From my small copying loops benchmarking in #45487, I noticed that copying small vectors of 1-5 elements with a for loop is slower on the nightly build than in 1.10:
(Though, awesome that the larger array for loops are now using SIMD automatically!)
It seems like the extra overhead comes from this new line:
https://github.com/JuliaLang/julia/blob/8eaf83c036f0962fca5e2d6a1bbc8eb48e6e5444/base/essentials.jl#L891
As array size increases, the relative cost of this line of code decreases
not trying to minimize the issue, but just noting that if one finds themselves with many small vectors and a need for speed, probably a Tuple
or SVector
is a more appropriate type than Vector
I'm pretty sure that that was there in the arrayref
intrinsic previously. Is the regression that it no longer is proving that the indexing is inbounds?
not trying to minimize the issue, but just noting that if one finds themselves with many small vectors and a need for speed, probably a
Tuple
orSVector
is a more appropriate type thanVector
Sure, not the end of the world! :-)
I'm pretty sure that that was there in the
arrayref
intrinsic previously. Is the regression that it no longer is proving that the indexing is inbounds?
Perhaps the attribution isn't quite right, and it's some other overhead from the compiled code. The generated native code in nightly is 403 lines vs. 269 lines.
Non-SIMD loop (13 instructions):
.LBB0_9: # %L76
# =>This Loop Header: Depth=1
# Child Loop BB0_10 Depth 2
xor esi, esi
.p2align 4, 0x90
.LBB0_10: # %L93
# Parent Loop BB0_9 Depth=1
# => This Inner Loop Header: Depth=2
; │ @ REPL[9]:6 within `forloopcopy`
; │┌ @ essentials.jl:13 within `getindex`
cmp rcx, rsi
je .LBB0_11
# %bb.14: # %idxend
# in Loop: Header=BB0_10 Depth=2
; │└
; │┌ @ array.jl:1021 within `setindex!`
cmp rsi, rbx
jae .LBB0_16
# %bb.15: # %idxend33
# in Loop: Header=BB0_10 Depth=2
; │└
; │┌ @ essentials.jl:13 within `getindex`
mov rax, qword ptr [r13]
vmovsd xmm0, qword ptr [rax + 8*rsi] # xmm0 = mem[0],zero
; │└
; │┌ @ array.jl:1021 within `setindex!`
mov rax, qword ptr [rdi]
vmovsd qword ptr [rax + 8*rsi], xmm0
; │└
; │ @ REPL[9]:7 within `forloopcopy`
; │┌ @ range.jl:901 within `iterate`
; ││┌ @ promotion.jl:521 within `==`
inc rsi
cmp rdx, rsi
; │└└
jne .LBB0_10
# %bb.12: # %L108.loopexit
# in Loop: Header=BB0_9 Depth=1
SIMD loop (65 instructions):
.LBB0_16: # %L120
# =>This Loop Header: Depth=1
# Child Loop BB0_20 Depth 2
# Child Loop BB0_22 Depth 2
mov rbx, qword ptr [r15]
mov r11, qword ptr [r15 + 16]
mov rax, qword ptr [rbp - 64] # 8-byte Reload
mov r14, qword ptr [rax]
cmp r8, r11
mov rcx, r11
cmovb rcx, r8
cmp rcx, rdx
cmovae rcx, rdx
lea rax, [rcx + 1]
mov r13d, 1
cmp rax, 17
jb .LBB0_21
# %bb.17: # %vector.memcheck
# in Loop: Header=BB0_16 Depth=1
lea r10, [r14 + 8*rcx]
add r10, 8
cmp rbx, r10
jae .LBB0_19
# %bb.18: # %vector.memcheck
# in Loop: Header=BB0_16 Depth=1
lea rcx, [rbx + 8*rcx]
add rcx, 8
cmp r14, rcx
jb .LBB0_21
.LBB0_19: # %vector.ph
# in Loop: Header=BB0_16 Depth=1
mov r10d, eax
and r10d, 15
mov ecx, 16
cmove r10, rcx
mov rcx, rax
sub rcx, r10
neg r10
lea r13, [rax + r10]
inc r13
xor eax, eax
.p2align 4, 0x90
.LBB0_20: # %vector.body
# Parent Loop BB0_16 Depth=1
# => This Inner Loop Header: Depth=2
; ││ @ essentials.jl:892 within `getindex`
vmovups ymm0, ymmword ptr [r14 + 8*rax]
vmovups ymm1, ymmword ptr [r14 + 8*rax + 32]
vmovups ymm2, ymmword ptr [r14 + 8*rax + 64]
vmovups ymm3, ymmword ptr [r14 + 8*rax + 96]
; │└
; │┌ @ array.jl:976 within `setindex!`
vmovups ymmword ptr [rbx + 8*rax], ymm0
vmovups ymmword ptr [rbx + 8*rax + 32], ymm1
vmovups ymmword ptr [rbx + 8*rax + 64], ymm2
vmovups ymmword ptr [rbx + 8*rax + 96], ymm3
add rax, 16
cmp rcx, rax
jne .LBB0_20
.LBB0_21: # %scalar.ph
# in Loop: Header=BB0_16 Depth=1
; │└
; │┌ @ essentials.jl:891 within `getindex`
lea r10, [r11 + 1]
mov rax, rsi
sub rax, r13
sub r11, r13
inc r11
mov rcx, rdi
sub rcx, r13
lea rbx, [rbx + 8*r13]
add rbx, -8
lea r14, [r14 + 8*r13]
add r14, -8
xor r13d, r13d
.p2align 4, 0x90
.LBB0_22: # %L137
# Parent Loop BB0_16 Depth=1
# => This Inner Loop Header: Depth=2
cmp rcx, r13
je .LBB0_27
# %bb.23: # %L153
# in Loop: Header=BB0_22 Depth=2
; │└
; │┌ @ array.jl:975 within `setindex!`
; ││┌ @ int.jl:513 within `<`
cmp r11, r13
; ││└
je .LBB0_28
# %bb.24: # %L171
# in Loop: Header=BB0_22 Depth=2
; │└
; │┌ @ essentials.jl:892 within `getindex`
vmovsd xmm0, qword ptr [r14 + 8*r13] # xmm0 = mem[0],zero
; │└
; │┌ @ array.jl:976 within `setindex!`
vmovsd qword ptr [rbx + 8*r13], xmm0
; │└
; │ @ REPL[6]:7 within `forloopcopy`
; │┌ @ range.jl:902 within `iterate`
; ││┌ @ promotion.jl:635 within `==`
inc r13
cmp rax, r13
; │└└
jne .LBB0_22
# %bb.25: # %L186.loopexit
# in Loop: Header=BB0_16 Depth=1
It's possible that the slowdown is just the check to see if the vector is long enough to do the vector version.
Oh I see, yes - there's a SIMD loop and then a non-SIMD loop in the SIMD code from 1.11.
But the non-SIMD version in the SIMD code has an extra 12 instructions, attributed to the bounds check line (essentials.jl:891) that doesn't appear in the 1.10 non-SIMD-only code. So perhaps you're right that the issue is this bounds check was able to be elided in 1.10 but no longer is here?