Strange behavior in big table benchmark
(Currently this behaviour is only observed in Big table benchmark)
When I remove #[inline] from Buffer::reserve_internal() method, call to itoap::write_u64 function will become indirect call (via function address on register), which makes the program 30% slower.
This issue prevents removing #[inline] for reserve_internal() method, resulting in slightly larger binary.
Full Assembly Output
<sailfish_research::BigTable as sailfish::TemplateOnce>::render_once_to_string:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 56
mov r12, rdx
mov r13, rsi
mov qword, ptr, [rsp, +, 48], rdi
mov rax, qword, ptr, [rcx]
movdqu xmm0, xmmword, ptr, [rcx, +, 8]
pshufd xmm0, xmm0, 78
mov qword, ptr, [rcx], 1
xorps xmm1, xmm1
mov qword, ptr, [rsp, +, 24], rcx
movups xmmword, ptr, [rcx, +, 8], xmm1
mov qword, ptr, [rsp], rax
movdqu xmmword, ptr, [rsp, +, 8], xmm0
mov rsi, qword, ptr, [rip, +, _ZN70_$LT$sailfish_research..BigTable$u20$as$u20$sailfish..TemplateOnce$GT$21render_once_to_string9SIZE_HINT17h2be0f08b3976f89cE]
mov rax, qword, ptr, [rsp, +, 8]
mov rcx, qword, ptr, [rsp, +, 16]
lea rdx, [rax, +, rsi]
cmp rdx, rcx
ja .LBB10_1
lea rdx, [rax, +, 8]
cmp rdx, rcx
ja .LBB10_4
.LBB10_6:
mov rcx, qword, ptr, [rsp]
movabs rdx, 738138905116767292
mov qword, ptr, [rcx, +, rax], rdx
mov rcx, qword, ptr, [rsp, +, 8]
lea rax, [rcx, +, 8]
mov qword, ptr, [rsp, +, 8], rax
add rcx, 12
cmp rcx, qword, ptr, [rsp, +, 16]
ja .LBB10_7
.LBB10_9:
mov rcx, qword, ptr, [rsp]
mov dword, ptr, [rcx, +, rax], 1047688252
mov rax, qword, ptr, [rsp, +, 8]
add rax, 4
mov qword, ptr, [rsp, +, 8], rax
shl r12, 3
lea rcx, [r12, +, 2*r12]
test rcx, rcx
je .LBB10_28
add rcx, r13
mov r12, qword, ptr, [rip, +, _ZN5itoap9write_u6417hc82e35bf780e3b54E@GOTPCREL]
mov qword, ptr, [rsp, +, 32], rcx
.LBB10_11:
lea rcx, [rax, +, 4]
cmp rcx, qword, ptr, [rsp, +, 16]
ja .LBB10_12
.LBB10_14:
mov rcx, qword, ptr, [rsp]
mov dword, ptr, [rcx, +, rax], 1046770748
mov rsi, qword, ptr, [rsp, +, 8]
add rsi, 4
mov qword, ptr, [rsp, +, 8], rsi
mov r14, qword, ptr, [r13, +, 16]
test r14, r14
je .LBB10_24
mov r15, qword, ptr, [r13]
shl r14, 3
xor ebp, ebp
.LBB10_16:
mov rbx, qword, ptr, [r15, +, rbp]
lea rax, [rsi, +, 20]
cmp rax, qword, ptr, [rsp, +, 16]
ja .LBB10_17
.LBB10_19:
add rsi, qword, ptr, [rsp]
mov rdi, rbx
call r12
mov rdx, qword, ptr, [rsp, +, 8]
lea rcx, [rdx, +, rax]
mov qword, ptr, [rsp, +, 8], rcx
add rax, rdx
add rax, 9
cmp rax, qword, ptr, [rsp, +, 16]
ja .LBB10_21
.LBB10_23:
mov rax, qword, ptr, [rsp]
movzx edx, byte, ptr, [rip, +, .Lanon.28d6b14a742276b51f6db9706e341907.9+8]
mov byte, ptr, [rax, +, rcx, +, 8], dl
mov rdx, qword, ptr, [rip, +, .Lanon.28d6b14a742276b51f6db9706e341907.9]
mov qword, ptr, [rax, +, rcx], rdx
mov rsi, qword, ptr, [rsp, +, 8]
add rsi, 9
mov qword, ptr, [rsp, +, 8], rsi
add rbp, 8
cmp r14, rbp
jne .LBB10_16
jmp .LBB10_24
.LBB10_17:
mov esi, 20
mov rdi, rsp
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rsi, qword, ptr, [rsp, +, 8]
jmp .LBB10_19
.LBB10_21:
mov esi, 9
mov rdi, rsp
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rcx, qword, ptr, [rsp, +, 8]
jmp .LBB10_23
.LBB10_24:
lea rax, [rsi, -, 4]
mov qword, ptr, [rsp, +, 8], rax
add rsi, 5
cmp rsi, qword, ptr, [rsp, +, 16]
ja .LBB10_25
.LBB10_27:
add r13, 24
mov rcx, qword, ptr, [rsp]
mov dl, byte, ptr, [rip, +, .Lanon.28d6b14a742276b51f6db9706e341907.10+8]
mov byte, ptr, [rcx, +, rax, +, 8], dl
mov rdx, qword, ptr, [rip, +, .Lanon.28d6b14a742276b51f6db9706e341907.10]
mov qword, ptr, [rcx, +, rax], rdx
mov rax, qword, ptr, [rsp, +, 8]
add rax, 9
mov qword, ptr, [rsp, +, 8], rax
mov rcx, qword, ptr, [rsp, +, 32]
cmp r13, rcx
jne .LBB10_11
jmp .LBB10_28
.LBB10_12:
mov esi, 4
mov rdi, rsp
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rax, qword, ptr, [rsp, +, 8]
jmp .LBB10_14
.LBB10_25:
mov esi, 9
mov rdi, rsp
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rax, qword, ptr, [rsp, +, 8]
jmp .LBB10_27
.LBB10_28:
lea rcx, [rax, -, 4]
mov qword, ptr, [rsp, +, 8], rcx
add rax, 5
cmp rax, qword, ptr, [rsp, +, 16]
ja .LBB10_29
.LBB10_31:
mov rax, qword, ptr, [rsp]
mov dl, byte, ptr, [rip, +, .Lanon.28d6b14a742276b51f6db9706e341907.11+8]
mov byte, ptr, [rax, +, rcx, +, 8], dl
mov rdx, qword, ptr, [rip, +, .Lanon.28d6b14a742276b51f6db9706e341907.11]
mov qword, ptr, [rax, +, rcx], rdx
mov rax, qword, ptr, [rsp, +, 8]
lea rcx, [rax, +, 9]
mov qword, ptr, [rsp, +, 8], rcx
shr rcx, 2
add rax, rcx
add rax, 9
mov rcx, qword, ptr, [rip, +, _ZN70_$LT$sailfish_research..BigTable$u20$as$u20$sailfish..TemplateOnce$GT$21render_once_to_string9SIZE_HINT17h2be0f08b3976f89cE]
cmp rcx, rax
jae .LBB10_33
mov qword, ptr, [rip, +, _ZN70_$LT$sailfish_research..BigTable$u20$as$u20$sailfish..TemplateOnce$GT$21render_once_to_string9SIZE_HINT17h2be0f08b3976f89cE], rax
.LBB10_33:
mov rbx, qword, ptr, [rsp]
movdqu xmm0, xmmword, ptr, [rsp, +, 8]
pshufd xmm0, xmm0, 78
mov rax, qword, ptr, [rsp, +, 24]
mov rdi, qword, ptr, [rax]
test rdi, rdi
je .LBB10_36
mov rsi, qword, ptr, [rax, +, 8]
test rsi, rsi
je .LBB10_36
mov edx, 1
movdqa xmmword, ptr, [rsp, +, 32], xmm0
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
movdqa xmm0, xmmword, ptr, [rsp, +, 32]
mov rax, qword, ptr, [rsp, +, 24]
.LBB10_36:
mov qword, ptr, [rax], rbx
movdqu xmmword, ptr, [rax, +, 8], xmm0
mov rax, qword, ptr, [rsp, +, 48]
mov qword, ptr, [rax], 0
add rsp, 56
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB10_1:
mov rdi, rsp
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rax, qword, ptr, [rsp, +, 8]
mov rcx, qword, ptr, [rsp, +, 16]
lea rdx, [rax, +, 8]
cmp rdx, rcx
jbe .LBB10_6
.LBB10_4:
mov rdi, rsp
mov esi, 8
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rax, qword, ptr, [rsp, +, 8]
jmp .LBB10_6
.LBB10_7:
mov rdi, rsp
mov esi, 4
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rax, qword, ptr, [rsp, +, 8]
jmp .LBB10_9
.LBB10_29:
mov rdi, rsp
mov esi, 9
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer6Buffer16reserve_internal17h3a4aa4ca459d2714E@GOTPCREL]
mov rcx, qword, ptr, [rsp, +, 8]
jmp .LBB10_31
.LBB10_39:
jmp .LBB10_40
.LBB10_38:
jmp .LBB10_40
.LBB10_37:
.LBB10_40:
mov rbx, rax
mov rdi, rsp
call core::ptr::drop_in_place
mov rdi, rbx
call _Unwind_Resume
ud2
@botika I appreciate your tips, though it has nothing to do with this issue.
Your experiments examines how compiler can optimize fixed buffer. However this issue is about the indirect call generated by compiler.
I found that enhanced for-loop optimization could cause significant performance drop (~40%) in big-table benchmark. That makes me feel even more confused.
Previous compiler output
{
__sf_rt::render_text!(__sf_buf, "<table>\n");
{
__sf_rt::render_text!(__sf_buf, "<tr>");
for row in table {
{
__sf_rt::render_text!(__sf_buf, "<td>");
for col in row {
__sf_rt::render_escaped!(__sf_buf, *col);
__sf_rt::render_text!(__sf_buf, "</td><td>");
}
unsafe {
__sf_buf._set_len(__sf_buf.len() - 4usize);
}
}
__sf_rt::render_text!(__sf_buf, "</tr><tr>");
}
unsafe {
__sf_buf._set_len(__sf_buf.len() - 4usize);
}
}
__sf_rt::render_text!(__sf_buf, "\n</table>");
}
perf/join-after-optimize branch (~40% slower)
{
__sf_rt::render_text!(__sf_buf, "<table>\n<tr><td>");
for row in table {
for col in row {
__sf_rt::render_escaped!(__sf_buf, *col);
__sf_rt::render_text!(__sf_buf, "</td><td>");
}
unsafe {
__sf_buf._set_len(__sf_buf.len() - 4usize);
}
__sf_rt::render_text!(__sf_buf, "</tr><tr><td>");
}
unsafe {
__sf_buf._set_len(__sf_buf.len() - 8usize);
}
__sf_rt::render_text!(__sf_buf, "\n</table>");
}
Full Assembly Output
all::big_table:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 120
mov qword, ptr, [rsp, +, 88], rdi
mov r12d, 8
mov edi, 2400
mov esi, 8
call qword, ptr, [rip, +, __rust_alloc@GOTPCREL]
test rax, rax
je .LBB20_1
mov qword, ptr, [rsp, +, 40], rax
vmovaps xmm0, xmmword, ptr, [rip, +, .LCPI20_0]
vmovups xmmword, ptr, [rsp, +, 48], xmm0
mov r13d, 100
mov ebp, 4
xor r15d, r15d
jmp .LBB20_49
.LBB20_52:
mov rcx, qword, ptr, [rsp, +, 40]
mov r14, qword, ptr, [rsp, +, 32]
.LBB20_53:
lea rax, [r15, +, 2*r15]
mov rdx, qword, ptr, [rsp, +, 16]
mov qword, ptr, [rcx, +, 8*rax, +, 16], rdx
vmovaps xmm0, xmmword, ptr, [rsp]
vmovups xmmword, ptr, [rcx, +, 8*rax], xmm0
mov r15, r14
cmp r14d, 100
je .LBB20_3
.LBB20_49:
mov edi, 800
mov esi, 8
call qword, ptr, [rip, +, __rust_alloc@GOTPCREL]
test rax, rax
je .LBB20_50
mov rdi, rax
lea rax, [r15, +, 1]
mov qword, ptr, [rsp, +, 32], rax
mov qword, ptr, [rsp, +, 64], rdi
vmovaps xmm0, xmmword, ptr, [rip, +, .LCPI20_0]
vmovups xmmword, ptr, [rsp, +, 72], xmm0
mov esi, 100
xor r14d, r14d
jmp .LBB20_71
.LBB20_75:
mov rdx, rcx
mov rcx, rbx
call qword, ptr, [rip, +, __rust_realloc@GOTPCREL]
.LBB20_79:
mov rcx, rax
.LBB20_80:
test rcx, rcx
je .LBB20_84
mov qword, ptr, [rsp, +, 64], rcx
shr rbx, 3
mov rdi, rcx
mov rsi, rbx
.LBB20_82:
mov qword, ptr, [rdi, +, 8*r14], r14
lea rax, [r14, +, 1]
mov r14, rax
cmp rax, 100
je .LBB20_51
.LBB20_71:
cmp r14, rsi
jne .LBB20_82
lea rax, [rsi, +, 1]
lea rcx, [rsi, +, rsi]
cmp rcx, rax
cmova rax, rcx
cmp rax, 4
cmovbe rax, rbp
xor ecx, ecx
mul r12
mov rbx, rax
seto al
setno cl
shl rcx, 3
test rsi, rsi
je .LBB20_76
test al, al
jne .LBB20_83
shl rsi, 3
test rsi, rsi
jne .LBB20_75
jmp .LBB20_77
.LBB20_76:
test al, al
jne .LBB20_83
.LBB20_77:
test rbx, rbx
je .LBB20_80
mov rdi, rbx
mov rsi, rcx
call qword, ptr, [rip, +, __rust_alloc@GOTPCREL]
jmp .LBB20_79
.LBB20_51:
mov qword, ptr, [rsp, +, 80], 100
mov qword, ptr, [rsp, +, 72], rsi
mov rax, qword, ptr, [rsp, +, 64]
mov qword, ptr, [rsp], rax
mov rax, qword, ptr, [rsp, +, 72]
mov qword, ptr, [rsp, +, 8], rax
mov qword, ptr, [rsp, +, 16], 100
cmp r15, r13
jne .LBB20_52
mov rax, r13
inc rax
mov r14, qword, ptr, [rsp, +, 32]
je .LBB20_55
mov rcx, r13
add rcx, r13
cmp rcx, rax
cmova rax, rcx
cmp rax, 4
cmovbe rax, rbp
xor ecx, ecx
mov edx, 24
mul rdx
mov rbx, rax
seto al
setno cl
shl rcx, 3
test r13, r13
je .LBB20_63
test al, al
jne .LBB20_56
mov rdi, qword, ptr, [rsp, +, 40]
test rdi, rdi
je .LBB20_61
shl r13, 3
lea rsi, [2*r13]
add rsi, r13
test rsi, rsi
je .LBB20_61
mov rdx, rcx
mov rcx, rbx
call qword, ptr, [rip, +, __rust_realloc@GOTPCREL]
jmp .LBB20_67
.LBB20_63:
test al, al
jne .LBB20_64
.LBB20_61:
test rbx, rbx
je .LBB20_68
mov rdi, rbx
mov rsi, rcx
call qword, ptr, [rip, +, __rust_alloc@GOTPCREL]
.LBB20_67:
mov rcx, rax
.LBB20_68:
test rcx, rcx
je .LBB20_84
mov qword, ptr, [rsp, +, 40], rcx
mov rdx, rbx
movabs rax, -6148914691236517205
mulx r13, r13, rax
shr r13, 4
jmp .LBB20_53
.LBB20_3:
mov qword, ptr, [rsp, +, 56], r14
mov qword, ptr, [rsp, +, 48], r13
call qword, ptr, [rip, +, _ZN3std4time7Instant3now17hb624d5f02fff610dE@GOTPCREL]
mov qword, ptr, [rsp, +, 64], rax
mov qword, ptr, [rsp, +, 72], rdx
mov rax, qword, ptr, [rsp, +, 88]
mov rax, qword, ptr, [rax]
mov qword, ptr, [rsp, +, 104], rax
test rax, rax
je .LBB20_85
xor ebp, ebp
.LBB20_6:
mov qword, ptr, [rsp], 1
lea rax, [rsp, +, 8]
vxorps xmm0, xmm0, xmm0
vmovups xmmword, ptr, [rax], xmm0
mov r15, qword, ptr, [rsp, +, 40]
mov r12, qword, ptr, [rip, +, _ZN56_$LT$all..BigTable$u20$as$u20$sailfish..TemplateOnce$GT$11render_once9SIZE_HINT17h7471bb5c60182121E]
mov rcx, qword, ptr, [rsp, +, 8]
mov rsi, qword, ptr, [rsp, +, 16]
mov rdi, rsi
sub rdi, rcx
cmp rdi, r12
jae .LBB20_7
lea rax, [rsi, +, rsi]
add r12, rsi
cmp rax, r12
cmova r12, rax
mov rdi, qword, ptr, [rsp]
mov rdx, r12
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer12safe_realloc17hefa6669a8d0a53bfE@GOTPCREL]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 16], r12
mov rcx, qword, ptr, [rsp, +, 8]
mov rdi, r12
sub rdi, rcx
cmp rdi, 16
jb .LBB20_16
.LBB20_15:
mov rax, qword, ptr, [rsp]
jmp .LBB20_18
.LBB20_7:
mov r12, rsi
cmp rdi, 16
jae .LBB20_15
.LBB20_16:
lea rax, [r12, +, r12]
lea rbx, [r12, +, 16]
cmp rax, rbx
cmova rbx, rax
mov rdi, qword, ptr, [rsp]
mov rsi, r12
mov rdx, rbx
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer12safe_realloc17hefa6669a8d0a53bfE@GOTPCREL]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 16], rbx
mov rcx, qword, ptr, [rsp, +, 8]
mov r12, rbx
.LBB20_18:
mov qword, ptr, [rsp, +, 96], rbp
vmovups xmm0, xmmword, ptr, [rip, +, .L__unnamed_2]
vmovups xmmword, ptr, [rax, +, rcx], xmm0
add rcx, 16
mov qword, ptr, [rsp, +, 8], rcx
test r14, r14
je .LBB20_19
lea rdx, [r14, +, 2*r14]
mov rsi, r15
lea rdx, [r15, +, 8*rdx]
mov qword, ptr, [rsp, +, 112], rdx
jmp .LBB20_23
.LBB20_41:
mov r15, r12
mov rsi, qword, ptr, [rsp, +, 32]
.LBB20_44:
add rsi, 24
mov rdx, qword, ptr, [rip, +, .L__unnamed_3+5]
mov qword, ptr, [rax, +, rcx, +, 5], rdx
mov rdx, qword, ptr, [rip, +, .L__unnamed_3]
mov qword, ptr, [rax, +, rcx], rdx
add rcx, 13
mov qword, ptr, [rsp, +, 8], rcx
cmp rsi, qword, ptr, [rsp, +, 112]
je .LBB20_20
.LBB20_23:
mov r13, qword, ptr, [rsi, +, 16]
test r13, r13
mov qword, ptr, [rsp, +, 32], rsi
je .LBB20_40
mov rbp, qword, ptr, [rsi]
shl r13, 3
xor r14d, r14d
jmp .LBB20_25
.LBB20_30:
mov rax, qword, ptr, [rsp]
.LBB20_47:
movzx edx, byte, ptr, [rip, +, .L__unnamed_4+8]
mov byte, ptr, [rax, +, rcx, +, 8], dl
mov rdx, qword, ptr, [rip, +, .L__unnamed_4]
mov qword, ptr, [rax, +, rcx], rdx
add rcx, 9
mov qword, ptr, [rsp, +, 8], rcx
add r14, 8
cmp r13, r14
je .LBB20_40
.LBB20_25:
mov r15, qword, ptr, [rbp, +, r14]
lea rdx, [rcx, +, 20]
cmp rdx, r12
jbe .LBB20_28
lea rcx, [r12, +, r12]
lea rbx, [r12, +, 20]
cmp rcx, rbx
cmova rbx, rcx
mov rdi, rax
mov rsi, r12
mov rdx, rbx
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer12safe_realloc17hefa6669a8d0a53bfE@GOTPCREL]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 16], rbx
mov rcx, qword, ptr, [rsp, +, 8]
.LBB20_28:
add rax, rcx
mov rdi, r15
mov rsi, rax
call qword, ptr, [rip, +, _ZN5itoap9write_u6417hb594a3d8c0958d6cE@GOTPCREL]
mov rcx, rax
add rcx, qword, ptr, [rsp, +, 8]
mov qword, ptr, [rsp, +, 8], rcx
mov r12, qword, ptr, [rsp, +, 16]
mov rax, r12
sub rax, rcx
cmp rax, 9
jae .LBB20_30
lea rax, [r12, +, r12]
lea rbx, [r12, +, 9]
cmp rax, rbx
cmova rbx, rax
mov rdi, qword, ptr, [rsp]
mov rsi, r12
mov rdx, rbx
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer12safe_realloc17hefa6669a8d0a53bfE@GOTPCREL]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 16], rbx
mov rcx, qword, ptr, [rsp, +, 8]
mov r12, rbx
jmp .LBB20_47
.LBB20_40:
add rcx, -4
mov qword, ptr, [rsp, +, 8], rcx
mov rdx, r12
sub rdx, rcx
cmp rdx, 12
ja .LBB20_41
lea rcx, [r12, +, r12]
lea r15, [r12, +, 13]
cmp rcx, r15
cmova r15, rcx
mov rdi, rax
mov rsi, r12
mov rdx, r15
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer12safe_realloc17hefa6669a8d0a53bfE@GOTPCREL]
mov rsi, qword, ptr, [rsp, +, 32]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 16], r15
mov rcx, qword, ptr, [rsp, +, 8]
mov r12, r15
jmp .LBB20_44
.LBB20_19:
mov r15, r12
.LBB20_20:
add rcx, -8
mov qword, ptr, [rsp, +, 8], rcx
mov rdx, r15
sub rdx, rcx
cmp rdx, 8
ja .LBB20_21
lea rcx, [r15, +, r15]
lea rbx, [r15, +, 9]
cmp rcx, rbx
cmova rbx, rcx
mov rdi, rax
mov rsi, r15
mov rdx, rbx
call qword, ptr, [rip, +, _ZN8sailfish7runtime6buffer12safe_realloc17hefa6669a8d0a53bfE@GOTPCREL]
mov rbp, qword, ptr, [rsp, +, 96]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 16], rbx
mov rcx, qword, ptr, [rsp, +, 8]
jmp .LBB20_33
.LBB20_21:
mov rbp, qword, ptr, [rsp, +, 96]
.LBB20_33:
mov dl, byte, ptr, [rip, +, .L__unnamed_5+8]
mov byte, ptr, [rax, +, rcx, +, 8], dl
mov rdx, qword, ptr, [rip, +, .L__unnamed_5]
mov qword, ptr, [rax, +, rcx], rdx
lea rax, [rcx, +, 9]
mov qword, ptr, [rsp, +, 8], rax
shr rax, 3
add rax, rcx
add rax, 84
mov rcx, qword, ptr, [rip, +, _ZN56_$LT$all..BigTable$u20$as$u20$sailfish..TemplateOnce$GT$11render_once9SIZE_HINT17h7471bb5c60182121E]
cmp rcx, rax
jae .LBB20_35
mov qword, ptr, [rip, +, _ZN56_$LT$all..BigTable$u20$as$u20$sailfish..TemplateOnce$GT$11render_once9SIZE_HINT17h7471bb5c60182121E], rax
.LBB20_35:
inc rbp
mov rax, qword, ptr, [rsp]
vpermilps xmm0, xmmword, ptr, [rsp, +, 8], 78
mov qword, ptr, [rsp], rax
vmovups xmmword, ptr, [rsp, +, 8], xmm0
mov rsi, qword, ptr, [rsp, +, 8]
mov rax, qword, ptr, [rsp, +, 16]
mov rdi, qword, ptr, [rsp]
test rdi, rdi
je .LBB20_38
test rsi, rsi
je .LBB20_38
mov edx, 1
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB20_38:
cmp rbp, qword, ptr, [rsp, +, 104]
je .LBB20_85
mov r14, qword, ptr, [rsp, +, 56]
jmp .LBB20_6
.LBB20_85:
lea rdi, [rsp, +, 64]
call qword, ptr, [rip, +, _ZN3std4time7Instant7elapsed17h0b88ba2cb63ee684E@GOTPCREL]
mov rcx, qword, ptr, [rsp, +, 88]
mov qword, ptr, [rcx, +, 8], rax
mov dword, ptr, [rcx, +, 16], edx
mov r14, qword, ptr, [rsp, +, 40]
mov rax, qword, ptr, [rsp, +, 56]
test rax, rax
je .LBB20_93
lea rax, [rax, +, 2*rax]
lea rbp, [r14, +, 8*rax]
mov r15, qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
mov rbx, r14
jmp .LBB20_88
.LBB20_92:
add rbx, 24
cmp rbx, rbp
je .LBB20_93
.LBB20_88:
mov rsi, qword, ptr, [rbx, +, 8]
test rsi, rsi
je .LBB20_92
mov rdi, qword, ptr, [rbx]
test rdi, rdi
je .LBB20_92
shl rsi, 3
test rsi, rsi
je .LBB20_92
mov edx, 8
call r15
jmp .LBB20_92
.LBB20_93:
mov rax, qword, ptr, [rsp, +, 48]
test rax, rax
je .LBB20_96
shl rax, 3
lea rsi, [rax, +, 2*rax]
test rsi, rsi
je .LBB20_96
mov edx, 8
mov rdi, r14
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB20_96:
add rsp, 120
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB20_84:
mov esi, 8
mov rdi, rbx
call qword, ptr, [rip, +, _ZN5alloc5alloc18handle_alloc_error17hb12bf6f6b6f1927aE@GOTPCREL]
ud2
.LBB20_83:
mov qword, ptr, [rsp, +, 56], r15
mov qword, ptr, [rsp, +, 48], r13
mov qword, ptr, [rsp, +, 80], rsi
mov qword, ptr, [rsp, +, 72], rsi
call qword, ptr, [rip, +, _ZN5alloc7raw_vec17capacity_overflow17h35f3dcdf1e879e98E@GOTPCREL]
.LBB20_57:
ud2
.LBB20_50:
mov edi, 800
mov esi, 8
call qword, ptr, [rip, +, _ZN5alloc5alloc18handle_alloc_error17hb12bf6f6b6f1927aE@GOTPCREL]
ud2
.LBB20_55:
mov r13, -1
jmp .LBB20_56
.LBB20_64:
xor r13d, r13d
.LBB20_56:
mov qword, ptr, [rsp, +, 56], r13
mov qword, ptr, [rsp, +, 48], r13
call qword, ptr, [rip, +, _ZN5alloc7raw_vec17capacity_overflow17h35f3dcdf1e879e98E@GOTPCREL]
jmp .LBB20_57
.LBB20_1:
mov edi, 2400
mov esi, 8
call qword, ptr, [rip, +, _ZN5alloc5alloc18handle_alloc_error17hb12bf6f6b6f1927aE@GOTPCREL]
ud2
.LBB20_100:
mov rbx, rax
mov rdi, rsp
call core::ptr::drop_in_place
jmp .LBB20_99
.LBB20_97:
mov rbx, rax
lea rdi, [rsp, +, 64]
call core::ptr::drop_in_place
jmp .LBB20_99
.LBB20_10:
jmp .LBB20_11
.LBB20_9:
jmp .LBB20_11
.LBB20_8:
.LBB20_11:
mov rbx, rax
mov rdi, rsp
call core::ptr::drop_in_place
.LBB20_99:
lea rdi, [rsp, +, 40]
call core::ptr::drop_in_place
mov rdi, rbx
call _Unwind_Resume
ud2
.LBB20_98:
mov rbx, rax
jmp .LBB20_99
.LBB10_16: is a loop starts. r12 set to qword, ptr, [rip, +, _ZN5itoap9write_u6417hc82e35bf780e3b54E@GOTPCREL] to save n additions rip + .... and memory gets.
@botika This isn't that simple. Here are the objdump results for master and perf/join-after-optimize branch
master branch
00000000000167f0 <<all::BigTable as sailfish::TemplateOnce>::render_once>:
167f0: 55 push rbp
167f1: 41 57 push r15
167f3: 41 56 push r14
167f5: 41 55 push r13
167f7: 41 54 push r12
167f9: 53 push rbx
167fa: 48 83 ec 38 sub rsp,0x38
167fe: 48 c7 44 24 08 01 00 mov QWORD PTR [rsp+0x8],0x1
16805: 00 00
16807: c5 f8 57 c0 vxorps xmm0,xmm0,xmm0
1680b: c5 f8 11 44 24 10 vmovups XMMWORD PTR [rsp+0x10],xmm0
16811: 49 89 d7 mov r15,rdx
16814: 48 89 f5 mov rbp,rsi
16817: 49 89 fe mov r14,rdi
1681a: 4c 8b 25 87 0e 04 00 mov r12,QWORD PTR [rip+0x40e87] # 576a8 <<all::BigTable as sailfish::TemplateOnce>::render_once::SIZE_HINT>
16821: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16826: 48 8b 74 24 18 mov rsi,QWORD PTR [rsp+0x18]
1682b: 48 89 f0 mov rax,rsi
1682e: 48 29 c8 sub rax,rcx
16831: 4c 39 e0 cmp rax,r12
16834: 73 3e jae 16874 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x84>
16836: 48 8d 04 36 lea rax,[rsi+rsi*1]
1683a: 49 01 f4 add r12,rsi
1683d: 4c 39 e0 cmp rax,r12
16840: 4c 0f 47 e0 cmova r12,rax
16844: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16849: 4c 89 e2 mov rdx,r12
1684c: ff 15 ae f6 03 00 call QWORD PTR [rip+0x3f6ae] # 55f00 <_DYNAMIC+0x240>
16852: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16857: 4c 89 64 24 18 mov QWORD PTR [rsp+0x18],r12
1685c: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16861: 4c 89 e0 mov rax,r12
16864: 48 29 c8 sub rax,rcx
16867: 48 83 f8 08 cmp rax,0x8
1686b: 72 10 jb 1687d <<all::BigTable as sailfish::TemplateOnce>::render_once+0x8d>
1686d: 48 8b 44 24 08 mov rax,QWORD PTR [rsp+0x8]
16872: eb 39 jmp 168ad <<all::BigTable as sailfish::TemplateOnce>::render_once+0xbd>
16874: 49 89 f4 mov r12,rsi
16877: 48 83 f8 08 cmp rax,0x8
1687b: 73 f0 jae 1686d <<all::BigTable as sailfish::TemplateOnce>::render_once+0x7d>
1687d: 4b 8d 04 24 lea rax,[r12+r12*1]
16881: 49 8d 5c 24 08 lea rbx,[r12+0x8]
16886: 48 39 d8 cmp rax,rbx
16889: 48 0f 47 d8 cmova rbx,rax
1688d: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16892: 4c 89 e6 mov rsi,r12
16895: 48 89 da mov rdx,rbx
16898: ff 15 62 f6 03 00 call QWORD PTR [rip+0x3f662] # 55f00 <_DYNAMIC+0x240>
1689e: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
168a3: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
168a8: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
168ad: 48 ba 3c 74 61 62 6c movabs rdx,0xa3e656c6261743c
168b4: 65 3e 0a
168b7: 48 89 14 08 mov QWORD PTR [rax+rcx*1],rdx
168bb: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
168c0: 48 8b 74 24 18 mov rsi,QWORD PTR [rsp+0x18]
168c5: 48 83 c1 08 add rcx,0x8
168c9: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
168ce: 48 89 f0 mov rax,rsi
168d1: 48 29 c8 sub rax,rcx
168d4: 48 83 f8 04 cmp rax,0x4
168d8: 73 2e jae 16908 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x118>
168da: 48 8d 04 36 lea rax,[rsi+rsi*1]
168de: 48 8d 5e 04 lea rbx,[rsi+0x4]
168e2: 48 39 d8 cmp rax,rbx
168e5: 48 0f 47 d8 cmova rbx,rax
168e9: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
168ee: 48 89 da mov rdx,rbx
168f1: ff 15 09 f6 03 00 call QWORD PTR [rip+0x3f609] # 55f00 <_DYNAMIC+0x240>
168f7: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
168fc: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16901: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16906: eb 05 jmp 1690d <<all::BigTable as sailfish::TemplateOnce>::render_once+0x11d>
16908: 48 8b 44 24 08 mov rax,QWORD PTR [rsp+0x8]
1690d: 4c 89 74 24 20 mov QWORD PTR [rsp+0x20],r14
16912: c7 04 08 3c 74 72 3e mov DWORD PTR [rax+rcx*1],0x3e72743c
16919: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
1691e: 48 8b 74 24 18 mov rsi,QWORD PTR [rsp+0x18]
16923: 48 83 c0 04 add rax,0x4
16927: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
1692c: 49 c1 e7 03 shl r15,0x3
16930: 4b 8d 0c 7f lea rcx,[r15+r15*2]
16934: 48 85 c9 test rcx,rcx
16937: 0f 84 20 02 00 00 je 16b5d <<all::BigTable as sailfish::TemplateOnce>::render_once+0x36d>
1693d: 48 01 e9 add rcx,rbp
16940: 48 89 4c 24 28 mov QWORD PTR [rsp+0x28],rcx
16945: e9 40 00 00 00 jmp 1698a <<all::BigTable as sailfish::TemplateOnce>::render_once+0x19a>
1694a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
16950: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
16955: 48 8b 6c 24 30 mov rbp,QWORD PTR [rsp+0x30]
1695a: 48 8b 7c 24 28 mov rdi,QWORD PTR [rsp+0x28]
1695f: 48 83 c5 18 add rbp,0x18
16963: 8a 15 5a 1d ff ff mov dl,BYTE PTR [rip+0xffffffffffff1d5a] # 86c3 <GCC_except_table28+0x3af>
16969: 88 54 01 08 mov BYTE PTR [rcx+rax*1+0x8],dl
1696d: 48 8b 15 47 1d ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1d47] # 86bb <GCC_except_table28+0x3a7>
16974: 48 89 14 01 mov QWORD PTR [rcx+rax*1],rdx
16978: 48 83 c0 09 add rax,0x9
1697c: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
16981: 48 39 fd cmp rbp,rdi
16984: 0f 84 d3 01 00 00 je 16b5d <<all::BigTable as sailfish::TemplateOnce>::render_once+0x36d>
1698a: 48 89 f1 mov rcx,rsi
1698d: 48 29 c1 sub rcx,rax
16990: 48 83 f9 04 cmp rcx,0x4
16994: 73 3a jae 169d0 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x1e0>
16996: 48 8d 04 36 lea rax,[rsi+rsi*1]
1699a: 48 8d 5e 04 lea rbx,[rsi+0x4]
1699e: 48 39 d8 cmp rax,rbx
169a1: 48 0f 47 d8 cmova rbx,rax
169a5: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
169aa: 48 89 da mov rdx,rbx
169ad: ff 15 4d f5 03 00 call QWORD PTR [rip+0x3f54d] # 55f00 <_DYNAMIC+0x240>
169b3: 48 89 c1 mov rcx,rax
169b6: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
169bb: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
169c0: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
169c5: e9 0b 00 00 00 jmp 169d5 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x1e5>
169ca: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
169d0: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
169d5: c7 04 01 3c 74 64 3e mov DWORD PTR [rcx+rax*1],0x3e64743c
169dc: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
169e1: 48 8b 74 24 18 mov rsi,QWORD PTR [rsp+0x18]
169e6: 48 83 c0 04 add rax,0x4
169ea: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
169ef: 4c 8b 65 10 mov r12,QWORD PTR [rbp+0x10]
169f3: 4d 85 e4 test r12,r12
169f6: 48 89 6c 24 30 mov QWORD PTR [rsp+0x30],rbp
169fb: 0f 84 ff 00 00 00 je 16b00 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x310>
16a01: 4c 8b 6d 00 mov r13,QWORD PTR [rbp+0x0]
16a05: 49 c1 e4 03 shl r12,0x3
16a09: 45 31 f6 xor r14d,r14d
16a0c: 48 8b 2d ed f4 03 00 mov rbp,QWORD PTR [rip+0x3f4ed] # 55f00 <_DYNAMIC+0x240>
16a13: e9 39 00 00 00 jmp 16a51 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x261>
16a18: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
16a1f: 00
16a20: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
16a25: 0f b6 15 8e 1c ff ff movzx edx,BYTE PTR [rip+0xffffffffffff1c8e] # 86ba <GCC_except_table28+0x3a6>
16a2c: 88 54 01 08 mov BYTE PTR [rcx+rax*1+0x8],dl
16a30: 48 8b 15 7b 1c ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1c7b] # 86b2 <GCC_except_table28+0x39e>
16a37: 48 89 14 01 mov QWORD PTR [rcx+rax*1],rdx
16a3b: 48 83 c0 09 add rax,0x9
16a3f: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
16a44: 49 83 c6 08 add r14,0x8
16a48: 4d 39 f4 cmp r12,r14
16a4b: 0f 84 b6 00 00 00 je 16b07 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x317>
16a51: 4f 8b 7c 35 00 mov r15,QWORD PTR [r13+r14*1+0x0]
16a56: 48 8d 48 14 lea rcx,[rax+0x14]
16a5a: 48 39 f1 cmp rcx,rsi
16a5d: 76 31 jbe 16a90 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x2a0>
16a5f: 48 8d 04 36 lea rax,[rsi+rsi*1]
16a63: 48 8d 5e 14 lea rbx,[rsi+0x14]
16a67: 48 39 d8 cmp rax,rbx
16a6a: 48 0f 47 d8 cmova rbx,rax
16a6e: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16a73: 48 89 da mov rdx,rbx
16a76: ff d5 call rbp
16a78: 48 89 c6 mov rsi,rax
16a7b: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16a80: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16a85: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
16a8a: e9 06 00 00 00 jmp 16a95 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x2a5>
16a8f: 90 nop
16a90: 48 8b 74 24 08 mov rsi,QWORD PTR [rsp+0x8]
16a95: 48 01 c6 add rsi,rax
16a98: 4c 89 ff mov rdi,r15
16a9b: ff 15 67 f4 03 00 call QWORD PTR [rip+0x3f467] # 55f08 <_DYNAMIC+0x248>
16aa1: 48 03 44 24 10 add rax,QWORD PTR [rsp+0x10]
16aa6: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
16aab: 48 8b 74 24 18 mov rsi,QWORD PTR [rsp+0x18]
16ab0: 48 89 f1 mov rcx,rsi
16ab3: 48 29 c1 sub rcx,rax
16ab6: 48 83 f9 09 cmp rcx,0x9
16aba: 0f 83 60 ff ff ff jae 16a20 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x230>
16ac0: 48 8d 04 36 lea rax,[rsi+rsi*1]
16ac4: 48 8d 5e 09 lea rbx,[rsi+0x9]
16ac8: 48 39 d8 cmp rax,rbx
16acb: 48 0f 47 d8 cmova rbx,rax
16acf: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16ad4: 48 89 da mov rdx,rbx
16ad7: ff d5 call rbp
16ad9: 48 89 c1 mov rcx,rax
16adc: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16ae1: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16ae6: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
16aeb: 48 89 de mov rsi,rbx
16aee: e9 32 ff ff ff jmp 16a25 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x235>
16af3: 66 66 66 66 2e 0f 1f data16 data16 data16 nop WORD PTR cs:[rax+rax*1+0x0]
16afa: 84 00 00 00 00 00
16b00: 48 8b 2d f9 f3 03 00 mov rbp,QWORD PTR [rip+0x3f3f9] # 55f00 <_DYNAMIC+0x240>
16b07: 48 83 c0 fc add rax,0xfffffffffffffffc
16b0b: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
16b10: 48 89 f1 mov rcx,rsi
16b13: 48 29 c1 sub rcx,rax
16b16: 48 83 f9 09 cmp rcx,0x9
16b1a: 0f 83 30 fe ff ff jae 16950 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x160>
16b20: 48 8d 04 36 lea rax,[rsi+rsi*1]
16b24: 48 8d 5e 09 lea rbx,[rsi+0x9]
16b28: 48 39 d8 cmp rax,rbx
16b2b: 48 0f 47 d8 cmova rbx,rax
16b2f: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16b34: 48 89 da mov rdx,rbx
16b37: ff d5 call rbp
16b39: 48 8b 6c 24 30 mov rbp,QWORD PTR [rsp+0x30]
16b3e: 48 8b 7c 24 28 mov rdi,QWORD PTR [rsp+0x28]
16b43: 48 89 c1 mov rcx,rax
16b46: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16b4b: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16b50: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
16b55: 48 89 de mov rsi,rbx
16b58: e9 02 fe ff ff jmp 1695f <<all::BigTable as sailfish::TemplateOnce>::render_once+0x16f>
16b5d: 48 83 c0 fc add rax,0xfffffffffffffffc
16b61: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
16b66: 48 89 f1 mov rcx,rsi
16b69: 48 29 c1 sub rcx,rax
16b6c: 48 83 f9 09 cmp rcx,0x9
16b70: 73 36 jae 16ba8 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x3b8>
16b72: 48 8d 04 36 lea rax,[rsi+rsi*1]
16b76: 48 8d 5e 09 lea rbx,[rsi+0x9]
16b7a: 48 39 d8 cmp rax,rbx
16b7d: 48 0f 47 d8 cmova rbx,rax
16b81: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16b86: 48 89 da mov rdx,rbx
16b89: ff 15 71 f3 03 00 call QWORD PTR [rip+0x3f371] # 55f00 <_DYNAMIC+0x240>
16b8f: 48 8b 74 24 20 mov rsi,QWORD PTR [rsp+0x20]
16b94: 48 89 c1 mov rcx,rax
16b97: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16b9c: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16ba1: 48 8b 44 24 10 mov rax,QWORD PTR [rsp+0x10]
16ba6: eb 0a jmp 16bb2 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x3c2>
16ba8: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
16bad: 48 8b 74 24 20 mov rsi,QWORD PTR [rsp+0x20]
16bb2: 8a 15 14 1b ff ff mov dl,BYTE PTR [rip+0xffffffffffff1b14] # 86cc <GCC_except_table28+0x3b8>
16bb8: 88 54 01 08 mov BYTE PTR [rcx+rax*1+0x8],dl
16bbc: 48 8b 15 01 1b ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1b01] # 86c4 <GCC_except_table28+0x3b0>
16bc3: 48 89 14 01 mov QWORD PTR [rcx+rax*1],rdx
16bc7: 48 8d 48 09 lea rcx,[rax+0x9]
16bcb: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
16bd0: 48 c1 e9 03 shr rcx,0x3
16bd4: 48 01 c8 add rax,rcx
16bd7: 48 83 c0 54 add rax,0x54
16bdb: 48 8b 0d c6 0a 04 00 mov rcx,QWORD PTR [rip+0x40ac6] # 576a8 <<all::BigTable as sailfish::TemplateOnce>::render_once::SIZE_HINT>
16be2: 48 39 c1 cmp rcx,rax
16be5: 73 07 jae 16bee <<all::BigTable as sailfish::TemplateOnce>::render_once+0x3fe>
16be7: 48 89 05 ba 0a 04 00 mov QWORD PTR [rip+0x40aba],rax # 576a8 <<all::BigTable as sailfish::TemplateOnce>::render_once::SIZE_HINT>
16bee: 48 8b 44 24 08 mov rax,QWORD PTR [rsp+0x8]
16bf3: c4 e3 79 04 44 24 10 vpermilps xmm0,XMMWORD PTR [rsp+0x10],0x4e
16bfa: 4e
16bfb: 48 89 46 08 mov QWORD PTR [rsi+0x8],rax
16bff: c5 f8 11 46 10 vmovups XMMWORD PTR [rsi+0x10],xmm0
16c04: 48 c7 06 00 00 00 00 mov QWORD PTR [rsi],0x0
16c0b: 48 89 f0 mov rax,rsi
16c0e: 48 83 c4 38 add rsp,0x38
16c12: 5b pop rbx
16c13: 41 5c pop r12
16c15: 41 5d pop r13
16c17: 41 5e pop r14
16c19: 41 5f pop r15
16c1b: 5d pop rbp
16c1c: c3 ret
16c1d: eb 02 jmp 16c21 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x431>
16c1f: eb 00 jmp 16c21 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x431>
16c21: 48 89 c3 mov rbx,rax
16c24: 48 8d 7c 24 08 lea rdi,[rsp+0x8]
16c29: e8 72 f5 ff ff call 161a0 <core::ptr::drop_in_place>
16c2e: 48 89 df mov rdi,rbx
16c31: e8 1a b7 03 00 call 52350 <_Unwind_Resume@plt>
16c36: 0f 0b ud2
16c38: cc int3
16c39: cc int3
16c3a: cc int3
16c3b: cc int3
16c3c: cc int3
16c3d: cc int3
16c3e: cc int3
16c3f: cc int3
perf/join-after-optimize branch
00000000000167d0 <<all::BigTable as sailfish::TemplateOnce>::render_once>:
167d0: 55 push rbp
167d1: 41 57 push r15
167d3: 41 56 push r14
167d5: 41 55 push r13
167d7: 41 54 push r12
167d9: 53 push rbx
167da: 48 83 ec 38 sub rsp,0x38
167de: 48 c7 44 24 08 01 00 mov QWORD PTR [rsp+0x8],0x1
167e5: 00 00
167e7: c5 f8 57 c0 vxorps xmm0,xmm0,xmm0
167eb: c5 f8 11 44 24 10 vmovups XMMWORD PTR [rsp+0x10],xmm0
167f1: 49 89 d4 mov r12,rdx
167f4: 48 89 f5 mov rbp,rsi
167f7: 48 89 7c 24 28 mov QWORD PTR [rsp+0x28],rdi
167fc: 4c 8b 2d 45 0d 04 00 mov r13,QWORD PTR [rip+0x40d45] # 57548 <<all::BigTable as sailfish::TemplateOnce>::render_once::SIZE_HINT>
16803: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16808: 48 8b 74 24 18 mov rsi,QWORD PTR [rsp+0x18]
1680d: 48 89 f0 mov rax,rsi
16810: 48 29 c8 sub rax,rcx
16813: 4c 39 e8 cmp rax,r13
16816: 73 3e jae 16856 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x86>
16818: 48 8d 04 36 lea rax,[rsi+rsi*1]
1681c: 49 01 f5 add r13,rsi
1681f: 4c 39 e8 cmp rax,r13
16822: 4c 0f 47 e8 cmova r13,rax
16826: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
1682b: 4c 89 ea mov rdx,r13
1682e: ff 15 6c f5 03 00 call QWORD PTR [rip+0x3f56c] # 55da0 <_DYNAMIC+0x240>
16834: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16839: 4c 89 6c 24 18 mov QWORD PTR [rsp+0x18],r13
1683e: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16843: 4c 89 e8 mov rax,r13
16846: 48 29 c8 sub rax,rcx
16849: 48 83 f8 10 cmp rax,0x10
1684d: 72 10 jb 1685f <<all::BigTable as sailfish::TemplateOnce>::render_once+0x8f>
1684f: 48 8b 44 24 08 mov rax,QWORD PTR [rsp+0x8]
16854: eb 3d jmp 16893 <<all::BigTable as sailfish::TemplateOnce>::render_once+0xc3>
16856: 49 89 f5 mov r13,rsi
16859: 48 83 f8 10 cmp rax,0x10
1685d: 73 f0 jae 1684f <<all::BigTable as sailfish::TemplateOnce>::render_once+0x7f>
1685f: 4c 89 e8 mov rax,r13
16862: 4c 01 e8 add rax,r13
16865: 49 8d 5d 10 lea rbx,[r13+0x10]
16869: 48 39 d8 cmp rax,rbx
1686c: 48 0f 47 d8 cmova rbx,rax
16870: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
16875: 4c 89 ee mov rsi,r13
16878: 48 89 da mov rdx,rbx
1687b: ff 15 1f f5 03 00 call QWORD PTR [rip+0x3f51f] # 55da0 <_DYNAMIC+0x240>
16881: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16886: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
1688b: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16890: 49 89 dd mov r13,rbx
16893: c5 f8 10 05 05 1c ff vmovups xmm0,XMMWORD PTR [rip+0xffffffffffff1c05] # 84a0 <GCC_except_table28+0x190>
1689a: ff
1689b: c5 f8 11 04 08 vmovups XMMWORD PTR [rax+rcx*1],xmm0
168a0: 48 83 c1 10 add rcx,0x10
168a4: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
168a9: 49 c1 e4 03 shl r12,0x3
168ad: 4b 8d 14 64 lea rdx,[r12+r12*2]
168b1: 48 85 d2 test rdx,rdx
168b4: 0f 84 ab 01 00 00 je 16a65 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x295>
168ba: 48 01 ea add rdx,rbp
168bd: 48 89 54 24 30 mov QWORD PTR [rsp+0x30],rdx
168c2: e9 43 00 00 00 jmp 1690a <<all::BigTable as sailfish::TemplateOnce>::render_once+0x13a>
168c7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
168ce: 00 00
168d0: 4d 89 ec mov r12,r13
168d3: 48 8b 6c 24 20 mov rbp,QWORD PTR [rsp+0x20]
168d8: 48 83 c5 18 add rbp,0x18
168dc: 48 8b 15 cd 1d ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1dcd] # 86b0 <GCC_except_table28+0x3a0>
168e3: 48 89 54 08 05 mov QWORD PTR [rax+rcx*1+0x5],rdx
168e8: 48 8b 15 bc 1d ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1dbc] # 86ab <GCC_except_table28+0x39b>
168ef: 48 89 14 08 mov QWORD PTR [rax+rcx*1],rdx
168f3: 48 83 c1 0d add rcx,0xd
168f7: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
168fc: 48 8b 54 24 30 mov rdx,QWORD PTR [rsp+0x30]
16901: 48 39 d5 cmp rbp,rdx
16904: 0f 84 5e 01 00 00 je 16a68 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x298>
1690a: 4c 8b 7d 10 mov r15,QWORD PTR [rbp+0x10]
1690e: 4d 85 ff test r15,r15
16911: 48 89 6c 24 20 mov QWORD PTR [rsp+0x20],rbp
16916: 0f 84 f4 00 00 00 je 16a10 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x240>
1691c: 48 8b 6d 00 mov rbp,QWORD PTR [rbp+0x0]
16920: 49 c1 e7 03 shl r15,0x3
16924: 45 31 f6 xor r14d,r14d
16927: e9 35 00 00 00 jmp 16961 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x191>
1692c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
16930: 48 8b 44 24 08 mov rax,QWORD PTR [rsp+0x8]
16935: 0f b6 15 6e 1d ff ff movzx edx,BYTE PTR [rip+0xffffffffffff1d6e] # 86aa <GCC_except_table28+0x39a>
1693c: 88 54 08 08 mov BYTE PTR [rax+rcx*1+0x8],dl
16940: 48 8b 15 5b 1d ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1d5b] # 86a2 <GCC_except_table28+0x392>
16947: 48 89 14 08 mov QWORD PTR [rax+rcx*1],rdx
1694b: 48 83 c1 09 add rcx,0x9
1694f: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
16954: 49 83 c6 08 add r14,0x8
16958: 4d 39 f7 cmp r15,r14
1695b: 0f 84 af 00 00 00 je 16a10 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x240>
16961: 4e 8b 64 35 00 mov r12,QWORD PTR [rbp+r14*1+0x0]
16966: 48 8d 51 14 lea rdx,[rcx+0x14]
1696a: 4c 39 ea cmp rdx,r13
1696d: 76 2f jbe 1699e <<all::BigTable as sailfish::TemplateOnce>::render_once+0x1ce>
1696f: 4c 89 e9 mov rcx,r13
16972: 4c 01 e9 add rcx,r13
16975: 49 8d 5d 14 lea rbx,[r13+0x14]
16979: 48 39 d9 cmp rcx,rbx
1697c: 48 0f 47 d9 cmova rbx,rcx
16980: 48 89 c7 mov rdi,rax
16983: 4c 89 ee mov rsi,r13
16986: 48 89 da mov rdx,rbx
16989: ff 15 11 f4 03 00 call QWORD PTR [rip+0x3f411] # 55da0 <_DYNAMIC+0x240>
1698f: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16994: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16999: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
1699e: 48 01 c8 add rax,rcx
169a1: 4c 89 e7 mov rdi,r12
169a4: 48 89 c6 mov rsi,rax
169a7: ff 15 fb f3 03 00 call QWORD PTR [rip+0x3f3fb] # 55da8 <_DYNAMIC+0x248>
169ad: 48 89 c1 mov rcx,rax
169b0: 48 03 4c 24 10 add rcx,QWORD PTR [rsp+0x10]
169b5: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
169ba: 4c 8b 6c 24 18 mov r13,QWORD PTR [rsp+0x18]
169bf: 4c 89 e8 mov rax,r13
169c2: 48 29 c8 sub rax,rcx
169c5: 48 83 f8 09 cmp rax,0x9
169c9: 0f 83 61 ff ff ff jae 16930 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x160>
169cf: 4c 89 e8 mov rax,r13
169d2: 4c 01 e8 add rax,r13
169d5: 49 8d 5d 09 lea rbx,[r13+0x9]
169d9: 48 39 d8 cmp rax,rbx
169dc: 48 0f 47 d8 cmova rbx,rax
169e0: 48 8b 7c 24 08 mov rdi,QWORD PTR [rsp+0x8]
169e5: 4c 89 ee mov rsi,r13
169e8: 48 89 da mov rdx,rbx
169eb: ff 15 af f3 03 00 call QWORD PTR [rip+0x3f3af] # 55da0 <_DYNAMIC+0x240>
169f1: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
169f6: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
169fb: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16a00: 49 89 dd mov r13,rbx
16a03: e9 2d ff ff ff jmp 16935 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x165>
16a08: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
16a0f: 00
16a10: 48 83 c1 fc add rcx,0xfffffffffffffffc
16a14: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
16a19: 4c 89 ea mov rdx,r13
16a1c: 48 29 ca sub rdx,rcx
16a1f: 48 83 fa 0c cmp rdx,0xc
16a23: 0f 87 a7 fe ff ff ja 168d0 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x100>
16a29: 4c 89 e9 mov rcx,r13
16a2c: 4c 01 e9 add rcx,r13
16a2f: 4d 8d 65 0d lea r12,[r13+0xd]
16a33: 4c 39 e1 cmp rcx,r12
16a36: 4c 0f 47 e1 cmova r12,rcx
16a3a: 48 89 c7 mov rdi,rax
16a3d: 4c 89 ee mov rsi,r13
16a40: 4c 89 e2 mov rdx,r12
16a43: ff 15 57 f3 03 00 call QWORD PTR [rip+0x3f357] # 55da0 <_DYNAMIC+0x240>
16a49: 48 8b 6c 24 20 mov rbp,QWORD PTR [rsp+0x20]
16a4e: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16a53: 4c 89 64 24 18 mov QWORD PTR [rsp+0x18],r12
16a58: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16a5d: 4d 89 e5 mov r13,r12
16a60: e9 73 fe ff ff jmp 168d8 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x108>
16a65: 4d 89 ec mov r12,r13
16a68: 48 83 c1 f8 add rcx,0xfffffffffffffff8
16a6c: 48 89 4c 24 10 mov QWORD PTR [rsp+0x10],rcx
16a71: 4c 89 e2 mov rdx,r12
16a74: 48 29 ca sub rdx,rcx
16a77: 48 83 fa 08 cmp rdx,0x8
16a7b: 77 2e ja 16aab <<all::BigTable as sailfish::TemplateOnce>::render_once+0x2db>
16a7d: 4b 8d 0c 24 lea rcx,[r12+r12*1]
16a81: 49 8d 5c 24 09 lea rbx,[r12+0x9]
16a86: 48 39 d9 cmp rcx,rbx
16a89: 48 0f 47 d9 cmova rbx,rcx
16a8d: 48 89 c7 mov rdi,rax
16a90: 4c 89 e6 mov rsi,r12
16a93: 48 89 da mov rdx,rbx
16a96: ff 15 04 f3 03 00 call QWORD PTR [rip+0x3f304] # 55da0 <_DYNAMIC+0x240>
16a9c: 48 89 44 24 08 mov QWORD PTR [rsp+0x8],rax
16aa1: 48 89 5c 24 18 mov QWORD PTR [rsp+0x18],rbx
16aa6: 48 8b 4c 24 10 mov rcx,QWORD PTR [rsp+0x10]
16aab: 8a 15 0f 1c ff ff mov dl,BYTE PTR [rip+0xffffffffffff1c0f] # 86c0 <GCC_except_table28+0x3b0>
16ab1: 88 54 08 08 mov BYTE PTR [rax+rcx*1+0x8],dl
16ab5: 48 8b 15 fc 1b ff ff mov rdx,QWORD PTR [rip+0xffffffffffff1bfc] # 86b8 <GCC_except_table28+0x3a8>
16abc: 48 89 14 08 mov QWORD PTR [rax+rcx*1],rdx
16ac0: 48 8d 41 09 lea rax,[rcx+0x9]
16ac4: 48 89 44 24 10 mov QWORD PTR [rsp+0x10],rax
16ac9: 48 c1 e8 03 shr rax,0x3
16acd: 48 01 c8 add rax,rcx
16ad0: 48 83 c0 54 add rax,0x54
16ad4: 48 8b 0d 6d 0a 04 00 mov rcx,QWORD PTR [rip+0x40a6d] # 57548 <<all::BigTable as sailfish::TemplateOnce>::render_once::SIZE_HINT>
16adb: 48 39 c1 cmp rcx,rax
16ade: 73 07 jae 16ae7 <<all::BigTable as sailfish::TemplateOnce>::render_once+0x317>
16ae0: 48 89 05 61 0a 04 00 mov QWORD PTR [rip+0x40a61],rax # 57548 <<all::BigTable as sailfish::TemplateOnce>::render_once::SIZE_HINT>
16ae7: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
16aec: c4 e3 79 04 44 24 10 vpermilps xmm0,XMMWORD PTR [rsp+0x10],0x4e
16af3: 4e
16af4: 48 8b 44 24 28 mov rax,QWORD PTR [rsp+0x28]
16af9: 48 89 48 08 mov QWORD PTR [rax+0x8],rcx
16afd: c5 f8 11 40 10 vmovups XMMWORD PTR [rax+0x10],xmm0
16b02: 48 c7 00 00 00 00 00 mov QWORD PTR [rax],0x0
16b09: 48 83 c4 38 add rsp,0x38
16b0d: 5b pop rbx
16b0e: 41 5c pop r12
16b10: 41 5d pop r13
16b12: 41 5e pop r14
16b14: 41 5f pop r15
16b16: 5d pop rbp
16b17: c3 ret
16b18: eb 02 jmp 16b1c <<all::BigTable as sailfish::TemplateOnce>::render_once+0x34c>
16b1a: eb 00 jmp 16b1c <<all::BigTable as sailfish::TemplateOnce>::render_once+0x34c>
16b1c: 48 89 c3 mov rbx,rax
16b1f: 48 8d 7c 24 08 lea rdi,[rsp+0x8]
16b24: e8 57 f6 ff ff call 16180 <core::ptr::drop_in_place>
16b29: 48 89 df mov rdi,rbx
16b2c: e8 bf b6 03 00 call 521f0 <_Unwind_Resume@plt>
16b31: 0f 0b ud2
16b33: cc int3
16b34: cc int3
16b35: cc int3
16b36: cc int3
16b37: cc int3
16b38: cc int3
16b39: cc int3
16b3a: cc int3
16b3b: cc int3
16b3c: cc int3
16b3d: cc int3
16b3e: cc int3
16b3f: cc int3
I see that both branch call write_u64 function via pointer offset from the content of rip register (at 16a9b and 169a7 respectively).
Note: no significant difference in Teams benchmark
Yes, that is so easy, ask in Rust or in llvm.
Why is writeu64 put in r12 and called from this registry?
Raise the benchmarks that the world sees, if so democracy works.
There is some concern about a licence in big table benchmark. We need another example
https://choosealicense.com/no-permission/
I do not understand what you mean that is not compatible with the License. Can't use the big table with MIT or APACHE License? I did not understand you.
On the other hand, I have made a new lexer for you that can tokenize any block language (handlebars, jinja, ..). I am now with the AST. And with that the i18n at compile time. After that goes the html5 parser and webpack. And once here the DOM is ready to build anything.
Yarte will do the same as this and it is faster and I think it is impossible to do it faster. It has a compilation-time evaluator v_eval, meta-programming, compilation-time errors, derive, proc_macro, nothing has a cost and it is extensible to infinity.
We could now have the hot module replace implemented instead of reinventing the wheel. And it would be from the team, neither yours nor mine. I don't understand it. Nothing feel the veneer. I've already done everything I could. goodbye! :smile_cat:
Big table benchmark is distributed under No Lincense. That's why I removed the benchmark from this repository.
https://github.com/djc/template-benchmarks-rs
https://pythonhosted.org/wheezy.template/examples.html Yes of course ... the benchmarks too, everything, the ifs, the else minus the design patterns that don't seem to do them. Good grief if Euler raised his head.