LocalAllocator: add alloc_hinted
In current implementation, alloc expects small allocations are more
likely, which is likely true for most cases, and puts small allocations
in the fast path. However, in some scenario, the allocator users may know
an approximate alloc size in compile time. For example, they may be
allocating a certein type of objects that may have a typical size, but
the actual size in runtime may vary; or they may know the least size of
some objects.
By introducing a method add_hinted, we can let users to hint the
allocator. It accepts a compile-time size hint, based on which a hinted
fast path is chosen, instead of always favoring small allocations.
Currently, we use the size hint to adjust the fast path only in
LocalAllocator. Further we can use hint to do more optimizations, like,
adding add_range_hinted for backends, pass the hint from
LocalAllocator to them, and adjust their fast path to passing the
allocation to their parent or allocating by their own.
#276 may be focused on a similar issue but it is more focused on small allocations, and it is based on snmalloc1.
I wonder if you could get the compiler to do all the work with just a couple of functions:
SNMALLOC_FAST_PATH
template <size_t size_hint = 1>
void* alloc_hinted(size_t size)
{
if (likely(size == size_hint))
{
return alloc(size_hint);
}
return alloc_hinted_slow(size);
}
SNMALLOC_SLOW_PATH
void* alloc_hinted_slow(size_t size)
{
return alloc(size);
}
The aim would be to prevent the compiler knowing that the two branches do the same thing, and get it to heavily inline the correctly hinted branch.
I also wonder if you already are linking against a custom API, why not just call the templated alloc function that takes a static size?
I am intrigued by the idea, and it seems pretty light-weight, but I think you can do this without introducing small_alloc_slow and without renaming alloc_not_small? Do you have benchmark results you can share?
@nwf It's about inlining. SNMALLOC_FAST_PATH will always inline, and SNMALLOC_SLOW_PATH will always prohibit inlining, despite the likely and unlikely hints given to the compiler. If we don't introduce fast path for alloc_not_small and slow path for alloc_small, the inlining behavior will be wrong.
An example:
#include <snmalloc.h>
void* test_alloc(size_t size) {
return snmalloc::ThreadAlloc::get().alloc_hinted<0x400000>(size);
}
If we do not introduce the two paths and just use small_alloc and alloc_not_small:
if constexpr (
(size_hint - 1) <= (sizeclass_to_size(NUM_SMALL_SIZECLASSES - 1) - 1))
{
// Perform the - 1 on size, so that zero wraps around and ends up on
// slow path.
if (SNMALLOC_LIKELY(
(size - 1) <= (sizeclass_to_size(NUM_SMALL_SIZECLASSES - 1) - 1)))
{
return capptr_reveal(small_alloc<zero_mem>(size));
}
return capptr_reveal(alloc_not_small<zero_mem>(size));
}
else
{
if (SNMALLOC_UNLIKELY(
(size - 1) <= (sizeclass_to_size(NUM_SMALL_SIZECLASSES - 1) - 1)))
{
return capptr_reveal(small_alloc<zero_mem>(size));
}
return capptr_reveal(alloc_not_small<zero_mem>(size));
}
The result inlining behavior will be:
"?test_alloc@@YAPEAX_K@Z": # @"?test_alloc@@YAPEAX_K@Z"
.seh_proc "?test_alloc@@YAPEAX_K@Z"
# %bb.0:
subq $56, %rsp
.seh_stackalloc 56
.seh_endprologue
leaq -1(%rcx), %rax
cmpq $57343, %rax # imm = 0xDFFF
jbe .LBB0_1
# %bb.10:
movq %rcx, %r8
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rcx
leaq 48(%rsp), %rdx
callq "??$alloc_not_small@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z"
.LBB0_11:
movq 48(%rsp), %rax
addq $56, %rsp
retq
.LBB0_1:
shrq $4, %rax
leaq "?sizeclass_lookup@?1??size_to_sizeclass@snmalloc@@YA_K_K@Z@4USizeClassLookup@?1??12@YA_K0@Z@B"(%rip), %rcx
movzbl (%rax,%rcx), %r10d
movl _tls_index(%rip), %ecx
movq %gs:88, %rdx
movq (%rdx,%rcx,8), %rdx
movq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rdx,%r10,8), %rcx
testq %rcx, %rcx
je .LBB0_5
# %bb.2:
movq (%rcx), %rdx
prefetcht0 (%rdx)
movl _tls_index(%rip), %r8d
movq %gs:88, %rax
movq (%rax,%r8,8), %rax
movq %rdx, "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax,%r10,8)
cmpb $64, %r10b
jae .LBB0_12
# %bb.3:
shlq $5, %r10
leaq "?sizeclass_metadata@snmalloc@@3USizeClassTable@1@B"(%rip), %rax
movq 2056(%r10,%rax), %rdx
andq %rcx, %rdx
movq 2072(%r10,%rax), %rax
imulq %rax, %rdx
cmpq %rax, %rdx
jae .LBB0_13
# %bb.4:
movq %rcx, 48(%rsp)
jmp .LBB0_11
.LBB0_5:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
movq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32+6584(%rax), %rcx
testq %rcx, %rcx
je .LBB0_9
# %bb.6:
leaq (%rdx,%r10,8), %r9
addq $"?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32, %r9
movq 1088(%rcx), %rax
cmpq 1024(%rcx), %rax
jne .LBB0_8
# %bb.7:
leaq 48(%rsp), %rdx
movq %r10, %r8
callq "??$small_alloc@$0A@@?$CoreAllocator@VGlobals@snmalloc@@@snmalloc@@QEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_KAEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@1@@Z"
jmp .LBB0_11
.LBB0_12:
leaq "??_C@_0BO@NKCBJDPK@assert?5fail?3?5?$HL?$HN?5in?5?$HL?$HN?5on?5?$HL?$HN?5?6?$AA@"(%rip), %rcx
leaq "??_C@_08HJHHMEJH@sc?5?$DM?5TAG?$AA@"(%rip), %rdx
leaq "??_C@_0CL@KBKOAEII@src?1backend?1?4?4?1mem?1?4?4?1mem?1sizecl@"(%rip), %r8
leaq "??_C@_02JKLACGAP@74?$AA@"(%rip), %r9
callq "??$report_fatal_error@$0EAA@PEBDPEBDPEBDPEBD@snmalloc@@YAXPEBD000@Z"
.LBB0_13:
leaq "??_C@_0BO@NKCBJDPK@assert?5fail?3?5?$HL?$HN?5in?5?$HL?$HN?5on?5?$HL?$HN?5?6?$AA@"(%rip), %rcx
leaq "??_C@_0EP@JODPFPFD@is_start_of_object?$CI?5sizeclass_t?3@"(%rip), %rdx
leaq "??_C@_0CA@HEKPKBLI@src?1backend?1?4?4?1mem?1localcache?4h?$AA@"(%rip), %r8
leaq "??_C@_02CLJDCEPA@19?$AA@"(%rip), %r9
callq "??$report_fatal_error@$0EAA@PEBDPEBDPEBDPEBD@snmalloc@@YAXPEBD000@Z"
.LBB0_9:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rcx
leaq 48(%rsp), %rdx
movq %rcx, %r8
movq %r10, %r9
callq "??$lazy_init@V<lambda_2>@?0???R1?0???$small_alloc@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@3@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@3@@Z@_K@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?A?<decltype-auto>@@V<lambda_2>@?0???R3?0???$small_alloc@$0A@@01@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@1@@Z@0@Z"
jmp .LBB0_11
.LBB0_8:
movq %r9, 40(%rsp)
movq %r10, 32(%rsp)
leaq 48(%rsp), %rdx
movq %rcx, %r9
callq "??$handle_message_queue_inner@V<lambda_1>@?0???R<lambda_2>@?0???$small_alloc@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@4@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@4@@Z@PEAV?$CoreAllocator@VGlobals@snmalloc@@@4@_KPEAV784@@?$CoreAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?A?<decltype-auto>@@V<lambda_1>@?0???R<lambda_2>@?0???$small_alloc@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@1@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@1@@Z@PEAV01@01@Z"
jmp .LBB0_11
.seh_endproc
The small_alloc is inlined, and alloc_not_small is not, which is not correct.
By introducing small_alloc_slow and alloc_not_small_fast, the inlining will be correct:
"?test_alloc@@YAPEAX_K@Z": # @"?test_alloc@@YAPEAX_K@Z"
.seh_proc "?test_alloc@@YAPEAX_K@Z"
# %bb.0:
subq $88, %rsp
.seh_stackalloc 88
.seh_endprologue
movq %rcx, %r8
decq %rcx
cmpq $57343, %rcx # imm = 0xDFFF
jbe .LBB0_13
# %bb.1:
movq %r8, 64(%rsp)
testq %r8, %r8
je .LBB0_8
# %bb.2:
movl _tls_index(%rip), %eax
movq %gs:88, %rdx
movq (%rdx,%rax,8), %rax
movq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32+6584(%rax), %rax
testq %rax, %rax
je .LBB0_14
# %bb.3:
movq 1088(%rax), %rdx
cmpq 1024(%rax), %rdx
jne .LBB0_15
# %bb.4:
lzcntq %rcx, %r10
je .LBB0_19
# %bb.5:
leaq 1024(%rax), %r9
movl %r10d, %edx
negb %dl
movl $1, %ecx
shlxq %rdx, %rcx, %r8
addq $1280, %rax # imm = 0x500
movq %r10, 32(%rsp)
leaq 72(%rsp), %rcx
movq %rax, %rdx
callq "?alloc_chunk@?$BackendAllocator@VPALWindows@snmalloc@@$0A@VMetaEntry@2@@snmalloc@@SA?AU?$pair@V?$CapPtr@XU?$bound@$00$00$00@capptr@snmalloc@@@snmalloc@@PEAVMetaslab@2@@std@@AEAULocalState@12@_KPEAURemoteAllocator@2@Vsizeclass_t@2@@Z"
movq 80(%rsp), %rax
testq %rax, %rax
je .LBB0_7
# %bb.6:
leaq 16(%rax), %rcx
movq %rcx, 24(%rax)
movb $1, 43(%rax)
movw $1, 40(%rax)
.LBB0_7:
movq 72(%rsp), %rax
jmp .LBB0_10
.LBB0_8:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
movq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rax
testq %rax, %rax
je .LBB0_16
# %bb.9:
movq (%rax), %rcx
prefetcht0 (%rcx)
movl _tls_index(%rip), %r8d
movq %gs:88, %rdx
movq (%rdx,%r8,8), %rdx
movq %rcx, "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rdx)
testb $15, %al
jne .LBB0_20
.LBB0_10:
movq %rax, 56(%rsp)
.LBB0_11:
movq 56(%rsp), %rax
.LBB0_12:
addq $88, %rsp
retq
.LBB0_13:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rcx
leaq 72(%rsp), %rdx
callq "??$small_alloc_slow@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z"
movq 72(%rsp), %rax
jmp .LBB0_12
.LBB0_14:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rcx
leaq 56(%rsp), %rdx
leaq 64(%rsp), %r8
callq "??$lazy_init@V<lambda_1>@?0???$alloc_not_small_fast@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@3@_K@Z@$$V@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?A?<decltype-auto>@@V<lambda_1>@?0???$alloc_not_small_fast@$0A@@01@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z@@Z"
jmp .LBB0_11
.LBB0_15:
leaq 56(%rsp), %rdx
leaq 64(%rsp), %r8
movq %rax, %rcx
movq %rax, %r9
callq "??$handle_message_queue_inner@V<lambda_1>@?0???$alloc_not_small_fast@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@3@_K@Z@PEAV?$CoreAllocator@VGlobals@snmalloc@@@3@@?$CoreAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?A?<decltype-auto>@@V<lambda_1>@?0???$alloc_not_small_fast@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@1@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z@PEAV01@@Z"
jmp .LBB0_11
.LBB0_16:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
movq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32+6584(%rax), %rcx
testq %rcx, %rcx
je .LBB0_21
# %bb.17:
movq 1088(%rcx), %rax
cmpq 1024(%rcx), %rax
jne .LBB0_22
# %bb.18:
movl _tls_index(%rip), %eax
movq %gs:88, %rdx
movq (%rdx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %r9
leaq 56(%rsp), %rdx
xorl %r8d, %r8d
callq "??$small_alloc@$0A@@?$CoreAllocator@VGlobals@snmalloc@@@snmalloc@@QEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_KAEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@1@@Z"
jmp .LBB0_11
.LBB0_19:
leaq "??_C@_0BO@NKCBJDPK@assert?5fail?3?5?$HL?$HN?5in?5?$HL?$HN?5on?5?$HL?$HN?5?6?$AA@"(%rip), %rcx
leaq "??_C@_0CL@JCLLPKLM@sizeof?$CIT?$CJ?5?$CK?58?5?$DO?5static_cast?$DMsize@"(%rip), %rdx
leaq "??_C@_0CA@HFJJCPJC@src?1backend?1?4?4?1mem?1?4?4?1ds?1bits?4h?$AA@"(%rip), %r8
leaq "??_C@_02KKMAPKNE@46?$AA@"(%rip), %r9
callq "??$report_fatal_error@$0EAA@PEBDPEBDPEBDPEBD@snmalloc@@YAXPEBD000@Z"
.LBB0_20:
leaq "??_C@_0BO@NKCBJDPK@assert?5fail?3?5?$HL?$HN?5in?5?$HL?$HN?5on?5?$HL?$HN?5?6?$AA@"(%rip), %rcx
leaq "??_C@_0EP@JODPFPFD@is_start_of_object?$CI?5sizeclass_t?3@"(%rip), %rdx
leaq "??_C@_0CA@HEKPKBLI@src?1backend?1?4?4?1mem?1localcache?4h?$AA@"(%rip), %r8
leaq "??_C@_02CLJDCEPA@19?$AA@"(%rip), %r9
callq "??$report_fatal_error@$0EAA@PEBDPEBDPEBDPEBD@snmalloc@@YAXPEBD000@Z"
.LBB0_21:
movl _tls_index(%rip), %eax
movq %gs:88, %rcx
movq (%rcx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rcx
leaq 56(%rsp), %rdx
movq %rcx, %r8
xorl %r9d, %r9d
callq "??$lazy_init@V<lambda_2>@?0???R1?0???$small_alloc@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@3@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@3@@Z@_K@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?A?<decltype-auto>@@V<lambda_2>@?0???R3?0???$small_alloc@$0A@@01@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@1@@Z@0@Z"
jmp .LBB0_11
.LBB0_22:
movl _tls_index(%rip), %eax
movq %gs:88, %rdx
movq (%rdx,%rax,8), %rax
leaq "?alloc@?1??get@ThreadAlloc@snmalloc@@SAAEAV?$LocalAllocator@VGlobals@snmalloc@@@3@XZ@4V43@A"@SECREL32(%rax), %rax
movq %rax, 40(%rsp)
movq $0, 32(%rsp)
leaq 56(%rsp), %rdx
movq %rcx, %r9
callq "??$handle_message_queue_inner@V<lambda_1>@?0???R<lambda_2>@?0???$small_alloc@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@4@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@4@@Z@PEAV?$CoreAllocator@VGlobals@snmalloc@@@4@_KPEAV784@@?$CoreAllocator@VGlobals@snmalloc@@@snmalloc@@AEAA?A?<decltype-auto>@@V<lambda_1>@?0???R<lambda_2>@?0???$small_alloc@$0A@@?$LocalAllocator@VGlobals@snmalloc@@@1@AEAA?AV?$CapPtr@XU?$bound@$0A@$0A@$00@capptr@snmalloc@@@1@_K@Z@QEBA?A?<auto>@@0PEAV?$Iter@U?$bound@$0A@$0A@$00@capptr@snmalloc@@U?$bound@$0A@$0A@$0A@@23@@freelist@1@@Z@PEAV01@01@Z"
jmp .LBB0_11
.seh_endproc
FWIW, you may be able to get some additional performance gain on the dealloc path by hinting how you expect the remoteness to resolve
Yes, adding hint to dealloc is also doable and I would like to have a try.
I wonder if you could get the compiler to do all the work with just a couple of functions
@mjp41 It is still about inlining. Only a likely cannot change the compiler's inlining behavior.