bitvec Large generated code for a simple operation

Large generated code for a simple operation

Open mstange opened this issue 3 years ago • 1 comments

trafficstars

In this example, the generated code calls into BitSliceIndex::index and into BitField::load_le:

use bitvec::prelude::*;

fn extract_24(val: u32) -> u32 {
    let bv: BitArray<u32, Msb0> = BitArray::new(val);
    bv[8..32].load_le()
}

Both called functions are very large. I would have expected the calls to be optimized out. Unless I'm mistaken, the outcome should be equivalent to val & 0xffffff.

This is on macOS arm64 with Rust 1.58.0 stable.

Full code is here: https://github.com/mstange/bitvec-test

Jan 16 '22 01:01 mstange

So from a theoretical perspective: I know. It's an area of perpetual improvement. The primary culprit is the bitvec::domain module, which is responsible for segmenting an arbitrary &BitSlice into partial and whole elements. In particular, Domain::new is an ugly and large match stack that is guaranteed to be extremely hot and in need of improvement or outright skipping.

However: your sample code is marked #[inline(never)]. … don't do that? If I comment out just that attribute, and then disassemble it,

$ cargo +nightly install cargo-asm --vers 0.1.16
$ cargo +nightly asm bitvec_test::main --build-type release --no-color | bat -lasm
/ load_value

I get at lines 105 and 106

call    qword, ptr, [rip, +, _ZN47_$LT$u32$u20$as$u20$bitvec..store..BitStore$GT$10load_value17h8fbc73463f745022E@GOTPCREL]
and     eax, 16777215

which is "read an integer out of memory" followed by "mask it with 0x00FF_FFFF". It's weird that the .load_value() call isn't inlined, but that's a different thing to chase. Anyway, full dump:

bitvec_test::main:
 push    rbp
 push    r15
 push    r14
 push    r13
 push    r12
 push    rbx
 sub     rsp, 168
 lea     rbx, [rsp, +, 128]
 mov     rdi, rbx
 call    qword, ptr, [rip, +, _ZN3std3env4args17h2fa6ae5095eceb16E@GOTPCREL]
 mov     qword, ptr, [rsp, +, 160], 1
 mov     rdi, rsp
 mov     rsi, rbx
 call    qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$9size_hint17h2d86283ab7b22d14E@GOTPCREL]
 mov     rax, qword, ptr, [rsp]
 mov     rsi, qword, ptr, [rsp, +, 8]
 mov     rcx, qword, ptr, [rsp, +, 16]
 mov     r12, qword, ptr, [rsp, +, 160]
 xor     edi, edi
 mov     rdx, rax
 sub     rdx, r12
 cmovb   rdx, rdi
 sub     rcx, r12
 cmovb   rcx, rdi
 mov     qword, ptr, [rsp, +, 48], rsi
 mov     qword, ptr, [rsp, +, 56], rcx
 mov     qword, ptr, [rsp, +, 88], rdx
 mov     qword, ptr, [rsp, +, 80], 1
 cmp     rsi, 1
 jne     .LBB11_41
 cmp     rcx, rdx
 jne     .LBB11_41
 cmp     rax, r12
 jbe     .LBB11_42
 test    r12, r12
 je      .LBB11_17
 mov     qword, ptr, [rsp, +, 160], 0
 add     r12, -1
 je      .LBB11_13
 xor     r13d, r13d
 lea     r15, [rsp, +, 128]
 mov     rbp, qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
 jmp     .LBB11_8
.LBB11_7:
 cmp     r12, r13
 je      .LBB11_13
.LBB11_8:
 mov     rbx, r14
 mov     rdi, rsp
 mov     rsi, r15
 call    rbp
 mov     rdi, qword, ptr, [rsp]
 mov     r14, qword, ptr, [rsp, +, 8]
 test    rdi, rdi
 cmove   rdi, r13
 cmovne  rbx, r14
 je      .LBB11_17
 add     r13, 1
 test    rbx, rbx
 je      .LBB11_7
 test    rdi, rdi
 je      .LBB11_7
 mov     edx, 1
 mov     rsi, rbx
 call    qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
 jmp     .LBB11_7
.LBB11_13:
 mov     rdi, rsp
 lea     rsi, [rsp, +, 128]
 call    qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
 mov     rdi, qword, ptr, [rsp]
 test    rdi, rdi
 je      .LBB11_17
 mov     rsi, qword, ptr, [rsp, +, 8]
 test    rsi, rsi
 je      .LBB11_17
 mov     edx, 1
 call    qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_17:
 mov     rdi, rsp
 lea     rsi, [rsp, +, 128]
 call    qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
 cmp     qword, ptr, [rsp], 0
 je      .LBB11_46
 mov     rsi, qword, ptr, [rsp, +, 16]
 mov     qword, ptr, [rsp, +, 96], rsi
 movups  xmm0, xmmword, ptr, [rsp]
 movaps  xmmword, ptr, [rsp, +, 80], xmm0
 mov     rbx, qword, ptr, [rsp, +, 80]
 cmp     rsi, 2
 jb      .LBB11_21
 movzx   eax, word, ptr, [rbx]
 cmp     eax, 30768
 je      .LBB11_38
.LBB11_21:
 mov     rdi, rbx
 call    qword, ptr, [rip, +, _ZN4core3num60_$LT$impl$u20$core..str..traits..FromStr$u20$for$u20$u32$GT$8from_str17hb0615c3f2cf106b0E@GOTPCREL]
 test    al, 1
 jne     .LBB11_47
.LBB11_23:
 shr     rax, 32
 mov     dword, ptr, [rsp], eax
 mov     rdi, rsp
 call    qword, ptr, [rip, +, _ZN47_$LT$u32$u20$as$u20$bitvec..store..BitStore$GT$10load_value17h8fbc73463f745022E@GOTPCREL]
 and     eax, 16777215
 mov     dword, ptr, [rsp, +, 112], eax
 lea     rax, [rsp, +, 112]
 mov     qword, ptr, [rsp, +, 48], rax
 mov     rax, qword, ptr, [rip, +, _ZN4core3fmt3num3imp52_$LT$impl$u20$core..fmt..Display$u20$for$u20$u32$GT$3fmt17hc9b8e4322a951e30E@GOTPCREL]
 mov     qword, ptr, [rsp, +, 56], rax
 lea     rax, [rip, +, .L__unnamed_7]
 mov     qword, ptr, [rsp], rax
 mov     qword, ptr, [rsp, +, 8], 2
 mov     qword, ptr, [rsp, +, 16], 0
 lea     rax, [rsp, +, 48]
 mov     qword, ptr, [rsp, +, 32], rax
 mov     qword, ptr, [rsp, +, 40], 1
 mov     rdi, rsp
 call    qword, ptr, [rip, +, _ZN3std2io5stdio7_eprint17h4d077c3ca706ec3fE@GOTPCREL]
 mov     rsi, qword, ptr, [rsp, +, 88]
 test    rsi, rsi
 je      .LBB11_28
 test    rbx, rbx
 je      .LBB11_28
 mov     edx, 1
 mov     rdi, rbx
 call    qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_28:
 mov     rbx, qword, ptr, [rsp, +, 144]
 mov     rbp, qword, ptr, [rsp, +, 152]
 cmp     rbp, rbx
 je      .LBB11_34
 mov     r14, qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
 jmp     .LBB11_31
.LBB11_30:
 add     rbx, 24
 cmp     rbx, rbp
 je      .LBB11_34
.LBB11_31:
 mov     rsi, qword, ptr, [rbx, +, 8]
 test    rsi, rsi
 je      .LBB11_30
 mov     rdi, qword, ptr, [rbx]
 test    rdi, rdi
 je      .LBB11_30
 mov     edx, 1
 call    r14
 jmp     .LBB11_30
.LBB11_34:
 mov     rax, qword, ptr, [rsp, +, 136]
 test    rax, rax
 je      .LBB11_37
 shl     rax, 3
 lea     rsi, [rax, +, 2*rax]
 test    rsi, rsi
 je      .LBB11_37
 mov     rdi, qword, ptr, [rsp, +, 128]
 mov     edx, 8
 call    qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_37:
 add     rsp, 168
 pop     rbx
 pop     r12
 pop     r13
 pop     r14
 pop     r15
 pop     rbp
 ret
.LBB11_38:
 lea     rdi, [rbx, +, 2]
 add     rsi, -2
 mov     edx, 16
 call    qword, ptr, [rip, +, _ZN4core3num21_$LT$impl$u20$u32$GT$14from_str_radix17h029ea4c7fafc0898E@GOTPCREL]
 test    al, 1
 je      .LBB11_23
 mov     byte, ptr, [rsp], ah
 lea     rdi, [rip, +, .L__unnamed_8]
 lea     rcx, [rip, +, .L__unnamed_9]
 lea     r8, [rip, +, .L__unnamed_10]
 mov     rdx, rsp
 mov     esi, 43
 call    qword, ptr, [rip, +, _ZN4core6result13unwrap_failed17hb53671404b9e33c2E@GOTPCREL]
 jmp     .LBB11_57
.LBB11_41:
 mov     qword, ptr, [rsp], 0
 lea     rdi, [rsp, +, 48]
 lea     rsi, [rsp, +, 80]
 mov     rdx, rsp
 call    core::panicking::assert_failed
 jmp     .LBB11_57
.LBB11_42:
 lea     rdi, [rsp, +, 80]
 call    qword, ptr, [rip, +, _ZN3std3env4args17h2fa6ae5095eceb16E@GOTPCREL]
 mov     rdi, rsp
 lea     rsi, [rsp, +, 80]
 call    qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
 cmp     qword, ptr, [rsp], 0
 jne     .LBB11_48
 lea     rdi, [rip, +, .L__unnamed_11]
 lea     rdx, [rip, +, .L__unnamed_12]
 mov     esi, 43
 call    qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
 jmp     .LBB11_57
.LBB11_46:
 lea     rdi, [rip, +, .L__unnamed_11]
 lea     rdx, [rip, +, .L__unnamed_13]
 mov     esi, 43
 call    qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
 jmp     .LBB11_57
.LBB11_47:
 mov     byte, ptr, [rsp], ah
 lea     rdi, [rip, +, .L__unnamed_8]
 lea     rcx, [rip, +, .L__unnamed_9]
 lea     r8, [rip, +, .L__unnamed_14]
 mov     rdx, rsp
 mov     esi, 43
 call    qword, ptr, [rip, +, _ZN4core6result13unwrap_failed17hb53671404b9e33c2E@GOTPCREL]
 jmp     .LBB11_57
.LBB11_48:
 mov     rax, qword, ptr, [rsp, +, 16]
 mov     qword, ptr, [rsp, +, 64], rax
 movups  xmm0, xmmword, ptr, [rsp]
 movaps  xmmword, ptr, [rsp, +, 48], xmm0
 lea     rax, [rsp, +, 48]
 mov     qword, ptr, [rsp, +, 112], rax
 lea     rax, [rip, +, _ZN60_$LT$alloc..string..String$u20$as$u20$core..fmt..Display$GT$3fmt17h3158fd30d2269b29E]
 mov     qword, ptr, [rsp, +, 120], rax
 lea     rax, [rip, +, .L__unnamed_15]
 mov     qword, ptr, [rsp], rax
 mov     qword, ptr, [rsp, +, 8], 2
 mov     qword, ptr, [rsp, +, 16], 0
 lea     rax, [rsp, +, 112]
 mov     qword, ptr, [rsp, +, 32], rax
 mov     qword, ptr, [rsp, +, 40], 1
 mov     rdi, rsp
 call    qword, ptr, [rip, +, _ZN3std2io5stdio7_eprint17h4d077c3ca706ec3fE@GOTPCREL]
 mov     rsi, qword, ptr, [rsp, +, 56]
 test    rsi, rsi
 je      .LBB11_52
 mov     rdi, qword, ptr, [rsp, +, 48]
 test    rdi, rdi
 je      .LBB11_52
 mov     edx, 1
 call    qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_52:
 mov     rbx, qword, ptr, [rsp, +, 96]
 mov     rbp, qword, ptr, [rsp, +, 104]
 cmp     rbp, rbx
 jne     .LBB11_58
.LBB11_53:
 mov     rax, qword, ptr, [rsp, +, 88]
 test    rax, rax
 je      .LBB11_56
 shl     rax, 3
 lea     rsi, [rax, +, 2*rax]
 test    rsi, rsi
 je      .LBB11_56
 mov     rdi, qword, ptr, [rsp, +, 80]
 mov     edx, 8
 call    qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_56:
 mov     edi, 1
 call    qword, ptr, [rip, +, _ZN3std7process4exit17h39bde275b62f86b5E@GOTPCREL]
.LBB11_57:
 ud2
.LBB11_58:
 mov     r14, qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
 jmp     .LBB11_60
.LBB11_59:
 add     rbx, 24
 cmp     rbx, rbp
 je      .LBB11_53
.LBB11_60:
 mov     rsi, qword, ptr, [rbx, +, 8]
 test    rsi, rsi
 je      .LBB11_59
 mov     rdi, qword, ptr, [rbx]
 test    rdi, rdi
 je      .LBB11_59
 mov     edx, 1
 call    r14
 jmp     .LBB11_59
.LBB11_63:
 mov     rbx, rax
 lea     rdi, [rsp, +, 48]
 call    core::ptr::drop_in_place<alloc::string::String>
 jmp     .LBB11_65
.LBB11_64:
 mov     rbx, rax
.LBB11_65:
 lea     rdi, [rsp, +, 80]
 call    core::ptr::drop_in_place<std::env::Args>
 jmp     .LBB11_70
.LBB11_66:
 mov     rbx, rax
 lea     rdi, [rsp, +, 80]
 call    core::ptr::drop_in_place<alloc::string::String>
 jmp     .LBB11_70
.LBB11_67:
 jmp     .LBB11_69
.LBB11_68:
.LBB11_69:
 mov     rbx, rax
.LBB11_70:
 lea     rdi, [rsp, +, 128]
 call    core::ptr::drop_in_place<std::env::Args>
 mov     rdi, rbx
 call    _Unwind_Resume
 ud2

The naïve codegen for &BitSlice -> Domain -> integer is awful, and it is going to remain so with runtime-only indices. I am always looking to improve it. Because this (debug-build of <BitSlice<u32, Msb0> as BitField>::load_le)

<bitvec::slice::BitSlice<T,bitvec::order::Msb0> as bitvec::field::BitField>::load_le (/home/myrrlyn/.cargo/registry/src/github.com-1ecc6299db9ec823/bitvec-1.0.0/src/field.rs:234):
 sub     rsp, 520
 mov     qword, ptr, [rsp, +, 168], rdi
 mov     qword, ptr, [rsp, +, 176], rsi
 mov     qword, ptr, [rsp, +, 448], rdi
 mov     qword, ptr, [rsp, +, 456], rsi
 call    bitvec::slice::api::<impl bitvec::slice::BitSlice<T,O>>::len
 mov     qword, ptr, [rsp, +, 184], rax
 mov     rdx, qword, ptr, [rsp, +, 184]
 lea     rdi, [rip, +, .L__unnamed_55]
 mov     esi, 4
 call    bitvec::field::check
 mov     rdx, qword, ptr, [rsp, +, 176]
 mov     rsi, qword, ptr, [rsp, +, 168]
 lea     rdi, [rsp, +, 200]
 call    bitvec::slice::BitSlice<T,O>::domain
 mov     rax, qword, ptr, [rsp, +, 200]
 test    rax, rax
 je      .LBB325_5
 jmp     .LBB325_43
.LBB325_43:
 jmp     .LBB325_6
 ud2
.LBB325_5:
 mov     rax, qword, ptr, [rsp, +, 208]
 mov     qword, ptr, [rsp, +, 256], rax
 mov     rax, qword, ptr, [rsp, +, 216]
 mov     qword, ptr, [rsp, +, 264], rax
 call    bitvec::mem::bits_of
 mov     qword, ptr, [rsp, +, 160], rax
 jmp     .LBB325_37
.LBB325_6:
 mov     rax, qword, ptr, [rsp, +, 208]
 mov     qword, ptr, [rsp, +, 296], rax
 mov     rax, qword, ptr, [rsp, +, 216]
 mov     qword, ptr, [rsp, +, 304], rax
 mov     rcx, qword, ptr, [rsp, +, 224]
 mov     qword, ptr, [rsp, +, 144], rcx
 mov     rax, qword, ptr, [rsp, +, 232]
 mov     qword, ptr, [rsp, +, 152], rax
 mov     qword, ptr, [rsp, +, 464], rcx
 mov     qword, ptr, [rsp, +, 472], rax
 mov     rax, qword, ptr, [rsp, +, 240]
 mov     qword, ptr, [rsp, +, 312], rax
 mov     rax, qword, ptr, [rsp, +, 248]
 mov     qword, ptr, [rsp, +, 320], rax
 mov     dword, ptr, [rsp, +, 332], 0
 mov     eax, 1
 xor     ecx, ecx
 cmp     qword, ptr, [rsp, +, 312], 0
 cmove   rax, rcx
 cmp     rax, 1
 jne     .LBB325_8
 mov     rax, qword, ptr, [rsp, +, 312]
 mov     qword, ptr, [rsp, +, 336], rax
 mov     rax, qword, ptr, [rsp, +, 320]
 mov     qword, ptr, [rsp, +, 344], rax
 call    bitvec::mem::bits_of
 mov     qword, ptr, [rsp, +, 136], rax
 jmp     .LBB325_9
.LBB325_8:
 mov     rsi, qword, ptr, [rsp, +, 152]
 mov     rdi, qword, ptr, [rsp, +, 144]
 call    core::slice::<impl [T]>::iter
 mov     qword, ptr, [rsp, +, 120], rax
 mov     qword, ptr, [rsp, +, 128], rdx
 jmp     .LBB325_15
.LBB325_9:
 mov     rax, qword, ptr, [rsp, +, 136]
 mov     byte, ptr, [rsp, +, 118], al
 lea     rdi, [rsp, +, 336]
 call    bitvec::domain::PartialElement<M,T,O>::tail
 mov     byte, ptr, [rsp, +, 119], al
 mov     al, byte, ptr, [rsp, +, 119]
 movzx   edi, al
 call    bitvec::index::BitEnd<R>::into_inner
 mov     byte, ptr, [rsp, +, 117], al
 mov     cl, byte, ptr, [rsp, +, 117]
 mov     al, byte, ptr, [rsp, +, 118]
 sub     al, cl
 mov     byte, ptr, [rsp, +, 116], al
 setb    al
 test    al, 1
 jne     .LBB325_13
 mov     al, byte, ptr, [rsp, +, 116]
 mov     byte, ptr, [rsp, +, 483], al
 movups  xmm0, xmmword, ptr, [rsp, +, 336]
 movaps  xmmword, ptr, [rsp, +, 352], xmm0
 mov     rdi, qword, ptr, [rsp, +, 352]
 mov     rsi, qword, ptr, [rsp, +, 360]
 movzx   edx, al
 call    bitvec::field::get
 mov     dword, ptr, [rsp, +, 112], eax
 jmp     .LBB325_14
.LBB325_13:
 lea     rdi, [rip, +, str.0]
 lea     rdx, [rip, +, .L__unnamed_56]
 mov     rax, qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
 mov     esi, 33
 call    rax
 ud2
.LBB325_14:
 mov     eax, dword, ptr, [rsp, +, 112]
 mov     dword, ptr, [rsp, +, 332], eax
 jmp     .LBB325_8
.LBB325_15:
 mov     rsi, qword, ptr, [rsp, +, 128]
 mov     rdi, qword, ptr, [rsp, +, 120]
 call    core::iter::traits::iterator::Iterator::rev
 mov     qword, ptr, [rsp, +, 96], rax
 mov     qword, ptr, [rsp, +, 104], rdx
 mov     rsi, qword, ptr, [rsp, +, 104]
 mov     rdi, qword, ptr, [rsp, +, 96]
 call    core::iter::traits::iterator::Iterator::map
 mov     qword, ptr, [rsp, +, 80], rax
 mov     qword, ptr, [rsp, +, 88], rdx
 mov     rsi, qword, ptr, [rsp, +, 88]
 mov     rdi, qword, ptr, [rsp, +, 80]
 call    <I as core::iter::traits::collect::IntoIterator>::into_iter
 mov     qword, ptr, [rsp, +, 64], rax
 mov     qword, ptr, [rsp, +, 72], rdx
 mov     rax, qword, ptr, [rsp, +, 72]
 mov     rcx, qword, ptr, [rsp, +, 64]
 mov     qword, ptr, [rsp, +, 376], rcx
 mov     qword, ptr, [rsp, +, 384], rax
.LBB325_19:
 lea     rdi, [rsp, +, 376]
 call    <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
 mov     dword, ptr, [rsp, +, 396], edx
 mov     dword, ptr, [rsp, +, 392], eax
 mov     eax, dword, ptr, [rsp, +, 392]
 test    rax, rax
 je      .LBB325_22
 jmp     .LBB325_44
.LBB325_44:
 jmp     .LBB325_23
 ud2
.LBB325_22:
 mov     eax, 1
 xor     ecx, ecx
 cmp     qword, ptr, [rsp, +, 296], 0
 cmove   rax, rcx
 cmp     rax, 1
 je      .LBB325_27
 jmp     .LBB325_28
.LBB325_23:
 mov     eax, dword, ptr, [rsp, +, 396]
 mov     dword, ptr, [rsp, +, 52], eax
 mov     dword, ptr, [rsp, +, 484], eax
 call    bitvec::mem::bits_of
 mov     qword, ptr, [rsp, +, 56], rax
 mov     rsi, qword, ptr, [rsp, +, 56]
 lea     rdi, [rsp, +, 332]
 call    bitvec::field::maybe_shift_left
 mov     edi, dword, ptr, [rsp, +, 52]
 call    bitvec::field::resize
 mov     dword, ptr, [rsp, +, 48], eax
 mov     esi, dword, ptr, [rsp, +, 48]
 lea     rdi, [rsp, +, 332]
 call    <u32 as core::ops::bit::BitOrAssign>::bitor_assign
 jmp     .LBB325_19
.LBB325_27:
 mov     rax, qword, ptr, [rsp, +, 296]
 mov     qword, ptr, [rsp, +, 400], rax
 mov     rax, qword, ptr, [rsp, +, 304]
 mov     qword, ptr, [rsp, +, 408], rax
 call    bitvec::mem::bits_of
 mov     qword, ptr, [rsp, +, 40], rax
 jmp     .LBB325_29
.LBB325_28:
 mov     eax, dword, ptr, [rsp, +, 332]
 mov     dword, ptr, [rsp, +, 196], eax
 jmp     .LBB325_36
.LBB325_29:
 lea     rdi, [rsp, +, 400]
 call    bitvec::domain::PartialElement<M,T,O>::head
 mov     byte, ptr, [rsp, +, 39], al
 mov     al, byte, ptr, [rsp, +, 39]
 movzx   edi, al
 call    bitvec::index::BitIdx<R>::into_inner
 mov     byte, ptr, [rsp, +, 38], al
 mov     rax, qword, ptr, [rsp, +, 40]
 mov     cl, byte, ptr, [rsp, +, 38]
 movzx   ecx, cl
 sub     rax, rcx
 mov     qword, ptr, [rsp, +, 24], rax
 setb    al
 test    al, 1
 jne     .LBB325_33
 mov     rsi, qword, ptr, [rsp, +, 24]
 mov     qword, ptr, [rsp, +, 488], rsi
 lea     rdi, [rsp, +, 332]
 call    bitvec::field::maybe_shift_left
 jmp     .LBB325_34
.LBB325_33:
 lea     rdi, [rip, +, str.0]
 lea     rdx, [rip, +, .L__unnamed_57]
 mov     rax, qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
 mov     esi, 33
 call    rax
 ud2
.LBB325_34:
 movups  xmm0, xmmword, ptr, [rsp, +, 400]
 movaps  xmmword, ptr, [rsp, +, 416], xmm0
 mov     rdi, qword, ptr, [rsp, +, 416]
 mov     rsi, qword, ptr, [rsp, +, 424]
 xor     edx, edx
 call    bitvec::field::get
 mov     dword, ptr, [rsp, +, 20], eax
 mov     esi, dword, ptr, [rsp, +, 20]
 lea     rdi, [rsp, +, 332]
 call    <u32 as core::ops::bit::BitOrAssign>::bitor_assign
 jmp     .LBB325_28
.LBB325_36:
 lea     rax, [rsp, +, 184]
 mov     qword, ptr, [rsp, +, 440], rax
 mov     eax, dword, ptr, [rsp, +, 196]
 mov     rdi, qword, ptr, [rsp, +, 440]
 mov     dword, ptr, [rsp, +, 508], eax
 mov     qword, ptr, [rsp, +, 512], rdi
 mov     dword, ptr, [rsp, +, 504], eax
 mov     esi, dword, ptr, [rsp, +, 504]
 call    <bitvec::slice::BitSlice<T,bitvec::order::Msb0> as bitvec::field::BitField>::load_le::{{closure}}
 mov     dword, ptr, [rsp, +, 16], eax
 jmp     .LBB325_42
.LBB325_37:
 mov     rax, qword, ptr, [rsp, +, 160]
 mov     byte, ptr, [rsp, +, 14], al
 lea     rdi, [rsp, +, 256]
 call    bitvec::domain::PartialElement<M,T,O>::tail
 mov     byte, ptr, [rsp, +, 15], al
 mov     al, byte, ptr, [rsp, +, 15]
 movzx   edi, al
 call    bitvec::index::BitEnd<R>::into_inner
 mov     byte, ptr, [rsp, +, 13], al
 mov     cl, byte, ptr, [rsp, +, 13]
 mov     al, byte, ptr, [rsp, +, 14]
 sub     al, cl
 mov     byte, ptr, [rsp, +, 12], al
 setb    al
 test    al, 1
 jne     .LBB325_41
 mov     al, byte, ptr, [rsp, +, 12]
 mov     byte, ptr, [rsp, +, 503], al
 movups  xmm0, xmmword, ptr, [rsp, +, 256]
 movaps  xmmword, ptr, [rsp, +, 272], xmm0
 mov     rdi, qword, ptr, [rsp, +, 272]
 mov     rsi, qword, ptr, [rsp, +, 280]
 movzx   edx, al
 call    bitvec::field::get
 mov     dword, ptr, [rsp, +, 196], eax
 jmp     .LBB325_36
.LBB325_41:
 lea     rdi, [rip, +, str.0]
 lea     rdx, [rip, +, .L__unnamed_58]
 mov     rax, qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
 mov     esi, 33
 call    rax
 ud2
.LBB325_42:
 mov     eax, dword, ptr, [rsp, +, 16]
 add     rsp, 520
 ret

is atrocious. Unfortunately, it's also necessary for being able to correctly operate on arbitrary regions, and ripe for const propagation.

And LLVM is really good at const propagation.

So if you're at all able to use compile-time indices and demand inlining, absolutely do so.

I'll also admit I absolutely do not know enough about the #[inline] attribute to feel fully comfortable putting it back on to everything. I think between increasing hints for cross-crate inlining, turning up optimization levels, and using compile-time static indices, we can largely eliminate a lot of bitvec's codegen. Unfortunately, it is very obvious that as these levers become inaccessible to us (especially runtime-only indices), the pessimal case is not great. Unfortunately, without going in and manually templating each type combination case (certainly possible, but I'd really rather not?) this is just the kind of code I have to write in order to function correctly.

So tl;dr, I know this is a problem, I don't think I can do a whole lot about it but I'm always looking for avenues to try out, and it's certainly possible to game the system to get rid of these costs in many cases.

Feb 23 '22 04:02 myrrlyn

bitvec bitvec copied to clipboard

Large generated code for a simple operation

bitvec
bitvec copied to clipboard