bitvec
bitvec copied to clipboard
Large generated code for a simple operation
In this example, the generated code calls into BitSliceIndex::index and into BitField::load_le:
use bitvec::prelude::*;
fn extract_24(val: u32) -> u32 {
let bv: BitArray<u32, Msb0> = BitArray::new(val);
bv[8..32].load_le()
}
Both called functions are very large. I would have expected the calls to be optimized out.
Unless I'm mistaken, the outcome should be equivalent to val & 0xffffff.
This is on macOS arm64 with Rust 1.58.0 stable.
Full code is here: https://github.com/mstange/bitvec-test
So from a theoretical perspective: I know. It's an area of perpetual improvement. The primary culprit is the bitvec::domain module, which is responsible for segmenting an arbitrary &BitSlice into partial and whole elements. In particular, Domain::new is an ugly and large match stack that is guaranteed to be extremely hot and in need of improvement or outright skipping.
However: your sample code is marked #[inline(never)]. … don't do that? If I comment out just that attribute, and then disassemble it,
$ cargo +nightly install cargo-asm --vers 0.1.16
$ cargo +nightly asm bitvec_test::main --build-type release --no-color | bat -lasm
/ load_value
I get at lines 105 and 106
call qword, ptr, [rip, +, _ZN47_$LT$u32$u20$as$u20$bitvec..store..BitStore$GT$10load_value17h8fbc73463f745022E@GOTPCREL]
and eax, 16777215
which is "read an integer out of memory" followed by "mask it with 0x00FF_FFFF". It's weird that the .load_value() call isn't inlined, but that's a different thing to chase. Anyway, full dump:
bitvec_test::main:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 168
lea rbx, [rsp, +, 128]
mov rdi, rbx
call qword, ptr, [rip, +, _ZN3std3env4args17h2fa6ae5095eceb16E@GOTPCREL]
mov qword, ptr, [rsp, +, 160], 1
mov rdi, rsp
mov rsi, rbx
call qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$9size_hint17h2d86283ab7b22d14E@GOTPCREL]
mov rax, qword, ptr, [rsp]
mov rsi, qword, ptr, [rsp, +, 8]
mov rcx, qword, ptr, [rsp, +, 16]
mov r12, qword, ptr, [rsp, +, 160]
xor edi, edi
mov rdx, rax
sub rdx, r12
cmovb rdx, rdi
sub rcx, r12
cmovb rcx, rdi
mov qword, ptr, [rsp, +, 48], rsi
mov qword, ptr, [rsp, +, 56], rcx
mov qword, ptr, [rsp, +, 88], rdx
mov qword, ptr, [rsp, +, 80], 1
cmp rsi, 1
jne .LBB11_41
cmp rcx, rdx
jne .LBB11_41
cmp rax, r12
jbe .LBB11_42
test r12, r12
je .LBB11_17
mov qword, ptr, [rsp, +, 160], 0
add r12, -1
je .LBB11_13
xor r13d, r13d
lea r15, [rsp, +, 128]
mov rbp, qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
jmp .LBB11_8
.LBB11_7:
cmp r12, r13
je .LBB11_13
.LBB11_8:
mov rbx, r14
mov rdi, rsp
mov rsi, r15
call rbp
mov rdi, qword, ptr, [rsp]
mov r14, qword, ptr, [rsp, +, 8]
test rdi, rdi
cmove rdi, r13
cmovne rbx, r14
je .LBB11_17
add r13, 1
test rbx, rbx
je .LBB11_7
test rdi, rdi
je .LBB11_7
mov edx, 1
mov rsi, rbx
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
jmp .LBB11_7
.LBB11_13:
mov rdi, rsp
lea rsi, [rsp, +, 128]
call qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
mov rdi, qword, ptr, [rsp]
test rdi, rdi
je .LBB11_17
mov rsi, qword, ptr, [rsp, +, 8]
test rsi, rsi
je .LBB11_17
mov edx, 1
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_17:
mov rdi, rsp
lea rsi, [rsp, +, 128]
call qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
cmp qword, ptr, [rsp], 0
je .LBB11_46
mov rsi, qword, ptr, [rsp, +, 16]
mov qword, ptr, [rsp, +, 96], rsi
movups xmm0, xmmword, ptr, [rsp]
movaps xmmword, ptr, [rsp, +, 80], xmm0
mov rbx, qword, ptr, [rsp, +, 80]
cmp rsi, 2
jb .LBB11_21
movzx eax, word, ptr, [rbx]
cmp eax, 30768
je .LBB11_38
.LBB11_21:
mov rdi, rbx
call qword, ptr, [rip, +, _ZN4core3num60_$LT$impl$u20$core..str..traits..FromStr$u20$for$u20$u32$GT$8from_str17hb0615c3f2cf106b0E@GOTPCREL]
test al, 1
jne .LBB11_47
.LBB11_23:
shr rax, 32
mov dword, ptr, [rsp], eax
mov rdi, rsp
call qword, ptr, [rip, +, _ZN47_$LT$u32$u20$as$u20$bitvec..store..BitStore$GT$10load_value17h8fbc73463f745022E@GOTPCREL]
and eax, 16777215
mov dword, ptr, [rsp, +, 112], eax
lea rax, [rsp, +, 112]
mov qword, ptr, [rsp, +, 48], rax
mov rax, qword, ptr, [rip, +, _ZN4core3fmt3num3imp52_$LT$impl$u20$core..fmt..Display$u20$for$u20$u32$GT$3fmt17hc9b8e4322a951e30E@GOTPCREL]
mov qword, ptr, [rsp, +, 56], rax
lea rax, [rip, +, .L__unnamed_7]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 8], 2
mov qword, ptr, [rsp, +, 16], 0
lea rax, [rsp, +, 48]
mov qword, ptr, [rsp, +, 32], rax
mov qword, ptr, [rsp, +, 40], 1
mov rdi, rsp
call qword, ptr, [rip, +, _ZN3std2io5stdio7_eprint17h4d077c3ca706ec3fE@GOTPCREL]
mov rsi, qword, ptr, [rsp, +, 88]
test rsi, rsi
je .LBB11_28
test rbx, rbx
je .LBB11_28
mov edx, 1
mov rdi, rbx
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_28:
mov rbx, qword, ptr, [rsp, +, 144]
mov rbp, qword, ptr, [rsp, +, 152]
cmp rbp, rbx
je .LBB11_34
mov r14, qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
jmp .LBB11_31
.LBB11_30:
add rbx, 24
cmp rbx, rbp
je .LBB11_34
.LBB11_31:
mov rsi, qword, ptr, [rbx, +, 8]
test rsi, rsi
je .LBB11_30
mov rdi, qword, ptr, [rbx]
test rdi, rdi
je .LBB11_30
mov edx, 1
call r14
jmp .LBB11_30
.LBB11_34:
mov rax, qword, ptr, [rsp, +, 136]
test rax, rax
je .LBB11_37
shl rax, 3
lea rsi, [rax, +, 2*rax]
test rsi, rsi
je .LBB11_37
mov rdi, qword, ptr, [rsp, +, 128]
mov edx, 8
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_37:
add rsp, 168
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB11_38:
lea rdi, [rbx, +, 2]
add rsi, -2
mov edx, 16
call qword, ptr, [rip, +, _ZN4core3num21_$LT$impl$u20$u32$GT$14from_str_radix17h029ea4c7fafc0898E@GOTPCREL]
test al, 1
je .LBB11_23
mov byte, ptr, [rsp], ah
lea rdi, [rip, +, .L__unnamed_8]
lea rcx, [rip, +, .L__unnamed_9]
lea r8, [rip, +, .L__unnamed_10]
mov rdx, rsp
mov esi, 43
call qword, ptr, [rip, +, _ZN4core6result13unwrap_failed17hb53671404b9e33c2E@GOTPCREL]
jmp .LBB11_57
.LBB11_41:
mov qword, ptr, [rsp], 0
lea rdi, [rsp, +, 48]
lea rsi, [rsp, +, 80]
mov rdx, rsp
call core::panicking::assert_failed
jmp .LBB11_57
.LBB11_42:
lea rdi, [rsp, +, 80]
call qword, ptr, [rip, +, _ZN3std3env4args17h2fa6ae5095eceb16E@GOTPCREL]
mov rdi, rsp
lea rsi, [rsp, +, 80]
call qword, ptr, [rip, +, _ZN73_$LT$std..env..Args$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h87bf078216e1165cE@GOTPCREL]
cmp qword, ptr, [rsp], 0
jne .LBB11_48
lea rdi, [rip, +, .L__unnamed_11]
lea rdx, [rip, +, .L__unnamed_12]
mov esi, 43
call qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
jmp .LBB11_57
.LBB11_46:
lea rdi, [rip, +, .L__unnamed_11]
lea rdx, [rip, +, .L__unnamed_13]
mov esi, 43
call qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
jmp .LBB11_57
.LBB11_47:
mov byte, ptr, [rsp], ah
lea rdi, [rip, +, .L__unnamed_8]
lea rcx, [rip, +, .L__unnamed_9]
lea r8, [rip, +, .L__unnamed_14]
mov rdx, rsp
mov esi, 43
call qword, ptr, [rip, +, _ZN4core6result13unwrap_failed17hb53671404b9e33c2E@GOTPCREL]
jmp .LBB11_57
.LBB11_48:
mov rax, qword, ptr, [rsp, +, 16]
mov qword, ptr, [rsp, +, 64], rax
movups xmm0, xmmword, ptr, [rsp]
movaps xmmword, ptr, [rsp, +, 48], xmm0
lea rax, [rsp, +, 48]
mov qword, ptr, [rsp, +, 112], rax
lea rax, [rip, +, _ZN60_$LT$alloc..string..String$u20$as$u20$core..fmt..Display$GT$3fmt17h3158fd30d2269b29E]
mov qword, ptr, [rsp, +, 120], rax
lea rax, [rip, +, .L__unnamed_15]
mov qword, ptr, [rsp], rax
mov qword, ptr, [rsp, +, 8], 2
mov qword, ptr, [rsp, +, 16], 0
lea rax, [rsp, +, 112]
mov qword, ptr, [rsp, +, 32], rax
mov qword, ptr, [rsp, +, 40], 1
mov rdi, rsp
call qword, ptr, [rip, +, _ZN3std2io5stdio7_eprint17h4d077c3ca706ec3fE@GOTPCREL]
mov rsi, qword, ptr, [rsp, +, 56]
test rsi, rsi
je .LBB11_52
mov rdi, qword, ptr, [rsp, +, 48]
test rdi, rdi
je .LBB11_52
mov edx, 1
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_52:
mov rbx, qword, ptr, [rsp, +, 96]
mov rbp, qword, ptr, [rsp, +, 104]
cmp rbp, rbx
jne .LBB11_58
.LBB11_53:
mov rax, qword, ptr, [rsp, +, 88]
test rax, rax
je .LBB11_56
shl rax, 3
lea rsi, [rax, +, 2*rax]
test rsi, rsi
je .LBB11_56
mov rdi, qword, ptr, [rsp, +, 80]
mov edx, 8
call qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
.LBB11_56:
mov edi, 1
call qword, ptr, [rip, +, _ZN3std7process4exit17h39bde275b62f86b5E@GOTPCREL]
.LBB11_57:
ud2
.LBB11_58:
mov r14, qword, ptr, [rip, +, __rust_dealloc@GOTPCREL]
jmp .LBB11_60
.LBB11_59:
add rbx, 24
cmp rbx, rbp
je .LBB11_53
.LBB11_60:
mov rsi, qword, ptr, [rbx, +, 8]
test rsi, rsi
je .LBB11_59
mov rdi, qword, ptr, [rbx]
test rdi, rdi
je .LBB11_59
mov edx, 1
call r14
jmp .LBB11_59
.LBB11_63:
mov rbx, rax
lea rdi, [rsp, +, 48]
call core::ptr::drop_in_place<alloc::string::String>
jmp .LBB11_65
.LBB11_64:
mov rbx, rax
.LBB11_65:
lea rdi, [rsp, +, 80]
call core::ptr::drop_in_place<std::env::Args>
jmp .LBB11_70
.LBB11_66:
mov rbx, rax
lea rdi, [rsp, +, 80]
call core::ptr::drop_in_place<alloc::string::String>
jmp .LBB11_70
.LBB11_67:
jmp .LBB11_69
.LBB11_68:
.LBB11_69:
mov rbx, rax
.LBB11_70:
lea rdi, [rsp, +, 128]
call core::ptr::drop_in_place<std::env::Args>
mov rdi, rbx
call _Unwind_Resume
ud2
The naïve codegen for &BitSlice -> Domain -> integer is awful, and it is going to remain so with runtime-only indices. I am always looking to improve it. Because this (debug-build of <BitSlice<u32, Msb0> as BitField>::load_le)
<bitvec::slice::BitSlice<T,bitvec::order::Msb0> as bitvec::field::BitField>::load_le (/home/myrrlyn/.cargo/registry/src/github.com-1ecc6299db9ec823/bitvec-1.0.0/src/field.rs:234):
sub rsp, 520
mov qword, ptr, [rsp, +, 168], rdi
mov qword, ptr, [rsp, +, 176], rsi
mov qword, ptr, [rsp, +, 448], rdi
mov qword, ptr, [rsp, +, 456], rsi
call bitvec::slice::api::<impl bitvec::slice::BitSlice<T,O>>::len
mov qword, ptr, [rsp, +, 184], rax
mov rdx, qword, ptr, [rsp, +, 184]
lea rdi, [rip, +, .L__unnamed_55]
mov esi, 4
call bitvec::field::check
mov rdx, qword, ptr, [rsp, +, 176]
mov rsi, qword, ptr, [rsp, +, 168]
lea rdi, [rsp, +, 200]
call bitvec::slice::BitSlice<T,O>::domain
mov rax, qword, ptr, [rsp, +, 200]
test rax, rax
je .LBB325_5
jmp .LBB325_43
.LBB325_43:
jmp .LBB325_6
ud2
.LBB325_5:
mov rax, qword, ptr, [rsp, +, 208]
mov qword, ptr, [rsp, +, 256], rax
mov rax, qword, ptr, [rsp, +, 216]
mov qword, ptr, [rsp, +, 264], rax
call bitvec::mem::bits_of
mov qword, ptr, [rsp, +, 160], rax
jmp .LBB325_37
.LBB325_6:
mov rax, qword, ptr, [rsp, +, 208]
mov qword, ptr, [rsp, +, 296], rax
mov rax, qword, ptr, [rsp, +, 216]
mov qword, ptr, [rsp, +, 304], rax
mov rcx, qword, ptr, [rsp, +, 224]
mov qword, ptr, [rsp, +, 144], rcx
mov rax, qword, ptr, [rsp, +, 232]
mov qword, ptr, [rsp, +, 152], rax
mov qword, ptr, [rsp, +, 464], rcx
mov qword, ptr, [rsp, +, 472], rax
mov rax, qword, ptr, [rsp, +, 240]
mov qword, ptr, [rsp, +, 312], rax
mov rax, qword, ptr, [rsp, +, 248]
mov qword, ptr, [rsp, +, 320], rax
mov dword, ptr, [rsp, +, 332], 0
mov eax, 1
xor ecx, ecx
cmp qword, ptr, [rsp, +, 312], 0
cmove rax, rcx
cmp rax, 1
jne .LBB325_8
mov rax, qword, ptr, [rsp, +, 312]
mov qword, ptr, [rsp, +, 336], rax
mov rax, qword, ptr, [rsp, +, 320]
mov qword, ptr, [rsp, +, 344], rax
call bitvec::mem::bits_of
mov qword, ptr, [rsp, +, 136], rax
jmp .LBB325_9
.LBB325_8:
mov rsi, qword, ptr, [rsp, +, 152]
mov rdi, qword, ptr, [rsp, +, 144]
call core::slice::<impl [T]>::iter
mov qword, ptr, [rsp, +, 120], rax
mov qword, ptr, [rsp, +, 128], rdx
jmp .LBB325_15
.LBB325_9:
mov rax, qword, ptr, [rsp, +, 136]
mov byte, ptr, [rsp, +, 118], al
lea rdi, [rsp, +, 336]
call bitvec::domain::PartialElement<M,T,O>::tail
mov byte, ptr, [rsp, +, 119], al
mov al, byte, ptr, [rsp, +, 119]
movzx edi, al
call bitvec::index::BitEnd<R>::into_inner
mov byte, ptr, [rsp, +, 117], al
mov cl, byte, ptr, [rsp, +, 117]
mov al, byte, ptr, [rsp, +, 118]
sub al, cl
mov byte, ptr, [rsp, +, 116], al
setb al
test al, 1
jne .LBB325_13
mov al, byte, ptr, [rsp, +, 116]
mov byte, ptr, [rsp, +, 483], al
movups xmm0, xmmword, ptr, [rsp, +, 336]
movaps xmmword, ptr, [rsp, +, 352], xmm0
mov rdi, qword, ptr, [rsp, +, 352]
mov rsi, qword, ptr, [rsp, +, 360]
movzx edx, al
call bitvec::field::get
mov dword, ptr, [rsp, +, 112], eax
jmp .LBB325_14
.LBB325_13:
lea rdi, [rip, +, str.0]
lea rdx, [rip, +, .L__unnamed_56]
mov rax, qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
mov esi, 33
call rax
ud2
.LBB325_14:
mov eax, dword, ptr, [rsp, +, 112]
mov dword, ptr, [rsp, +, 332], eax
jmp .LBB325_8
.LBB325_15:
mov rsi, qword, ptr, [rsp, +, 128]
mov rdi, qword, ptr, [rsp, +, 120]
call core::iter::traits::iterator::Iterator::rev
mov qword, ptr, [rsp, +, 96], rax
mov qword, ptr, [rsp, +, 104], rdx
mov rsi, qword, ptr, [rsp, +, 104]
mov rdi, qword, ptr, [rsp, +, 96]
call core::iter::traits::iterator::Iterator::map
mov qword, ptr, [rsp, +, 80], rax
mov qword, ptr, [rsp, +, 88], rdx
mov rsi, qword, ptr, [rsp, +, 88]
mov rdi, qword, ptr, [rsp, +, 80]
call <I as core::iter::traits::collect::IntoIterator>::into_iter
mov qword, ptr, [rsp, +, 64], rax
mov qword, ptr, [rsp, +, 72], rdx
mov rax, qword, ptr, [rsp, +, 72]
mov rcx, qword, ptr, [rsp, +, 64]
mov qword, ptr, [rsp, +, 376], rcx
mov qword, ptr, [rsp, +, 384], rax
.LBB325_19:
lea rdi, [rsp, +, 376]
call <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
mov dword, ptr, [rsp, +, 396], edx
mov dword, ptr, [rsp, +, 392], eax
mov eax, dword, ptr, [rsp, +, 392]
test rax, rax
je .LBB325_22
jmp .LBB325_44
.LBB325_44:
jmp .LBB325_23
ud2
.LBB325_22:
mov eax, 1
xor ecx, ecx
cmp qword, ptr, [rsp, +, 296], 0
cmove rax, rcx
cmp rax, 1
je .LBB325_27
jmp .LBB325_28
.LBB325_23:
mov eax, dword, ptr, [rsp, +, 396]
mov dword, ptr, [rsp, +, 52], eax
mov dword, ptr, [rsp, +, 484], eax
call bitvec::mem::bits_of
mov qword, ptr, [rsp, +, 56], rax
mov rsi, qword, ptr, [rsp, +, 56]
lea rdi, [rsp, +, 332]
call bitvec::field::maybe_shift_left
mov edi, dword, ptr, [rsp, +, 52]
call bitvec::field::resize
mov dword, ptr, [rsp, +, 48], eax
mov esi, dword, ptr, [rsp, +, 48]
lea rdi, [rsp, +, 332]
call <u32 as core::ops::bit::BitOrAssign>::bitor_assign
jmp .LBB325_19
.LBB325_27:
mov rax, qword, ptr, [rsp, +, 296]
mov qword, ptr, [rsp, +, 400], rax
mov rax, qword, ptr, [rsp, +, 304]
mov qword, ptr, [rsp, +, 408], rax
call bitvec::mem::bits_of
mov qword, ptr, [rsp, +, 40], rax
jmp .LBB325_29
.LBB325_28:
mov eax, dword, ptr, [rsp, +, 332]
mov dword, ptr, [rsp, +, 196], eax
jmp .LBB325_36
.LBB325_29:
lea rdi, [rsp, +, 400]
call bitvec::domain::PartialElement<M,T,O>::head
mov byte, ptr, [rsp, +, 39], al
mov al, byte, ptr, [rsp, +, 39]
movzx edi, al
call bitvec::index::BitIdx<R>::into_inner
mov byte, ptr, [rsp, +, 38], al
mov rax, qword, ptr, [rsp, +, 40]
mov cl, byte, ptr, [rsp, +, 38]
movzx ecx, cl
sub rax, rcx
mov qword, ptr, [rsp, +, 24], rax
setb al
test al, 1
jne .LBB325_33
mov rsi, qword, ptr, [rsp, +, 24]
mov qword, ptr, [rsp, +, 488], rsi
lea rdi, [rsp, +, 332]
call bitvec::field::maybe_shift_left
jmp .LBB325_34
.LBB325_33:
lea rdi, [rip, +, str.0]
lea rdx, [rip, +, .L__unnamed_57]
mov rax, qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
mov esi, 33
call rax
ud2
.LBB325_34:
movups xmm0, xmmword, ptr, [rsp, +, 400]
movaps xmmword, ptr, [rsp, +, 416], xmm0
mov rdi, qword, ptr, [rsp, +, 416]
mov rsi, qword, ptr, [rsp, +, 424]
xor edx, edx
call bitvec::field::get
mov dword, ptr, [rsp, +, 20], eax
mov esi, dword, ptr, [rsp, +, 20]
lea rdi, [rsp, +, 332]
call <u32 as core::ops::bit::BitOrAssign>::bitor_assign
jmp .LBB325_28
.LBB325_36:
lea rax, [rsp, +, 184]
mov qword, ptr, [rsp, +, 440], rax
mov eax, dword, ptr, [rsp, +, 196]
mov rdi, qword, ptr, [rsp, +, 440]
mov dword, ptr, [rsp, +, 508], eax
mov qword, ptr, [rsp, +, 512], rdi
mov dword, ptr, [rsp, +, 504], eax
mov esi, dword, ptr, [rsp, +, 504]
call <bitvec::slice::BitSlice<T,bitvec::order::Msb0> as bitvec::field::BitField>::load_le::{{closure}}
mov dword, ptr, [rsp, +, 16], eax
jmp .LBB325_42
.LBB325_37:
mov rax, qword, ptr, [rsp, +, 160]
mov byte, ptr, [rsp, +, 14], al
lea rdi, [rsp, +, 256]
call bitvec::domain::PartialElement<M,T,O>::tail
mov byte, ptr, [rsp, +, 15], al
mov al, byte, ptr, [rsp, +, 15]
movzx edi, al
call bitvec::index::BitEnd<R>::into_inner
mov byte, ptr, [rsp, +, 13], al
mov cl, byte, ptr, [rsp, +, 13]
mov al, byte, ptr, [rsp, +, 14]
sub al, cl
mov byte, ptr, [rsp, +, 12], al
setb al
test al, 1
jne .LBB325_41
mov al, byte, ptr, [rsp, +, 12]
mov byte, ptr, [rsp, +, 503], al
movups xmm0, xmmword, ptr, [rsp, +, 256]
movaps xmmword, ptr, [rsp, +, 272], xmm0
mov rdi, qword, ptr, [rsp, +, 272]
mov rsi, qword, ptr, [rsp, +, 280]
movzx edx, al
call bitvec::field::get
mov dword, ptr, [rsp, +, 196], eax
jmp .LBB325_36
.LBB325_41:
lea rdi, [rip, +, str.0]
lea rdx, [rip, +, .L__unnamed_58]
mov rax, qword, ptr, [rip, +, _ZN4core9panicking5panic17h86fc01e270142a61E@GOTPCREL]
mov esi, 33
call rax
ud2
.LBB325_42:
mov eax, dword, ptr, [rsp, +, 16]
add rsp, 520
ret
is atrocious. Unfortunately, it's also necessary for being able to correctly operate on arbitrary regions, and ripe for const propagation.
And LLVM is really good at const propagation.
So if you're at all able to use compile-time indices and demand inlining, absolutely do so.
I'll also admit I absolutely do not know enough about the #[inline] attribute to feel fully comfortable putting it back on to everything. I think between increasing hints for cross-crate inlining, turning up optimization levels, and using compile-time static indices, we can largely eliminate a lot of bitvec's codegen. Unfortunately, it is very obvious that as these levers become inaccessible to us (especially runtime-only indices), the pessimal case is not great. Unfortunately, without going in and manually templating each type combination case (certainly possible, but I'd really rather not?) this is just the kind of code I have to write in order to function correctly.
So tl;dr, I know this is a problem, I don't think I can do a whole lot about it but I'm always looking for avenues to try out, and it's certainly possible to game the system to get rid of these costs in many cases.