rust
rust copied to clipboard
Consistently use the highest bit of vector masks when converting to i1 vectors
This improves the codegen for vector select
, gather
, scatter
and boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
Examples of generated code seem to indicate that llvm considers the high bit as relevant when masks are passed as parameters. When a mask is truncated to i1, llvm thus inserts a left shift to use this lowest bit in the following masked operation. With an additional right shift before truncating, the highest bit or sign bit is converted and used in the i1 and the generated assembly no longer contains an additional shift.
All examples compiled with AVX2 target feature using RUSTFLAGS="-Ctarget-cpu=skylake"
.
Select:
pub fn select(m: mask64x4, a: i64x4, b: i64x4) -> i64x4 {
m.select(a, b)
}
Before:
portable_simd_test::select:
mov rax, rdi
vmovdqa ymm0, ymmword ptr [rsi]
vmovapd ymm1, ymmword ptr [rcx]
vpsllq ymm0, ymm0, 63
vblendvpd ymm0, ymm1, ymmword ptr [rdx], ymm0
vmovapd ymmword ptr [rdi], ymm0
vzeroupper
ret
After:
portable_simd_test::select:
mov rax, rdi
vmovapd ymm0, ymmword ptr [rsi]
vmovapd ymm1, ymmword ptr [rcx]
vblendvpd ymm0, ymm1, ymmword ptr [rdx], ymm0
vmovapd ymmword ptr [rdi], ymm0
vzeroupper
ret
Reduce all:
pub unsafe fn mask_all(m: mask8x32) -> bool {
m.all()
}
Before:
portable_simd_test::mask_all:
vmovdqa ymm0, ymmword ptr [rdi]
vpsllw ymm0, ymm0, 7
vpmovmskb eax, ymm0
cmp eax, -1
sete al
vzeroupper
ret
After:
portable_simd_test::mask_all:
vmovdqa ymm0, ymmword ptr [rdi]
vpmovmskb eax, ymm0
cmp eax, -1
sete al
vzeroupper
ret
On a bigger scale, this also improves the performance of masked aggregation kernels in an experimental branch of the apache/arrow-rs project where those kernels are migrated from packed_simd
to core_simd
.
Opening as a draft for now because so far this is only tested for x86_64 targets, and because I'm a bit unsure of how to add or extend unit tests for these intrinsics.
r? @davidtwco
(rustbot has picked a reviewer for you, use r? to override)
Does this affect codegen for non-x86 platforms?
You may want to study the other simd intrinsic tests and also the assembly or codegen tests.
Does this affect codegen for non-x86 platforms?
Seems to have a very similar effect for target aarch64-unknown-linux-gnu
. Example with simple select
(using 128 bit registers now):
pub fn select(m: mask32x4, a: i32x4, b: i32x4) -> i32x4 {
m.select(a, b)
}
Before
portable_simd_test::select:
ldr q1, [x0]
ldr q0, [x1]
ldr q2, [x2]
shl v1.4s, v1.4s, #31
cmlt v1.4s, v1.4s, #0
bif v0.16b, v2.16b, v1.16b
str q0, [x8]
ret
After
portable_simd_test::select:
ldr q0, [x0]
ldr q1, [x1]
ldr q2, [x2]
cmlt v0.4s, v0.4s, #0
bsl v0.16b, v1.16b, v2.16b
str q0, [x8]
ret
Interestingly there is a big effect on all
reduction in the following example:
pub unsafe fn mask_all(m: mask8x16) -> bool {
m.all()
}
Before
portable_simd_test::mask_all:
.cfi_startproc
sub sp, sp, #16
.cfi_def_cfa_offset 16
ldr q0, [x0]
mov w8, #65535
umov w9, v0.b[1]
umov w11, v0.b[2]
umov w10, v0.b[0]
umov w12, v0.b[3]
umov w13, v0.b[4]
umov w14, v0.b[5]
and w9, w9, #0x1
and w11, w11, #0x1
and w10, w10, #0x1
and w12, w12, #0x1
and w13, w13, #0x1
and w14, w14, #0x1
bfi w10, w9, #1, #1
umov w9, v0.b[6]
bfi w10, w11, #2, #1
umov w11, v0.b[7]
bfi w10, w12, #3, #1
umov w12, v0.b[8]
bfi w10, w13, #4, #1
umov w13, v0.b[9]
and w9, w9, #0x1
bfi w10, w14, #5, #1
umov w14, v0.b[10]
and w11, w11, #0x1
orr w9, w10, w9, lsl #6
umov w10, v0.b[11]
and w12, w12, #0x1
orr w9, w9, w11, lsl #7
umov w11, v0.b[12]
and w13, w13, #0x1
orr w9, w9, w12, lsl #8
umov w12, v0.b[13]
and w14, w14, #0x1
orr w9, w9, w13, lsl #9
umov w13, v0.b[14]
and w10, w10, #0x1
orr w9, w9, w14, lsl #10
and w11, w11, #0x1
orr w9, w9, w10, lsl #11
and w10, w12, #0x1
umov w12, v0.b[15]
orr w9, w9, w11, lsl #12
and w11, w13, #0x1
orr w9, w9, w10, lsl #13
orr w9, w9, w11, lsl #14
orr w9, w9, w12, lsl #15
bics wzr, w8, w9
cset w0, eq
add sp, sp, #16
.cfi_def_cfa_offset 0
ret
After
portable_simd_test::mask_all:
movi v0.2d, #0xffffffffffffffff
ldr q1, [x0]
cmgt v0.16b, v1.16b, v0.16b
umaxv b0, v0.16b
fmov w8, s0
mvn w8, w8
and w0, w8, #0x1
ret
But this does not seem to lead to an improvement for the is_hex
function from rust-lang/portable-simd#303.
I haven't checked any other target platforms yet. Thanks for the pointer to the tests, I'll have a look at those.
Cool! That's weird, but also good to know.
That is_hex
function from rust-lang/portable-simd#303 is so close to getting vectorized though.
pub fn is_hex_mask(chunk: &[u8; 16]) -> bool {
let x = u8x16::from_array(*chunk);
let m1 = x.simd_gt(splat(b'0' - 1));
let m2 = x.simd_lt(splat(b'9' + 1));
let m3 = x.simd_gt(splat(b'a' - 1));
let m4 = x.simd_lt(splat(b'f' + 1));
let m = (m1 & m2) | (m3 & m4);
m.all()
}
With the changes from this PR, changing that last line to either one of the following alternatives gets vectorized:
(m.to_int().simd_ne(splat(0))).all()
!((!m).any())
And maybe the weirdest one
fn mask_all(m: mask8x16) -> bool {
m.all()
}
mask_all(m)
All these seem to help llvm in knowing that the bit pattern is all ones/zeros.
Maybe this is less of a problem on x86 since llvm knows which of the mask instructions only use the high bit. So the pattern of "shift to lowest, trunc to i1, i1 vector op", allows it to directly use the high bit. While on aarch64, the operations require a full mask, but with the shift this mask can be created with a simpler vector comparison.
r? wg-llvm
:umbrella: The latest upstream changes (presumably #106573) made this pull request unmergeable. Please resolve the merge conflicts.
That was an interesting rebase, with the tests having moved in the mean time. Sorry for taking so long coming back to this PR.
Nagisa said they'll try to get to this soon. In the meantime, my experience is that these kinds of PRs are very prone to having something break during the cross-platform builds, so let's
@bors try
:hourglass: Trying commit 9dd93913af8897728bead361c713ce87b961c748 with merge 8e6cd1d9198b904379afff0d143d7d037b710b61...
:sunny: Try build successful - checks-actions
Build commit: 8e6cd1d9198b904379afff0d143d7d037b710b61 (8e6cd1d9198b904379afff0d143d7d037b710b61
)
Oh hm that doesn't run the full CI (anymore?) (did it ever?), I should probably just finally start up an aarch64 machine and test this on that then.
Since we already have a try build, it wouldn’t hurt a to have a @rust-timer queue
Awaiting bors try build completion.
@rustbot label: +S-waiting-on-perf
Switching to waiting on author as it seems there has been a review. @jhorstmann Feel free to request a review with @rustbot ready
, thanks!
@rustbot author
r? rust-lang/compiler
The job x86_64-gnu-llvm-14
failed! Check out the build log: (web) (plain)
Click to see the possible cause of the failure (guessed by this bot)
..........................................iiii..ii.iii...................
failures:
---- [assembly] tests/assembly/simd-intrinsic-mask-reduce.rs#aarch64 stdout ----
error in revision `aarch64`: verification with 'FileCheck' failed
status: exit status: 1
command: "/usr/lib/llvm-14/bin/FileCheck" "--input-file" "/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s" "/checkout/tests/assembly/simd-intrinsic-mask-reduce.rs" "--allow-unused-prefixes" "--check-prefixes" "CHECK,NONMSVC,aarch64" "--dump-input-context" "100"
stdout: none
--- stderr -------------------------------
/checkout/tests/assembly/simd-intrinsic-mask-reduce.rs:35:19: error: aarch64-NEXT: expected string not found in input
// aarch64-NEXT: umaxv
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:13:6: note: scanning from here
cmgt v0.16b, v1.16b, v0.16b
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:14:2: note: possible intended match here
umov w8, v0.b[1]
^
/checkout/tests/assembly/simd-intrinsic-mask-reduce.rs:45:19: error: aarch64-NEXT: expected string not found in input
// aarch64-NEXT: umaxv
^
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:77:6: note: scanning from here
cmlt v0.16b, v0.16b, #0
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:78:2: note: possible intended match here
umov w8, v0.b[1]
Input file: /checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s
Check file: /checkout/tests/assembly/simd-intrinsic-mask-reduce.rs
-dump-input=help explains the following input dump.
Input was:
<<<<<<
1: .text
1: .text
2: .file "simd_intrinsic_mask_reduce.ced9cd37d82699d5-cgu.0"
3: .section .text.mask_reduce_all,"ax",@progbits
4: .globl mask_reduce_all
5: .p2align 2
6: .type mask_reduce_all,@function
8: .cfi_startproc
9: sub sp, sp, #16
9: sub sp, sp, #16
10: .cfi_def_cfa_offset 16
11: movi v0.2d, #0xffffffffffffffff
12: ldr q1, [x0]
13: cmgt v0.16b, v1.16b, v0.16b
next:35'0 X~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
14: umov w8, v0.b[1]
next:35'0 ~~~~~~~~~~~~~~~~~~
next:35'1 ? possible intended match
15: umov w10, v0.b[2]
next:35'0 ~~~~~~~~~~~~~~~~~~~
16: umov w9, v0.b[0]
next:35'0 ~~~~~~~~~~~~~~~~~~
17: umov w11, v0.b[3]
next:35'0 ~~~~~~~~~~~~~~~~~~~
18: umov w12, v0.b[4]
next:35'0 ~~~~~~~~~~~~~~~~~~~
19: umov w13, v0.b[5]
next:35'0 ~~~~~~~~~~~~~~~~~~~
20: and w8, w8, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~
21: and w10, w10, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
22: and w9, w9, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~
23: and w11, w11, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
24: and w12, w12, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
25: and w13, w13, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
26: bfi w9, w8, #1, #1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
27: umov w8, v0.b[6]
next:35'0 ~~~~~~~~~~~~~~~~~~
28: bfi w9, w10, #2, #1
next:35'0 ~~~~~~~~~~~~~~~~~~~~~
29: umov w10, v0.b[7]
next:35'0 ~~~~~~~~~~~~~~~~~~~
30: bfi w9, w11, #3, #1
next:35'0 ~~~~~~~~~~~~~~~~~~~~~
31: umov w11, v0.b[8]
next:35'0 ~~~~~~~~~~~~~~~~~~~
32: bfi w9, w12, #4, #1
next:35'0 ~~~~~~~~~~~~~~~~~~~~~
33: umov w12, v0.b[9]
next:35'0 ~~~~~~~~~~~~~~~~~~~
34: and w8, w8, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~
35: bfi w9, w13, #5, #1
next:35'0 ~~~~~~~~~~~~~~~~~~~~~
36: umov w13, v0.b[10]
next:35'0 ~~~~~~~~~~~~~~~~~~~~
37: and w10, w10, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
38: orr w8, w9, w8, lsl #6
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~
39: umov w9, v0.b[11]
next:35'0 ~~~~~~~~~~~~~~~~~~~
40: and w11, w11, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
41: orr w8, w8, w10, lsl #7
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
42: umov w10, v0.b[12]
next:35'0 ~~~~~~~~~~~~~~~~~~~~
43: and w12, w12, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
44: orr w8, w8, w11, lsl #8
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
45: umov w11, v0.b[13]
next:35'0 ~~~~~~~~~~~~~~~~~~~~
46: and w13, w13, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
47: orr w8, w8, w12, lsl #9
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
48: umov w12, v0.b[14]
next:35'0 ~~~~~~~~~~~~~~~~~~~~
49: and w9, w9, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~
50: orr w8, w8, w13, lsl #10
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
51: and w10, w10, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
52: orr w8, w8, w9, lsl #11
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
53: and w9, w11, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~
54: umov w11, v0.b[15]
next:35'0 ~~~~~~~~~~~~~~~~~~~~
55: orr w8, w8, w10, lsl #12
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
56: and w10, w12, #0x1
next:35'0 ~~~~~~~~~~~~~~~~~~~~
57: orr w8, w8, w9, lsl #13
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
58: orr w8, w8, w10, lsl #14
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
59: orr w8, w8, w11, lsl #15
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
60: tst w8, #0xffff
next:35'0 ~~~~~~~~~~~~~~~~~
61: cset w0, eq
next:35'0 ~~~~~~~~~~~~~
62: add sp, sp, #16
next:35'0 ~~~~~~~~~~~~~~~~~
63: ret
next:35'0 ~~~~~
64: .Lfunc_end0:
next:35'0 ~~~~~~~~~~~~~
65: .size mask_reduce_all, .Lfunc_end0-mask_reduce_all
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
66: .cfi_endproc
next:35'0 ~~~~~~~~~~~~~~
67:
next:35'0 ~
68: .section .text.mask_reduce_any,"ax",@progbits
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69: .globl mask_reduce_any
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~
70: .p2align 2
next:35'0 ~~~~~~~~~~~~
71: .type mask_reduce_any,@function
next:35'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72: mask_reduce_any:
next:35'0 ~~~~~~~~~~~~~~~~
73: .cfi_startproc
74: sub sp, sp, #16
75: .cfi_def_cfa_offset 16
76: ldr q0, [x0]
77: cmlt v0.16b, v0.16b, #0
next:45'0 X~~~~~~~~~~~~~~~~~~~ error: no match found
78: umov w8, v0.b[1]
next:45'0 ~~~~~~~~~~~~~~~~~~
next:45'1 ? possible intended match
79: umov w10, v0.b[2]
next:45'0 ~~~~~~~~~~~~~~~~~~~
80: umov w9, v0.b[0]
next:45'0 ~~~~~~~~~~~~~~~~~~
81: umov w11, v0.b[3]
next:45'0 ~~~~~~~~~~~~~~~~~~~
82: umov w12, v0.b[4]
next:45'0 ~~~~~~~~~~~~~~~~~~~
83: umov w13, v0.b[5]
next:45'0 ~~~~~~~~~~~~~~~~~~~
84: and w8, w8, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~
85: and w10, w10, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
86: and w9, w9, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~
87: and w11, w11, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
88: and w12, w12, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
89: and w13, w13, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
90: bfi w9, w8, #1, #1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
91: umov w8, v0.b[6]
next:45'0 ~~~~~~~~~~~~~~~~~~
92: bfi w9, w10, #2, #1
next:45'0 ~~~~~~~~~~~~~~~~~~~~~
93: umov w10, v0.b[7]
next:45'0 ~~~~~~~~~~~~~~~~~~~
94: bfi w9, w11, #3, #1
next:45'0 ~~~~~~~~~~~~~~~~~~~~~
95: umov w11, v0.b[8]
next:45'0 ~~~~~~~~~~~~~~~~~~~
96: bfi w9, w12, #4, #1
next:45'0 ~~~~~~~~~~~~~~~~~~~~~
97: umov w12, v0.b[9]
next:45'0 ~~~~~~~~~~~~~~~~~~~
98: and w8, w8, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~
99: bfi w9, w13, #5, #1
next:45'0 ~~~~~~~~~~~~~~~~~~~~~
100: umov w13, v0.b[10]
next:45'0 ~~~~~~~~~~~~~~~~~~~~
101: and w10, w10, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
102: orr w8, w9, w8, lsl #6
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~
103: umov w9, v0.b[11]
next:45'0 ~~~~~~~~~~~~~~~~~~~
104: and w11, w11, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
105: orr w8, w8, w10, lsl #7
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
106: umov w10, v0.b[12]
next:45'0 ~~~~~~~~~~~~~~~~~~~~
107: and w12, w12, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
108: orr w8, w8, w11, lsl #8
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
109: umov w11, v0.b[13]
next:45'0 ~~~~~~~~~~~~~~~~~~~~
110: and w13, w13, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
111: orr w8, w8, w12, lsl #9
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
112: umov w12, v0.b[14]
next:45'0 ~~~~~~~~~~~~~~~~~~~~
113: and w9, w9, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~
114: orr w8, w8, w13, lsl #10
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
115: and w10, w10, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
116: orr w8, w8, w9, lsl #11
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
117: and w9, w11, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~
118: umov w11, v0.b[15]
next:45'0 ~~~~~~~~~~~~~~~~~~~~
119: orr w8, w8, w10, lsl #12
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
120: and w10, w12, #0x1
next:45'0 ~~~~~~~~~~~~~~~~~~~~
121: orr w8, w8, w9, lsl #13
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~
122: orr w8, w8, w10, lsl #14
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
123: orr w8, w8, w11, lsl #15
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~
124: tst w8, #0xffff
next:45'0 ~~~~~~~~~~~~~~~~~
125: cset w0, ne
next:45'0 ~~~~~~~~~~~~~
126: add sp, sp, #16
next:45'0 ~~~~~~~~~~~~~~~~~
127: ret
next:45'0 ~~~~~
128: .Lfunc_end1:
next:45'0 ~~~~~~~~~~~~~
129: .size mask_reduce_any, .Lfunc_end1-mask_reduce_any
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
130: .cfi_endproc
next:45'0 ~~~~~~~~~~~~~~
131:
next:45'0 ~
132: .section ".note.GNU-stack","",@progbits
next:45'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------------------
failures:
[assembly] tests/assembly/simd-intrinsic-mask-reduce.rs#aarch64
test result: FAILED. 131 passed; 1 failed; 29 ignored; 0 measured; 0 filtered out; finished in 530.67ms
Some tests failed in compiletest suite=assembly mode=assembly host=x86_64-unknown-linux-gnu target=x86_64-unknown-linux-gnu
Sorry, I'm not able to review this. r? compiler
r? compiler
r? compiler
Perhaps we need a T-codegen.
Let's nominate it for the compiler meeting to find a reviewer.
r? @wesleywiser
I will put this on my TODO list to review
:umbrella: The latest upstream changes (presumably #114148) made this pull request unmergeable. Please resolve the merge conflicts.
The job x86_64-gnu-llvm-15
failed! Check out the build log: (web) (plain)
Click to see the possible cause of the failure (guessed by this bot)
##[group]Run git config --global core.autocrlf false
git config --global core.autocrlf false
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
lfs: false
submodules: false
set-safe-directory: true
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/setup-environment.sh
src/ci/scripts/setup-environment.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/should-skip-this.sh
src/ci/scripts/should-skip-this.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/verify-channel.sh
src/ci/scripts/verify-channel.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/collect-cpu-stats.sh
src/ci/scripts/collect-cpu-stats.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/install-sccache.sh
src/ci/scripts/install-sccache.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/select-xcode.sh
src/ci/scripts/select-xcode.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-clang.sh
src/ci/scripts/install-clang.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-wix.sh
src/ci/scripts/install-wix.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/disable-git-crlf-conversion.sh
src/ci/scripts/disable-git-crlf-conversion.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/install-msys2.sh
src/ci/scripts/install-msys2.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-mingw.sh
src/ci/scripts/install-mingw.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-ninja.sh
src/ci/scripts/install-ninja.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/enable-docker-ipv6.sh
src/ci/scripts/enable-docker-ipv6.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/disable-git-crlf-conversion.sh
src/ci/scripts/disable-git-crlf-conversion.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/verify-line-endings.sh
src/ci/scripts/verify-line-endings.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
CACHE_DOMAIN: ci-caches.rust-lang.org
IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/verify-backported-commits.sh
src/ci/scripts/verify-backported-commits.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/verify-stable-version-number.sh
src/ci/scripts/verify-stable-version-number.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/run-build-from-ci.sh
src/ci/scripts/run-build-from-ci.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
PR_CI_JOB: 1
CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
SCCACHE_BUCKET: rust-lang-ci-sccache2
TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
failures:
---- [codegen] tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs stdout ----
error: verification with 'FileCheck' failed
status: exit status: 1
command: "/usr/lib/llvm-15/bin/FileCheck" "--input-file" "/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll" "/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs" "--allow-unused-prefixes" "--check-prefixes" "CHECK,NONMSVC" "--dump-input-context" "100"
--- stderr -------------------------------
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:32:12: error: CHECK: expected string not found in input
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:32:12: error: CHECK: expected string not found in input
// CHECK: [[A:%[0-9]+]] = lshr <4 x i8> %{{m|1}}, <i8 7, i8 7, i8 7, i8 7>
Build completed unsuccessfully in 0:11:05
Build completed unsuccessfully in 0:11:05
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:7:23: note: scanning from here
define void @select_m8(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 4 dereferenceable(4) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 {
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:12:2: note: possible intended match here
%3 = lshr <4 x i8> %0, <i8 7, i8 7, i8 7, i8 7>
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:41:12: error: CHECK: expected string not found in input
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:41:12: error: CHECK: expected string not found in input
// CHECK: [[A:%[0-9]+]] = lshr <4 x i32> %{{m|1}}, <i32 31, i32 31, i32 31, i32 31>
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:20:24: note: scanning from here
define void @select_m32(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 {
^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:25:2: note: possible intended match here
%3 = lshr <4 x i32> %0, <i32 31, i32 31, i32 31, i32 31>
Input file: /checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll
Check file: /checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs
-dump-input=help explains the following input dump.
Input was:
<<<<<<
<<<<<<
1: ; ModuleID = 'simd_intrinsic_generic_select.1439860ecd8c1a90-cgu.0'
2: source_filename = "simd_intrinsic_generic_select.1439860ecd8c1a90-cgu.0"
3: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
4: target triple = "x86_64-unknown-linux-gnu"
5:
6: ; Function Attrs: nonlazybind uwtable
7: define void @select_m8(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 4 dereferenceable(4) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 {
check:32'0 X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
8: start:
check:32'0 ~~~~~~~
9: %0 = load <4 x i8>, ptr %m, align 4
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10: %1 = load <4 x float>, ptr %a, align 16
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11: %2 = load <4 x float>, ptr %b, align 16
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12: %3 = lshr <4 x i8> %0, <i8 7, i8 7, i8 7, i8 7>
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:32'1 ? possible intended match
13: %4 = trunc <4 x i8> %3 to <4 x i1>
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
14: %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %2
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15: store <4 x float> %5, ptr %_0, align 16
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16: ret void
check:32'0 ~~~~~~~~~~
17: }
check:32'0 ~~
18:
check:32'0 ~
19: ; Function Attrs: nonlazybind uwtable
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20: define void @select_m32(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 {
check:32'0 ~~~~~~~~~~~~~~~~~~~~~~~
check:41'0 X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
21: start:
check:41'0 ~~~~~~~
22: %0 = load <4 x i32>, ptr %m, align 16
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23: %1 = load <4 x float>, ptr %a, align 16
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24: %2 = load <4 x float>, ptr %b, align 16
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25: %3 = lshr <4 x i32> %0, <i32 31, i32 31, i32 31, i32 31>
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:41'1 ? possible intended match
26: %4 = trunc <4 x i32> %3 to <4 x i1>
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27: %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %2
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
28: store <4 x float> %5, ptr %_0, align 16
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29: ret void
check:41'0 ~~~~~~~~~~
30: }
check:41'0 ~~
31:
check:41'0 ~
32: ; Function Attrs: nonlazybind uwtable
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33: define void @select_bitmask(ptr noalias nocapture noundef sret(<8 x float>) align 32 dereferenceable(32) %_0, i8 noundef %m, ptr noalias nocapture noundef align 32 dereferenceable(32) %a, ptr noalias nocapture noundef align 32 dereferenceable(32) %b) unnamed_addr #0 {
check:41'0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
34: start:
35: %0 = load <8 x float>, ptr %a, align 32
36: %1 = load <8 x float>, ptr %b, align 32
37: %2 = bitcast i8 %m to <8 x i1>
38: %3 = select <8 x i1> %2, <8 x float> %0, <8 x float> %1
39: store <8 x float> %3, ptr %_0, align 32
40: ret void
41: }
42:
43: attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
44:
45: !llvm.module.flags = !{!0, !1}
46: !llvm.ident = !{!2}
47:
48: !0 = !{i32 7, !"PIC Level", i32 2}
49: !1 = !{i32 2, !"RtLibUseGOT", i32 1}
50: !2 = !{!"rustc version 1.73.0-nightly (f9a7e8786 2023-07-30)"}
------------------------------------------
:umbrella: The latest upstream changes (presumably #105545) made this pull request unmergeable. Please resolve the merge conflicts.
:umbrella: The latest upstream changes (presumably #117444) made this pull request unmergeable. Please resolve the merge conflicts.
@jhorstmann any updates on this? thanks
@Dylan-DPC I updated the PR and adjusted also the new masked load/store intrinsics to use the same logic. I also added another assembly test for masked load that shows there are no unneeded shift instructions in the output. The current output is
load_f64x4:
vpsllq ymm0, ymmword ptr [rdi], 63
vpmovq2m k1, ymm0
vmovupd ymm0 {k1} {z}, ymmword ptr [rsi]
vmovapd ymmword ptr [rdx], ymm0
vzeroupper
ret
(https://rust.godbolt.org/z/ThTz6E39s)