rust icon indicating copy to clipboard operation
rust copied to clipboard

Consistently use the highest bit of vector masks when converting to i1 vectors

Open jhorstmann opened this issue 1 year ago • 34 comments

This improves the codegen for vector select, gather, scatter and boolean reduction intrinsics and fixes rust-lang/portable-simd#316.

Examples of generated code seem to indicate that llvm considers the high bit as relevant when masks are passed as parameters. When a mask is truncated to i1, llvm thus inserts a left shift to use this lowest bit in the following masked operation. With an additional right shift before truncating, the highest bit or sign bit is converted and used in the i1 and the generated assembly no longer contains an additional shift.

All examples compiled with AVX2 target feature using RUSTFLAGS="-Ctarget-cpu=skylake".

Select:

pub fn select(m: mask64x4, a: i64x4, b: i64x4) -> i64x4 {
    m.select(a, b)
}

Before:

portable_simd_test::select:
	mov rax, rdi
	vmovdqa ymm0, ymmword ptr [rsi]
	vmovapd ymm1, ymmword ptr [rcx]
	vpsllq ymm0, ymm0, 63
	vblendvpd ymm0, ymm1, ymmword ptr [rdx], ymm0
	vmovapd ymmword ptr [rdi], ymm0
	vzeroupper
	ret

After:

portable_simd_test::select:
	mov rax, rdi
	vmovapd ymm0, ymmword ptr [rsi]
	vmovapd ymm1, ymmword ptr [rcx]
	vblendvpd ymm0, ymm1, ymmword ptr [rdx], ymm0
	vmovapd ymmword ptr [rdi], ymm0
	vzeroupper
	ret

Reduce all:

pub unsafe fn mask_all(m: mask8x32) -> bool {
    m.all()
}

Before:

portable_simd_test::mask_all:
	vmovdqa ymm0, ymmword ptr [rdi]
	vpsllw ymm0, ymm0, 7
	vpmovmskb eax, ymm0
	cmp eax, -1
	sete al
	vzeroupper
	ret

After:

portable_simd_test::mask_all:
	vmovdqa ymm0, ymmword ptr [rdi]
	vpmovmskb eax, ymm0
	cmp eax, -1
	sete al
	vzeroupper
	ret

On a bigger scale, this also improves the performance of masked aggregation kernels in an experimental branch of the apache/arrow-rs project where those kernels are migrated from packed_simd to core_simd.

Opening as a draft for now because so far this is only tested for x86_64 targets, and because I'm a bit unsure of how to add or extend unit tests for these intrinsics.

jhorstmann avatar Nov 21 '22 21:11 jhorstmann

r? @davidtwco

(rustbot has picked a reviewer for you, use r? to override)

rustbot avatar Nov 21 '22 21:11 rustbot

Does this affect codegen for non-x86 platforms?

workingjubilee avatar Nov 22 '22 10:11 workingjubilee

You may want to study the other simd intrinsic tests and also the assembly or codegen tests.

workingjubilee avatar Nov 22 '22 10:11 workingjubilee

Does this affect codegen for non-x86 platforms?

Seems to have a very similar effect for target aarch64-unknown-linux-gnu. Example with simple select (using 128 bit registers now):

pub fn select(m: mask32x4, a: i32x4, b: i32x4) -> i32x4 {
    m.select(a, b)
}

Before

portable_simd_test::select:
	ldr q1, [x0]
	ldr q0, [x1]
	ldr q2, [x2]
	shl v1.4s, v1.4s, #31
	cmlt v1.4s, v1.4s, #0
	bif v0.16b, v2.16b, v1.16b
	str q0, [x8]
	ret

After

portable_simd_test::select:
	ldr q0, [x0]
	ldr q1, [x1]
	ldr q2, [x2]
	cmlt v0.4s, v0.4s, #0
	bsl v0.16b, v1.16b, v2.16b
	str q0, [x8]
	ret

Interestingly there is a big effect on all reduction in the following example:

pub unsafe fn mask_all(m: mask8x16) -> bool {
    m.all()
}

Before

portable_simd_test::mask_all:
	.cfi_startproc
	sub sp, sp, #16
	.cfi_def_cfa_offset 16

	ldr q0, [x0]

	mov w8, #65535

	umov w9, v0.b[1]
	umov w11, v0.b[2]
	umov w10, v0.b[0]
	umov w12, v0.b[3]
	umov w13, v0.b[4]
	umov w14, v0.b[5]
	and w9, w9, #0x1
	and w11, w11, #0x1
	and w10, w10, #0x1
	and w12, w12, #0x1
	and w13, w13, #0x1
	and w14, w14, #0x1
	bfi w10, w9, #1, #1
	umov w9, v0.b[6]
	bfi w10, w11, #2, #1
	umov w11, v0.b[7]
	bfi w10, w12, #3, #1
	umov w12, v0.b[8]
	bfi w10, w13, #4, #1
	umov w13, v0.b[9]
	and w9, w9, #0x1
	bfi w10, w14, #5, #1
	umov w14, v0.b[10]
	and w11, w11, #0x1
	orr w9, w10, w9, lsl #6
	umov w10, v0.b[11]
	and w12, w12, #0x1
	orr w9, w9, w11, lsl #7
	umov w11, v0.b[12]
	and w13, w13, #0x1
	orr w9, w9, w12, lsl #8
	umov w12, v0.b[13]
	and w14, w14, #0x1
	orr w9, w9, w13, lsl #9
	umov w13, v0.b[14]
	and w10, w10, #0x1
	orr w9, w9, w14, lsl #10
	and w11, w11, #0x1
	orr w9, w9, w10, lsl #11
	and w10, w12, #0x1
	umov w12, v0.b[15]
	orr w9, w9, w11, lsl #12
	and w11, w13, #0x1
	orr w9, w9, w10, lsl #13
	orr w9, w9, w11, lsl #14
	orr w9, w9, w12, lsl #15
	bics wzr, w8, w9
	cset w0, eq

	add sp, sp, #16
	.cfi_def_cfa_offset 0
	ret

After

portable_simd_test::mask_all:
	movi v0.2d, #0xffffffffffffffff
	ldr q1, [x0]
	cmgt v0.16b, v1.16b, v0.16b
	umaxv b0, v0.16b
	fmov w8, s0
	mvn w8, w8
	and w0, w8, #0x1
	ret

But this does not seem to lead to an improvement for the is_hex function from rust-lang/portable-simd#303.

I haven't checked any other target platforms yet. Thanks for the pointer to the tests, I'll have a look at those.

jhorstmann avatar Nov 22 '22 20:11 jhorstmann

Cool! That's weird, but also good to know.

workingjubilee avatar Nov 22 '22 21:11 workingjubilee

That is_hex function from rust-lang/portable-simd#303 is so close to getting vectorized though.

pub fn is_hex_mask(chunk: &[u8; 16]) -> bool {
    let x = u8x16::from_array(*chunk);
    let m1 = x.simd_gt(splat(b'0' - 1));
    let m2 = x.simd_lt(splat(b'9' + 1));
    let m3 = x.simd_gt(splat(b'a' - 1));
    let m4 = x.simd_lt(splat(b'f' + 1));
    let m = (m1 & m2) | (m3 & m4);
    m.all()
}

With the changes from this PR, changing that last line to either one of the following alternatives gets vectorized:

    (m.to_int().simd_ne(splat(0))).all()
    !((!m).any())

And maybe the weirdest one

    fn mask_all(m: mask8x16) -> bool {
        m.all()
    }

    mask_all(m)

All these seem to help llvm in knowing that the bit pattern is all ones/zeros.

Maybe this is less of a problem on x86 since llvm knows which of the mask instructions only use the high bit. So the pattern of "shift to lowest, trunc to i1, i1 vector op", allows it to directly use the high bit. While on aarch64, the operations require a full mask, but with the shift this mask can be created with a simpler vector comparison.

jhorstmann avatar Nov 23 '22 22:11 jhorstmann

r? wg-llvm

davidtwco avatar Jan 07 '23 14:01 davidtwco

:umbrella: The latest upstream changes (presumably #106573) made this pull request unmergeable. Please resolve the merge conflicts.

bors avatar Jan 08 '23 02:01 bors

That was an interesting rebase, with the tests having moved in the mean time. Sorry for taking so long coming back to this PR.

jhorstmann avatar Feb 28 '23 20:02 jhorstmann

Nagisa said they'll try to get to this soon. In the meantime, my experience is that these kinds of PRs are very prone to having something break during the cross-platform builds, so let's

@bors try

workingjubilee avatar Mar 23 '23 19:03 workingjubilee

:hourglass: Trying commit 9dd93913af8897728bead361c713ce87b961c748 with merge 8e6cd1d9198b904379afff0d143d7d037b710b61...

bors avatar Mar 23 '23 19:03 bors

:sunny: Try build successful - checks-actions Build commit: 8e6cd1d9198b904379afff0d143d7d037b710b61 (8e6cd1d9198b904379afff0d143d7d037b710b61)

bors avatar Mar 23 '23 20:03 bors

Oh hm that doesn't run the full CI (anymore?) (did it ever?), I should probably just finally start up an aarch64 machine and test this on that then.

workingjubilee avatar Mar 23 '23 21:03 workingjubilee

Since we already have a try build, it wouldn’t hurt a to have a @rust-timer queue

nagisa avatar Mar 29 '23 19:03 nagisa

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

rust-timer avatar Mar 29 '23 19:03 rust-timer

Switching to waiting on author as it seems there has been a review. @jhorstmann Feel free to request a review with @rustbot ready, thanks!

@rustbot author

apiraino avatar May 03 '23 16:05 apiraino

r? rust-lang/compiler

nagisa avatar Jun 29 '23 22:06 nagisa

The job x86_64-gnu-llvm-14 failed! Check out the build log: (web) (plain)

Click to see the possible cause of the failure (guessed by this bot)
..........................................iiii..ii.iii...................

failures:

---- [assembly] tests/assembly/simd-intrinsic-mask-reduce.rs#aarch64 stdout ----

error in revision `aarch64`: verification with 'FileCheck' failed
status: exit status: 1
command: "/usr/lib/llvm-14/bin/FileCheck" "--input-file" "/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s" "/checkout/tests/assembly/simd-intrinsic-mask-reduce.rs" "--allow-unused-prefixes" "--check-prefixes" "CHECK,NONMSVC,aarch64" "--dump-input-context" "100"
stdout: none
--- stderr -------------------------------
/checkout/tests/assembly/simd-intrinsic-mask-reduce.rs:35:19: error: aarch64-NEXT: expected string not found in input
 // aarch64-NEXT: umaxv
                  ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:13:6: note: scanning from here
 cmgt v0.16b, v1.16b, v0.16b
     ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:14:2: note: possible intended match here
 umov w8, v0.b[1]
 ^
/checkout/tests/assembly/simd-intrinsic-mask-reduce.rs:45:19: error: aarch64-NEXT: expected string not found in input
 // aarch64-NEXT: umaxv
                  ^
                  ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:77:6: note: scanning from here
 cmlt v0.16b, v0.16b, #0
     ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s:78:2: note: possible intended match here
 umov w8, v0.b[1]


Input file: /checkout/obj/build/x86_64-unknown-linux-gnu/test/assembly/simd-intrinsic-mask-reduce.aarch64/simd-intrinsic-mask-reduce.s
Check file: /checkout/tests/assembly/simd-intrinsic-mask-reduce.rs

-dump-input=help explains the following input dump.
Input was:
<<<<<<
           1:  .text 
           1:  .text 
           2:  .file "simd_intrinsic_mask_reduce.ced9cd37d82699d5-cgu.0" 
           3:  .section .text.mask_reduce_all,"ax",@progbits 
           4:  .globl mask_reduce_all 
           5:  .p2align 2 
           6:  .type mask_reduce_all,@function 
           8:  .cfi_startproc 
           9:  sub sp, sp, #16 
           9:  sub sp, sp, #16 
          10:  .cfi_def_cfa_offset 16 
          11:  movi v0.2d, #0xffffffffffffffff 
          12:  ldr q1, [x0] 
          13:  cmgt v0.16b, v1.16b, v0.16b 
next:35'0          X~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
          14:  umov w8, v0.b[1] 
next:35'0     ~~~~~~~~~~~~~~~~~~
next:35'1      ?                 possible intended match
          15:  umov w10, v0.b[2] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          16:  umov w9, v0.b[0] 
next:35'0     ~~~~~~~~~~~~~~~~~~
          17:  umov w11, v0.b[3] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          18:  umov w12, v0.b[4] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          19:  umov w13, v0.b[5] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          20:  and w8, w8, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~
          21:  and w10, w10, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          22:  and w9, w9, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~
          23:  and w11, w11, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          24:  and w12, w12, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          25:  and w13, w13, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          26:  bfi w9, w8, #1, #1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          27:  umov w8, v0.b[6] 
next:35'0     ~~~~~~~~~~~~~~~~~~
          28:  bfi w9, w10, #2, #1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~
          29:  umov w10, v0.b[7] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          30:  bfi w9, w11, #3, #1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~
          31:  umov w11, v0.b[8] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          32:  bfi w9, w12, #4, #1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~
          33:  umov w12, v0.b[9] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          34:  and w8, w8, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~
          35:  bfi w9, w13, #5, #1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~
          36:  umov w13, v0.b[10] 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          37:  and w10, w10, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          38:  orr w8, w9, w8, lsl #6 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~
          39:  umov w9, v0.b[11] 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          40:  and w11, w11, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          41:  orr w8, w8, w10, lsl #7 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
          42:  umov w10, v0.b[12] 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          43:  and w12, w12, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          44:  orr w8, w8, w11, lsl #8 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
          45:  umov w11, v0.b[13] 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          46:  and w13, w13, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          47:  orr w8, w8, w12, lsl #9 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
          48:  umov w12, v0.b[14] 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          49:  and w9, w9, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~
          50:  orr w8, w8, w13, lsl #10 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
          51:  and w10, w10, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          52:  orr w8, w8, w9, lsl #11 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
          53:  and w9, w11, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~
          54:  umov w11, v0.b[15] 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          55:  orr w8, w8, w10, lsl #12 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
          56:  and w10, w12, #0x1 
next:35'0     ~~~~~~~~~~~~~~~~~~~~
          57:  orr w8, w8, w9, lsl #13 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
          58:  orr w8, w8, w10, lsl #14 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
          59:  orr w8, w8, w11, lsl #15 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
          60:  tst w8, #0xffff 
next:35'0     ~~~~~~~~~~~~~~~~~
          61:  cset w0, eq 
next:35'0     ~~~~~~~~~~~~~
          62:  add sp, sp, #16 
next:35'0     ~~~~~~~~~~~~~~~~~
          63:  ret 
next:35'0     ~~~~~
          64: .Lfunc_end0: 
next:35'0     ~~~~~~~~~~~~~
          65:  .size mask_reduce_all, .Lfunc_end0-mask_reduce_all 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          66:  .cfi_endproc 
next:35'0     ~~~~~~~~~~~~~~
          67:  
next:35'0     ~
          68:  .section .text.mask_reduce_any,"ax",@progbits 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          69:  .globl mask_reduce_any 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~
          70:  .p2align 2 
next:35'0     ~~~~~~~~~~~~
          71:  .type mask_reduce_any,@function 
next:35'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          72: mask_reduce_any: 
next:35'0     ~~~~~~~~~~~~~~~~
          73:  .cfi_startproc 
          74:  sub sp, sp, #16 
          75:  .cfi_def_cfa_offset 16 
          76:  ldr q0, [x0] 
          77:  cmlt v0.16b, v0.16b, #0 
next:45'0          X~~~~~~~~~~~~~~~~~~~ error: no match found
          78:  umov w8, v0.b[1] 
next:45'0     ~~~~~~~~~~~~~~~~~~
next:45'1      ?                 possible intended match
          79:  umov w10, v0.b[2] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          80:  umov w9, v0.b[0] 
next:45'0     ~~~~~~~~~~~~~~~~~~
          81:  umov w11, v0.b[3] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          82:  umov w12, v0.b[4] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          83:  umov w13, v0.b[5] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          84:  and w8, w8, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~
          85:  and w10, w10, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
          86:  and w9, w9, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~
          87:  and w11, w11, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
          88:  and w12, w12, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
          89:  and w13, w13, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
          90:  bfi w9, w8, #1, #1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
          91:  umov w8, v0.b[6] 
next:45'0     ~~~~~~~~~~~~~~~~~~
          92:  bfi w9, w10, #2, #1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~
          93:  umov w10, v0.b[7] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          94:  bfi w9, w11, #3, #1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~
          95:  umov w11, v0.b[8] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          96:  bfi w9, w12, #4, #1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~
          97:  umov w12, v0.b[9] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
          98:  and w8, w8, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~
          99:  bfi w9, w13, #5, #1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~
         100:  umov w13, v0.b[10] 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         101:  and w10, w10, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         102:  orr w8, w9, w8, lsl #6 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~
         103:  umov w9, v0.b[11] 
next:45'0     ~~~~~~~~~~~~~~~~~~~
         104:  and w11, w11, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         105:  orr w8, w8, w10, lsl #7 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
         106:  umov w10, v0.b[12] 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         107:  and w12, w12, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         108:  orr w8, w8, w11, lsl #8 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
         109:  umov w11, v0.b[13] 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         110:  and w13, w13, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         111:  orr w8, w8, w12, lsl #9 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
         112:  umov w12, v0.b[14] 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         113:  and w9, w9, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~
         114:  orr w8, w8, w13, lsl #10 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
         115:  and w10, w10, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         116:  orr w8, w8, w9, lsl #11 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
         117:  and w9, w11, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~
         118:  umov w11, v0.b[15] 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         119:  orr w8, w8, w10, lsl #12 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
         120:  and w10, w12, #0x1 
next:45'0     ~~~~~~~~~~~~~~~~~~~~
         121:  orr w8, w8, w9, lsl #13 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~
         122:  orr w8, w8, w10, lsl #14 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
         123:  orr w8, w8, w11, lsl #15 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
         124:  tst w8, #0xffff 
next:45'0     ~~~~~~~~~~~~~~~~~
         125:  cset w0, ne 
next:45'0     ~~~~~~~~~~~~~
         126:  add sp, sp, #16 
next:45'0     ~~~~~~~~~~~~~~~~~
         127:  ret 
next:45'0     ~~~~~
         128: .Lfunc_end1: 
next:45'0     ~~~~~~~~~~~~~
         129:  .size mask_reduce_any, .Lfunc_end1-mask_reduce_any 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         130:  .cfi_endproc 
next:45'0     ~~~~~~~~~~~~~~
         131:  
next:45'0     ~
         132:  .section ".note.GNU-stack","",@progbits 
next:45'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------------------




failures:
    [assembly] tests/assembly/simd-intrinsic-mask-reduce.rs#aarch64
test result: FAILED. 131 passed; 1 failed; 29 ignored; 0 measured; 0 filtered out; finished in 530.67ms

Some tests failed in compiletest suite=assembly mode=assembly host=x86_64-unknown-linux-gnu target=x86_64-unknown-linux-gnu

rust-log-analyzer avatar Jun 30 '23 10:06 rust-log-analyzer

Sorry, I'm not able to review this. r? compiler

cjgillot avatar Jul 04 '23 19:07 cjgillot

r? compiler

b-naber avatar Jul 05 '23 20:07 b-naber

r? compiler

fee1-dead avatar Jul 05 '23 23:07 fee1-dead

Perhaps we need a T-codegen.

workingjubilee avatar Jul 05 '23 23:07 workingjubilee

Let's nominate it for the compiler meeting to find a reviewer.

Nilstrieb avatar Jul 06 '23 04:07 Nilstrieb

r? @wesleywiser

I will put this on my TODO list to review

wesleywiser avatar Jul 06 '23 14:07 wesleywiser

:umbrella: The latest upstream changes (presumably #114148) made this pull request unmergeable. Please resolve the merge conflicts.

bors avatar Jul 29 '23 16:07 bors

The job x86_64-gnu-llvm-15 failed! Check out the build log: (web) (plain)

Click to see the possible cause of the failure (guessed by this bot)
##[group]Run git config --global core.autocrlf false
git config --global core.autocrlf false
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
  lfs: false
  submodules: false
  set-safe-directory: true
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/setup-environment.sh
src/ci/scripts/setup-environment.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/should-skip-this.sh
src/ci/scripts/should-skip-this.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/verify-channel.sh
src/ci/scripts/verify-channel.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/collect-cpu-stats.sh
src/ci/scripts/collect-cpu-stats.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/install-sccache.sh
src/ci/scripts/install-sccache.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/select-xcode.sh
src/ci/scripts/select-xcode.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-clang.sh
src/ci/scripts/install-clang.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-wix.sh
src/ci/scripts/install-wix.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/disable-git-crlf-conversion.sh
src/ci/scripts/disable-git-crlf-conversion.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/install-msys2.sh
src/ci/scripts/install-msys2.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-mingw.sh
src/ci/scripts/install-mingw.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/install-ninja.sh
src/ci/scripts/install-ninja.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/enable-docker-ipv6.sh
src/ci/scripts/enable-docker-ipv6.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/disable-git-crlf-conversion.sh
src/ci/scripts/disable-git-crlf-conversion.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/verify-line-endings.sh
src/ci/scripts/verify-line-endings.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
  CACHE_DOMAIN: ci-caches.rust-lang.org
  IMAGE: x86_64-gnu-llvm-15
##[endgroup]
##[group]Run src/ci/scripts/verify-backported-commits.sh
src/ci/scripts/verify-backported-commits.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/verify-stable-version-number.sh
src/ci/scripts/verify-stable-version-number.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
##[group]Run src/ci/scripts/run-build-from-ci.sh
src/ci/scripts/run-build-from-ci.sh
shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}
env:
  PR_CI_JOB: 1
  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
  HEAD_SHA: f1f03f5dedf38630d7a1a9838d238146ec7c6054
  SCCACHE_BUCKET: rust-lang-ci-sccache2
  TOOLSTATE_REPO: https://github.com/rust-lang-nursery/rust-toolstate
---
failures:

---- [codegen] tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs stdout ----

error: verification with 'FileCheck' failed
status: exit status: 1
command: "/usr/lib/llvm-15/bin/FileCheck" "--input-file" "/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll" "/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs" "--allow-unused-prefixes" "--check-prefixes" "CHECK,NONMSVC" "--dump-input-context" "100"
--- stderr -------------------------------
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:32:12: error: CHECK: expected string not found in input
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:32:12: error: CHECK: expected string not found in input
 // CHECK: [[A:%[0-9]+]] = lshr <4 x i8> %{{m|1}}, <i8 7, i8 7, i8 7, i8 7>
Build completed unsuccessfully in 0:11:05
Build completed unsuccessfully in 0:11:05
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:7:23: note: scanning from here
define void @select_m8(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 4 dereferenceable(4) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 {
                      ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:12:2: note: possible intended match here
 %3 = lshr <4 x i8> %0, <i8 7, i8 7, i8 7, i8 7>
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:41:12: error: CHECK: expected string not found in input
/checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs:41:12: error: CHECK: expected string not found in input
 // CHECK: [[A:%[0-9]+]] = lshr <4 x i32> %{{m|1}}, <i32 31, i32 31, i32 31, i32 31>
           ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:20:24: note: scanning from here
define void @select_m32(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 {
                       ^
/checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll:25:2: note: possible intended match here
 %3 = lshr <4 x i32> %0, <i32 31, i32 31, i32 31, i32 31>

Input file: /checkout/obj/build/x86_64-unknown-linux-gnu/test/codegen/simd-intrinsic/simd-intrinsic-generic-select/simd-intrinsic-generic-select.ll
Check file: /checkout/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs


-dump-input=help explains the following input dump.
Input was:
<<<<<<
<<<<<<
            1: ; ModuleID = 'simd_intrinsic_generic_select.1439860ecd8c1a90-cgu.0' 
            2: source_filename = "simd_intrinsic_generic_select.1439860ecd8c1a90-cgu.0" 
            3: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" 
            4: target triple = "x86_64-unknown-linux-gnu" 
            5:  
            6: ; Function Attrs: nonlazybind uwtable 
            7: define void @select_m8(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 4 dereferenceable(4) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 { 
check:32'0                           X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
            8: start: 
check:32'0     ~~~~~~~
            9:  %0 = load <4 x i8>, ptr %m, align 4 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           10:  %1 = load <4 x float>, ptr %a, align 16 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           11:  %2 = load <4 x float>, ptr %b, align 16 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           12:  %3 = lshr <4 x i8> %0, <i8 7, i8 7, i8 7, i8 7> 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:32'1      ?                                                possible intended match
           13:  %4 = trunc <4 x i8> %3 to <4 x i1> 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           14:  %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %2 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           15:  store <4 x float> %5, ptr %_0, align 16 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           16:  ret void 
check:32'0     ~~~~~~~~~~
           17: } 
check:32'0     ~~
           18:  
check:32'0     ~
           19: ; Function Attrs: nonlazybind uwtable 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           20: define void @select_m32(ptr noalias nocapture noundef sret(<4 x float>) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %m, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b) unnamed_addr #0 { 
check:32'0     ~~~~~~~~~~~~~~~~~~~~~~~
check:41'0                            X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
           21: start: 
check:41'0     ~~~~~~~
           22:  %0 = load <4 x i32>, ptr %m, align 16 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           23:  %1 = load <4 x float>, ptr %a, align 16 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           24:  %2 = load <4 x float>, ptr %b, align 16 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           25:  %3 = lshr <4 x i32> %0, <i32 31, i32 31, i32 31, i32 31> 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:41'1      ?                                                         possible intended match
           26:  %4 = trunc <4 x i32> %3 to <4 x i1> 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           27:  %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %2 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           28:  store <4 x float> %5, ptr %_0, align 16 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           29:  ret void 
check:41'0     ~~~~~~~~~~
           30: } 
check:41'0     ~~
           31:  
check:41'0     ~
           32: ; Function Attrs: nonlazybind uwtable 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           33: define void @select_bitmask(ptr noalias nocapture noundef sret(<8 x float>) align 32 dereferenceable(32) %_0, i8 noundef %m, ptr noalias nocapture noundef align 32 dereferenceable(32) %a, ptr noalias nocapture noundef align 32 dereferenceable(32) %b) unnamed_addr #0 { 
check:41'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~
           34: start: 
           35:  %0 = load <8 x float>, ptr %a, align 32 
           36:  %1 = load <8 x float>, ptr %b, align 32 
           37:  %2 = bitcast i8 %m to <8 x i1> 
           38:  %3 = select <8 x i1> %2, <8 x float> %0, <8 x float> %1 
           39:  store <8 x float> %3, ptr %_0, align 32 
           40:  ret void 
           41: } 
           42:  
           43: attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" } 
           44:  
           45: !llvm.module.flags = !{!0, !1} 
           46: !llvm.ident = !{!2} 
           47:  
           48: !0 = !{i32 7, !"PIC Level", i32 2} 
           49: !1 = !{i32 2, !"RtLibUseGOT", i32 1} 
           50: !2 = !{!"rustc version 1.73.0-nightly (f9a7e8786 2023-07-30)"} 
------------------------------------------



rust-log-analyzer avatar Jul 30 '23 15:07 rust-log-analyzer

:umbrella: The latest upstream changes (presumably #105545) made this pull request unmergeable. Please resolve the merge conflicts.

bors avatar Aug 01 '23 21:08 bors

:umbrella: The latest upstream changes (presumably #117444) made this pull request unmergeable. Please resolve the merge conflicts.

bors avatar Oct 31 '23 15:10 bors

@jhorstmann any updates on this? thanks

Dylan-DPC avatar Feb 05 '24 13:02 Dylan-DPC

@Dylan-DPC I updated the PR and adjusted also the new masked load/store intrinsics to use the same logic. I also added another assembly test for masked load that shows there are no unneeded shift instructions in the output. The current output is


load_f64x4:
        vpsllq  ymm0, ymmword ptr [rdi], 63
        vpmovq2m        k1, ymm0
        vmovupd ymm0 {k1} {z}, ymmword ptr [rsi]
        vmovapd ymmword ptr [rdx], ymm0
        vzeroupper
        ret

(https://rust.godbolt.org/z/ThTz6E39s)

jhorstmann avatar Mar 01 '24 19:03 jhorstmann