[X86] lowerBuildVectorToBitOp - handle cases where either side folds to vector load
lowerBuildVectorToBitOp is currently limited to cases where the build_vector is a common shift/bitlogic instruction with all of the rhs operands being constant - the idea being to only vectorize the instruction if we don't increase the amount of GPR->FPU moves.
This patch relaxes the constant requirement to allow for cases where all of the LHS or RHS operands can be folded to a single vector load, similarly avoiding extra GPR->FPU traffic - typically these appear late from consecutive loads from stack or similar.
Fixes #163788
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
Changes
lowerBuildVectorToBitOp is currently limited to cases where the build_vector is a common shift/bitlogic instruction with all of the rhs operands being constant - the idea being to only vectorize the instruction if we don't increase the amount of GPR->FPU moves.
This patch relaxes the constant requirement to allow for cases where all of the LHS or RHS operands can be folded to a single vector load, similarly avoiding extra GPR->FPU traffic - typically these appear late from consecutive loads from stack or similar.
Fixes #163788
Full diff: https://github.com/llvm/llvm-project/pull/172467.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+40-19)
- (modified) llvm/test/CodeGen/X86/setcc-wide-types.ll (+39-67)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4ac51408719bb..1e17c67beb8da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8864,6 +8864,7 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
+ unsigned ElemSize = VT.getScalarSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Check that all elements have the same opcode.
@@ -8873,7 +8874,7 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
if (Opcode != Op->getOperand(i).getOpcode())
return SDValue();
- // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ // TODO: We may be able to add support for other Ops (e.g. ADD/SUB).
bool IsShift = false;
switch (Opcode) {
default:
@@ -8895,34 +8896,54 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
break;
}
+ // Collect elements.
+ bool RHSAllConst = true;
SmallVector<SDValue, 4> LHSElts, RHSElts;
for (SDValue Elt : Op->ops()) {
SDValue LHS = Elt.getOperand(0);
SDValue RHS = Elt.getOperand(1);
+ RHSAllConst &= isa<ConstantSDNode>(RHS);
+ LHSElts.push_back(LHS);
+ RHSElts.push_back(RHS);
+ }
+ // Canonicalize shift amounts.
+ if (IsShift) {
// We expect the canonicalized RHS operand to be the constant.
- if (!isa<ConstantSDNode>(RHS))
+ // TODO: Permit non-constant XOP/AVX2 cases?
+ if (!RHSAllConst)
return SDValue();
// Extend shift amounts.
- if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
- if (!IsShift)
- return SDValue();
- RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
- }
-
- LHSElts.push_back(LHS);
- RHSElts.push_back(RHS);
+ for (SDValue &Op1 : RHSElts)
+ if (Op1.getValueSizeInBits() != ElemSize)
+ Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType());
+
+ // Limit to shifts by uniform immediates.
+ // TODO: Only accept vXi8/vXi64 special cases?
+ // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
+ if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
+ return SDValue();
}
-
- // Limit to shifts by uniform immediates.
- // TODO: Only accept vXi8/vXi64 special cases?
- // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
- if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
- return SDValue();
-
- SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
- SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
+ assert(all_of(llvm::concat<SDValue>(LHSElts, RHSElts),
+ [ElemSize](SDValue V) {
+ return V.getValueSizeInBits() == ElemSize;
+ }) &&
+ "Element size mismatch");
+
+ // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as
+ // a load or RHS must be constant.
+ SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget,
+ /*IsAfterLegalize=*/true);
+ SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget,
+ /*IsAfterLegalize=*/true);
+ if (!LHS && !RHS && !RHSAllConst)
+ return SDValue();
+
+ if (!LHS)
+ LHS = DAG.getBuildVector(VT, DL, LHSElts);
+ if (!RHS)
+ RHS = DAG.getBuildVector(VT, DL, RHSElts);
SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
if (!IsShift)
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index d27b032058bc7..1e53dc01ed168 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -327,66 +327,50 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
define i1 @ne_v4i256(<4 x i256> %a0) {
; SSE2-LABEL: ne_v4i256:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: movq %r10, %xmm0
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq %rdx, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r9
; SSE2-NEXT: movq %r9, %xmm0
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r8
; SSE2-NEXT: movq %r8, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: por {{[0-9]+}}(%rsp), %xmm1
; SSE2-NEXT: movq %rsi, %xmm0
-; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movq %rdi, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT: movmskps %xmm0, %eax
+; SSE2-NEXT: movq %rdi, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: por {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: movq %rdx, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: por {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: ne_v4i256:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; SSE41-NEXT: movq %r10, %xmm0
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT: movq %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movq %rdx, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r9
; SSE41-NEXT: movq %r9, %xmm0
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r8
; SSE41-NEXT: movq %r8, %xmm1
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT: por {{[0-9]+}}(%rsp), %xmm1
; SSE41-NEXT: movq %rsi, %xmm0
-; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdi
-; SSE41-NEXT: movq %rdi, %xmm3
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE41-NEXT: por %xmm1, %xmm3
-; SSE41-NEXT: por %xmm2, %xmm3
-; SSE41-NEXT: ptest %xmm3, %xmm3
+; SSE41-NEXT: movq %rdi, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE41-NEXT: por {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: movq %rdx, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: por {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
@@ -448,31 +432,19 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
;
; AVX512-LABEL: ne_v4i256:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovq %r10, %xmm0
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: vmovq %rdx, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT: vmovq %r9, %xmm1
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: vmovq %r8, %xmm2
+; AVX512-NEXT: vmovq %rsi, %xmm1
+; AVX512-NEXT: vmovq %rdi, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vmovq %r9, %xmm1
+; AVX512-NEXT: vmovq %r8, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT: vmovq %rsi, %xmm2
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT: vmovq %rdi, %xmm3
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vporq {{[0-9]+}}(%rsp), %zmm0, %zmm0
; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: sete %al
LLVM Buildbot has detected a new failure on builder openmp-s390x-linux running on systemz-1 while building llvm at step 6 "test-openmp".
Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/19110
Here is the relevant piece of the build log for the reference
Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libomp :: tasking/issue-94260-2.c' FAILED ********************
Exit Code: -11
Command Output (stdout):
--
# RUN: at line 1
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic && /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11
--
********************