[RISCV] Extend selectSHXADDOp to handle an additional pattern

Open asb opened this issue 1 month ago • 1 comments

Transform (and (shl x, c1), c2) -> (shl (and x, c2 >> c1), c1) as long as the shifted mask fits into a 12-bit immediate. This allows a shxadd to be selected when it wasn't before.

This was found in SPEC but I will note that the dynamic execution counts are not particularly high (the best we get is -0.04% on 531.deepsjeng_r. The diffstat on the generated assembly for an RVA22 SPEC build is 2086 insertions, 2345 deletions.

Although the impact is small on these benchmarks, I'm choosing to propose this because it does represent a case where shxadd can 'obviously' be selected without anything particularly heroic.

Dec 17 '25 12:12 asb

@llvm/pr-subscribers-backend-risc-v

Author: Alex Bradbury (asb)

Changes

Transform (and (shl x, c1), c2) -> (shl (and x, c2 >> c1), c1) as long as the shifted mask fits into a 12-bit immediate. This allows a shxadd to be selected when it wasn't before.

Although the impact is small on these benchmarks, I'm choosing to propose this because it does represent a case where shxadd can 'obviously' be selected without anything particularly heroic.

Full diff: https://github.com/llvm/llvm-project/pull/172641.diff

3 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp (+18)
(modified) llvm/test/CodeGen/RISCV/rv32zba.ll (+96)
(modified) llvm/test/CodeGen/RISCV/rv64zba.ll (+96)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b6b716be35c3e..0154c93cc65d1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3729,6 +3729,24 @@ bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt,
   if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
     SDValue N0 = N.getOperand(0);
 
+    // Transform (and (shl x, c1), c2) -> (shl (and x, c2 >> c1), c1) as long
+    // as the shifted mask fits into a 12-bit immediate.
+    if (N0.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N0.getOperand(1)) &&
+        N0.getConstantOperandVal(1) == ShAmt) {
+      uint64_t Mask = N.getConstantOperandVal(1);
+      uint64_t PreShiftMask = Mask >> ShAmt;
+
+      if (isInt<12>(PreShiftMask)) {
+        SDLoc DL(N);
+        EVT VT = N.getValueType();
+        Val = SDValue(CurDAG->getMachineNode(
+                          RISCV::ANDI, DL, VT, N0.getOperand(0),
+                          CurDAG->getTargetConstant(PreShiftMask, DL, VT)),
+                      0);
+        return true;
+      }
+    }
+
     if (bool LeftShift = N0.getOpcode() == ISD::SHL;
         (LeftShift || N0.getOpcode() == ISD::SRL) &&
         isa<ConstantSDNode>(N0.getOperand(1))) {
diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll
index ea9d117f2e2e3..53aa970f85eb5 100644
--- a/llvm/test/CodeGen/RISCV/rv32zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zba.ll
@@ -1300,3 +1300,99 @@ define ptr @shl_add_knownbits(ptr %p, i32 %i) {
   %r = getelementptr i8, ptr %p, i32 %shr
   ret ptr %r
 }
+
+; The shxadd_masked tests cover:
+;    (add (and (shl x, c1), c2), y)
+; -> (shXadd (and x, c2 >> c1), y)
+; i.e. shift left and then mask, so that shxadd can be selected.
+
+define i32 @sh1add_masked(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sh1add_masked:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srli a0, a0, 23
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: sh1add_masked:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    zext.b a0, a0
+; RV32ZBA-NEXT:    sh1add a0, a0, a1
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: sh1add_masked:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    zext.b a0, a0
+; RV32XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV32XANDESPERF-NEXT:    ret
+  %shl = shl i32 %a, 1
+  %and = and i32 %shl, 510
+  %add = add i32 %and, %b
+  ret i32 %add
+}
+
+define i32 @sh2add_masked(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sh2add_masked:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 22
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: sh2add_masked:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    andi a0, a0, 1023
+; RV32ZBA-NEXT:    sh2add a0, a0, a1
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: sh2add_masked:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    andi a0, a0, 1023
+; RV32XANDESPERF-NEXT:    nds.lea.w a0, a1, a0
+; RV32XANDESPERF-NEXT:    ret
+  %shl = shl i32 %a, 2
+  %and = and i32 %shl, 4092
+  %add = add i32 %and, %b
+  ret i32 %add
+}
+
+define i32 @sh3add_masked(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sh3add_masked:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 768
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: sh3add_masked:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    andi a0, a0, 768
+; RV32ZBA-NEXT:    sh3add a0, a0, a1
+; RV32ZBA-NEXT:    ret
+;
+; RV32XANDESPERF-LABEL: sh3add_masked:
+; RV32XANDESPERF:       # %bb.0:
+; RV32XANDESPERF-NEXT:    andi a0, a0, 768
+; RV32XANDESPERF-NEXT:    nds.lea.d a0, a1, a0
+; RV32XANDESPERF-NEXT:    ret
+  %shl = shl i32 %a, 3
+  %and = and i32 %shl, 6144
+  %add = add i32 %and, %b
+  ret i32 %add
+}
+
+; This should not trigger the optimisation as the shifted mask would not fit
+; in an immediate.
+define i32 @sh1add_large_mask(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: sh1add_large_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 1
+  %and = and i32 %shl, 4096
+  %add = add i32 %and, %b
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index fb26b8b16a290..6c6d63c26875b 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -5087,3 +5087,99 @@ define i64 @exactashr1mul36(i64 %a) {
   %d = mul i64 %c, 36
   ret i64 %d
 }
+
+; The shxadd_masked tests cover:
+;    (add (and (shl x, c1), c2), y)
+; -> (shXadd (and x, c2 >> c1), y)
+; i.e. shift left and then mask, so that shxadd can be selected.
+
+define i64 @sh1add_masked(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sh1add_masked:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a0, a0, 55
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh1add_masked:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    zext.b a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: sh1add_masked:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    zext.b a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %shl = shl i64 %a, 1
+  %and = and i64 %shl, 510
+  %add = add i64 %and, %b
+  ret i64 %add
+}
+
+define i64 @sh2add_masked(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sh2add_masked:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 54
+; RV64I-NEXT:    srli a0, a0, 52
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2add_masked:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    andi a0, a0, 1023
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: sh2add_masked:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    andi a0, a0, 1023
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %shl = shl i64 %a, 2
+  %and = and i64 %shl, 4092
+  %add = add i64 %and, %b
+  ret i64 %add
+}
+
+define i64 @sh3add_masked(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sh3add_masked:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 768
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh3add_masked:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    andi a0, a0, 768
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: sh3add_masked:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    andi a0, a0, 768
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %shl = shl i64 %a, 3
+  %and = and i64 %shl, 6144
+  %add = add i64 %and, %b
+  ret i64 %add
+}
+
+; This should not trigger the optimisation as the shifted mask would not fit
+; in an immediate.
+define i64 @sh1add_large_mask(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: sh1add_large_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 1
+  %and = and i64 %shl, 4096
+  %add = add i64 %and, %b
+  ret i64 %add
+}

Dec 17 '25 12:12 llvmbot