llvm-project icon indicating copy to clipboard operation
llvm-project copied to clipboard

[X86][SimplifyCFG] Support hoisting load/store with conditional faulting

Open KanRobert opened this issue 1 year ago • 28 comments

This is simplifycfg part of https://github.com/llvm/llvm-project/pull/95515

KanRobert avatar Jun 27 '24 09:06 KanRobert

@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-loongarch @llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-transforms

Author: Shengchen Kan (KanRobert)

Changes

Patch is 26.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96878.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Utils/SimplifyCFG.cpp (+216-5)
  • (added) llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll (+460)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c52c4dc0b8a51..558fafd5a2652 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores if an unconditional store precedes"));
 
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+    "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+    cl::init(true),
+    cl::desc("Hoist loads/stores if the target supports "
+             "conditional faulting"));
+
 static cl::opt<bool> MergeCondStores(
     "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs);
+  bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2960,6 +2967,199 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
   return HaveRewritablePHIs;
 }
 
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     store i32 1, ptr %q, align 4
+///     ...
+///   TrueBB:
+///     %0 = load i32, ptr %b, align 4
+///     store i32 %0, ptr %p, align 4
+///     ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     %0 = cload i32, ptr %b, %cond
+///     cstore i32 %0, ptr %p, %cond
+///     cstore i32 1, ptr %q, ~%cond
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     ...
+///   TrueBB:
+///     ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+///   %vcond = bitcast i1 %cond to <1 x i1>
+///   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+///                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+///   %0 = bitcast <1 x i32> %v0 to i32
+///   call void @llvm.masked.store.v1i32.p0
+//                          (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+///   %cond.not = xor i1 %cond, true
+///   %vcond.not = bitcast i1 %cond.not to <1 x i>
+///   call void @llvm.masked.store.v1i32.p0
+///              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+///   are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+    BasicBlock *BB) {
+  if (!HoistLoadsStoresWithCondFaulting ||
+      !TTI.hasConditionalLoadStoreForType())
+    return false;
+
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *IfTrueBB = BI->getSuccessor(0);
+  BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+  // If either of the blocks has it's address taken, then we can't do this fold,
+  // because the code we'd hoist would no longer run when we jump into the block
+  // by it's address.
+  for (auto *Succ : {IfTrueBB, IfFalseBB})
+    if (Succ->hasAddressTaken())
+      return false;
+
+  // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
+  // checking all intermediate operands dominate the branch.
+  auto IsLoadFromAlloca = [](const Instruction &I) {
+    return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
+  };
+
+  // Collect hoisted loads/stores.
+  SmallSetVector<Instruction *, 4> HoistedInsts;
+  // Not hoist load/store if
+  // 1. target does not have corresponding conditional faulting load/store.
+  // 2. it's volatile or atomic.
+  // 3. there is a load/store that can not be hoisted in the same bb.
+  // 4. there is a non-load/store that's not safe to speculatively execute
+  //    in the same bb.
+  // 5. any operand of it does not dominate the branch.
+  // 6. it's a store and a memory read is skipped.
+  auto HoistInstsInBB = [&](BasicBlock *BB) {
+    bool SkipMemoryRead = false;
+    // A more efficient way to check domination. An operand dominates the
+    // BranchInst if
+    // 1. it's not defined in the same bb as the instruction.
+    // 2. it's to be hoisted.
+    //
+    // b/c BB is only predecessor and BranchInst does not define any value.
+    auto OpsDominatesBranch = [&](Instruction &I) {
+      return llvm::all_of(I.operands(), [&](Value *Op) {
+        if (auto *J = dyn_cast<Instruction>(Op)) {
+          if (HoistedInsts.contains(J))
+            return true;
+          if (J->getParent() == I.getParent())
+            return false;
+        }
+        return true;
+      });
+    };
+    for (auto &I : *BB) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (LI || SI) {
+        bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+        if (!IsSimple || !OpsDominatesBranch(I))
+          return false;
+        auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+        // a load from alloca is always safe.
+        if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
+          return false;
+        // Conservative aliasing check.
+        if (SI && SkipMemoryRead)
+          return false;
+        HoistedInsts.insert(&I);
+      } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
+        return false;
+      else if (I.mayReadFromMemory())
+        SkipMemoryRead = true;
+    }
+    return true;
+  };
+
+  if (!HoistInstsInBB(IfTrueBB) || !HoistInstsInBB(IfFalseBB) ||
+      HoistedInsts.empty())
+    return false;
+
+  // Put newly added instructions before the BranchInst.
+  IRBuilder<> Builder(BI);
+  auto &Context = BB->getContext();
+  auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  auto *Cond = BI->getOperand(0);
+  auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+  Value *VCondNot = nullptr;
+  for (auto *I : HoistedInsts) {
+    // Only need to move the position for load from alloca.
+    if (IsLoadFromAlloca(*I)) {
+      I->moveBefore(BI);
+      continue;
+    }
+
+    bool InvertCond = I->getParent() == IfFalseBB;
+    // Construct the inverted condition if need.
+    if (InvertCond && !VCondNot)
+      VCondNot = Builder.CreateBitCast(
+          Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+    auto *Mask = InvertCond ? VCondNot : VCond;
+    auto *Op0 = I->getOperand(0);
+    if (auto *LI = dyn_cast<LoadInst>(I)) {
+      // Load
+      auto *Ty = I->getType();
+      // NOTE: Now we assume conditional faulting load/store is supported for
+      // scalar only when creating new instructions, but it's easy to extend it
+      // for vector types in the future.
+      assert(!Ty->isVectorTy() && "not implemented");
+      auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+                                          LI->getAlign(), Mask);
+      auto *S0 = Builder.CreateBitCast(V0, Ty);
+      V0->copyMetadata(*I);
+      I->replaceAllUsesWith(S0);
+    } else {
+      // Store
+      assert(!Op0->getType()->isVectorTy() && "not implemented");
+      auto *StoredVal =
+          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+      auto *VStore = Builder.CreateMaskedStore(
+          StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+      VStore->copyMetadata(*I);
+    }
+  }
+
+  // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+  // error.
+  std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
+    if (!IsLoadFromAlloca(*I))
+      I->eraseFromParent();
+  });
+
+  return true;
+}
+
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -7420,31 +7620,42 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   // We have a conditional branch to two blocks that are only reachable
-  // from BI.  We know that the condbr dominates the two blocks, so see if
-  // there is any identical code in the "then" and "else" blocks.  If so, we
-  // can hoist it up to the branching block.
+  // from BI.  We know that the condbr dominates the two blocks, so see
+  //
+  // * if there is any identical code in the "then" and "else" blocks.
+  // * if there is any different load/store in the "then" and "else" blocks.
+  //
+  // If so, we can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistCommon && hoistCommonCodeFromSuccessors(
                              BI->getParent(), !Options.HoistCommonInsts))
         return requestResimplify();
+      if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+        return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
       Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
-          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
+        if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+          return requestResimplify();
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
           return requestResimplify();
+      }
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
     // execute Successor #1 if it branches to Successor #0.
     Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
-        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
+      if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+        return requestResimplify();
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
         return requestResimplify();
+    }
   }
 
   // If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..2fd0055cf05f9
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,460 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; The redundant bitcast/insertelement will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i64 1, ptr %p, align 8, !dbg !8
+  store i16 2, ptr %q, align 8, !dbg !8
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4,  !dbg !9
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; simplifycfg is run before sroa. alloca here is not optimized away yet.
+define void @alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p.addr = alloca ptr
+  %q.addr = alloca ptr
+  %a.addr = alloca i32
+  store ptr %p, ptr %p.addr
+  store ptr %q, ptr %q.addr
+  store i32 %a, ptr %a.addr
+  %0 = load i32, ptr %a.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  %1 = load ptr, ptr %q.addr
+  %2 = load i32, ptr %1
+  %3 = load ptr, ptr %p.addr
+  store i32 %2, ptr %3
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; successor 1 branches to successor 0.
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  br label %if.end
+}
+
+;; successor 0 branches to successor 1.
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b
+  store i32 %0, ptr %p
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
+; CHECK-NEXT:    ret i64 [[COMMON_RET_OP]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.end
+
+if.true:
+  store i32 1, ptr %b
+  %0 = load i16, ptr %p
+  %1 = load i64, ptr %q
+  %zext = zext i16 %0 to i64
+  %add = add i64 %zext, %1
+  ret i64 %add
+
+if.end:
+  ret i64 0
+}
+
+define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT:    [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %read = call i32 @read_memory_only()
+  %0 = load i32, ptr %q
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b
+  store i32 %1, ptr %p
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [%read, %if.false], [0, %if.true]
+  ret i32 %phi
+}
+
+; i8 is not supported by conditional faulting
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+ent...
[truncated]

llvmbot avatar Jun 27 '24 09:06 llvmbot

How is the performance of the hoisting load/store being benchmarked? I don't know the specific implementation in microarchitecture, but using these conditional instructions to replace a very easy-to-predict branch may negatively contribute to performance. Is there any microarchitecture simulator or real chip implemented so we can benchmark the performance?

cyyself avatar Jun 27 '24 15:06 cyyself

How is the performance of the hoisting load/store being benchmarked? I don't know the specific implementation in microarchitecture, but using these conditional instructions to replace a very easy-to-predict branch may negatively contribute to performance. Is there any microarchitecture simulator or real chip implemented so we can benchmark the performance?

I can share code for check predictability

  // If the branch is non-unpredictable, and is predicted to *not* branch to
  // the `then` block, then avoid speculating it.
  if (!BI->getMetadata(LLVMContext::MD_unpredictable)) {
    uint64_t TWeight, FWeight;
    if (extractBranchWeights(*BI, TWeight, FWeight) &&
        (TWeight + FWeight) != 0) {
      uint64_t EndWeight = Invert ? TWeight : FWeight;
      BranchProbability BIEndProb =
          BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight);
      BranchProbability Likely = TTI.getPredictableBranchThreshold();
      if (BIEndProb >= Likely)
        return false;
    }
  }

in SpeculativelyExecuteBB with the newly added transform.

We have an internal cycle-accurate performance simulator. The real chip is not public yet. You know, even if I have data, I can't make any comments on the performance of future HW.

KanRobert avatar Jun 28 '24 01:06 KanRobert

I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)

Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of print-after-all.

If I add -start-before=simplifycfg to the flags, it will crash. (not related this PR)

bash$ cat 1.c
void f() {}

bash$ clang -O2 1.c -S -emit-llvm

bash$ llc --start-before=simplifycfg <1.ll

Assertion `!NodePtr->isKnownSentinel()' failed

KanRobert avatar Jun 28 '24 06:06 KanRobert

I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)

Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of print-after-all.

I think a llc -O2 should be ok. I just believe we need a complete test case to demonstrate the final changes from this patch.

If I add -start-before=simplifycfg to the flags, it will crash. (not related this PR)

bash$ cat 1.c
void f() {}

bash$ clang -O2 1.c -S -emit-llvm

bash$ llc --start-before=simplifycfg <1.ll

Assertion `!NodePtr->isKnownSentinel()' failed

I guess --start-before can only be used on MIR passes.

dianqk avatar Jun 28 '24 07:06 dianqk

I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)

Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of print-after-all.

I think a llc -O2 should be ok. I just believe we need a complete test case to demonstrate the final changes from this patch.

If I add -start-before=simplifycfg to the flags, it will crash. (not related this PR)

bash$ cat 1.c
void f() {}

bash$ clang -O2 1.c -S -emit-llvm

bash$ llc --start-before=simplifycfg <1.ll

Assertion `!NodePtr->isKnownSentinel()' failed

I guess --start-before can only be used on MIR passes.

-O2 does not work. In fact, the default opt level for llc is O2. I can add a test with flags opt -O2 -S to check the transform is not cancelled during the middle-end opts. In backend ,we already have a test to check masked.load/store can be selected. Does it sound good?

KanRobert avatar Jun 28 '24 07:06 KanRobert

I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)

Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of print-after-all.

I think a llc -O2 should be ok. I just believe we need a complete test case to demonstrate the final changes from this patch.

If I add -start-before=simplifycfg to the flags, it will crash. (not related this PR)

bash$ cat 1.c
void f() {}

bash$ clang -O2 1.c -S -emit-llvm

bash$ llc --start-before=simplifycfg <1.ll

Assertion `!NodePtr->isKnownSentinel()' failed

I guess --start-before can only be used on MIR passes.

-O2 does not work. In fact, the default opt level for llc is O2. I can add a test with flags opt -O2 -S to check the transform is not cancelled during the middle-end opts. In backend ,we already have a test to check masked.load/store can be selected. Does it sound good?

Done. Added test masked-load-store-legal.ll

KanRobert avatar Jun 28 '24 07:06 KanRobert

@KanRobert I think this patch is far away from being ready for review. Please make sure that this patch doesn't cause crashes or miscompilation on the following benchmarks:

  • [ ] Clang stage2 build
  • [ ] SPEC 2006/2017
  • [ ] llvm-test-suite
  • [x] https://github.com/dtcxzyw/llvm-opt-benchmark
  • [x] fuzzers (passed 1M csmith tests)

dtcxzyw avatar Jun 29 '24 08:06 dtcxzyw

Crash reproducer:

; bin/opt -passes=simplifycfg -mattr=+cf reduced.ll -S
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

define i64 @vm_exec_core(i1 %0) {
  br i1 %0, label %2, label %common.ret

common.ret:                                       ; preds = %2, %1
  ret i64 0

2:                                                ; preds = %1
  br label %common.ret
}
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.      Program arguments: bin/opt -passes=simplifycfg -mattr=+cf reduced.ll
1.      Running pass "function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>)" on module "reduced.ll"
2.      Running pass "simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>" on function "vm_exec_core"
 #0 0x000076ec91614410 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x214410)
 #1 0x000076ec9161141f llvm::sys::RunSignalHandlers() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x21141f)
 #2 0x000076ec91611575 SignalHandler(int) Signals.cpp:0:0
 #3 0x000076ec91042520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #4 0x000076ec8c0b7873 (anonymous namespace)::SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(llvm::BranchInst*, llvm::BasicBlock*) SimplifyCFG.cpp:0:0
 #5 0x000076ec8c0bcdea llvm::simplifyCFG(llvm::BasicBlock*, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&, llvm::ArrayRef<llvm::WeakVH>) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMTransformUtils.so.19.0git+0x2bcdea)
 #6 0x000076ec8cba8a43 iterativelySimplifyCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0
 #7 0x000076ec8cba96ba simplifyFunctionCFGImpl(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0
 #8 0x000076ec8cbaa4f5 simplifyFunctionCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0
 #9 0x000076ec8cbaa6b0 llvm::SimplifyCFGPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMScalarOpts.so.19.0git+0x3aa6b0)
#10 0x000076ec8e0c79e6 llvm::detail::PassModel<llvm::Function, llvm::SimplifyCFGPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMPasses.so.19.0git+0xc79e6)
#11 0x000076ec8b14ba1b llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34ba1b)
#12 0x000076ec906d5096 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd5096)
#13 0x000076ec8b14a7e7 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34a7e7)
#14 0x000076ec906d59b6 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd59b6)
#15 0x000076ec8b148d0b llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x348d0b)
#16 0x000076ec91a3b6b4 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x2d6b4)
#17 0x000076ec91a4878a optMain (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x3a78a)
#18 0x000076ec91029d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16
#19 0x000076ec91029e40 call_init ./csu/../csu/libc-start.c:128:20
#20 0x000076ec91029e40 __libc_start_main ./csu/../csu/libc-start.c:379:5
#21 0x0000612682610095 _start (bin/opt+0x1095)
Segmentation fault (core dumped)

Not able to reproduce with latest PR. Probably fixed by 12ea4a1eb3ee64d4aed10978f9b50cfc4d14007d

KanRobert avatar Jul 01 '24 03:07 KanRobert

@KanRobert I think this patch is far away from being ready for review. Please make sure that this patch doesn't cause crashes or miscompilation on the following benchmarks:

  • [ ] Clang stage2 build
  • [ ] SPEC 2006/2017
  • [ ] llvm-test-suite
  • [ ] https://github.com/dtcxzyw/llvm-opt-benchmark
  • [ ] fuzzers (e.g., csmith)

Not familiar with other benchmarks. But I will validate SPEC2017 + llvm-test-suite on intel SDE with this PR, and comment the result on this page.

KanRobert avatar Jul 01 '24 03:07 KanRobert

@KanRobert I think this patch is far away from being ready for review. Please make sure that this patch doesn't cause crashes or miscompilation on the following benchmarks:

  • [ ] Clang stage2 build
  • [ ] SPEC 2006/2017
  • [ ] llvm-test-suite
  • [ ] https://github.com/dtcxzyw/llvm-opt-benchmark
  • [ ] fuzzers (e.g., csmith)

Not familiar with other benchmarks. But I will validate SPEC2017 + llvm-test-suite on intel SDE with this PR, and comment the result on this page.

Great! As we have a similar prototype on RISCV, I can validate this patch with fuzzers :)

dtcxzyw avatar Jul 01 '24 03:07 dtcxzyw

Crash reproducer:

; bin/opt -passes=simplifycfg -mattr=+cf reduced.ll -S
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

define i64 @vm_exec_core(i1 %0) {
  br i1 %0, label %2, label %common.ret

common.ret:                                       ; preds = %2, %1
  ret i64 0

2:                                                ; preds = %1
  br label %common.ret
}
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.      Program arguments: bin/opt -passes=simplifycfg -mattr=+cf reduced.ll
1.      Running pass "function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>)" on module "reduced.ll"
2.      Running pass "simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>" on function "vm_exec_core"
 #0 0x000076ec91614410 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x214410)
 #1 0x000076ec9161141f llvm::sys::RunSignalHandlers() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x21141f)
 #2 0x000076ec91611575 SignalHandler(int) Signals.cpp:0:0
 #3 0x000076ec91042520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #4 0x000076ec8c0b7873 (anonymous namespace)::SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(llvm::BranchInst*, llvm::BasicBlock*) SimplifyCFG.cpp:0:0
 #5 0x000076ec8c0bcdea llvm::simplifyCFG(llvm::BasicBlock*, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&, llvm::ArrayRef<llvm::WeakVH>) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMTransformUtils.so.19.0git+0x2bcdea)
 #6 0x000076ec8cba8a43 iterativelySimplifyCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0
 #7 0x000076ec8cba96ba simplifyFunctionCFGImpl(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0
 #8 0x000076ec8cbaa4f5 simplifyFunctionCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0
 #9 0x000076ec8cbaa6b0 llvm::SimplifyCFGPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMScalarOpts.so.19.0git+0x3aa6b0)
#10 0x000076ec8e0c79e6 llvm::detail::PassModel<llvm::Function, llvm::SimplifyCFGPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMPasses.so.19.0git+0xc79e6)
#11 0x000076ec8b14ba1b llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34ba1b)
#12 0x000076ec906d5096 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd5096)
#13 0x000076ec8b14a7e7 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34a7e7)
#14 0x000076ec906d59b6 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd59b6)
#15 0x000076ec8b148d0b llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x348d0b)
#16 0x000076ec91a3b6b4 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x2d6b4)
#17 0x000076ec91a4878a optMain (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x3a78a)
#18 0x000076ec91029d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16
#19 0x000076ec91029e40 call_init ./csu/../csu/libc-start.c:128:20
#20 0x000076ec91029e40 __libc_start_main ./csu/../csu/libc-start.c:379:5
#21 0x0000612682610095 _start (bin/opt+0x1095)
Segmentation fault (core dumped)

Not able to reproduce with latest PR. Probably fixed by 12ea4a1

Can you check this reproducer again (at the top of 8598bcb9934dca16ea16d87304e00defc85d986c)?

dtcxzyw avatar Jul 01 '24 04:07 dtcxzyw

opt -passes=simplifycfg -mattr=+cf reduced.ll -S

@dtcxzyw Rebased, still no error

bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S
; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

define i64 @vm_exec_core(i1 %0) #0 {
common.ret:
  ret i64 0
}

attributes #0 = { "target-features"="+cf" }

KanRobert avatar Jul 01 '24 05:07 KanRobert

opt -passes=simplifycfg -mattr=+cf reduced.ll -S

@dtcxzyw Rebased, still no error

bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S
; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

define i64 @vm_exec_core(i1 %0) #0 {
common.ret:
  ret i64 0
}

attributes #0 = { "target-features"="+cf" }

I also cannot reproduce this with clang+asan build. Maybe it was caused by a bug in gcc 11.4.

dtcxzyw avatar Jul 01 '24 06:07 dtcxzyw

opt -passes=simplifycfg -mattr=+cf reduced.ll -S

@dtcxzyw Rebased, still no error

bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S
; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

define i64 @vm_exec_core(i1 %0) #0 {
common.ret:
  ret i64 0
}

attributes #0 = { "target-features"="+cf" }

I also cannot reproduce this with clang+asan build. Maybe it was caused by a bug in gcc 11.4.

Reproduced with gcc 12.3

cmake llvm-project/llvm -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -G Ninja \
        -DCMAKE_C_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 \
        -DLLVM_TARGETS_TO_BUILD="X86;RISCV;AArch64" -DLLVM_PARALLEL_LINK_JOBS=4 -DLLVM_ENABLE_ASSERTIONS=ON 

dtcxzyw avatar Jul 01 '24 15:07 dtcxzyw

opt -passes=simplifycfg -mattr=+cf reduced.ll -S

@dtcxzyw Rebased, still no error

bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S
; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

define i64 @vm_exec_core(i1 %0) #0 {
common.ret:
  ret i64 0
}

attributes #0 = { "target-features"="+cf" }

I also cannot reproduce this with clang+asan build. Maybe it was caused by a bug in gcc 11.4.

Reproduced with gcc 12.3

cmake llvm-project/llvm -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -G Ninja \
        -DCMAKE_C_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 \
        -DLLVM_TARGETS_TO_BUILD="X86;RISCV;AArch64" -DLLVM_PARALLEL_LINK_JOBS=4 -DLLVM_ENABLE_ASSERTIONS=ON 

Reproduced and fixed by https://github.com/llvm/llvm-project/pull/96878/commits/c0545a4289e0c50e49f73e164b02ed5271102f88, probably the known issue of initializer_list https://clang.llvm.org/docs/analyzer/developer-docs/InitializerLists.html

KanRobert avatar Jul 02 '24 01:07 KanRobert

@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via SimplifyCFGOptions?

dtcxzyw avatar Jul 03 '24 16:07 dtcxzyw

@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via SimplifyCFGOptions?

That would also help with the issue that masked load/store intrinsics are not well supported as traditional load/stores throughout the middle end.

goldsteinn avatar Jul 03 '24 17:07 goldsteinn

@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via SimplifyCFGOptions?

Do you mean sth like FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));?

If so, where in the pipeline do you suggest doing this? @dtcxzyw

KanRobert avatar Jul 04 '24 05:07 KanRobert

@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via SimplifyCFGOptions?

Do you mean sth like FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));?

Yeah.

If so, where in the pipeline do you suggest doing this? @dtcxzyw

See https://github.com/dtcxzyw/llvm-project/commit/3f407c4d30eb877653c07f00bdcbe3e04742f762.

dtcxzyw avatar Jul 04 '24 07:07 dtcxzyw

I wonder whether it is feasible to allow masked.load/masked.store to be used with scalars. The need to cast between <1 x %T> and %T everywhere is quite awkward and doesn't represent the underlying instructions well.

nikic avatar Jul 04 '24 07:07 nikic

I wonder whether it is feasible to allow masked.load/masked.store to be used with scalars. The need to cast between <1 x %T> and %T everywhere is quite awkward and doesn't represent the underlying instructions well.

+1

dtcxzyw avatar Jul 04 '24 07:07 dtcxzyw

I wonder whether it is feasible to allow masked.load/masked.store to be used with scalars. The need to cast between <1 x %T> and %T everywhere is quite awkward and doesn't represent the underlying instructions well.

In theory, we should let hoistLoadStoreWithCondFaultingFromSuccessors, hoistCommonCodeFromSuccessors, SpeculativelyExecuteBB appear in one pass, because the former can bring more optimization opportunities to the latter two. This is the main reason why I want to do it in SimplifyCFG, not after CodeGen. SimplifyCFGOption provides an opportunity to run it in a late pipeline so that some mid-end optimizations don't have to handle it well. Possibly, we have several solutions

  1. run these three transforms of SimplifyCFG rightly before CodeGenPrepare if the target supports conditional faulting, to avoid the awkwardness?
  2. extend LLVM IR to support scalar versioned masked.load/store.
  3. enhance the support for masked loads/stores for 1x %T throughout the middle-end passes after SimplifyCFG

Or any better idea? @nikic @dtcxzyw

KanRobert avatar Jul 04 '24 08:07 KanRobert

@KanRobert Can you file a separate PR to support conditional load/store of ptrs with +cf? Then I can share another miscompilation reproducer :)

@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8

; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
  %p = alloca i32, align 4
  call void @llvm.lifetime.start.p0(i64 4, ptr %p) #3
  store i8 0, ptr @d, align 1
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i8, ptr @d, align 1
  %conv = sext i8 %0 to i32
  %cmp = icmp slt i32 %conv, 2
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %1 = load ptr, ptr @h, align 8
  %2 = load ptr, ptr %1, align 8
  %3 = load ptr, ptr %2, align 8
  %4 = load ptr, ptr %3, align 8
  %5 = load ptr, ptr @f, align 8
  store ptr %4, ptr %5, align 8
  %6 = load ptr, ptr @h, align 8
  %7 = load ptr, ptr %6, align 8
  %8 = load ptr, ptr %7, align 8
  store ptr %4, ptr %8, align 8
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %9 = load i8, ptr @d, align 1
  %inc = add i8 %9, 1
  store i8 %inc, ptr @d, align 1
  br label %for.cond

for.end:                                          ; preds = %for.cond
  store i32 0, ptr %p, align 4
  br label %for.cond2

for.cond2:                                        ; preds = %for.body3, %for.end
  %10 = load i32, ptr %p, align 4
  %tobool = icmp ne i32 %10, 0
  br i1 %tobool, label %for.body3, label %for.end4

for.body3:                                        ; preds = %for.cond2
  br label %for.cond2

for.end4:                                         ; preds = %for.cond2
  call void @llvm.lifetime.end.p0(i64 4, ptr %p)
  ret void
}

; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, ptr %retval, align 4
  call void @i()
  %0 = load i64, ptr @c, align 8
  %conv = trunc i64 %0 to i32
  %call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
  ret i32 0
}

declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...)

dtcxzyw avatar Jul 04 '24 18:07 dtcxzyw

@KanRobert Can you file a separate PR to support conditional load/store of ptrs with +cf? Then I can share another miscompilation reproducer :)

@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8

; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
  %p = alloca i32, align 4
  call void @llvm.lifetime.start.p0(i64 4, ptr %p) #3
  store i8 0, ptr @d, align 1
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i8, ptr @d, align 1
  %conv = sext i8 %0 to i32
  %cmp = icmp slt i32 %conv, 2
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %1 = load ptr, ptr @h, align 8
  %2 = load ptr, ptr %1, align 8
  %3 = load ptr, ptr %2, align 8
  %4 = load ptr, ptr %3, align 8
  %5 = load ptr, ptr @f, align 8
  store ptr %4, ptr %5, align 8
  %6 = load ptr, ptr @h, align 8
  %7 = load ptr, ptr %6, align 8
  %8 = load ptr, ptr %7, align 8
  store ptr %4, ptr %8, align 8
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %9 = load i8, ptr @d, align 1
  %inc = add i8 %9, 1
  store i8 %inc, ptr @d, align 1
  br label %for.cond

for.end:                                          ; preds = %for.cond
  store i32 0, ptr %p, align 4
  br label %for.cond2

for.cond2:                                        ; preds = %for.body3, %for.end
  %10 = load i32, ptr %p, align 4
  %tobool = icmp ne i32 %10, 0
  br i1 %tobool, label %for.body3, label %for.end4

for.body3:                                        ; preds = %for.cond2
  br label %for.cond2

for.end4:                                         ; preds = %for.cond2
  call void @llvm.lifetime.end.p0(i64 4, ptr %p)
  ret void
}

; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, ptr %retval, align 4
  call void @i()
  %0 = load i64, ptr @c, align 8
  %conv = trunc i64 %0 to i32
  %call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
  ret i32 0
}

declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...)

So you're working on exposing the potential issues for mask.load/store in other middle-end passes? And prefer the 3rd solution?

KanRobert avatar Jul 05 '24 01:07 KanRobert

So you're working on exposing the potential issues for mask.load/store in other middle-end passes?

No. It seems a bug in SimplifyCFG.

And prefer the 3rd solution?

I prefer the second one.

dtcxzyw avatar Jul 05 '24 04:07 dtcxzyw

So you're working on exposing the potential issues for mask.load/store in other middle-end passes?

No. It seems a bug in SimplifyCFG.

And prefer the 3rd solution?

I prefer the second one.

Is it a bug in this transform or in other transforms?

KanRobert avatar Jul 05 '24 04:07 KanRobert

So you're working on exposing the potential issues for mask.load/store in other middle-end passes?

No. It seems a bug in SimplifyCFG.

And prefer the 3rd solution?

I prefer the second one.

Is it a bug in this transform or in other transforms?

After simplifycfg, the program exits with SIGSEGV. I cannot tell whether it is a bug in this patch because +cf doesn't support load/store pointers :)

*** IR Dump After InferFunctionAttrsPass on [module] ***
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8

; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
  %p = alloca i32, align 4
  call void @llvm.lifetime.start.p0(i64 4, ptr %p) #3
  store i8 0, ptr @d, align 1, !tbaa !9
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i8, ptr @d, align 1, !tbaa !9
  %conv = sext i8 %0 to i32
  %cmp = icmp slt i32 %conv, 2
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %1 = load ptr, ptr @h, align 8, !tbaa !12
  %2 = load ptr, ptr %1, align 8, !tbaa !12
  %3 = load ptr, ptr %2, align 8, !tbaa !12
  %4 = load ptr, ptr %3, align 8, !tbaa !12
  %5 = load ptr, ptr @f, align 8, !tbaa !12
  store ptr %4, ptr %5, align 8, !tbaa !12
  %6 = load ptr, ptr @h, align 8, !tbaa !12
  %7 = load ptr, ptr %6, align 8, !tbaa !12
  %8 = load ptr, ptr %7, align 8, !tbaa !12
  store ptr %4, ptr %8, align 8, !tbaa !12
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %9 = load i8, ptr @d, align 1, !tbaa !9
  %inc = add i8 %9, 1
  store i8 %inc, ptr @d, align 1, !tbaa !9
  br label %for.cond, !llvm.loop !14

for.end:                                          ; preds = %for.cond
  store i32 0, ptr %p, align 4, !tbaa !16
  br label %for.cond2

for.cond2:                                        ; preds = %for.body3, %for.end
  %10 = load i32, ptr %p, align 4, !tbaa !16
  %tobool = icmp ne i32 %10, 0
  br i1 %tobool, label %for.body3, label %for.end4

for.body3:                                        ; preds = %for.cond2
  br label %for.cond2, !llvm.loop !18

for.end4:                                         ; preds = %for.cond2
  call void @llvm.lifetime.end.p0(i64 4, ptr %p) #3
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1

; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, ptr %retval, align 4
  call void @i()
  %0 = load i64, ptr @c, align 8, !tbaa !19
  %conv = trunc i64 %0 to i32
  %call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...) #2

attributes #0 = { nounwind uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #3 = { nounwind }

!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicldst1p0_zicond1p0_zicsr2p0_zifencei2p0_zmmul1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 8}
!8 = !{!"clang version 19.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"omnipotent char", !11, i64 0}
!11 = !{!"Simple C/C++ TBAA"}
!12 = !{!13, !13, i64 0}
!13 = !{!"any pointer", !10, i64 0}
!14 = distinct !{!14, !15}
!15 = !{!"llvm.loop.mustprogress"}
!16 = !{!17, !17, i64 0}
!17 = !{!"int", !10, i64 0}
!18 = distinct !{!18, !15}
!19 = !{!20, !20, i64 0}
!20 = !{!"long", !10, i64 0}
*** IR Dump After CoroEarlyPass on [module] omitted because no change ***
*** IR Dump After EntryExitInstrumenterPass on i omitted because no change ***
BISECT: running pass (5) LowerExpectIntrinsicPass on i
*** IR Dump After LowerExpectIntrinsicPass on i omitted because no change ***
BISECT: running pass (6) SimplifyCFGPass on i
*** IR Dump After SimplifyCFGPass on i ***
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8

; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
  %p = alloca i32, align 4
  call void @llvm.lifetime.start.p0(i64 4, ptr %p) #5
  store i8 0, ptr @d, align 1, !tbaa !9
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i8, ptr @d, align 1, !tbaa !9
  %conv = sext i8 %0 to i32
  %cmp = icmp slt i32 %conv, 2
  %1 = bitcast i1 %cmp to <1 x i1>
  %2 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr @h, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %3 = bitcast <1 x ptr> %2 to ptr
  %4 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %3, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %5 = bitcast <1 x ptr> %4 to ptr
  %6 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %5, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %7 = bitcast <1 x ptr> %6 to ptr
  %8 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %7, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %9 = bitcast <1 x ptr> %8 to ptr
  %10 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr @f, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %11 = bitcast <1 x ptr> %10 to ptr
  %12 = bitcast ptr %9 to <1 x ptr>
  call void @llvm.masked.store.v1p0.p0(<1 x ptr> %12, ptr %11, i32 8, <1 x i1> %1), !tbaa !12
  %13 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr @h, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %14 = bitcast <1 x ptr> %13 to ptr
  %15 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %14, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %16 = bitcast <1 x ptr> %15 to ptr
  %17 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %16, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
  %18 = bitcast <1 x ptr> %17 to ptr
  %19 = bitcast ptr %9 to <1 x ptr>
  call void @llvm.masked.store.v1p0.p0(<1 x ptr> %19, ptr %18, i32 8, <1 x i1> %1), !tbaa !12
  %20 = xor i1 %cmp, true
  %21 = bitcast i1 %20 to <1 x i1>
  call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr %p, i32 4, <1 x i1> %21), !tbaa !14
  br i1 %cmp, label %for.inc, label %for.cond2

for.inc:                                          ; preds = %for.cond
  %22 = load i8, ptr @d, align 1, !tbaa !9
  %inc = add i8 %22, 1
  store i8 %inc, ptr @d, align 1, !tbaa !9
  br label %for.cond, !llvm.loop !16

for.cond2:                                        ; preds = %for.cond2, %for.cond
  %23 = load i32, ptr %p, align 4, !tbaa !14
  %tobool = icmp ne i32 %23, 0
  br i1 %tobool, label %for.cond2, label %for.end4, !llvm.loop !18

for.end4:                                         ; preds = %for.cond2
  call void @llvm.lifetime.end.p0(i64 4, ptr %p) #5
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1

; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, ptr %retval, align 4
  call void @i()
  %0 = load i64, ptr @c, align 8, !tbaa !19
  %conv = trunc i64 %0 to i32
  %call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...) #2

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
declare <1 x ptr> @llvm.masked.load.v1p0.p0(ptr nocapture, i32 immarg, <1 x i1>, <1 x ptr>) #3

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v1p0.p0(<1 x ptr>, ptr nocapture, i32 immarg, <1 x i1>) #4

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr nocapture, i32 immarg, <1 x i1>) #4

attributes #0 = { nounwind uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
attributes #5 = { nounwind }

!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicldst1p0_zicond1p0_zicsr2p0_zifencei2p0_zmmul1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 8}
!8 = !{!"clang version 19.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"omnipotent char", !11, i64 0}
!11 = !{!"Simple C/C++ TBAA"}
!12 = !{!13, !13, i64 0}
!13 = !{!"any pointer", !10, i64 0}
!14 = !{!15, !15, i64 0}
!15 = !{!"int", !10, i64 0}
!16 = distinct !{!16, !17}
!17 = !{!"llvm.loop.mustprogress"}
!18 = distinct !{!18, !17}
!19 = !{!20, !20, i64 0}
!20 = !{!"long", !10, i64 0}

dtcxzyw avatar Jul 05 '24 04:07 dtcxzyw

  1. run these three transforms of SimplifyCFG rightly before CodeGenPrepare if the target supports conditional faulting, to avoid the awkwardness?
  2. extend LLVM IR to support scalar versioned masked.load/store.
  3. enhance the support for masked loads/stores for 1x %T throughout the middle-end passes after SimplifyCFG

dtcxzyw: I prefer the second one.

@nikic @DianQK Vote?

KanRobert avatar Jul 08 '24 03:07 KanRobert

Can you file a separate PR to support conditional load/store of ptrs with +cf?

@dtcxzyw In progress.

KanRobert avatar Jul 08 '24 03:07 KanRobert