[X86][SimplifyCFG] Support hoisting load/store with conditional faulting
This is simplifycfg part of https://github.com/llvm/llvm-project/pull/95515
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-loongarch @llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-llvm-transforms
Author: Shengchen Kan (KanRobert)
Changes
Patch is 26.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96878.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Utils/SimplifyCFG.cpp (+216-5)
- (added) llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll (+460)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c52c4dc0b8a51..558fafd5a2652 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
"simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores if an unconditional store precedes"));
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+ "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+ cl::init(true),
+ cl::desc("Hoist loads/stores if the target supports "
+ "conditional faulting"));
+
static cl::opt<bool> MergeCondStores(
"simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
+ bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2960,6 +2967,199 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// store i32 1, ptr %q, align 4
+/// ...
+/// TrueBB:
+/// %0 = load i32, ptr %b, align 4
+/// store i32 %0, ptr %p, align 4
+/// ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// %0 = cload i32, ptr %b, %cond
+/// cstore i32 %0, ptr %p, %cond
+/// cstore i32 1, ptr %q, ~%cond
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// ...
+/// TrueBB:
+/// ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+/// %vcond = bitcast i1 %cond to <1 x i1>
+/// %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+/// (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+/// %0 = bitcast <1 x i32> %v0 to i32
+/// call void @llvm.masked.store.v1i32.p0
+// (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+/// %cond.not = xor i1 %cond, true
+/// %vcond.not = bitcast i1 %cond.not to <1 x i>
+/// call void @llvm.masked.store.v1i32.p0
+/// (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+/// are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+ BasicBlock *BB) {
+ if (!HoistLoadsStoresWithCondFaulting ||
+ !TTI.hasConditionalLoadStoreForType())
+ return false;
+
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *IfTrueBB = BI->getSuccessor(0);
+ BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+ // If either of the blocks has it's address taken, then we can't do this fold,
+ // because the code we'd hoist would no longer run when we jump into the block
+ // by it's address.
+ for (auto *Succ : {IfTrueBB, IfFalseBB})
+ if (Succ->hasAddressTaken())
+ return false;
+
+ // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
+ // checking all intermediate operands dominate the branch.
+ auto IsLoadFromAlloca = [](const Instruction &I) {
+ return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
+ };
+
+ // Collect hoisted loads/stores.
+ SmallSetVector<Instruction *, 4> HoistedInsts;
+ // Not hoist load/store if
+ // 1. target does not have corresponding conditional faulting load/store.
+ // 2. it's volatile or atomic.
+ // 3. there is a load/store that can not be hoisted in the same bb.
+ // 4. there is a non-load/store that's not safe to speculatively execute
+ // in the same bb.
+ // 5. any operand of it does not dominate the branch.
+ // 6. it's a store and a memory read is skipped.
+ auto HoistInstsInBB = [&](BasicBlock *BB) {
+ bool SkipMemoryRead = false;
+ // A more efficient way to check domination. An operand dominates the
+ // BranchInst if
+ // 1. it's not defined in the same bb as the instruction.
+ // 2. it's to be hoisted.
+ //
+ // b/c BB is only predecessor and BranchInst does not define any value.
+ auto OpsDominatesBranch = [&](Instruction &I) {
+ return llvm::all_of(I.operands(), [&](Value *Op) {
+ if (auto *J = dyn_cast<Instruction>(Op)) {
+ if (HoistedInsts.contains(J))
+ return true;
+ if (J->getParent() == I.getParent())
+ return false;
+ }
+ return true;
+ });
+ };
+ for (auto &I : *BB) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (LI || SI) {
+ bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+ if (!IsSimple || !OpsDominatesBranch(I))
+ return false;
+ auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+ // a load from alloca is always safe.
+ if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
+ return false;
+ // Conservative aliasing check.
+ if (SI && SkipMemoryRead)
+ return false;
+ HoistedInsts.insert(&I);
+ } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
+ return false;
+ else if (I.mayReadFromMemory())
+ SkipMemoryRead = true;
+ }
+ return true;
+ };
+
+ if (!HoistInstsInBB(IfTrueBB) || !HoistInstsInBB(IfFalseBB) ||
+ HoistedInsts.empty())
+ return false;
+
+ // Put newly added instructions before the BranchInst.
+ IRBuilder<> Builder(BI);
+ auto &Context = BB->getContext();
+ auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+ auto *Cond = BI->getOperand(0);
+ auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+ Value *VCondNot = nullptr;
+ for (auto *I : HoistedInsts) {
+ // Only need to move the position for load from alloca.
+ if (IsLoadFromAlloca(*I)) {
+ I->moveBefore(BI);
+ continue;
+ }
+
+ bool InvertCond = I->getParent() == IfFalseBB;
+ // Construct the inverted condition if need.
+ if (InvertCond && !VCondNot)
+ VCondNot = Builder.CreateBitCast(
+ Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+ auto *Mask = InvertCond ? VCondNot : VCond;
+ auto *Op0 = I->getOperand(0);
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ // Load
+ auto *Ty = I->getType();
+ // NOTE: Now we assume conditional faulting load/store is supported for
+ // scalar only when creating new instructions, but it's easy to extend it
+ // for vector types in the future.
+ assert(!Ty->isVectorTy() && "not implemented");
+ auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+ LI->getAlign(), Mask);
+ auto *S0 = Builder.CreateBitCast(V0, Ty);
+ V0->copyMetadata(*I);
+ I->replaceAllUsesWith(S0);
+ } else {
+ // Store
+ assert(!Op0->getType()->isVectorTy() && "not implemented");
+ auto *StoredVal =
+ Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+ auto *VStore = Builder.CreateMaskedStore(
+ StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+ VStore->copyMetadata(*I);
+ }
+ }
+
+ // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+ // error.
+ std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
+ if (!IsLoadFromAlloca(*I))
+ I->eraseFromParent();
+ });
+
+ return true;
+}
+
/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
@@ -7420,31 +7620,42 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
return requestResimplify();
// We have a conditional branch to two blocks that are only reachable
- // from BI. We know that the condbr dominates the two blocks, so see if
- // there is any identical code in the "then" and "else" blocks. If so, we
- // can hoist it up to the branching block.
+ // from BI. We know that the condbr dominates the two blocks, so see
+ //
+ // * if there is any identical code in the "then" and "else" blocks.
+ // * if there is any different load/store in the "then" and "else" blocks.
+ //
+ // If so, we can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
if (Succ0TI->getNumSuccessors() == 1 &&
- Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+ Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
return requestResimplify();
+ }
}
} else if (BI->getSuccessor(1)->getSinglePredecessor()) {
// If Successor #0 has multiple preds, we may be able to conditionally
// execute Successor #1 if it branches to Successor #0.
Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
if (Succ1TI->getNumSuccessors() == 1 &&
- Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+ Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
return requestResimplify();
+ }
}
// If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..2fd0055cf05f9
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,460 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; The redundant bitcast/insertelement will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p, align 8, !dbg !8
+ store i16 2, ptr %q, align 8, !dbg !8
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4, !dbg !9
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; simplifycfg is run before sroa. alloca here is not optimized away yet.
+define void @alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT: store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %p.addr = alloca ptr
+ %q.addr = alloca ptr
+ %a.addr = alloca i32
+ store ptr %p, ptr %p.addr
+ store ptr %q, ptr %q.addr
+ store i32 %a, ptr %a.addr
+ %0 = load i32, ptr %a.addr
+ %tobool = icmp ne i32 %0, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+ %1 = load ptr, ptr %q.addr
+ %2 = load i32, ptr %1
+ %3 = load ptr, ptr %p.addr
+ store i32 %2, ptr %3
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; successor 1 branches to successor 0.
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %tobool = icmp ne i32 %a, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ ret void
+
+if.then:
+ %0 = load i32, ptr %q
+ store i32 %0, ptr %p
+ br label %if.end
+}
+
+;; successor 0 branches to successor 1.
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b
+ store i32 %0, ptr %p
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+;; load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
+; CHECK-NEXT: ret i64 [[COMMON_RET_OP]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.end
+
+if.true:
+ store i32 1, ptr %b
+ %0 = load i16, ptr %p
+ %1 = load i64, ptr %q
+ %zext = zext i16 %0 to i64
+ %add = add i64 %zext, %1
+ ret i64 %add
+
+if.end:
+ ret i64 0
+}
+
+define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT: [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT: [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
+; CHECK-NEXT: ret i32 [[PHI]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %read = call i32 @read_memory_only()
+ %0 = load i32, ptr %q
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b
+ store i32 %1, ptr %p
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [%read, %if.false], [0, %if.true]
+ ret i32 %phi
+}
+
+; i8 is not supported by conditional faulting
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+ent...
[truncated]
How is the performance of the hoisting load/store being benchmarked? I don't know the specific implementation in microarchitecture, but using these conditional instructions to replace a very easy-to-predict branch may negatively contribute to performance. Is there any microarchitecture simulator or real chip implemented so we can benchmark the performance?
How is the performance of the hoisting load/store being benchmarked? I don't know the specific implementation in microarchitecture, but using these conditional instructions to replace a very easy-to-predict branch may negatively contribute to performance. Is there any microarchitecture simulator or real chip implemented so we can benchmark the performance?
I can share code for check predictability
// If the branch is non-unpredictable, and is predicted to *not* branch to
// the `then` block, then avoid speculating it.
if (!BI->getMetadata(LLVMContext::MD_unpredictable)) {
uint64_t TWeight, FWeight;
if (extractBranchWeights(*BI, TWeight, FWeight) &&
(TWeight + FWeight) != 0) {
uint64_t EndWeight = Invert ? TWeight : FWeight;
BranchProbability BIEndProb =
BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight);
BranchProbability Likely = TTI.getPredictableBranchThreshold();
if (BIEndProb >= Likely)
return false;
}
}
in SpeculativelyExecuteBB with the newly added transform.
We have an internal cycle-accurate performance simulator. The real chip is not public yet. You know, even if I have data, I can't make any comments on the performance of future HW.
I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)
Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of print-after-all.
If I add -start-before=simplifycfg to the flags, it will crash. (not related this PR)
bash$ cat 1.c
void f() {}
bash$ clang -O2 1.c -S -emit-llvm
bash$ llc --start-before=simplifycfg <1.ll
Assertion `!NodePtr->isKnownSentinel()' failed
I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)
Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of
print-after-all.
I think a llc -O2 should be ok. I just believe we need a complete test case to demonstrate the final changes from this patch.
If I add
-start-before=simplifycfgto the flags, it will crash. (not related this PR)bash$ cat 1.c void f() {} bash$ clang -O2 1.c -S -emit-llvm bash$ llc --start-before=simplifycfg <1.ll Assertion `!NodePtr->isKnownSentinel()' failed
I guess --start-before can only be used on MIR passes.
I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)
Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of
print-after-all.I think a
llc -O2should be ok. I just believe we need a complete test case to demonstrate the final changes from this patch.If I add
-start-before=simplifycfgto the flags, it will crash. (not related this PR)bash$ cat 1.c void f() {} bash$ clang -O2 1.c -S -emit-llvm bash$ llc --start-before=simplifycfg <1.ll Assertion `!NodePtr->isKnownSentinel()' failedI guess
--start-beforecan only be used on MIR passes.
-O2 does not work. In fact, the default opt level for llc is O2. I can add a test with flags opt -O2 -S to check the transform is not cancelled during the middle-end opts. In backend ,we already have a test to check masked.load/store can be selected. Does it sound good?
I think you need at least one backend test case to show this change? (llc also runs SimplifyCFG)
Hmm, how to let llc run simplifycfg? @DianQK I can't find simplifycfg in the log of
print-after-all.I think a
llc -O2should be ok. I just believe we need a complete test case to demonstrate the final changes from this patch.If I add
-start-before=simplifycfgto the flags, it will crash. (not related this PR)bash$ cat 1.c void f() {} bash$ clang -O2 1.c -S -emit-llvm bash$ llc --start-before=simplifycfg <1.ll Assertion `!NodePtr->isKnownSentinel()' failedI guess
--start-beforecan only be used on MIR passes.-O2 does not work. In fact, the default opt level for llc is
O2. I can add a test with flagsopt -O2 -Sto check the transform is not cancelled during the middle-end opts. In backend ,we already have a test to checkmasked.load/storecan be selected. Does it sound good?
Done. Added test masked-load-store-legal.ll
@KanRobert I think this patch is far away from being ready for review. Please make sure that this patch doesn't cause crashes or miscompilation on the following benchmarks:
- [ ] Clang stage2 build
- [ ] SPEC 2006/2017
- [ ] llvm-test-suite
- [x] https://github.com/dtcxzyw/llvm-opt-benchmark
- [x] fuzzers (passed 1M csmith tests)
Crash reproducer:
; bin/opt -passes=simplifycfg -mattr=+cf reduced.ll -S target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" define i64 @vm_exec_core(i1 %0) { br i1 %0, label %2, label %common.ret common.ret: ; preds = %2, %1 ret i64 0 2: ; preds = %1 br label %common.ret }PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. Stack dump: 0. Program arguments: bin/opt -passes=simplifycfg -mattr=+cf reduced.ll 1. Running pass "function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>)" on module "reduced.ll" 2. Running pass "simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>" on function "vm_exec_core" #0 0x000076ec91614410 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x214410) #1 0x000076ec9161141f llvm::sys::RunSignalHandlers() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x21141f) #2 0x000076ec91611575 SignalHandler(int) Signals.cpp:0:0 #3 0x000076ec91042520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520) #4 0x000076ec8c0b7873 (anonymous namespace)::SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(llvm::BranchInst*, llvm::BasicBlock*) SimplifyCFG.cpp:0:0 #5 0x000076ec8c0bcdea llvm::simplifyCFG(llvm::BasicBlock*, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&, llvm::ArrayRef<llvm::WeakVH>) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMTransformUtils.so.19.0git+0x2bcdea) #6 0x000076ec8cba8a43 iterativelySimplifyCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0 #7 0x000076ec8cba96ba simplifyFunctionCFGImpl(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0 #8 0x000076ec8cbaa4f5 simplifyFunctionCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0 #9 0x000076ec8cbaa6b0 llvm::SimplifyCFGPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMScalarOpts.so.19.0git+0x3aa6b0) #10 0x000076ec8e0c79e6 llvm::detail::PassModel<llvm::Function, llvm::SimplifyCFGPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMPasses.so.19.0git+0xc79e6) #11 0x000076ec8b14ba1b llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34ba1b) #12 0x000076ec906d5096 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd5096) #13 0x000076ec8b14a7e7 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34a7e7) #14 0x000076ec906d59b6 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd59b6) #15 0x000076ec8b148d0b llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x348d0b) #16 0x000076ec91a3b6b4 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x2d6b4) #17 0x000076ec91a4878a optMain (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x3a78a) #18 0x000076ec91029d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16 #19 0x000076ec91029e40 call_init ./csu/../csu/libc-start.c:128:20 #20 0x000076ec91029e40 __libc_start_main ./csu/../csu/libc-start.c:379:5 #21 0x0000612682610095 _start (bin/opt+0x1095) Segmentation fault (core dumped)
Not able to reproduce with latest PR. Probably fixed by 12ea4a1eb3ee64d4aed10978f9b50cfc4d14007d
@KanRobert I think this patch is far away from being ready for review. Please make sure that this patch doesn't cause crashes or miscompilation on the following benchmarks:
- [ ] Clang stage2 build
- [ ] SPEC 2006/2017
- [ ] llvm-test-suite
- [ ] https://github.com/dtcxzyw/llvm-opt-benchmark
- [ ] fuzzers (e.g., csmith)
Not familiar with other benchmarks. But I will validate SPEC2017 + llvm-test-suite on intel SDE with this PR, and comment the result on this page.
@KanRobert I think this patch is far away from being ready for review. Please make sure that this patch doesn't cause crashes or miscompilation on the following benchmarks:
- [ ] Clang stage2 build
- [ ] SPEC 2006/2017
- [ ] llvm-test-suite
- [ ] https://github.com/dtcxzyw/llvm-opt-benchmark
- [ ] fuzzers (e.g., csmith)
Not familiar with other benchmarks. But I will validate SPEC2017 + llvm-test-suite on intel SDE with this PR, and comment the result on this page.
Great! As we have a similar prototype on RISCV, I can validate this patch with fuzzers :)
Crash reproducer:
; bin/opt -passes=simplifycfg -mattr=+cf reduced.ll -S target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" define i64 @vm_exec_core(i1 %0) { br i1 %0, label %2, label %common.ret common.ret: ; preds = %2, %1 ret i64 0 2: ; preds = %1 br label %common.ret }PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. Stack dump: 0. Program arguments: bin/opt -passes=simplifycfg -mattr=+cf reduced.ll 1. Running pass "function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>)" on module "reduced.ll" 2. Running pass "simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>" on function "vm_exec_core" #0 0x000076ec91614410 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x214410) #1 0x000076ec9161141f llvm::sys::RunSignalHandlers() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.19.0git+0x21141f) #2 0x000076ec91611575 SignalHandler(int) Signals.cpp:0:0 #3 0x000076ec91042520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520) #4 0x000076ec8c0b7873 (anonymous namespace)::SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(llvm::BranchInst*, llvm::BasicBlock*) SimplifyCFG.cpp:0:0 #5 0x000076ec8c0bcdea llvm::simplifyCFG(llvm::BasicBlock*, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&, llvm::ArrayRef<llvm::WeakVH>) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMTransformUtils.so.19.0git+0x2bcdea) #6 0x000076ec8cba8a43 iterativelySimplifyCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DomTreeUpdater*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0 #7 0x000076ec8cba96ba simplifyFunctionCFGImpl(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0 #8 0x000076ec8cbaa4f5 simplifyFunctionCFG(llvm::Function&, llvm::TargetTransformInfo const&, llvm::DominatorTree*, llvm::SimplifyCFGOptions const&) SimplifyCFGPass.cpp:0:0 #9 0x000076ec8cbaa6b0 llvm::SimplifyCFGPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMScalarOpts.so.19.0git+0x3aa6b0) #10 0x000076ec8e0c79e6 llvm::detail::PassModel<llvm::Function, llvm::SimplifyCFGPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMPasses.so.19.0git+0xc79e6) #11 0x000076ec8b14ba1b llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34ba1b) #12 0x000076ec906d5096 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd5096) #13 0x000076ec8b14a7e7 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x34a7e7) #14 0x000076ec906d59b6 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.19.0git+0xd59b6) #15 0x000076ec8b148d0b llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.19.0git+0x348d0b) #16 0x000076ec91a3b6b4 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x2d6b4) #17 0x000076ec91a4878a optMain (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.19.0git+0x3a78a) #18 0x000076ec91029d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16 #19 0x000076ec91029e40 call_init ./csu/../csu/libc-start.c:128:20 #20 0x000076ec91029e40 __libc_start_main ./csu/../csu/libc-start.c:379:5 #21 0x0000612682610095 _start (bin/opt+0x1095) Segmentation fault (core dumped)Not able to reproduce with latest PR. Probably fixed by 12ea4a1
Can you check this reproducer again (at the top of 8598bcb9934dca16ea16d87304e00defc85d986c)?
opt -passes=simplifycfg -mattr=+cf reduced.ll -S
@dtcxzyw Rebased, still no error
bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S
; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
define i64 @vm_exec_core(i1 %0) #0 {
common.ret:
ret i64 0
}
attributes #0 = { "target-features"="+cf" }
opt -passes=simplifycfg -mattr=+cf reduced.ll -S
@dtcxzyw Rebased, still no error
bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S ; ModuleID = 'reduced.ll' source_filename = "reduced.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" define i64 @vm_exec_core(i1 %0) #0 { common.ret: ret i64 0 } attributes #0 = { "target-features"="+cf" }
I also cannot reproduce this with clang+asan build. Maybe it was caused by a bug in gcc 11.4.
opt -passes=simplifycfg -mattr=+cf reduced.ll -S
@dtcxzyw Rebased, still no error
bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S ; ModuleID = 'reduced.ll' source_filename = "reduced.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" define i64 @vm_exec_core(i1 %0) #0 { common.ret: ret i64 0 } attributes #0 = { "target-features"="+cf" }I also cannot reproduce this with clang+asan build. Maybe it was caused by a bug in gcc 11.4.
Reproduced with gcc 12.3
cmake llvm-project/llvm -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -G Ninja \
-DCMAKE_C_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 \
-DLLVM_TARGETS_TO_BUILD="X86;RISCV;AArch64" -DLLVM_PARALLEL_LINK_JOBS=4 -DLLVM_ENABLE_ASSERTIONS=ON
opt -passes=simplifycfg -mattr=+cf reduced.ll -S
@dtcxzyw Rebased, still no error
bash$ opt -passes=simplifycfg -mattr=+cf reduced.ll -S ; ModuleID = 'reduced.ll' source_filename = "reduced.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" define i64 @vm_exec_core(i1 %0) #0 { common.ret: ret i64 0 } attributes #0 = { "target-features"="+cf" }I also cannot reproduce this with clang+asan build. Maybe it was caused by a bug in gcc 11.4.
Reproduced with gcc 12.3
cmake llvm-project/llvm -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -G Ninja \ -DCMAKE_C_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 \ -DLLVM_TARGETS_TO_BUILD="X86;RISCV;AArch64" -DLLVM_PARALLEL_LINK_JOBS=4 -DLLVM_ENABLE_ASSERTIONS=ON
Reproduced and fixed by https://github.com/llvm/llvm-project/pull/96878/commits/c0545a4289e0c50e49f73e164b02ed5271102f88, probably the known issue of initializer_list https://clang.llvm.org/docs/analyzer/developer-docs/InitializerLists.html
@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via SimplifyCFGOptions?
@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via
SimplifyCFGOptions?
That would also help with the issue that masked load/store intrinsics are not well supported as traditional load/stores throughout the middle end.
@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via
SimplifyCFGOptions?
Do you mean sth like FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));?
If so, where in the pipeline do you suggest doing this? @dtcxzyw
@KanRobert As there are some potential miscompilations/crashes caused by other passes after SimplifyCFGPass, should we limit this optimization to only run in the late pipeline via
SimplifyCFGOptions?Do you mean sth like
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));?
Yeah.
If so, where in the pipeline do you suggest doing this? @dtcxzyw
See https://github.com/dtcxzyw/llvm-project/commit/3f407c4d30eb877653c07f00bdcbe3e04742f762.
I wonder whether it is feasible to allow masked.load/masked.store to be used with scalars. The need to cast between <1 x %T> and %T everywhere is quite awkward and doesn't represent the underlying instructions well.
I wonder whether it is feasible to allow masked.load/masked.store to be used with scalars. The need to cast between
<1 x %T>and%Teverywhere is quite awkward and doesn't represent the underlying instructions well.
+1
I wonder whether it is feasible to allow masked.load/masked.store to be used with scalars. The need to cast between
<1 x %T>and%Teverywhere is quite awkward and doesn't represent the underlying instructions well.
In theory, we should let hoistLoadStoreWithCondFaultingFromSuccessors, hoistCommonCodeFromSuccessors, SpeculativelyExecuteBB appear in one pass, because the former can bring more optimization opportunities to the latter two. This is the main reason why I want to do it in SimplifyCFG, not after CodeGen. SimplifyCFGOption provides an opportunity to run it in a late pipeline so that some mid-end optimizations don't have to handle it well. Possibly, we have several solutions
- run these three transforms of
SimplifyCFGrightly beforeCodeGenPrepareif the target supports conditional faulting, to avoid the awkwardness? - extend LLVM IR to support scalar versioned
masked.load/store. - enhance the support for masked loads/stores for
1x %Tthroughout the middle-end passes afterSimplifyCFG
Or any better idea? @nikic @dtcxzyw
@KanRobert Can you file a separate PR to support conditional load/store of ptrs with +cf? Then I can share another miscompilation reproducer :)
@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8
; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
%p = alloca i32, align 4
call void @llvm.lifetime.start.p0(i64 4, ptr %p) #3
store i8 0, ptr @d, align 1
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i8, ptr @d, align 1
%conv = sext i8 %0 to i32
%cmp = icmp slt i32 %conv, 2
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load ptr, ptr @h, align 8
%2 = load ptr, ptr %1, align 8
%3 = load ptr, ptr %2, align 8
%4 = load ptr, ptr %3, align 8
%5 = load ptr, ptr @f, align 8
store ptr %4, ptr %5, align 8
%6 = load ptr, ptr @h, align 8
%7 = load ptr, ptr %6, align 8
%8 = load ptr, ptr %7, align 8
store ptr %4, ptr %8, align 8
br label %for.inc
for.inc: ; preds = %for.body
%9 = load i8, ptr @d, align 1
%inc = add i8 %9, 1
store i8 %inc, ptr @d, align 1
br label %for.cond
for.end: ; preds = %for.cond
store i32 0, ptr %p, align 4
br label %for.cond2
for.cond2: ; preds = %for.body3, %for.end
%10 = load i32, ptr %p, align 4
%tobool = icmp ne i32 %10, 0
br i1 %tobool, label %for.body3, label %for.end4
for.body3: ; preds = %for.cond2
br label %for.cond2
for.end4: ; preds = %for.cond2
call void @llvm.lifetime.end.p0(i64 4, ptr %p)
ret void
}
; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
%retval = alloca i32, align 4
store i32 0, ptr %retval, align 4
call void @i()
%0 = load i64, ptr @c, align 8
%conv = trunc i64 %0 to i32
%call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
ret i32 0
}
declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...)
@KanRobert Can you file a separate PR to support conditional load/store of ptrs with +cf? Then I can share another miscompilation reproducer :)
@e = dso_local global ptr null, align 8 @f = dso_local global ptr @e, align 8 @g = dso_local global ptr @f, align 8 @h = dso_local global ptr @g, align 8 @d = dso_local global i8 0, align 1 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 @c = dso_local global i64 0, align 8 ; Function Attrs: nounwind uwtable define dso_local void @i() #0 { entry: %p = alloca i32, align 4 call void @llvm.lifetime.start.p0(i64 4, ptr %p) #3 store i8 0, ptr @d, align 1 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i8, ptr @d, align 1 %conv = sext i8 %0 to i32 %cmp = icmp slt i32 %conv, 2 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load ptr, ptr @h, align 8 %2 = load ptr, ptr %1, align 8 %3 = load ptr, ptr %2, align 8 %4 = load ptr, ptr %3, align 8 %5 = load ptr, ptr @f, align 8 store ptr %4, ptr %5, align 8 %6 = load ptr, ptr @h, align 8 %7 = load ptr, ptr %6, align 8 %8 = load ptr, ptr %7, align 8 store ptr %4, ptr %8, align 8 br label %for.inc for.inc: ; preds = %for.body %9 = load i8, ptr @d, align 1 %inc = add i8 %9, 1 store i8 %inc, ptr @d, align 1 br label %for.cond for.end: ; preds = %for.cond store i32 0, ptr %p, align 4 br label %for.cond2 for.cond2: ; preds = %for.body3, %for.end %10 = load i32, ptr %p, align 4 %tobool = icmp ne i32 %10, 0 br i1 %tobool, label %for.body3, label %for.end4 for.body3: ; preds = %for.cond2 br label %for.cond2 for.end4: ; preds = %for.cond2 call void @llvm.lifetime.end.p0(i64 4, ptr %p) ret void } ; Function Attrs: nounwind uwtable define dso_local signext i32 @main() #0 { entry: %retval = alloca i32, align 4 store i32 0, ptr %retval, align 4 call void @i() %0 = load i64, ptr @c, align 8 %conv = trunc i64 %0 to i32 %call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv) ret i32 0 } declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...)
So you're working on exposing the potential issues for mask.load/store in other middle-end passes? And prefer the 3rd solution?
So you're working on exposing the potential issues for mask.load/store in other middle-end passes?
No. It seems a bug in SimplifyCFG.
And prefer the 3rd solution?
I prefer the second one.
So you're working on exposing the potential issues for mask.load/store in other middle-end passes?
No. It seems a bug in SimplifyCFG.
And prefer the 3rd solution?
I prefer the second one.
Is it a bug in this transform or in other transforms?
So you're working on exposing the potential issues for mask.load/store in other middle-end passes?
No. It seems a bug in SimplifyCFG.
And prefer the 3rd solution?
I prefer the second one.
Is it a bug in this transform or in other transforms?
After simplifycfg, the program exits with SIGSEGV. I cannot tell whether it is a bug in this patch because +cf doesn't support load/store pointers :)
*** IR Dump After InferFunctionAttrsPass on [module] ***
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8
; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
%p = alloca i32, align 4
call void @llvm.lifetime.start.p0(i64 4, ptr %p) #3
store i8 0, ptr @d, align 1, !tbaa !9
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i8, ptr @d, align 1, !tbaa !9
%conv = sext i8 %0 to i32
%cmp = icmp slt i32 %conv, 2
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load ptr, ptr @h, align 8, !tbaa !12
%2 = load ptr, ptr %1, align 8, !tbaa !12
%3 = load ptr, ptr %2, align 8, !tbaa !12
%4 = load ptr, ptr %3, align 8, !tbaa !12
%5 = load ptr, ptr @f, align 8, !tbaa !12
store ptr %4, ptr %5, align 8, !tbaa !12
%6 = load ptr, ptr @h, align 8, !tbaa !12
%7 = load ptr, ptr %6, align 8, !tbaa !12
%8 = load ptr, ptr %7, align 8, !tbaa !12
store ptr %4, ptr %8, align 8, !tbaa !12
br label %for.inc
for.inc: ; preds = %for.body
%9 = load i8, ptr @d, align 1, !tbaa !9
%inc = add i8 %9, 1
store i8 %inc, ptr @d, align 1, !tbaa !9
br label %for.cond, !llvm.loop !14
for.end: ; preds = %for.cond
store i32 0, ptr %p, align 4, !tbaa !16
br label %for.cond2
for.cond2: ; preds = %for.body3, %for.end
%10 = load i32, ptr %p, align 4, !tbaa !16
%tobool = icmp ne i32 %10, 0
br i1 %tobool, label %for.body3, label %for.end4
for.body3: ; preds = %for.cond2
br label %for.cond2, !llvm.loop !18
for.end4: ; preds = %for.cond2
call void @llvm.lifetime.end.p0(i64 4, ptr %p) #3
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
%retval = alloca i32, align 4
store i32 0, ptr %retval, align 4
call void @i()
%0 = load i64, ptr @c, align 8, !tbaa !19
%conv = trunc i64 %0 to i32
%call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
ret i32 0
}
; Function Attrs: nofree nounwind
declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...) #2
attributes #0 = { nounwind uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicldst1p0_zicond1p0_zicsr2p0_zifencei2p0_zmmul1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 8}
!8 = !{!"clang version 19.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"omnipotent char", !11, i64 0}
!11 = !{!"Simple C/C++ TBAA"}
!12 = !{!13, !13, i64 0}
!13 = !{!"any pointer", !10, i64 0}
!14 = distinct !{!14, !15}
!15 = !{!"llvm.loop.mustprogress"}
!16 = !{!17, !17, i64 0}
!17 = !{!"int", !10, i64 0}
!18 = distinct !{!18, !15}
!19 = !{!20, !20, i64 0}
!20 = !{!"long", !10, i64 0}
*** IR Dump After CoroEarlyPass on [module] omitted because no change ***
*** IR Dump After EntryExitInstrumenterPass on i omitted because no change ***
BISECT: running pass (5) LowerExpectIntrinsicPass on i
*** IR Dump After LowerExpectIntrinsicPass on i omitted because no change ***
BISECT: running pass (6) SimplifyCFGPass on i
*** IR Dump After SimplifyCFGPass on i ***
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
@e = dso_local global ptr null, align 8
@f = dso_local global ptr @e, align 8
@g = dso_local global ptr @f, align 8
@h = dso_local global ptr @g, align 8
@d = dso_local global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
@c = dso_local global i64 0, align 8
; Function Attrs: nounwind uwtable
define dso_local void @i() #0 {
entry:
%p = alloca i32, align 4
call void @llvm.lifetime.start.p0(i64 4, ptr %p) #5
store i8 0, ptr @d, align 1, !tbaa !9
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i8, ptr @d, align 1, !tbaa !9
%conv = sext i8 %0 to i32
%cmp = icmp slt i32 %conv, 2
%1 = bitcast i1 %cmp to <1 x i1>
%2 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr @h, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%3 = bitcast <1 x ptr> %2 to ptr
%4 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %3, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%5 = bitcast <1 x ptr> %4 to ptr
%6 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %5, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%7 = bitcast <1 x ptr> %6 to ptr
%8 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %7, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%9 = bitcast <1 x ptr> %8 to ptr
%10 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr @f, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%11 = bitcast <1 x ptr> %10 to ptr
%12 = bitcast ptr %9 to <1 x ptr>
call void @llvm.masked.store.v1p0.p0(<1 x ptr> %12, ptr %11, i32 8, <1 x i1> %1), !tbaa !12
%13 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr @h, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%14 = bitcast <1 x ptr> %13 to ptr
%15 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %14, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%16 = bitcast <1 x ptr> %15 to ptr
%17 = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %16, i32 8, <1 x i1> %1, <1 x ptr> poison), !tbaa !12
%18 = bitcast <1 x ptr> %17 to ptr
%19 = bitcast ptr %9 to <1 x ptr>
call void @llvm.masked.store.v1p0.p0(<1 x ptr> %19, ptr %18, i32 8, <1 x i1> %1), !tbaa !12
%20 = xor i1 %cmp, true
%21 = bitcast i1 %20 to <1 x i1>
call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr %p, i32 4, <1 x i1> %21), !tbaa !14
br i1 %cmp, label %for.inc, label %for.cond2
for.inc: ; preds = %for.cond
%22 = load i8, ptr @d, align 1, !tbaa !9
%inc = add i8 %22, 1
store i8 %inc, ptr @d, align 1, !tbaa !9
br label %for.cond, !llvm.loop !16
for.cond2: ; preds = %for.cond2, %for.cond
%23 = load i32, ptr %p, align 4, !tbaa !14
%tobool = icmp ne i32 %23, 0
br i1 %tobool, label %for.cond2, label %for.end4, !llvm.loop !18
for.end4: ; preds = %for.cond2
call void @llvm.lifetime.end.p0(i64 4, ptr %p) #5
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: nounwind uwtable
define dso_local signext i32 @main() #0 {
entry:
%retval = alloca i32, align 4
store i32 0, ptr %retval, align 4
call void @i()
%0 = load i64, ptr @c, align 8, !tbaa !19
%conv = trunc i64 %0 to i32
%call = call signext i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef signext %conv)
ret i32 0
}
; Function Attrs: nofree nounwind
declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...) #2
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
declare <1 x ptr> @llvm.masked.load.v1p0.p0(ptr nocapture, i32 immarg, <1 x i1>, <1 x ptr>) #3
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v1p0.p0(<1 x ptr>, ptr nocapture, i32 immarg, <1 x i1>) #4
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr nocapture, i32 immarg, <1 x i1>) #4
attributes #0 = { nounwind uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+zicldst,+zicond,+zicsr,+zifencei,+zmmul,-b,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zalasr,-experimental-zfbfmin,-experimental-zicfilp,-experimental-zicfiss,-experimental-ztso,-experimental-zvfbfmin,-experimental-zvfbfwma,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b" }
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
attributes #5 = { nounwind }
!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicldst1p0_zicond1p0_zicsr2p0_zifencei2p0_zmmul1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 8}
!8 = !{!"clang version 19.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"omnipotent char", !11, i64 0}
!11 = !{!"Simple C/C++ TBAA"}
!12 = !{!13, !13, i64 0}
!13 = !{!"any pointer", !10, i64 0}
!14 = !{!15, !15, i64 0}
!15 = !{!"int", !10, i64 0}
!16 = distinct !{!16, !17}
!17 = !{!"llvm.loop.mustprogress"}
!18 = distinct !{!18, !17}
!19 = !{!20, !20, i64 0}
!20 = !{!"long", !10, i64 0}
- run these three transforms of SimplifyCFG rightly before CodeGenPrepare if the target supports conditional faulting, to avoid the awkwardness?
- extend LLVM IR to support scalar versioned masked.load/store.
- enhance the support for masked loads/stores for 1x %T throughout the middle-end passes after SimplifyCFG
dtcxzyw: I prefer the second one.
@nikic @DianQK Vote?
Can you file a separate PR to support conditional load/store of ptrs with +cf?
@dtcxzyw In progress.