73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
107using namespace std::placeholders;
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115 "Controls which SLP graphs should be vectorized.");
119 cl::desc(
"Run the SLP vectorization passes"));
123 cl::desc(
"Enable vectorization for wider vector utilization"));
127 cl::desc(
"Only vectorize if you gain more than this "
132 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Attempt to vectorize for this register size in bits"));
150 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
158 cl::desc(
"Limit the size of the SLP scheduling region per block"));
162 cl::desc(
"Attempt to vectorize for this register size in bits"));
166 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
176 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
185 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189 cl::desc(
"The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
194 cl::desc(
"The maximum stride, considered to be profitable."));
198 cl::desc(
"Display the SLP trees with Graphviz"));
202 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
233 if (
SLPReVec && isa<FixedVectorType>(Ty))
235 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
244 if (
auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (
auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (
auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
272 Type *Ty,
unsigned Sz) {
277 if (NumParts == 0 || NumParts >= Sz)
292 if (NumParts == 0 || NumParts >= Sz)
297 return (Sz / RegVF) * RegVF;
307 for (
unsigned I : seq<unsigned>(Mask.size()))
309 I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
342 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344 auto *SV = cast<ShuffleVectorInst>(VL.
front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353 unsigned NumGroup = 0;
354 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356 Value *Src = SV->getOperand(0);
360 auto *SV = cast<ShuffleVectorInst>(V);
362 if (SV->getOperand(0) != Src)
365 if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371 if (!ExpectedIndex.
all())
375 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
393 auto *SV = cast<ShuffleVectorInst>(VL.
front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397 unsigned AccumulateLength = 0;
398 for (
Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421 auto *
I = dyn_cast<Instruction>(V);
422 if (!
I || isa<ExtractValueInst>(
I))
424 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426 if (isa<ExtractElementInst>(
I))
428 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
444 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
453 OS <<
"Idx: " <<
Idx <<
", ";
454 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
462 auto *It =
find_if(VL, IsaPred<Instruction>);
471 if (isa<PoisonValue>(V))
473 auto *
II = dyn_cast<Instruction>(V);
477 if (BB !=
II->getParent())
494 Value *FirstNonUndef =
nullptr;
495 for (
Value *V : VL) {
496 if (isa<UndefValue>(V))
498 if (!FirstNonUndef) {
502 if (V != FirstNonUndef)
505 return FirstNonUndef !=
nullptr;
510 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511 return Cmp->isCommutative();
512 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539 return I->isCommutative();
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549 if (
const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556 if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
570 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
581 Type *CurrentType =
IV->getType();
582 for (
unsigned I :
IV->indices()) {
583 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
621 if (MaskArg == UseMask::UndefsAsMask)
625 if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
636template <
bool IsPoisonOnly = false>
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646 auto *
C = dyn_cast<Constant>(V);
648 if (!UseMask.empty()) {
650 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652 if (isa<T>(
II->getOperand(1)))
659 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
675 if (
Constant *Elem =
C->getAggregateElement(
I))
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
705static std::optional<TargetTransformInfo::ShuffleKind>
708 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
722 Value *Vec1 =
nullptr;
723 Value *Vec2 =
nullptr;
725 auto *EE = dyn_cast<ExtractElementInst>(V);
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
738 if (isa<UndefValue>(VL[
I]))
740 auto *EI = cast<ExtractElementInst>(VL[
I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743 auto *Vec = EI->getVectorOperand();
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
748 if (isa<UndefValue>(Vec)) {
751 if (isa<UndefValue>(EI->getIndexOperand()))
753 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
759 unsigned IntIdx =
Idx->getValue().getZExtValue();
766 if (!Vec1 || Vec1 == Vec) {
768 }
else if (!Vec2 || Vec2 == Vec) {
774 if (CommonShuffleMode == Permute)
778 if (Mask[
I] %
Size !=
I) {
779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
785 if (CommonShuffleMode ==
Select && Vec2)
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803 return CI->getZExtValue();
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
808 return *EI->idx_begin();
814class InstructionsState {
821 assert(valid() &&
"InstructionsState is invalid.");
826 assert(valid() &&
"InstructionsState is invalid.");
831 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
833 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
836 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
839 unsigned CheckedOpcode =
I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
844 bool valid()
const {
return MainOp && AltOp; }
846 explicit operator bool()
const {
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
890 "Assessing comparisons of different types?");
900 return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
912 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
915 auto *It =
find_if(VL, IsaPred<Instruction>);
917 return InstructionsState::invalid();
920 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
932 unsigned AltOpcode = Opcode;
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
938 for (
Value *V : VL) {
939 auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946 if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
953 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
959 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963 return InstructionsState::invalid();
965 bool AnyPoison = InstCnt != VL.
size();
969 auto *
I = dyn_cast<Instruction>(V);
976 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
977 return InstructionsState::invalid();
978 unsigned InstOpcode =
I->getOpcode();
979 if (IsBinOp && isa<BinaryOperator>(
I)) {
980 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
984 AltOpcode = InstOpcode;
988 }
else if (IsCastOp && isa<CastInst>(
I)) {
991 Value *Op1 =
I->getOperand(0);
994 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
996 if (Opcode == AltOpcode) {
999 "Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1005 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1006 auto *BaseInst = cast<CmpInst>(MainOp);
1007 Type *Ty0 = BaseInst->getOperand(0)->getType();
1008 Type *Ty1 = Inst->getOperand(0)->getType();
1010 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1011 assert(InstOpcode == AltOpcode &&
1012 "Alternate instructions are only supported by BinaryOperator "
1020 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1026 auto *AltInst = cast<CmpInst>(AltOp);
1027 if (MainOp != AltOp) {
1030 }
else if (BasePred != CurrentPred) {
1033 "CmpInst isn't safe for alternation, logic needs to be updated!");
1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1042 }
else if (InstOpcode == Opcode) {
1043 assert(InstOpcode == AltOpcode &&
1044 "Alternate instructions are only supported by BinaryOperator and "
1046 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1047 if (Gep->getNumOperands() != 2 ||
1049 return InstructionsState::invalid();
1050 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1052 return InstructionsState::invalid();
1053 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1054 auto *BaseLI = cast<LoadInst>(MainOp);
1055 if (!LI->isSimple() || !BaseLI->isSimple())
1056 return InstructionsState::invalid();
1057 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1058 auto *
CallBase = cast<CallInst>(MainOp);
1060 return InstructionsState::invalid();
1061 if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1067 return InstructionsState::invalid();
1070 return InstructionsState::invalid();
1073 if (Mappings.
size() != BaseMappings.
size() ||
1074 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1075 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1076 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1077 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1078 Mappings.
front().Shape.Parameters !=
1079 BaseMappings.
front().Shape.Parameters)
1080 return InstructionsState::invalid();
1085 return InstructionsState::invalid();
1088 return InstructionsState(MainOp, AltOp);
1105 unsigned Opcode = UserInst->
getOpcode();
1107 case Instruction::Load: {
1108 LoadInst *LI = cast<LoadInst>(UserInst);
1111 case Instruction::Store: {
1112 StoreInst *SI = cast<StoreInst>(UserInst);
1113 return (SI->getPointerOperand() == Scalar);
1115 case Instruction::Call: {
1116 CallInst *CI = cast<CallInst>(UserInst);
1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1120 Arg.value().get() == Scalar;
1132 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1140 return LI->isSimple();
1142 return SI->isSimple();
1144 return !
MI->isVolatile();
1152 bool ExtendingManyInputs =
false) {
1153 if (SubMask.
empty())
1156 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1159 "SubMask with many inputs support must be larger than the mask.");
1161 Mask.append(SubMask.
begin(), SubMask.
end());
1165 int TermValue = std::min(Mask.size(), SubMask.
size());
1166 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1168 (!ExtendingManyInputs &&
1169 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1171 NewMask[
I] = Mask[SubMask[
I]];
1187 const unsigned Sz = Order.
size();
1190 for (
unsigned I = 0;
I < Sz; ++
I) {
1192 UnusedIndices.
reset(Order[
I]);
1194 MaskedIndices.
set(
I);
1196 if (MaskedIndices.
none())
1199 "Non-synced masked/available indices.");
1203 assert(
Idx >= 0 &&
"Indices must be synced.");
1214 Type *ScalarTy = VL[0]->getType();
1217 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1218 if (isa<PoisonValue>(VL[Lane]))
1220 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1221 OpcodeMask.
set(Lane * ScalarTyNumElements,
1222 Lane * ScalarTyNumElements + ScalarTyNumElements);
1232 const unsigned E = Indices.
size();
1234 for (
unsigned I = 0;
I < E; ++
I)
1235 Mask[Indices[
I]] =
I;
1241 assert(!Mask.empty() &&
"Expected non-empty mask.");
1245 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1247 Scalars[Mask[
I]] = Prev[
I];
1255 auto *
I = dyn_cast<Instruction>(V);
1260 auto *IO = dyn_cast<Instruction>(V);
1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1272 auto *
I = dyn_cast<Instruction>(V);
1276 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1278 auto *IU = dyn_cast<Instruction>(U);
1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1297 return !VL.
empty() &&
1313 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1322 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1324 if (NumParts == 0 || NumParts >= Limit)
1327 if (NumParts >= Sz || Sz % NumParts != 0 ||
1333namespace slpvectorizer {
1338 struct ScheduleData;
1362 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1363 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1414 return !VectorizableTree.
empty() &&
1415 !VectorizableTree.
front()->UserTreeIndices.empty();
1420 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1421 return VectorizableTree.
front()->Scalars;
1427 const TreeEntry &Root = *VectorizableTree.
front().get();
1428 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1429 !Root.Scalars.front()->getType()->isIntegerTy())
1430 return std::nullopt;
1431 auto It = MinBWs.
find(&Root);
1432 if (It != MinBWs.
end())
1436 if (Root.getOpcode() == Instruction::ZExt ||
1437 Root.getOpcode() == Instruction::SExt)
1438 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1439 Root.getOpcode() == Instruction::SExt);
1440 return std::nullopt;
1446 return MinBWs.
at(VectorizableTree.
front().get()).second;
1451 if (ReductionBitWidth == 0 ||
1452 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1453 ReductionBitWidth >=
1454 DL->getTypeSizeInBits(
1455 VectorizableTree.
front()->Scalars.front()->getType()))
1457 VectorizableTree.
front()->Scalars.front()->getType(),
1458 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
front()->Scalars.front()->getContext(),
1463 VectorizableTree.
front()->getVectorFactor());
1478 VectorizableTree.
clear();
1479 ScalarToTreeEntries.clear();
1481 NonScheduledFirst.
clear();
1482 EntryToLastInstruction.clear();
1483 LoadEntriesToVectorize.
clear();
1484 IsGraphTransformMode =
false;
1485 GatheredLoadsEntriesFirst.reset();
1486 ExternalUses.
clear();
1487 ExternalUsesAsOriginalScalar.clear();
1488 for (
auto &Iter : BlocksSchedules) {
1489 BlockScheduling *BS = Iter.second.get();
1493 ReductionBitWidth = 0;
1495 CastMaxMinBWSizes.reset();
1496 ExtraBitWidthNodes.
clear();
1497 InstrElementSize.clear();
1498 UserIgnoreList =
nullptr;
1499 PostponedGathers.
clear();
1500 ValueToGatherNodes.
clear();
1516 assert(!Order.
empty() &&
"expected non-empty order");
1517 const unsigned Sz = Order.
size();
1519 return P.value() ==
P.index() ||
P.value() == Sz;
1572 return MaxVecRegSize;
1577 return MinVecRegSize;
1585 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1587 return MaxVF ? MaxVF : UINT_MAX;
1639 unsigned *BestVF =
nullptr,
1640 bool TryRecursiveCheck =
true)
const;
1648 template <
typename T>
1675 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1676 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1698 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1699 MaxLevel(MaxLevel) {}
1753 if (isa<LoadInst>(V1)) {
1755 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1760 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1762 return U == U1 || U == U2 || R.isVectorized(U);
1765 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1768 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1770 ((
int)V1->getNumUses() == NumLanes ||
1771 AllUsersAreInternal(V1, V2)))
1777 auto CheckSameEntryOrFail = [&]() {
1782 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
1788 auto *LI1 = dyn_cast<LoadInst>(V1);
1789 auto *LI2 = dyn_cast<LoadInst>(V2);
1791 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1793 return CheckSameEntryOrFail();
1796 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1797 LI2->getPointerOperand(),
DL, SE,
true);
1798 if (!Dist || *Dist == 0) {
1801 R.TTI->isLegalMaskedGather(
1804 return CheckSameEntryOrFail();
1808 if (std::abs(*Dist) > NumLanes / 2)
1817 auto *C1 = dyn_cast<Constant>(V1);
1818 auto *C2 = dyn_cast<Constant>(V2);
1832 if (isa<UndefValue>(V2))
1836 Value *EV2 =
nullptr;
1849 int Dist = Idx2 - Idx1;
1852 if (std::abs(Dist) == 0)
1854 if (std::abs(Dist) > NumLanes / 2)
1861 return CheckSameEntryOrFail();
1864 auto *I1 = dyn_cast<Instruction>(V1);
1865 auto *I2 = dyn_cast<Instruction>(V2);
1867 if (I1->getParent() != I2->getParent())
1868 return CheckSameEntryOrFail();
1876 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1877 !S.isAltShuffle()) &&
1879 return isa<PoisonValue>(V) ||
1880 cast<Instruction>(V)->getNumOperands() ==
1881 S.getMainOp()->getNumOperands();
1887 if (I1 && isa<PoisonValue>(V2))
1890 if (isa<UndefValue>(V2))
1893 return CheckSameEntryOrFail();
1927 int ShallowScoreAtThisLevel =
1936 auto *I1 = dyn_cast<Instruction>(
LHS);
1937 auto *I2 = dyn_cast<Instruction>(
RHS);
1938 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1940 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1941 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1942 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1943 ShallowScoreAtThisLevel))
1944 return ShallowScoreAtThisLevel;
1945 assert(I1 && I2 &&
"Should have early exited.");
1952 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1953 OpIdx1 != NumOperands1; ++OpIdx1) {
1955 int MaxTmpScore = 0;
1956 unsigned MaxOpIdx2 = 0;
1957 bool FoundBest =
false;
1961 ? I2->getNumOperands()
1962 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1963 assert(FromIdx <= ToIdx &&
"Bad index");
1964 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1966 if (Op2Used.
count(OpIdx2))
1971 I1, I2, CurrLevel + 1, {});
1974 TmpScore > MaxTmpScore) {
1975 MaxTmpScore = TmpScore;
1982 Op2Used.
insert(MaxOpIdx2);
1983 ShallowScoreAtThisLevel += MaxTmpScore;
1986 return ShallowScoreAtThisLevel;
2017 struct OperandData {
2018 OperandData() =
default;
2019 OperandData(
Value *V,
bool APO,
bool IsUsed)
2020 : V(V), APO(APO), IsUsed(IsUsed) {}
2030 bool IsUsed =
false;
2039 enum class ReorderingMode {
2053 unsigned ArgSize = 0;
2059 const Loop *L =
nullptr;
2062 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2063 return OpsVec[OpIdx][Lane];
2067 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2068 return OpsVec[OpIdx][Lane];
2073 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2074 OpIdx != NumOperands; ++OpIdx)
2075 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2077 OpsVec[OpIdx][Lane].IsUsed =
false;
2081 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2082 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2094 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2096 Value *IdxLaneV = getData(
Idx, Lane).V;
2097 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2098 isa<ExtractElementInst>(IdxLaneV))
2101 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2104 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2105 if (!isa<Instruction>(OpIdxLnV))
2109 unsigned UniquesCount = Uniques.
size();
2110 auto IdxIt = Uniques.
find(IdxLaneV);
2111 unsigned UniquesCntWithIdxLaneV =
2112 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2113 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2114 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2115 unsigned UniquesCntWithOpIdxLaneV =
2116 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2117 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2119 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2120 UniquesCntWithOpIdxLaneV,
2121 UniquesCntWithOpIdxLaneV -
2123 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2124 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2125 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2134 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2135 Value *IdxLaneV = getData(
Idx, Lane).V;
2136 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2145 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2146 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2148 return R.areAllUsersVectorized(IdxLaneI)
2156 static const int ScoreScaleFactor = 10;
2164 int Lane,
unsigned OpIdx,
unsigned Idx,
2174 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2175 if (Score <= -SplatScore) {
2179 Score += SplatScore;
2185 Score *= ScoreScaleFactor;
2186 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2204 std::optional<unsigned>
2205 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2209 unsigned NumOperands = getNumOperands();
2212 Value *OpLastLane = getData(OpIdx, LastLane).V;
2215 ReorderingMode RMode = ReorderingModes[OpIdx];
2216 if (RMode == ReorderingMode::Failed)
2217 return std::nullopt;
2220 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2226 std::optional<unsigned>
Idx;
2230 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2236 bool IsUsed = RMode == ReorderingMode::Splat ||
2237 RMode == ReorderingMode::Constant ||
2238 RMode == ReorderingMode::Load;
2240 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2242 OperandData &OpData = getData(
Idx, Lane);
2244 bool OpAPO = OpData.APO;
2253 if (OpAPO != OpIdxAPO)
2258 case ReorderingMode::Load:
2259 case ReorderingMode::Opcode: {
2260 bool LeftToRight = Lane > LastLane;
2261 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2262 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2263 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2264 OpIdx,
Idx, IsUsed, UsedLanes);
2265 if (Score >
static_cast<int>(BestOp.Score) ||
2266 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2269 BestOp.Score = Score;
2270 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2274 case ReorderingMode::Constant:
2275 if (isa<Constant>(
Op) ||
2276 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2278 if (isa<Constant>(
Op)) {
2280 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2283 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2287 case ReorderingMode::Splat:
2288 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2289 IsUsed =
Op == OpLastLane;
2290 if (
Op == OpLastLane) {
2292 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2298 case ReorderingMode::Failed:
2304 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2308 return std::nullopt;
2315 unsigned getBestLaneToStartReordering()
const {
2316 unsigned Min = UINT_MAX;
2317 unsigned SameOpNumber = 0;
2328 for (
int I = getNumLanes();
I > 0; --
I) {
2329 unsigned Lane =
I - 1;
2330 OperandsOrderData NumFreeOpsHash =
2331 getMaxNumOperandsThatCanBeReordered(Lane);
2334 if (NumFreeOpsHash.NumOfAPOs < Min) {
2335 Min = NumFreeOpsHash.NumOfAPOs;
2336 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2338 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2339 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2340 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2343 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2344 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2345 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2346 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2347 auto [It, Inserted] =
2348 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2354 unsigned BestLane = 0;
2355 unsigned CntMin = UINT_MAX;
2357 if (
Data.second.first < CntMin) {
2358 CntMin =
Data.second.first;
2359 BestLane =
Data.second.second;
2366 struct OperandsOrderData {
2369 unsigned NumOfAPOs = UINT_MAX;
2372 unsigned NumOpsWithSameOpcodeParent = 0;
2386 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2387 unsigned CntTrue = 0;
2388 unsigned NumOperands = getNumOperands();
2398 bool AllUndefs =
true;
2399 unsigned NumOpsWithSameOpcodeParent = 0;
2403 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2404 const OperandData &OpData = getData(OpIdx, Lane);
2409 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2411 I->getParent() != Parent) {
2412 if (NumOpsWithSameOpcodeParent == 0) {
2413 NumOpsWithSameOpcodeParent = 1;
2415 Parent =
I->getParent();
2417 --NumOpsWithSameOpcodeParent;
2420 ++NumOpsWithSameOpcodeParent;
2424 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2425 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2429 OperandsOrderData
Data;
2430 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2431 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2439 assert((empty() || VL.
size() == getNumLanes()) &&
2440 "Expected same number of lanes");
2441 assert(S.valid() &&
"InstructionsState is invalid.");
2444 constexpr unsigned IntrinsicNumOperands = 2;
2447 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2448 OpsVec.
resize(NumOperands);
2449 unsigned NumLanes = VL.
size();
2450 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2451 OpsVec[OpIdx].
resize(NumLanes);
2452 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2453 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2454 "Expected instruction or poison value");
2465 if (isa<PoisonValue>(VL[Lane])) {
2466 if (
auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2468 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2471 }
else if (
auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2473 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2477 OpsVec[OpIdx][Lane] = {
2482 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2483 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2484 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2491 unsigned getNumOperands()
const {
return ArgSize; }
2494 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2497 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2498 return getData(OpIdx, Lane).V;
2502 bool empty()
const {
return OpsVec.
empty(); }
2505 void clear() { OpsVec.
clear(); }
2510 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2511 assert(
Op == getValue(OpIdx, Lane) &&
2512 "Op is expected to be getValue(OpIdx, Lane).");
2514 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2516 bool OpAPO = getData(OpIdx, Lane).APO;
2517 bool IsInvariant = L && L->isLoopInvariant(
Op);
2519 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2523 bool FoundCandidate =
false;
2524 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2525 OperandData &
Data = getData(OpI, Ln);
2526 if (
Data.APO != OpAPO ||
Data.IsUsed)
2528 Value *OpILane = getValue(OpI, Lane);
2529 bool IsConstantOp = isa<Constant>(OpILane);
2538 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2543 isa<Constant>(
Data.V)))) ||
2550 (IsInvariant && !isa<Constant>(
Data.V) &&
2552 L->isLoopInvariant(
Data.V))) {
2553 FoundCandidate =
true;
2560 if (!FoundCandidate)
2563 return getNumLanes() == 2 || Cnt > 1;
2568 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2569 assert(
Op == getValue(OpIdx, Lane) &&
2570 "Op is expected to be getValue(OpIdx, Lane).");
2571 bool OpAPO = getData(OpIdx, Lane).APO;
2572 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2575 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2576 const OperandData &
Data = getData(OpI, Ln);
2577 if (
Data.APO != OpAPO ||
Data.IsUsed)
2579 Value *OpILn = getValue(OpI, Ln);
2580 return (L && L->isLoopInvariant(OpILn)) ||
2593 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2594 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
2596 appendOperandsOfVL(RootVL, S);
2603 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2604 "Expected same num of lanes across all operands");
2605 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2606 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2614 unsigned NumOperands = getNumOperands();
2615 unsigned NumLanes = getNumLanes();
2635 unsigned FirstLane = getBestLaneToStartReordering();
2638 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2639 Value *OpLane0 = getValue(OpIdx, FirstLane);
2642 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2644 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2645 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2646 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2647 else if (isa<LoadInst>(OpILane0))
2648 ReorderingModes[OpIdx] = ReorderingMode::Load;
2650 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2651 }
else if (isa<Constant>(OpLane0)) {
2652 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2653 }
else if (isa<Argument>(OpLane0)) {
2655 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2665 auto &&SkipReordering = [
this]() {
2668 for (
const OperandData &
Data : Op0)
2672 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2679 return UniqueValues.
size() != 2 &&
2681 UniqueValues.
size());
2693 if (SkipReordering())
2696 bool StrategyFailed =
false;
2704 for (
unsigned I = 0;
I < NumOperands; ++
I)
2705 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2708 UsedLanes.
set(FirstLane);
2709 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2712 int Lane = FirstLane +
Direction * Distance;
2713 if (Lane < 0 || Lane >= (
int)NumLanes)
2715 UsedLanes.
set(Lane);
2717 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2720 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2722 std::optional<unsigned> BestIdx =
2723 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2724 MainAltOps[OpIdx], UsedLanes);
2731 swap(OpIdx, *BestIdx, Lane);
2734 StrategyFailed =
true;
2737 if (MainAltOps[OpIdx].
size() != 2) {
2738 OperandData &AltOp = getData(OpIdx, Lane);
2739 InstructionsState OpS =
2741 if (OpS && OpS.isAltShuffle())
2748 if (!StrategyFailed)
2753#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2756 case ReorderingMode::Load:
2758 case ReorderingMode::Opcode:
2760 case ReorderingMode::Constant:
2762 case ReorderingMode::Splat:
2764 case ReorderingMode::Failed:
2785 const unsigned Indent = 2;
2788 OS <<
"Operand " << Cnt++ <<
"\n";
2789 for (
const OperandData &OpData : OpDataVec) {
2791 if (
Value *V = OpData.V)
2795 OS <<
", APO:" << OpData.APO <<
"}\n";
2817 int BestScore = Limit;
2818 std::optional<int> Index;
2819 for (
int I : seq<int>(0, Candidates.size())) {
2821 Candidates[
I].second,
2824 if (Score > BestScore) {
2839 DeletedInstructions.insert(
I);
2844 template <
typename T>
2847 for (
T *V : DeadVals) {
2848 auto *
I = cast<Instruction>(V);
2849 DeletedInstructions.insert(
I);
2852 for (
T *V : DeadVals) {
2853 if (!V || !Processed.
insert(V).second)
2855 auto *
I = cast<Instruction>(V);
2858 for (
Use &U :
I->operands()) {
2859 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2860 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2862 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2863 return Entry->VectorizedValue == OpI;
2867 I->dropAllReferences();
2869 for (
T *V : DeadVals) {
2870 auto *
I = cast<Instruction>(V);
2871 if (!
I->getParent())
2876 cast<Instruction>(U.getUser()));
2878 "trying to erase instruction with users.");
2879 I->removeFromParent();
2883 while (!DeadInsts.
empty()) {
2886 if (!VI || !VI->getParent())
2889 "Live instruction found in dead worklist!");
2890 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2897 for (
Use &OpU : VI->operands()) {
2898 Value *OpV = OpU.get();
2909 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2910 if (!DeletedInstructions.contains(OpI) &&
2915 VI->removeFromParent();
2916 DeletedInstructions.insert(VI);
2924 return AnalyzedReductionsRoots.count(
I);
2929 AnalyzedReductionsRoots.insert(
I);
2943 AnalyzedReductionsRoots.clear();
2944 AnalyzedReductionVals.
clear();
2945 AnalyzedMinBWVals.
clear();
2957 return NonScheduledFirst.
contains(V);
2962 assert(V &&
"V cannot be nullptr.");
2963 return ScalarToTreeEntries.contains(V);
2973 bool collectValuesToDemote(
2974 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2977 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2987 canReorderOperands(TreeEntry *UserTE,
2994 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
3000 TreeEntry *TE =
nullptr;
3002 for (TreeEntry *E : getTreeEntries(V)) {
3003 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
3010 if (It != VL.
end()) {
3011 assert(
TE->isSame(VL) &&
"Expected same scalars.");
3019 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3020 unsigned OpIdx)
const {
3021 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3022 const_cast<TreeEntry *
>(UserTE), OpIdx);
3026 bool areAllUsersVectorized(
3035 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3040 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3044 getCastContextHint(
const TreeEntry &TE)
const;
3053 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3064 bool ResizeAllowed =
false)
const;
3073 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3074 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3075 unsigned NodeIdx)
const {
3076 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3083 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3088 template <
typename BVTy,
typename ResTy,
typename...
Args>
3089 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3094 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3095 bool PostponedPHIs);
3101 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3108 std::optional<TargetTransformInfo::ShuffleKind>
3120 unsigned NumParts)
const;
3132 std::optional<TargetTransformInfo::ShuffleKind>
3133 isGatherShuffledSingleRegisterEntry(
3150 isGatherShuffledEntry(
3153 unsigned NumParts,
bool ForOrder =
false);
3159 Type *ScalarTy)
const;
3163 void setInsertPointAfterBundle(
const TreeEntry *E);
3173 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3178 void tryToVectorizeGatheredLoads(
3187 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3203 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3207 void reorderGatherNode(TreeEntry &TE);
3211 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3228 [Scalars](
Value *V,
int Idx) {
3229 return (isa<UndefValue>(V) &&
3230 Idx == PoisonMaskElem) ||
3231 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3234 if (!ReorderIndices.empty()) {
3241 return IsSame(Scalars, Mask);
3242 if (VL.
size() == ReuseShuffleIndices.size()) {
3244 return IsSame(Scalars, Mask);
3248 return IsSame(Scalars, ReuseShuffleIndices);
3251 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3252 return isGather() && !UserTreeIndices.empty() &&
3253 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3254 UserTreeIndices.front().UserTE == UserEI.UserTE;
3258 bool hasEqualOperands(
const TreeEntry &TE)
const {
3259 if (
TE.getNumOperands() != getNumOperands())
3262 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3263 unsigned PrevCount =
Used.count();
3264 for (
unsigned K = 0;
K < E; ++
K) {
3267 if (getOperand(K) ==
TE.getOperand(
I)) {
3273 if (PrevCount ==
Used.count())
3282 unsigned getVectorFactor()
const {
3283 if (!ReuseShuffleIndices.empty())
3284 return ReuseShuffleIndices.size();
3285 return Scalars.
size();
3289 bool isGather()
const {
return State == NeedToGather; }
3316 enum CombinedOpcode {
3318 MinMax = Instruction::OtherOpsEnd + 1,
3320 CombinedOpcode CombinedOp = NotCombinedOp;
3334 VecTreeTy &Container;
3355 InstructionsState S = InstructionsState::invalid();
3358 unsigned InterleaveFactor = 0;
3362 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3364 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3370 assert(Operands[OpIdx].empty() &&
"Already resized?");
3372 "Number of operands is greater than the number of scalars.");
3378 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3379 VLOperands Ops(Scalars, S, R);
3382 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3383 setOperand(
I, Ops.getVL(
I));
3405 unsigned getNumOperands()
const {
return Operands.size(); }
3408 Value *getSingleOperand(
unsigned OpIdx)
const {
3410 assert(!Operands[OpIdx].empty() &&
"No operand available");
3415 bool isAltShuffle()
const {
return S.isAltShuffle(); }
3417 bool isOpcodeOrAlt(
Instruction *
I)
const {
return S.isOpcodeOrAlt(
I); }
3423 auto *
I = dyn_cast<Instruction>(
Op);
3424 if (
I && isOpcodeOrAlt(
I))
3426 return S.getMainOp();
3429 void setOperations(
const InstructionsState &S) {
3430 assert(S &&
"InstructionsState is invalid.");
3434 Instruction *getMainOp()
const {
return S.getMainOp(); }
3436 Instruction *getAltOp()
const {
return S.getAltOp(); }
3439 unsigned getOpcode()
const {
return S.
getOpcode(); }
3441 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
3443 bool hasState()
const {
return S.valid(); }
3447 int findLaneForValue(
Value *V)
const {
3448 unsigned FoundLane = getVectorFactor();
3449 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3450 std::advance(It, 1)) {
3453 FoundLane = std::distance(Scalars.begin(), It);
3454 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3455 if (!ReorderIndices.
empty())
3456 FoundLane = ReorderIndices[FoundLane];
3457 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3458 if (ReuseShuffleIndices.
empty())
3460 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3461 RIt != ReuseShuffleIndices.
end()) {
3462 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3466 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3479 bool isNonPowOf2Vec()
const {
3481 return IsNonPowerOf2;
3490 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3491 "Reshuffling not supported with non-power-of-2 vectors yet.");
3492 return IsNonPowerOf2;
3495 Value *getOrdered(
unsigned Idx)
const {
3496 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3497 if (ReorderIndices.
empty())
3498 return Scalars[
Idx];
3508 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3509 dbgs() <<
"Operand " << OpI <<
":\n";
3510 for (
const Value *V : Operands[OpI])
3513 dbgs() <<
"Scalars: \n";
3514 for (
Value *V : Scalars)
3516 dbgs() <<
"State: ";
3519 if (InterleaveFactor > 0) {
3520 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3523 dbgs() <<
"Vectorize\n";
3526 case ScatterVectorize:
3527 dbgs() <<
"ScatterVectorize\n";
3529 case StridedVectorize:
3530 dbgs() <<
"StridedVectorize\n";
3533 dbgs() <<
"NeedToGather\n";
3535 case CombinedVectorize:
3536 dbgs() <<
"CombinedVectorize\n";
3540 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
3541 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
3543 dbgs() <<
"MainOp: NULL\n";
3544 dbgs() <<
"AltOp: NULL\n";
3546 dbgs() <<
"VectorizedValue: ";
3547 if (VectorizedValue)
3548 dbgs() << *VectorizedValue <<
"\n";
3551 dbgs() <<
"ReuseShuffleIndices: ";
3552 if (ReuseShuffleIndices.
empty())
3555 for (
int ReuseIdx : ReuseShuffleIndices)
3556 dbgs() << ReuseIdx <<
", ";
3558 dbgs() <<
"ReorderIndices: ";
3559 for (
unsigned ReorderIdx : ReorderIndices)
3560 dbgs() << ReorderIdx <<
", ";
3562 dbgs() <<
"UserTreeIndices: ";
3563 for (
const auto &EInfo : UserTreeIndices)
3564 dbgs() << EInfo <<
", ";
3566 if (!CombinedEntriesWithIndices.
empty()) {
3567 dbgs() <<
"Combined entries: ";
3569 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3578 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3581 dbgs() <<
"SLP: " << Banner <<
":\n";
3583 dbgs() <<
"SLP: Costs:\n";
3584 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3585 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3586 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3587 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3588 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3594 std::optional<ScheduleData *> Bundle,
3595 const InstructionsState &S,
3596 const EdgeInfo &UserTreeIdx,
3599 unsigned InterleaveFactor = 0) {
3600 TreeEntry::EntryState EntryState =
3601 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3602 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3603 ReuseShuffleIndices, ReorderIndices);
3604 if (E && InterleaveFactor > 0)
3605 E->setInterleave(InterleaveFactor);
3610 TreeEntry::EntryState EntryState,
3611 std::optional<ScheduleData *> Bundle,
3612 const InstructionsState &S,
3613 const EdgeInfo &UserTreeIdx,
3616 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3617 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3618 "Need to vectorize gather entry?");
3620 if (GatheredLoadsEntriesFirst.has_value() &&
3621 EntryState == TreeEntry::NeedToGather && S &&
3622 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3623 !UserTreeIdx.UserTE)
3625 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3626 TreeEntry *
Last = VectorizableTree.
back().get();
3627 Last->Idx = VectorizableTree.
size() - 1;
3628 Last->State = EntryState;
3633 ReuseShuffleIndices.empty()) &&
3634 "Reshuffling scalars not yet supported for nodes with padding");
3635 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3636 ReuseShuffleIndices.end());
3637 if (ReorderIndices.
empty()) {
3640 Last->setOperations(S);
3643 Last->Scalars.assign(VL.
size(),
nullptr);
3646 if (Idx >= VL.size())
3647 return UndefValue::get(VL.front()->getType());
3652 Last->setOperations(S);
3653 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3655 if (!
Last->isGather()) {
3657 for (
Value *V : VL) {
3658 if (isa<PoisonValue>(V))
3660 auto It = ScalarToTreeEntries.find(V);
3662 (It == ScalarToTreeEntries.end() ||
3663 (It->getSecond().size() == 1 && It->getSecond().front() ==
Last) ||
3665 "Scalar already in tree!");
3666 if (It == ScalarToTreeEntries.end()) {
3667 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
3668 (void)Processed.
insert(V);
3669 }
else if (Processed.
insert(V).second) {
3671 "Value already associated with the node.");
3672 It->getSecond().push_back(
Last);
3676 ScheduleData *BundleMember = *Bundle;
3677 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3680 "Bundle and VL out of sync");
3682 for (
Value *V : VL) {
3687 BundleMember->TE =
Last;
3688 BundleMember = BundleMember->NextInBundle;
3691 assert(!BundleMember &&
"Bundle and VL out of sync");
3694 bool AllConstsOrCasts =
true;
3697 auto *
I = dyn_cast<CastInst>(V);
3698 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3699 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3700 !UserTreeIdx.UserTE->isGather())
3703 if (AllConstsOrCasts)
3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3706 MustGather.
insert(VL.begin(), VL.end());
3709 if (UserTreeIdx.UserTE)
3710 Last->UserTreeIndices.push_back(UserTreeIdx);
3716 TreeEntry::VecTreeTy VectorizableTree;
3721 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3722 VectorizableTree[
Id]->dump();
3730 assert(V &&
"V cannot be nullptr.");
3731 auto It = ScalarToTreeEntries.find(V);
3732 if (It == ScalarToTreeEntries.end())
3734 return It->getSecond();
3739 bool SameVF =
false)
const {
3740 assert(V &&
"V cannot be nullptr.");
3741 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
3742 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
3753 bool areAltOperandsProfitable(
const InstructionsState &S,
3758 TreeEntry::EntryState
3760 bool IsScatterVectorizeUserTE,
3789 using ValueToGatherNodesMap =
3791 ValueToGatherNodesMap ValueToGatherNodes;
3799 bool IsGraphTransformMode =
false;
3802 std::optional<unsigned> GatheredLoadsEntriesFirst;
3805 struct ExternalUser {
3832 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3833 auto It = AliasCache.
find(Key);
3834 if (It != AliasCache.
end())
3839 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3843 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3875 UserList ExternalUses;
3898 struct ScheduleData {
3901 enum { InvalidDeps = -1 };
3903 ScheduleData() =
default;
3906 FirstInBundle =
this;
3907 NextInBundle =
nullptr;
3908 NextLoadStore =
nullptr;
3909 IsScheduled =
false;
3910 SchedulingRegionID = BlockSchedulingRegionID;
3911 clearDependencies();
3918 if (hasValidDependencies()) {
3919 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3921 assert(UnscheduledDeps == Dependencies &&
"invariant");
3925 assert(isSchedulingEntity() &&
3926 "unexpected scheduled state");
3927 for (
const ScheduleData *BundleMember =
this; BundleMember;
3928 BundleMember = BundleMember->NextInBundle) {
3929 assert(BundleMember->hasValidDependencies() &&
3930 BundleMember->UnscheduledDeps == 0 &&
3931 "unexpected scheduled state");
3932 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3933 "only bundle is marked scheduled");
3937 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3938 "all bundle members must be in same basic block");
3944 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3948 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3952 bool isPartOfBundle()
const {
3953 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3958 bool isReady()
const {
3959 assert(isSchedulingEntity() &&
3960 "can't consider non-scheduling entity for ready list");
3961 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3967 int incrementUnscheduledDeps(
int Incr) {
3968 assert(hasValidDependencies() &&
3969 "increment of unscheduled deps would be meaningless");
3970 UnscheduledDeps += Incr;
3971 return FirstInBundle->unscheduledDepsInBundle();
3976 void resetUnscheduledDeps() {
3977 UnscheduledDeps = Dependencies;
3981 void clearDependencies() {
3982 Dependencies = InvalidDeps;
3983 resetUnscheduledDeps();
3984 MemoryDependencies.clear();
3985 ControlDependencies.clear();
3988 int unscheduledDepsInBundle()
const {
3989 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3991 for (
const ScheduleData *BundleMember =
this; BundleMember;
3992 BundleMember = BundleMember->NextInBundle) {
3993 if (BundleMember->UnscheduledDeps == InvalidDeps)
3995 Sum += BundleMember->UnscheduledDeps;
4001 if (!isSchedulingEntity()) {
4002 os <<
"/ " << *Inst;
4003 }
else if (NextInBundle) {
4005 ScheduleData *SD = NextInBundle;
4007 os <<
';' << *SD->Inst;
4008 SD = SD->NextInBundle;
4021 TreeEntry *
TE =
nullptr;
4025 ScheduleData *FirstInBundle =
nullptr;
4029 ScheduleData *NextInBundle =
nullptr;
4033 ScheduleData *NextLoadStore =
nullptr;
4047 int SchedulingRegionID = 0;
4050 int SchedulingPriority = 0;
4056 int Dependencies = InvalidDeps;
4062 int UnscheduledDeps = InvalidDeps;
4066 bool IsScheduled =
false;
4071 const BoUpSLP::ScheduleData &SD) {
4096 struct BlockScheduling {
4098 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4102 ScheduleStart =
nullptr;
4103 ScheduleEnd =
nullptr;
4104 FirstLoadStoreInRegion =
nullptr;
4105 LastLoadStoreInRegion =
nullptr;
4106 RegionHasStackSave =
false;
4110 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4113 ScheduleRegionSize = 0;
4117 ++SchedulingRegionID;
4121 if (BB !=
I->getParent())
4124 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4125 if (SD && isInSchedulingRegion(SD))
4130 ScheduleData *getScheduleData(
Value *V) {
4131 if (
auto *
I = dyn_cast<Instruction>(V))
4132 return getScheduleData(
I);
4136 bool isInSchedulingRegion(ScheduleData *SD)
const {
4137 return SD->SchedulingRegionID == SchedulingRegionID;
4142 template <
typename ReadyListType>
4143 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4144 SD->IsScheduled =
true;
4147 for (ScheduleData *BundleMember = SD; BundleMember;
4148 BundleMember = BundleMember->NextInBundle) {
4153 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4154 ScheduleData *OpDef = getScheduleData(
I);
4155 if (OpDef && OpDef->hasValidDependencies() &&
4156 OpDef->incrementUnscheduledDeps(-1) == 0) {
4160 ScheduleData *DepBundle = OpDef->FirstInBundle;
4161 assert(!DepBundle->IsScheduled &&
4162 "already scheduled bundle gets ready");
4163 ReadyList.insert(DepBundle);
4165 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4172 if (TreeEntry *TE = BundleMember->TE) {
4174 auto *
In = BundleMember->Inst;
4175 int Lane = std::distance(
TE->Scalars.begin(),
4177 assert(Lane >= 0 &&
"Lane not set");
4187 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4188 In->getNumOperands() ==
TE->getNumOperands()) &&
4189 "Missed TreeEntry operands?");
4191 for (
unsigned OpIdx : seq<unsigned>(
TE->getNumOperands()))
4192 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4197 for (
Use &U : BundleMember->Inst->operands())
4198 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4202 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4203 if (MemoryDepSD->hasValidDependencies() &&
4204 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4207 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4208 assert(!DepBundle->IsScheduled &&
4209 "already scheduled bundle gets ready");
4210 ReadyList.insert(DepBundle);
4212 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4216 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4217 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4220 ScheduleData *DepBundle = DepSD->FirstInBundle;
4221 assert(!DepBundle->IsScheduled &&
4222 "already scheduled bundle gets ready");
4223 ReadyList.insert(DepBundle);
4225 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4236 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4237 ScheduleStart->comesBefore(ScheduleEnd) &&
4238 "Not a valid scheduling region?");
4240 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4241 auto *SD = getScheduleData(
I);
4244 assert(isInSchedulingRegion(SD) &&
4245 "primary schedule data not in window?");
4246 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4247 "entire bundle in window!");
4251 for (
auto *SD : ReadyInsts) {
4252 assert(SD->isSchedulingEntity() && SD->isReady() &&
4253 "item in ready list not ready?");
4259 template <
typename ReadyListType>
4260 void initialFillReadyList(ReadyListType &ReadyList) {
4261 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4262 ScheduleData *SD = getScheduleData(
I);
4263 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4265 ReadyList.insert(SD);
4267 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4281 std::optional<ScheduleData *>
4283 const InstructionsState &S);
4289 ScheduleData *allocateScheduleDataChunks();
4293 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4298 ScheduleData *PrevLoadStore,
4299 ScheduleData *NextLoadStore);
4303 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4307 void resetSchedule();
4337 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4341 ScheduleData *LastLoadStoreInRegion =
nullptr;
4346 bool RegionHasStackSave =
false;
4349 int ScheduleRegionSize = 0;
4358 int SchedulingRegionID = 1;
4366 void scheduleBlock(BlockScheduling *BS);
4373 struct OrdersTypeDenseMapInfo {
4386 static unsigned getHashValue(
const OrdersType &V) {
4407 unsigned MaxVecRegSize;
4408 unsigned MinVecRegSize;
4423 unsigned ReductionBitWidth = 0;
4426 unsigned BaseGraphSize = 1;
4430 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4449 struct ChildIteratorType
4451 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4462 return R.VectorizableTree[0].get();
4466 return {
N->UserTreeIndices.begin(),
N->Container};
4470 return {
N->UserTreeIndices.end(),
N->Container};
4475 class nodes_iterator {
4486 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4490 return nodes_iterator(R->VectorizableTree.begin());
4494 return nodes_iterator(R->VectorizableTree.end());
4497 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4508 OS << Entry->Idx <<
".\n";
4511 for (
auto *V : Entry->Scalars) {
4513 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4514 return EU.Scalar == V;
4524 if (Entry->isGather())
4526 if (Entry->State == TreeEntry::ScatterVectorize ||
4527 Entry->State == TreeEntry::StridedVectorize)
4528 return "color=blue";
4537 for (
auto *
I : DeletedInstructions) {
4538 if (!
I->getParent()) {
4541 if (isa<PHINode>(
I))
4543 I->insertBefore(
F->getEntryBlock(),
4544 F->getEntryBlock().getFirstNonPHIIt());
4546 I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
4549 for (
Use &U :
I->operands()) {
4550 auto *
Op = dyn_cast<Instruction>(U.get());
4551 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4555 I->dropAllReferences();
4557 for (
auto *
I : DeletedInstructions) {
4559 "trying to erase instruction with users.");
4560 I->eraseFromParent();
4566#ifdef EXPENSIVE_CHECKS
4577 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4578 "Expected non-empty mask.");
4581 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4583 Reuses[Mask[
I]] = Prev[
I];
4591 bool BottomOrder =
false) {
4592 assert(!Mask.empty() &&
"Expected non-empty mask.");
4593 unsigned Sz = Mask.size();
4596 if (Order.
empty()) {
4598 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4600 PrevOrder.
swap(Order);
4603 for (
unsigned I = 0;
I < Sz; ++
I)
4605 Order[
I] = PrevOrder[Mask[
I]];
4607 return Data.value() == Sz ||
Data.index() ==
Data.value();
4616 if (Order.
empty()) {
4618 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4628 for (
unsigned I = 0;
I < Sz; ++
I)
4630 Order[MaskOrder[
I]] =
I;
4634std::optional<BoUpSLP::OrdersType>
4636 assert(TE.isGather() &&
"Expected gather node only.");
4640 Type *ScalarTy = GatheredScalars.
front()->getType();
4641 int NumScalars = GatheredScalars.
size();
4643 return std::nullopt;
4650 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4652 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4655 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4656 return std::nullopt;
4657 OrdersType CurrentOrder(NumScalars, NumScalars);
4658 if (GatherShuffles.
size() == 1 &&
4660 Entries.front().front()->isSame(TE.Scalars)) {
4663 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4664 return CurrentOrder;
4668 return all_of(Mask, [&](
int I) {
4675 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4676 (Entries.size() != 1 ||
4677 Entries.front().front()->ReorderIndices.empty())) ||
4678 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4679 return std::nullopt;
4684 for (
int I : seq<int>(0, NumParts)) {
4685 if (ShuffledSubMasks.
test(
I))
4687 const int VF = GetVF(
I);
4693 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4694 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4695 ShuffledSubMasks.
set(
I);
4699 int FirstMin = INT_MAX;
4700 int SecondVecFound =
false;
4701 for (
int K : seq<int>(Limit)) {
4702 int Idx = Mask[
I * PartSz + K];
4704 Value *V = GatheredScalars[
I * PartSz + K];
4706 SecondVecFound =
true;
4715 SecondVecFound =
true;
4719 FirstMin = (FirstMin / PartSz) * PartSz;
4721 if (SecondVecFound) {
4722 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4723 ShuffledSubMasks.
set(
I);
4726 for (
int K : seq<int>(Limit)) {
4727 int Idx = Mask[
I * PartSz + K];
4731 if (
Idx >= PartSz) {
4732 SecondVecFound =
true;
4735 if (CurrentOrder[
I * PartSz +
Idx] >
4736 static_cast<unsigned>(
I * PartSz + K) &&
4737 CurrentOrder[
I * PartSz +
Idx] !=
4738 static_cast<unsigned>(
I * PartSz +
Idx))
4739 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4742 if (SecondVecFound) {
4743 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4744 ShuffledSubMasks.
set(
I);
4750 if (!ExtractShuffles.
empty())
4751 TransformMaskToOrder(
4752 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4753 if (!ExtractShuffles[
I])
4756 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4757 for (
unsigned Idx : seq<unsigned>(Sz)) {
4758 int K =
I * PartSz +
Idx;
4761 if (!TE.ReuseShuffleIndices.empty())
4762 K = TE.ReuseShuffleIndices[K];
4765 if (!TE.ReorderIndices.empty())
4766 K = std::distance(TE.ReorderIndices.begin(),
4767 find(TE.ReorderIndices, K));
4768 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4771 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4773 .getKnownMinValue());
4778 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4779 if (ShuffledSubMasks.
any())
4780 return std::nullopt;
4781 PartSz = NumScalars;
4784 if (!Entries.empty())
4785 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4786 if (!GatherShuffles[
I])
4788 return std::max(Entries[
I].front()->getVectorFactor(),
4789 Entries[
I].back()->getVectorFactor());
4792 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4793 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4794 return std::nullopt;
4795 return std::move(CurrentOrder);
4800 bool CompareOpcodes =
true) {
4804 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4805 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4806 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4807 (!GEP2 || GEP2->getNumOperands() == 2) &&
4808 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4809 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4812 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4816template <
typename T>
4818 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4820 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4821 return CommonAlignment;
4827 "Order is empty. Please check it before using isReverseOrder.");
4828 unsigned Sz = Order.
size();
4830 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4841static std::optional<Value *>
4847 const SCEV *PtrSCEVLowest =
nullptr;
4848 const SCEV *PtrSCEVHighest =
nullptr;
4854 return std::nullopt;
4856 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4857 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4861 if (isa<SCEVCouldNotCompute>(Diff))
4862 return std::nullopt;
4864 PtrSCEVLowest = PtrSCEV;
4868 if (isa<SCEVCouldNotCompute>(Diff1))
4869 return std::nullopt;
4871 PtrSCEVHighest = PtrSCEV;
4877 if (isa<SCEVCouldNotCompute>(Dist))
4878 return std::nullopt;
4879 int Size =
DL.getTypeStoreSize(ElemTy);
4880 auto TryGetStride = [&](
const SCEV *Dist,
4881 const SCEV *Multiplier) ->
const SCEV * {
4882 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4883 if (M->getOperand(0) == Multiplier)
4884 return M->getOperand(1);
4885 if (M->getOperand(1) == Multiplier)
4886 return M->getOperand(0);
4889 if (Multiplier == Dist)
4894 const SCEV *Stride =
nullptr;
4895 if (
Size != 1 || SCEVs.
size() > 2) {
4897 Stride = TryGetStride(Dist, Sz);
4899 return std::nullopt;
4901 if (!Stride || isa<SCEVConstant>(Stride))
4902 return std::nullopt;
4905 using DistOrdPair = std::pair<int64_t, int>;
4907 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4909 bool IsConsecutive =
true;
4910 for (
const SCEV *PtrSCEV : SCEVs) {
4912 if (PtrSCEV != PtrSCEVLowest) {
4914 const SCEV *Coeff = TryGetStride(Diff, Stride);
4916 return std::nullopt;
4917 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4918 if (!SC || isa<SCEVCouldNotCompute>(SC))
4919 return std::nullopt;
4923 return std::nullopt;
4924 Dist = SC->getAPInt().getZExtValue();
4928 return std::nullopt;
4929 auto Res = Offsets.emplace(Dist, Cnt);
4931 return std::nullopt;
4933 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4936 if (Offsets.size() != SCEVs.
size())
4937 return std::nullopt;
4938 SortedIndices.
clear();
4939 if (!IsConsecutive) {
4943 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4944 SortedIndices[Cnt] = Pair.second;
4954static std::pair<InstructionCost, InstructionCost>
4970 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4973 Mask, NumSrcElts, NumSubElts,
Index)) {
4974 if (
Index + NumSubElts > NumSrcElts &&
4975 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4991 if (
Index % SubVecVF == 0) {
4999 std::iota(
Mask.begin(),
Mask.end(), 0);
5000 for (
unsigned I : seq<unsigned>(SubVecVF))
5003 Vec = Generator(Vec, V, Mask);
5007 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5019 unsigned SubVecVF,
unsigned Index) {
5020 if (
Index % SubVecVF == 0) {
5028 std::iota(Mask.begin(), Mask.end(),
Index);
5036 unsigned *BestVF,
bool TryRecursiveCheck)
const {
5049 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5055 const unsigned Sz = VL.
size();
5057 auto *POIter = PointerOps.
begin();
5058 for (
Value *V : VL) {
5059 auto *L = dyn_cast<LoadInst>(V);
5060 if (!L || !L->isSimple())
5062 *POIter = L->getPointerOperand();
5071 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5091 if (Order.
empty()) {
5092 Ptr0 = PointerOps.
front();
5093 PtrN = PointerOps.
back();
5095 Ptr0 = PointerOps[Order.
front()];
5096 PtrN = PointerOps[Order.
back()];
5098 std::optional<int> Diff =
5101 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5107 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5121 auto IsAnyPointerUsedOutGraph =
5122 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5123 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5124 return !isVectorized(U) && !MustGather.contains(U);
5127 const unsigned AbsoluteDiff = std::abs(*Diff);
5128 if (IsPossibleStrided &&
5129 (IsAnyPointerUsedOutGraph ||
5130 (AbsoluteDiff > Sz &&
5133 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
5134 *Diff == -(
static_cast<int>(Sz) - 1))) {
5135 int Stride = *Diff /
static_cast<int>(Sz - 1);
5136 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5148 else if (
Ptr != Ptr0)
5152 if (((Dist / Stride) * Stride) != Dist ||
5153 !Dists.
insert(Dist).second)
5156 if (Dists.
size() == Sz)
5165 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5167 bool ProfitableGatherPointers) {
5172 auto [ScalarGEPCost, VectorGEPCost] =
5174 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5180 VecTy->getNumElements());
5181 if (
static_cast<unsigned>(
count_if(
5182 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5188 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5207 false, CommonAlignment,
CostKind) +
5208 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5215 constexpr unsigned ListLimit = 4;
5216 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5225 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5235 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5248 DemandedElts.
setBits(Cnt, Cnt + VF);
5263 if (!DemandedElts.
isZero()) {
5268 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5269 if (DemandedElts[
Idx])
5276 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5281 LI0->getPointerOperand(),
5282 Instruction::GetElementPtr,
CostKind, ScalarTy,
5286 if (
static_cast<unsigned>(
5287 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5288 PointerOps.
size() - 1 ||
5308 LI0->getPointerAddressSpace(),
CostKind,
5314 LI0->getPointerOperand(),
5321 LI0->getPointerOperand(),
5331 for (
int Idx : seq<int>(0, VL.
size()))
5341 if (MaskedGatherCost >= VecLdCost &&
5354 bool ProfitableGatherPointers =
5355 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5356 return L->isLoopInvariant(V);
5358 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5359 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5361 (
GEP &&
GEP->getNumOperands() == 2 &&
5362 isa<Constant, Instruction>(
GEP->getOperand(1)));
5369 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5370 ProfitableGatherPointers))
5383 "Expected list of pointer operands.");
5393 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5395 SortedIndices.
clear();
5397 auto Key = std::make_pair(BBs[Cnt + 1],
5401 std::optional<int> Diff = getPointersDiff(
5402 ElemTy, std::get<0>(Base.front()), ElemTy,
5408 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5414 if (Bases.
size() > VL.
size() / 2 - 1)
5418 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5425 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5426 Bases.
front().second.size() == VL.
size()))
5431 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5440 FirstPointers.
insert(P1);
5441 SecondPointers.
insert(P2);
5447 "Unable to find matching root.");
5450 for (
auto &
Base : Bases) {
5451 for (
auto &Vec :
Base.second) {
5452 if (Vec.size() > 1) {
5453 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5454 const std::tuple<Value *, int, unsigned> &
Y) {
5455 return std::get<1>(
X) < std::get<1>(
Y);
5457 int InitialOffset = std::get<1>(Vec[0]);
5458 bool AnyConsecutive =
5460 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5464 if (!AnyConsecutive)
5469 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5473 for (
auto &
T : Bases)
5474 for (
const auto &Vec :
T.second)
5475 for (
const auto &
P : Vec)
5479 "Expected SortedIndices to be the size of VL");
5483std::optional<BoUpSLP::OrdersType>
5485 assert(TE.isGather() &&
"Expected gather node only.");
5486 Type *ScalarTy = TE.Scalars[0]->getType();
5489 Ptrs.
reserve(TE.Scalars.size());
5491 BBs.
reserve(TE.Scalars.size());
5492 for (
Value *V : TE.Scalars) {
5493 auto *L = dyn_cast<LoadInst>(V);
5494 if (!L || !L->isSimple())
5495 return std::nullopt;
5501 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5503 return std::move(Order);
5504 return std::nullopt;
5515 if (VU->
getType() != V->getType())
5518 if (!VU->
hasOneUse() && !V->hasOneUse())
5524 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5530 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5531 bool IsReusedIdx =
false;
5533 if (IE2 == VU && !IE1)
5535 if (IE1 == V && !IE2)
5536 return V->hasOneUse();
5537 if (IE1 && IE1 != V) {
5539 IsReusedIdx |= ReusedIdx.
test(Idx1);
5540 ReusedIdx.
set(Idx1);
5541 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5544 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5546 if (IE2 && IE2 != VU) {
5548 IsReusedIdx |= ReusedIdx.
test(Idx2);
5549 ReusedIdx.
set(Idx2);
5550 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5553 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5555 }
while (!IsReusedIdx && (IE1 || IE2));
5559std::optional<BoUpSLP::OrdersType>
5563 if (!TE.ReuseShuffleIndices.empty()) {
5565 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5566 "Reshuffling scalars not yet supported for nodes with padding");
5569 return std::nullopt;
5577 unsigned Sz = TE.Scalars.size();
5578 if (TE.isGather()) {
5579 if (std::optional<OrdersType> CurrentOrder =
5584 ::addMask(Mask, TE.ReuseShuffleIndices);
5585 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5586 unsigned Sz = TE.Scalars.size();
5587 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5590 Res[
Idx + K * Sz] =
I + K * Sz;
5592 return std::move(Res);
5595 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5597 2 * TE.getVectorFactor())) == 1)
5598 return std::nullopt;
5602 if (TE.ReorderIndices.empty())
5603 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5606 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5607 unsigned VF = ReorderMask.
size();
5611 for (
unsigned I = 0;
I < VF;
I += Sz) {
5613 unsigned UndefCnt = 0;
5614 unsigned Limit = std::min(Sz, VF -
I);
5623 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5625 return std::nullopt;
5627 for (
unsigned K = 0; K < NumParts; ++K) {
5628 unsigned Idx = Val + Sz * K;
5630 ResOrder[
Idx] =
I + K;
5633 return std::move(ResOrder);
5635 unsigned VF = TE.getVectorFactor();
5638 TE.ReuseShuffleIndices.end());
5639 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5641 if (isa<PoisonValue>(V))
5643 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5644 return Idx && *Idx < Sz;
5646 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5647 "by BinaryOperator and CastInst.");
5649 if (TE.ReorderIndices.empty())
5650 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5653 for (
unsigned I = 0;
I < VF; ++
I) {
5654 int &
Idx = ReusedMask[
I];
5657 Value *V = TE.Scalars[ReorderMask[
Idx]];
5659 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5665 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5666 auto *It = ResOrder.
begin();
5667 for (
unsigned K = 0; K < VF; K += Sz) {
5671 std::iota(SubMask.begin(), SubMask.end(), 0);
5673 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5674 std::advance(It, Sz);
5677 return Data.index() ==
Data.value();
5679 return std::nullopt;
5680 return std::move(ResOrder);
5682 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5683 any_of(TE.UserTreeIndices,
5685 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5687 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5688 return std::nullopt;
5689 if ((TE.State == TreeEntry::Vectorize ||
5690 TE.State == TreeEntry::StridedVectorize) &&
5691 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5692 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5693 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5694 "BinaryOperator and CastInst.");
5695 return TE.ReorderIndices;
5697 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5698 if (!TE.ReorderIndices.empty())
5699 return TE.ReorderIndices;
5702 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5703 if (!V->hasNUsesOrMore(1))
5705 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5710 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5712 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5718 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5719 auto *NodeA = DT->
getNode(BB1);
5720 auto *NodeB = DT->
getNode(BB2);
5721 assert(NodeA &&
"Should only process reachable instructions");
5722 assert(NodeB &&
"Should only process reachable instructions");
5723 assert((NodeA == NodeB) ==
5724 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5725 "Different nodes should have different DFS numbers");
5726 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5728 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5729 Value *V1 = TE.Scalars[I1];
5730 Value *V2 = TE.Scalars[I2];
5731 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5733 if (isa<PoisonValue>(V1))
5735 if (isa<PoisonValue>(V2))
5741 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5742 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5743 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5744 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5745 FirstUserOfPhi2->getParent());
5746 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5747 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5748 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5749 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5755 if (UserBVHead[I1] && !UserBVHead[I2])
5757 if (!UserBVHead[I1])
5759 if (UserBVHead[I1] == UserBVHead[I2])
5762 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5764 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5771 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5772 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5773 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5774 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5777 if (EE1->getOperand(0) == EE2->getOperand(0))
5779 if (!Inst1 && Inst2)
5781 if (Inst1 && Inst2) {
5789 "Expected either instructions or arguments vector operands.");
5790 return P1->getArgNo() < P2->getArgNo();
5795 std::iota(Phis.
begin(), Phis.
end(), 0);
5798 return std::nullopt;
5799 return std::move(Phis);
5801 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5805 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5806 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5807 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5809 auto *EE = dyn_cast<ExtractElementInst>(V);
5810 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5816 canReuseExtract(TE.Scalars, CurrentOrder,
true);
5817 if (Reuse || !CurrentOrder.
empty())
5818 return std::move(CurrentOrder);
5826 int Sz = TE.Scalars.size();
5828 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5830 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5831 if (It == TE.Scalars.begin())
5834 if (It != TE.Scalars.end()) {
5836 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5851 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5854 return std::move(Order);
5859 return std::nullopt;
5860 if (TE.Scalars.size() >= 3)
5865 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5869 CurrentOrder, PointerOps);
5871 return std::move(CurrentOrder);
5877 return CurrentOrder;
5879 return std::nullopt;
5889 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5891 if (Cluster != FirstCluster)
5897void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5900 const unsigned Sz =
TE.Scalars.size();
5902 if (!
TE.isGather() ||
5909 addMask(NewMask,
TE.ReuseShuffleIndices);
5911 TE.ReorderIndices.clear();
5918 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5919 *
End =
TE.ReuseShuffleIndices.end();
5920 It !=
End; std::advance(It, Sz))
5921 std::iota(It, std::next(It, Sz), 0);
5927 "Expected same size of orders");
5928 unsigned Sz = Order.
size();
5930 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5931 if (Order[
Idx] != Sz)
5932 UsedIndices.
set(Order[
Idx]);
5934 if (SecondaryOrder.
empty()) {
5935 for (
unsigned Idx : seq<unsigned>(0, Sz))
5936 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5939 for (
unsigned Idx : seq<unsigned>(0, Sz))
5940 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5941 !UsedIndices.
test(SecondaryOrder[
Idx]))
5942 Order[
Idx] = SecondaryOrder[
Idx];
5962 ExternalUserReorderMap;
5967 const std::unique_ptr<TreeEntry> &TE) {
5970 findExternalStoreUsersReorderIndices(TE.get());
5971 if (!ExternalUserReorderIndices.
empty()) {
5972 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5974 std::move(ExternalUserReorderIndices));
5980 if (TE->hasState() && TE->isAltShuffle()) {
5983 unsigned Opcode0 = TE->getOpcode();
5984 unsigned Opcode1 = TE->getAltOpcode();
5987 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5988 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5994 if (std::optional<OrdersType> CurrentOrder =
6004 const TreeEntry *UserTE = TE.get();
6006 if (UserTE->UserTreeIndices.size() != 1)
6009 return EI.UserTE->State == TreeEntry::Vectorize &&
6010 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
6013 UserTE = UserTE->UserTreeIndices.back().UserTE;
6016 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
6017 if (!(TE->State == TreeEntry::Vectorize ||
6018 TE->State == TreeEntry::StridedVectorize) ||
6019 !TE->ReuseShuffleIndices.empty())
6020 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
6021 if (TE->State == TreeEntry::Vectorize &&
6022 TE->getOpcode() == Instruction::PHI)
6023 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
6028 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
6029 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6030 auto It = VFToOrderedEntries.
find(VF);
6031 if (It == VFToOrderedEntries.
end())
6046 for (
const TreeEntry *OpTE : OrderedEntries) {
6049 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6052 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6054 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6055 auto It = GathersToOrders.find(OpTE);
6056 if (It != GathersToOrders.end())
6059 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6060 auto It = AltShufflesToOrders.find(OpTE);
6061 if (It != AltShufflesToOrders.end())
6064 if (OpTE->State == TreeEntry::Vectorize &&
6065 OpTE->getOpcode() == Instruction::PHI) {
6066 auto It = PhisToOrders.
find(OpTE);
6067 if (It != PhisToOrders.
end())
6070 return OpTE->ReorderIndices;
6073 auto It = ExternalUserReorderMap.
find(OpTE);
6074 if (It != ExternalUserReorderMap.
end()) {
6075 const auto &ExternalUserReorderIndices = It->second;
6079 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6080 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6081 ExternalUserReorderIndices.size();
6083 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6084 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6091 if (OpTE->State == TreeEntry::Vectorize &&
6092 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6093 assert(!OpTE->isAltShuffle() &&
6094 "Alternate instructions are only supported by BinaryOperator "
6098 unsigned E = Order.size();
6101 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6104 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6106 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6109 if (OrdersUses.empty())
6112 unsigned IdentityCnt = 0;
6113 unsigned FilledIdentityCnt = 0;
6115 for (
auto &Pair : OrdersUses) {
6117 if (!Pair.first.empty())
6118 FilledIdentityCnt += Pair.second;
6119 IdentityCnt += Pair.second;
6124 unsigned Cnt = IdentityCnt;
6125 for (
auto &Pair : OrdersUses) {
6129 if (Cnt < Pair.second ||
6130 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6131 Cnt == Pair.second && !BestOrder.
empty() &&
6134 BestOrder = Pair.first;
6147 unsigned E = BestOrder.
size();
6149 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6152 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6154 if (TE->Scalars.size() != VF) {
6155 if (TE->ReuseShuffleIndices.size() == VF) {
6161 return EI.UserTE->Scalars.size() == VF ||
6162 EI.UserTE->Scalars.size() ==
6165 "All users must be of VF size.");
6173 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6178 return isa<ShuffleVectorInst>(
6179 EI.UserTE->getMainOp());
6181 "Does not know how to reorder.");
6185 reorderNodeWithReuses(*TE, Mask);
6189 if ((TE->State == TreeEntry::Vectorize ||
6190 TE->State == TreeEntry::StridedVectorize) &&
6193 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6194 assert(!TE->isAltShuffle() &&
6195 "Alternate instructions are only supported by BinaryOperator "
6200 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6201 TE->reorderOperands(Mask);
6204 TE->reorderOperands(Mask);
6205 assert(TE->ReorderIndices.empty() &&
6206 "Expected empty reorder sequence.");
6209 if (!TE->ReuseShuffleIndices.empty()) {
6216 addMask(NewReuses, TE->ReuseShuffleIndices);
6217 TE->ReuseShuffleIndices.swap(NewReuses);
6223bool BoUpSLP::canReorderOperands(
6224 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6227 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6228 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6229 return OpData.first ==
I &&
6230 (OpData.second->State == TreeEntry::Vectorize ||
6231 OpData.second->State == TreeEntry::StridedVectorize);
6234 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6236 if (
any_of(TE->UserTreeIndices,
6237 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6241 Edges.emplace_back(
I, TE);
6247 if (TE->State != TreeEntry::Vectorize &&
6248 TE->State != TreeEntry::StridedVectorize &&
6249 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6253 TreeEntry *
Gather =
nullptr;
6255 [&
Gather, UserTE,
I](TreeEntry *TE) {
6256 assert(TE->State != TreeEntry::Vectorize &&
6257 TE->State != TreeEntry::StridedVectorize &&
6258 "Only non-vectorized nodes are expected.");
6259 if (
any_of(TE->UserTreeIndices,
6260 [UserTE,
I](
const EdgeInfo &EI) {
6261 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6263 assert(TE->isSame(UserTE->getOperand(
I)) &&
6264 "Operand entry does not match operands.");
6285 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6286 if (TE->State != TreeEntry::Vectorize &&
6287 TE->State != TreeEntry::StridedVectorize)
6289 if (std::optional<OrdersType> CurrentOrder =
6291 OrderedEntries.
insert(TE.get());
6292 if (!(TE->State == TreeEntry::Vectorize ||
6293 TE->State == TreeEntry::StridedVectorize) ||
6294 !TE->ReuseShuffleIndices.empty())
6295 GathersToOrders.
insert(TE.get());
6304 while (!OrderedEntries.
empty()) {
6309 for (TreeEntry *TE : OrderedEntries) {
6310 if (!(TE->State == TreeEntry::Vectorize ||
6311 TE->State == TreeEntry::StridedVectorize ||
6312 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6313 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6316 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6318 !Visited.
insert(TE).second) {
6324 for (
EdgeInfo &EI : TE->UserTreeIndices)
6328 for (TreeEntry *TE : Filtered)
6329 OrderedEntries.remove(TE);
6331 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6333 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6334 return Data1.first->Idx > Data2.first->Idx;
6336 for (
auto &
Data : UsersVec) {
6339 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6341 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6342 OrderedEntries.remove(
Op.second);
6355 for (
const auto &
Op :
Data.second) {
6356 TreeEntry *OpTE =
Op.second;
6357 if (!VisitedOps.
insert(OpTE).second)
6359 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6361 const auto Order = [&]() ->
const OrdersType {
6362 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6365 return OpTE->ReorderIndices;
6369 if (Order.size() == 1)
6372 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6373 return P.second == OpTE;
6376 if (OpTE->State == TreeEntry::Vectorize &&
6377 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6378 assert(!OpTE->isAltShuffle() &&
6379 "Alternate instructions are only supported by BinaryOperator "
6383 unsigned E = Order.size();
6386 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6389 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6392 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6394 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6395 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6396 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6397 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6398 (IgnoreReorder && TE->Idx == 0))
6400 if (TE->isGather()) {
6409 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6410 TreeEntry *UserTE = EI.
UserTE;
6411 if (!VisitedUsers.
insert(UserTE).second)
6416 if (AllowsReordering(UserTE))
6424 if (
static_cast<unsigned>(
count_if(
6425 Ops, [UserTE, &AllowsReordering](
6426 const std::pair<unsigned, TreeEntry *> &
Op) {
6427 return AllowsReordering(
Op.second) &&
6430 return EI.UserTE == UserTE;
6432 })) <= Ops.
size() / 2)
6433 ++Res.first->second;
6436 if (OrdersUses.empty()) {
6437 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6438 OrderedEntries.remove(
Op.second);
6442 unsigned IdentityCnt = 0;
6443 unsigned VF =
Data.second.front().second->getVectorFactor();
6445 for (
auto &Pair : OrdersUses) {
6447 IdentityCnt += Pair.second;
6452 unsigned Cnt = IdentityCnt;
6453 for (
auto &Pair : OrdersUses) {
6457 if (Cnt < Pair.second) {
6459 BestOrder = Pair.first;
6467 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6468 OrderedEntries.remove(
Op.second);
6477 unsigned E = BestOrder.
size();
6479 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6481 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6482 TreeEntry *TE =
Op.second;
6483 OrderedEntries.remove(TE);
6484 if (!VisitedOps.
insert(TE).second)
6486 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6487 reorderNodeWithReuses(*TE, Mask);
6491 if (TE->State != TreeEntry::Vectorize &&
6492 TE->State != TreeEntry::StridedVectorize &&
6493 (TE->State != TreeEntry::ScatterVectorize ||
6494 TE->ReorderIndices.empty()))
6496 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6497 TE->ReorderIndices.empty()) &&
6498 "Non-matching sizes of user/operand entries.");
6500 if (IgnoreReorder && TE == VectorizableTree.front().get())
6501 IgnoreReorder =
false;
6504 for (TreeEntry *
Gather : GatherOps) {
6506 "Unexpected reordering of gathers.");
6507 if (!
Gather->ReuseShuffleIndices.empty()) {
6513 OrderedEntries.remove(
Gather);
6517 if (
Data.first->State != TreeEntry::Vectorize ||
6518 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6519 Data.first->getMainOp()) ||
6520 Data.first->isAltShuffle())
6521 Data.first->reorderOperands(Mask);
6522 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6523 Data.first->isAltShuffle() ||
6524 Data.first->State == TreeEntry::StridedVectorize) {
6528 if (
Data.first->ReuseShuffleIndices.empty() &&
6529 !
Data.first->ReorderIndices.empty() &&
6530 !
Data.first->isAltShuffle()) {
6533 OrderedEntries.insert(
Data.first);
6541 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6542 VectorizableTree.front()->ReuseShuffleIndices.empty())
6543 VectorizableTree.front()->ReorderIndices.clear();
6546Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6547 if ((Entry.getOpcode() == Instruction::Store ||
6548 Entry.getOpcode() == Instruction::Load) &&
6549 Entry.State == TreeEntry::StridedVectorize &&
6550 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6551 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6552 return dyn_cast<Instruction>(Entry.Scalars.front());
6559 for (
auto &TEPtr : VectorizableTree) {
6560 TreeEntry *Entry = TEPtr.get();
6563 if (Entry->isGather())
6567 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6568 Value *Scalar = Entry->Scalars[Lane];
6569 if (!isa<Instruction>(Scalar))
6572 auto It = ScalarToExtUses.
find(Scalar);
6573 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6577 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6578 if (ExtI != ExternallyUsedValues.
end()) {
6579 int FoundLane = Entry->findLaneForValue(Scalar);
6580 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6581 << FoundLane <<
" from " << *Scalar <<
".\n");
6582 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6583 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
6586 for (
User *U : Scalar->users()) {
6594 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6599 !UseEntries.empty()) {
6603 if (
any_of(UseEntries, [&](TreeEntry *UseEntry) {
6604 return UseEntry->State == TreeEntry::ScatterVectorize ||
6606 Scalar, getRootEntryInstruction(*UseEntry), TLI,
6609 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6612 [](TreeEntry *UseEntry) {
6613 return UseEntry->isGather();
6619 if (It != ScalarToExtUses.
end()) {
6620 ExternalUses[It->second].User =
nullptr;
6625 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6627 int FoundLane = Entry->findLaneForValue(Scalar);
6629 <<
" from lane " << FoundLane <<
" from " << *Scalar
6631 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6632 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
6641BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6645 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6646 Value *V = TE->Scalars[Lane];
6648 if (!isa<Instruction>(V))
6655 for (
User *U : V->users()) {
6656 auto *SI = dyn_cast<StoreInst>(U);
6659 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6668 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6669 SI->getValueOperand()->getType(),
Ptr}];
6672 if (StoresVec.size() > Lane)
6674 if (!StoresVec.empty()) {
6676 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6677 SI->getValueOperand()->getType(),
6678 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6684 StoresVec.push_back(SI);
6689 for (
auto &
P : PtrToStoresMap) {
6690 Res[
I].swap(
P.second);
6697 OrdersType &ReorderIndices)
const {
6708 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6710 std::optional<int> Diff =
6712 SI->getPointerOperand(), *
DL, *SE,
6718 if (StoreOffsetVec.
size() != StoresVec.
size())
6720 sort(StoreOffsetVec,
6721 [](
const std::pair<int, unsigned> &L,
6722 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6725 for (
const auto &
P : StoreOffsetVec) {
6726 if (
Idx > 0 &&
P.first != PrevDist + 1)
6734 ReorderIndices.assign(StoresVec.
size(), 0);
6735 bool IsIdentity =
true;
6737 ReorderIndices[
P.second] =
I;
6738 IsIdentity &=
P.second ==
I;
6744 ReorderIndices.clear();
6751 for (
unsigned Idx : Order)
6758BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6759 unsigned NumLanes =
TE->Scalars.size();
6772 if (StoresVec.
size() != NumLanes)
6777 if (!canFormVector(StoresVec, ReorderIndices))
6782 ExternalReorderIndices.
push_back(ReorderIndices);
6784 return ExternalReorderIndices;
6790 UserIgnoreList = &UserIgnoreLst;
6793 buildTree_rec(Roots, 0,
EdgeInfo());
6800 buildTree_rec(Roots, 0,
EdgeInfo());
6809 bool AddNew =
true) {
6817 for (
Value *V : VL) {
6818 auto *LI = dyn_cast<LoadInst>(V);
6821 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6823 bool IsFound =
false;
6824 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6825 assert(LI->getParent() ==
Data.front().first->getParent() &&
6826 LI->getType() ==
Data.front().first->getType() &&
6830 "Expected loads with the same type, same parent and same "
6831 "underlying pointer.");
6833 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6834 Data.front().first->getPointerOperand(),
DL, SE,
6838 auto It = Map.find(*Dist);
6839 if (It != Map.end() && It->second != LI)
6841 if (It == Map.end()) {
6842 Data.emplace_back(LI, *Dist);
6843 Map.try_emplace(*Dist, LI);
6853 auto FindMatchingLoads =
6858 int &
Offset,
unsigned &Start) {
6860 return GatheredLoads.
end();
6870 std::optional<int> Dist =
6872 Data.front().first->getType(),
6873 Data.front().first->getPointerOperand(),
DL, SE,
6879 for (std::pair<LoadInst *, int>
P :
Data) {
6885 unsigned NumUniques = 0;
6886 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6887 bool Used = DataLoads.
contains(Pair.first);
6888 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6892 Repeated.insert(Cnt);
6895 if (NumUniques > 0 &&
6896 (Loads.
size() == NumUniques ||
6897 (Loads.
size() - NumUniques >= 2 &&
6898 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6904 return std::next(GatheredLoads.
begin(),
Idx);
6908 return GatheredLoads.
end();
6910 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6914 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6916 while (It != GatheredLoads.
end()) {
6917 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6918 for (
unsigned Idx : LocalToAdd)
6920 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6921 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6925 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6929 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6938 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6939 return PD.front().first->getParent() == LI->
getParent() &&
6940 PD.front().first->getType() == LI->
getType();
6942 while (It != GatheredLoads.
end()) {
6945 std::next(It), GatheredLoads.
end(),
6946 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6947 return PD.front().first->getParent() == LI->getParent() &&
6948 PD.front().first->getType() == LI->getType();
6952 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6953 AddNewLoads(GatheredLoads.emplace_back());
6958void BoUpSLP::tryToVectorizeGatheredLoads(
6961 8> &GatheredLoads) {
6962 GatheredLoadsEntriesFirst = VectorizableTree.size();
6965 LoadEntriesToVectorize.
size());
6966 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6967 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6968 VectorizableTree[
Idx]->Scalars.end());
6971 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6972 const std::pair<LoadInst *, int> &L2) {
6973 return L1.second > L2.second;
6979 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6980 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6988 bool Final,
unsigned MaxVF) {
6990 unsigned StartIdx = 0;
6995 *
TTI, Loads.
front()->getType(), MaxVF);
6997 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
7003 if (Final && CandidateVFs.
empty())
7006 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
7007 for (
unsigned NumElts : CandidateVFs) {
7008 if (Final && NumElts > BestVF)
7011 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
7015 if (VectorizedLoads.count(Slice.
front()) ||
7016 VectorizedLoads.count(Slice.
back()) ||
7022 bool AllowToVectorize =
false;
7030 if (LI->hasOneUse())
7036 if (
static_cast<unsigned int>(std::distance(
7037 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7039 if (!IsLegalBroadcastLoad)
7043 for (
User *U : LI->users()) {
7044 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
7046 for (
const TreeEntry *UTE : getTreeEntries(U)) {
7047 for (
int I : seq<int>(UTE->getNumOperands())) {
7049 return V == LI || isa<PoisonValue>(V);
7059 AllowToVectorize = CheckIfAllowed(Slice);
7063 any_of(ValueToGatherNodes.at(Slice.front()),
7064 [=](
const TreeEntry *TE) {
7065 return TE->Scalars.size() == 2 &&
7066 ((TE->Scalars.front() == Slice.front() &&
7067 TE->Scalars.back() == Slice.back()) ||
7068 (TE->Scalars.front() == Slice.back() &&
7069 TE->Scalars.back() == Slice.front()));
7074 if (AllowToVectorize) {
7079 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
7081 PointerOps, &BestVF);
7083 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7085 if (MaskedGatherVectorized.
empty() ||
7086 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7091 Results.emplace_back(Values, LS);
7092 VectorizedLoads.insert(Slice.begin(), Slice.end());
7095 if (Cnt == StartIdx)
7096 StartIdx += NumElts;
7099 if (StartIdx >= Loads.
size())
7103 if (!MaskedGatherVectorized.
empty() &&
7104 Cnt < MaskedGatherVectorized.
back() + NumElts)
7110 if (!AllowToVectorize || BestVF == 0)
7114 for (
unsigned Cnt : MaskedGatherVectorized) {
7116 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7120 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7122 if (Cnt == StartIdx)
7123 StartIdx += NumElts;
7127 if (!VectorizedLoads.contains(LI))
7128 NonVectorized.push_back(LI);
7132 auto ProcessGatheredLoads =
7135 bool Final =
false) {
7137 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7138 if (LoadsDists.size() <= 1) {
7139 NonVectorized.
push_back(LoadsDists.back().first);
7144 transform(LoadsDists, OriginalLoads.begin(),
7145 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7150 unsigned MaxConsecutiveDistance = 0;
7151 unsigned CurrentConsecutiveDist = 1;
7152 int LastDist = LocalLoadsDists.
front().second;
7153 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7154 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7157 assert(LastDist >=
L.second &&
7158 "Expected first distance always not less than second");
7159 if (
static_cast<unsigned>(LastDist -
L.second) ==
7160 CurrentConsecutiveDist) {
7161 ++CurrentConsecutiveDist;
7162 MaxConsecutiveDistance =
7163 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7167 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7170 CurrentConsecutiveDist = 1;
7171 LastDist =
L.second;
7174 if (Loads.
size() <= 1)
7176 if (AllowMaskedGather)
7177 MaxConsecutiveDistance = Loads.
size();
7178 else if (MaxConsecutiveDistance < 2)
7183 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7184 Final, MaxConsecutiveDistance);
7186 OriginalLoads.size() == Loads.
size() &&
7187 MaxConsecutiveDistance == Loads.
size() &&
7192 VectorizedLoads.
clear();
7196 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7197 UnsortedNonVectorized, Final,
7198 OriginalLoads.size());
7199 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7200 SortedNonVectorized.
swap(UnsortedNonVectorized);
7201 Results.swap(UnsortedResults);
7206 << Slice.
size() <<
")\n");
7208 for (
Value *L : Slice)
7210 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7216 unsigned MaxVF = Slice.size();
7217 unsigned UserMaxVF = 0;
7218 unsigned InterleaveFactor = 0;
7223 std::optional<unsigned> InterleavedLoadsDistance = 0;
7225 std::optional<unsigned> CommonVF = 0;
7229 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7230 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7233 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7235 if (*CommonVF == 0) {
7236 CommonVF =
E->Scalars.size();
7239 if (*CommonVF !=
E->Scalars.size())
7243 if (Pos !=
Idx && InterleavedLoadsDistance) {
7246 if (isa<Constant>(V))
7248 if (isVectorized(V))
7250 const auto &Nodes = ValueToGatherNodes.at(V);
7251 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7252 !is_contained(Slice, V);
7254 InterleavedLoadsDistance.reset();
7258 if (*InterleavedLoadsDistance == 0) {
7259 InterleavedLoadsDistance =
Idx - Pos;
7262 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7263 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7264 InterleavedLoadsDistance.reset();
7265 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7269 DeinterleavedNodes.
clear();
7271 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7272 CommonVF.value_or(0) != 0) {
7273 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7274 unsigned VF = *CommonVF;
7278 if (InterleaveFactor <= Slice.size() &&
7282 cast<LoadInst>(Slice.front())->getAlign(),
7283 cast<LoadInst>(Slice.front())
7287 UserMaxVF = InterleaveFactor * VF;
7289 InterleaveFactor = 0;
7294 unsigned ConsecutiveNodesSize = 0;
7295 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7296 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7297 [&, Slice = Slice](
const auto &
P) {
7299 return std::get<1>(
P).contains(V);
7301 if (It == Slice.end())
7304 VectorizableTree[std::get<0>(
P)]->Scalars;
7305 ConsecutiveNodesSize += VL.
size();
7306 unsigned Start = std::distance(Slice.begin(), It);
7307 unsigned Sz = Slice.size() - Start;
7308 return Sz < VL.
size() ||
7309 Slice.slice(std::distance(Slice.begin(), It),
7315 if (InterleaveFactor == 0 &&
7316 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7317 [&, Slice = Slice](
unsigned Idx) {
7319 SmallVector<Value *> PointerOps;
7320 return canVectorizeLoads(
7321 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7322 Slice[Idx * UserMaxVF], Order,
7324 LoadsState::ScatterVectorize;
7327 if (Slice.size() != ConsecutiveNodesSize)
7328 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7330 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7331 bool IsVectorized =
true;
7332 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7334 Slice.
slice(
I, std::min(VF,
E -
I));
7339 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7340 [&](
const auto &
P) {
7342 VectorizableTree[std::get<0>(
P)]
7347 unsigned Sz = VectorizableTree.size();
7348 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7349 if (Sz == VectorizableTree.size()) {
7350 IsVectorized =
false;
7353 if (InterleaveFactor > 0) {
7354 VF = 2 * (MaxVF / InterleaveFactor);
7355 InterleaveFactor = 0;
7364 NonVectorized.
append(SortedNonVectorized);
7366 return NonVectorized;
7368 for (
const auto &GLs : GatheredLoads) {
7369 const auto &
Ref = GLs.second;
7371 if (!
Ref.empty() && !NonVectorized.
empty() &&
7373 Ref.begin(),
Ref.end(), 0u,
7375 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7376 return S + LoadsDists.size();
7377 }) != NonVectorized.
size() &&
7378 IsMaskedGatherSupported(NonVectorized)) {
7380 for (
LoadInst *LI : NonVectorized) {
7388 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7392 for (
unsigned Idx : LoadEntriesToVectorize) {
7393 const TreeEntry &
E = *VectorizableTree[
Idx];
7396 if (!
E.ReorderIndices.empty()) {
7403 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7407 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7408 VectorizableTree.size())
7409 GatheredLoadsEntriesFirst.reset();
7416 Value *NeedsScheduling =
nullptr;
7417 for (
Value *V : VL) {
7420 if (!NeedsScheduling) {
7421 NeedsScheduling = V;
7426 return NeedsScheduling;
7437 bool AllowAlternate) {
7441 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7444 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7449 if (isa<ExtractElementInst, UndefValue>(V))
7451 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7453 !isa<UndefValue>(EI->getIndexOperand()))
7456 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7459 if ((isa<BinaryOperator, CastInst>(
I)) &&
7469 : cast<CastInst>(
I)->getOperand(0)->getType()));
7471 if (isa<CastInst>(
I)) {
7472 std::pair<size_t, size_t> OpVals =
7478 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7480 if (CI->isCommutative())
7486 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7500 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7501 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7502 SubKey =
hash_value(Gep->getPointerOperand());
7506 !isa<ConstantInt>(
I->getOperand(1))) {
7514 return std::make_pair(Key, SubKey);
7524bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7526 unsigned Opcode0 = S.getOpcode();
7527 unsigned Opcode1 = S.getAltOpcode();
7531 Opcode0, Opcode1, OpcodeMask))
7534 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7537 for (
Value *V : VL) {
7538 if (isa<PoisonValue>(V)) {
7543 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7548 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7554 switch (Res.value_or(0)) {
7569 constexpr unsigned NumAltInsts = 3;
7570 unsigned NonInstCnt = 0;
7573 unsigned UndefCnt = 0;
7575 unsigned ExtraShuffleInsts = 0;
7584 return is_contained(Operands.back(), V);
7587 ++ExtraShuffleInsts;
7604 if (isa<Constant, ExtractElementInst>(V) ||
7606 if (isa<UndefValue>(V))
7612 if (!Res.second && Res.first->second == 1)
7613 ++ExtraShuffleInsts;
7614 ++Res.first->getSecond();
7615 if (
auto *
I = dyn_cast<Instruction>(V))
7616 UniqueOpcodes.
insert(
I->getOpcode());
7617 else if (Res.second)
7620 return none_of(Uniques, [&](
const auto &
P) {
7621 return P.first->hasNUsesOrMore(
P.second + 1) &&
7623 return isVectorized(U) || Uniques.contains(U);
7632 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7633 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7634 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7637BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7639 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7642 "Expected instructions with same/alternate opcodes only.");
7644 unsigned ShuffleOrOp =
7645 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7647 switch (ShuffleOrOp) {
7648 case Instruction::PHI: {
7651 return TreeEntry::NeedToGather;
7653 for (
Value *V : VL) {
7654 auto *
PHI = dyn_cast<PHINode>(V);
7659 if (Term &&
Term->isTerminator()) {
7661 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7662 return TreeEntry::NeedToGather;
7667 return TreeEntry::Vectorize;
7669 case Instruction::ExtractValue:
7670 case Instruction::ExtractElement: {
7671 bool Reuse = canReuseExtract(VL, CurrentOrder);
7675 return TreeEntry::NeedToGather;
7676 if (Reuse || !CurrentOrder.empty())
7677 return TreeEntry::Vectorize;
7679 return TreeEntry::NeedToGather;
7681 case Instruction::InsertElement: {
7685 for (
Value *V : VL) {
7686 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7688 "Non-constant or undef index?");
7692 return !SourceVectors.contains(V);
7695 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7696 "different source vectors.\n");
7697 return TreeEntry::NeedToGather;
7702 return SourceVectors.contains(V) && !
V->hasOneUse();
7705 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7706 "multiple uses.\n");
7707 return TreeEntry::NeedToGather;
7710 return TreeEntry::Vectorize;
7712 case Instruction::Load: {
7721 return TreeEntry::Vectorize;
7723 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7725 LoadEntriesToVectorize.insert(VectorizableTree.size());
7726 return TreeEntry::NeedToGather;
7728 return TreeEntry::ScatterVectorize;
7730 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7732 LoadEntriesToVectorize.insert(VectorizableTree.size());
7733 return TreeEntry::NeedToGather;
7735 return TreeEntry::StridedVectorize;
7739 if (
DL->getTypeSizeInBits(ScalarTy) !=
7740 DL->getTypeAllocSizeInBits(ScalarTy))
7741 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7743 auto *LI = dyn_cast<LoadInst>(V);
7744 return !LI || !LI->isSimple();
7751 return TreeEntry::NeedToGather;
7755 case Instruction::ZExt:
7756 case Instruction::SExt:
7757 case Instruction::FPToUI:
7758 case Instruction::FPToSI:
7759 case Instruction::FPExt:
7760 case Instruction::PtrToInt:
7761 case Instruction::IntToPtr:
7762 case Instruction::SIToFP:
7763 case Instruction::UIToFP:
7764 case Instruction::Trunc:
7765 case Instruction::FPTrunc:
7766 case Instruction::BitCast: {
7768 for (
Value *V : VL) {
7769 if (isa<PoisonValue>(V))
7771 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7774 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7775 return TreeEntry::NeedToGather;
7778 return TreeEntry::Vectorize;
7780 case Instruction::ICmp:
7781 case Instruction::FCmp: {
7786 for (
Value *V : VL) {
7787 if (isa<PoisonValue>(V))
7789 auto *
Cmp = cast<CmpInst>(V);
7790 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7791 Cmp->getOperand(0)->getType() != ComparedTy) {
7792 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7793 return TreeEntry::NeedToGather;
7796 return TreeEntry::Vectorize;
7798 case Instruction::Select:
7799 case Instruction::FNeg:
7800 case Instruction::Add:
7801 case Instruction::FAdd:
7802 case Instruction::Sub:
7803 case Instruction::FSub:
7804 case Instruction::Mul:
7805 case Instruction::FMul:
7806 case Instruction::UDiv:
7807 case Instruction::SDiv:
7808 case Instruction::FDiv:
7809 case Instruction::URem:
7810 case Instruction::SRem:
7811 case Instruction::FRem:
7812 case Instruction::Shl:
7813 case Instruction::LShr:
7814 case Instruction::AShr:
7815 case Instruction::And:
7816 case Instruction::Or:
7817 case Instruction::Xor:
7818 case Instruction::Freeze:
7819 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7821 auto *
I = dyn_cast<Instruction>(V);
7822 return I &&
I->isBinaryOp() && !
I->isFast();
7824 return TreeEntry::NeedToGather;
7825 return TreeEntry::Vectorize;
7826 case Instruction::GetElementPtr: {
7828 for (
Value *V : VL) {
7829 auto *
I = dyn_cast<GetElementPtrInst>(V);
7832 if (
I->getNumOperands() != 2) {
7833 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7834 return TreeEntry::NeedToGather;
7840 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7841 for (
Value *V : VL) {
7842 auto *
GEP = dyn_cast<GEPOperator>(V);
7845 Type *CurTy =
GEP->getSourceElementType();
7847 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7848 return TreeEntry::NeedToGather;
7854 for (
Value *V : VL) {
7855 auto *
I = dyn_cast<GetElementPtrInst>(V);
7858 auto *
Op =
I->getOperand(1);
7859 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7860 (
Op->getType() != Ty1 &&
7861 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7862 Op->getType()->getScalarSizeInBits() >
7863 DL->getIndexSizeInBits(
7864 V->getType()->getPointerAddressSpace())))) {
7866 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7867 return TreeEntry::NeedToGather;
7871 return TreeEntry::Vectorize;
7873 case Instruction::Store: {
7875 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7878 if (
DL->getTypeSizeInBits(ScalarTy) !=
7879 DL->getTypeAllocSizeInBits(ScalarTy)) {
7880 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7881 return TreeEntry::NeedToGather;
7885 for (
Value *V : VL) {
7886 auto *
SI = cast<StoreInst>(V);
7887 if (!
SI->isSimple()) {
7889 return TreeEntry::NeedToGather;
7898 if (CurrentOrder.empty()) {
7899 Ptr0 = PointerOps.
front();
7900 PtrN = PointerOps.
back();
7902 Ptr0 = PointerOps[CurrentOrder.front()];
7903 PtrN = PointerOps[CurrentOrder.back()];
7905 std::optional<int> Dist =
7908 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7909 return TreeEntry::Vectorize;
7913 return TreeEntry::NeedToGather;
7915 case Instruction::Call: {
7916 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7918 auto *
I = dyn_cast<Instruction>(V);
7919 return I && !
I->isFast();
7921 return TreeEntry::NeedToGather;
7924 CallInst *CI = cast<CallInst>(VL0);
7935 return TreeEntry::NeedToGather;
7940 for (
unsigned J = 0; J != NumArgs; ++J)
7943 for (
Value *V : VL) {
7944 CallInst *CI2 = dyn_cast<CallInst>(V);
7950 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7952 return TreeEntry::NeedToGather;
7956 for (
unsigned J = 0; J != NumArgs; ++J) {
7959 if (ScalarArgs[J] != A1J) {
7961 <<
"SLP: mismatched arguments in call:" << *CI
7962 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7963 return TreeEntry::NeedToGather;
7972 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7973 <<
"!=" << *V <<
'\n');
7974 return TreeEntry::NeedToGather;
7978 return TreeEntry::Vectorize;
7980 case Instruction::ShuffleVector: {
7981 if (!S.isAltShuffle()) {
7984 return TreeEntry::Vectorize;
7987 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7988 return TreeEntry::NeedToGather;
7993 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7994 "the whole alt sequence is not profitable.\n");
7995 return TreeEntry::NeedToGather;
7998 return TreeEntry::Vectorize;
8002 return TreeEntry::NeedToGather;
8016 PHIHandler() =
delete;
8018 : DT(DT), Main(Main), Phis(Phis),
8019 Operands(Main->getNumIncomingValues(),
8021 void buildOperands() {
8022 constexpr unsigned FastLimit = 4;
8032 auto *
P = dyn_cast<PHINode>(V);
8034 assert(isa<PoisonValue>(V) &&
8035 "Expected isa instruction or poison value.");
8039 if (
P->getIncomingBlock(
I) == InBB)
8054 Blocks.try_emplace(InBB).first->second.push_back(
I);
8057 if (isa<PoisonValue>(V)) {
8062 auto *
P = cast<PHINode>(V);
8063 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
8071 auto It =
Blocks.find(InBB);
8077 for (
const auto &
P :
Blocks) {
8078 if (
P.getSecond().size() <= 1)
8080 unsigned BasicI =
P.getSecond().front();
8083 [&](
const auto &Data) {
8084 return !Data.value() ||
8085 Data.value() ==
Operands[BasicI][Data.index()];
8087 "Expected empty operands list.");
8097 const EdgeInfo &UserTreeIdx,
8098 unsigned InterleaveFactor) {
8104 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8105 bool DoNotFail =
false) {
8108 for (
Value *V : VL) {
8115 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8120 size_t NumUniqueScalarValues = UniqueValues.
size();
8123 if (NumUniqueScalarValues == VL.size() &&
8125 ReuseShuffleIndices.
clear();
8128 if ((UserTreeIdx.UserTE &&
8129 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8131 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8132 "for nodes with padding.\n");
8133 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8137 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8138 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8141 if (DoNotFail && UniquePositions.size() > 1 &&
8142 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8143 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8146 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8147 if (PWSz == VL.size()) {
8148 ReuseShuffleIndices.
clear();
8150 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8152 PWSz - UniqueValues.
size(),
8158 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8161 VL = NonUniqueValueVL;
8166 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8179 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8181 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8187 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
8188 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
8189 if (E->isSame(VL)) {
8191 E->UserTreeIndices.push_back(UserTreeIdx);
8192 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8198 return isa<PoisonValue>(V) || Values.contains(V);
8201 if (TryToFindDuplicates(S))
8202 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8203 ReuseShuffleIndices);
8213 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8218 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8220 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8221 if (TryToFindDuplicates(S))
8222 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8223 ReuseShuffleIndices);
8228 if (S && S.getOpcode() == Instruction::ExtractElement &&
8229 isa<ScalableVectorType>(
8230 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8231 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8232 if (TryToFindDuplicates(S))
8233 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8234 ReuseShuffleIndices);
8241 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8250 auto &&NotProfitableForVectorization = [&S,
this,
8252 if (!S || !S.isAltShuffle() || VL.size() > 2)
8261 for (
Value *V : VL) {
8262 auto *
I = cast<Instruction>(V);
8264 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8267 bool IsCommutative =
8269 if ((IsCommutative &&
8270 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8272 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8274 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8276 auto *
I1 = cast<Instruction>(VL.front());
8277 auto *I2 = cast<Instruction>(VL.back());
8278 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8280 I2->getOperand(
Op));
8281 if (
static_cast<unsigned>(
count_if(
8282 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8284 })) >= S.getMainOp()->getNumOperands() / 2)
8286 if (S.getMainOp()->getNumOperands() > 2)
8288 if (IsCommutative) {
8293 I2->getOperand((
Op + 1) % E));
8295 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8304 bool IsScatterVectorizeUserTE =
8305 UserTreeIdx.UserTE &&
8306 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8308 bool AreScatterAllGEPSameBlock =
8309 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8313 auto *
I = dyn_cast<GetElementPtrInst>(V);
8317 BB =
I->getParent();
8318 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8321 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8323 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8326 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8329 NotProfitableForVectorization(VL)) {
8330 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8331 if (TryToFindDuplicates(S))
8332 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8333 ReuseShuffleIndices);
8338 if (S && !EphValues.
empty()) {
8339 for (
Value *V : VL) {
8340 if (EphValues.
count(V)) {
8342 <<
") is ephemeral.\n");
8343 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8353 for (
Value *V : VL) {
8354 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8359 <<
") is already in tree.\n");
8360 if (TryToFindDuplicates(S))
8361 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8362 ReuseShuffleIndices);
8368 if (UserIgnoreList && !UserIgnoreList->empty()) {
8369 for (
Value *V : VL) {
8370 if (UserIgnoreList->contains(V)) {
8371 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8372 if (TryToFindDuplicates(S))
8373 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8374 ReuseShuffleIndices);
8382 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8383 assert(VL.front()->getType()->isPointerTy() &&
8384 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8385 "Expected pointers only.");
8387 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8388 assert(It != VL.end() &&
"Expected at least one GEP.");
8405 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8410 if (!TryToFindDuplicates(S,
true))
8416 TreeEntry::EntryState State = getScalarsVectorizationState(
8417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8418 if (State == TreeEntry::NeedToGather) {
8419 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8420 ReuseShuffleIndices);
8424 auto &BSRef = BlocksSchedules[BB];
8426 BSRef = std::make_unique<BlockScheduling>(BB);
8428 BlockScheduling &BS = *BSRef;
8430 std::optional<ScheduleData *> Bundle =
8431 BS.tryScheduleBundle(UniqueValues,
this, S);
8432#ifdef EXPENSIVE_CHECKS
8437 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8438 assert((!BS.getScheduleData(VL0) ||
8439 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8440 "tryScheduleBundle should cancelScheduling on failure");
8441 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8442 ReuseShuffleIndices);
8443 NonScheduledFirst.insert(VL.front());
8444 if (S.getOpcode() == Instruction::Load &&
8445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8449 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8451 unsigned ShuffleOrOp =
8452 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8453 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8456 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8461 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8466 for (
unsigned I : PHIOps)
8469 switch (ShuffleOrOp) {
8470 case Instruction::PHI: {
8471 auto *PH = cast<PHINode>(VL0);
8474 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8479 PHIHandler Handler(*DT, PH, VL);
8480 Handler.buildOperands();
8481 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8482 TE->setOperand(
I, Handler.getOperands(
I));
8484 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8489 case Instruction::ExtractValue:
8490 case Instruction::ExtractElement: {
8491 if (CurrentOrder.empty()) {
8492 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8495 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8497 for (
unsigned Idx : CurrentOrder)
8505 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8506 ReuseShuffleIndices, CurrentOrder);
8508 "(ExtractValueInst/ExtractElementInst).\n";
8512 TE->setOperand(*
this);
8515 case Instruction::InsertElement: {
8516 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8518 auto OrdCompare = [](
const std::pair<int, int> &
P1,
8519 const std::pair<int, int> &P2) {
8520 return P1.first > P2.first;
8523 decltype(OrdCompare)>
8524 Indices(OrdCompare);
8525 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8527 Indices.emplace(
Idx,
I);
8529 OrdersType CurrentOrder(VL.size(), VL.size());
8530 bool IsIdentity =
true;
8531 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8532 CurrentOrder[Indices.top().second] =
I;
8533 IsIdentity &= Indices.top().second ==
I;
8537 CurrentOrder.clear();
8538 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8540 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8543 TE->setOperand(*
this);
8544 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8547 case Instruction::Load: {
8554 TreeEntry *
TE =
nullptr;
8557 case TreeEntry::Vectorize:
8558 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8559 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8560 if (CurrentOrder.empty())
8565 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8568 case TreeEntry::StridedVectorize:
8570 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8571 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8572 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8575 case TreeEntry::ScatterVectorize:
8577 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8578 UserTreeIdx, ReuseShuffleIndices);
8581 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8584 case TreeEntry::CombinedVectorize:
8585 case TreeEntry::NeedToGather:
8588 TE->setOperand(*
this);
8589 if (State == TreeEntry::ScatterVectorize)
8590 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8593 case Instruction::ZExt:
8594 case Instruction::SExt:
8595 case Instruction::FPToUI:
8596 case Instruction::FPToSI:
8597 case Instruction::FPExt:
8598 case Instruction::PtrToInt:
8599 case Instruction::IntToPtr:
8600 case Instruction::SIToFP:
8601 case Instruction::UIToFP:
8602 case Instruction::Trunc:
8603 case Instruction::FPTrunc:
8604 case Instruction::BitCast: {
8605 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8606 std::make_pair(std::numeric_limits<unsigned>::min(),
8607 std::numeric_limits<unsigned>::max()));
8608 if (ShuffleOrOp == Instruction::ZExt ||
8609 ShuffleOrOp == Instruction::SExt) {
8610 CastMaxMinBWSizes = std::make_pair(
8616 }
else if (ShuffleOrOp == Instruction::Trunc) {
8617 CastMaxMinBWSizes = std::make_pair(
8624 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8625 ReuseShuffleIndices);
8629 TE->setOperand(*
this);
8631 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8632 if (ShuffleOrOp == Instruction::Trunc) {
8633 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8634 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8635 ShuffleOrOp == Instruction::UIToFP) {
8636 unsigned NumSignBits =
8638 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8640 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8642 if (NumSignBits * 2 >=
8644 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8648 case Instruction::ICmp:
8649 case Instruction::FCmp: {
8652 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8653 ReuseShuffleIndices);
8658 VLOperands Ops(VL, S, *
this);
8663 "Commutative Predicate mismatch");
8665 Left = Ops.getVL(0);
8666 Right = Ops.getVL(1);
8669 for (
Value *V : VL) {
8670 if (isa<PoisonValue>(V)) {
8675 auto *
Cmp = cast<CmpInst>(V);
8678 if (
Cmp->getPredicate() != P0)
8680 Left.push_back(LHS);
8681 Right.push_back(RHS);
8688 if (ShuffleOrOp == Instruction::ICmp) {
8689 unsigned NumSignBits0 =
8691 if (NumSignBits0 * 2 >=
8693 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8694 unsigned NumSignBits1 =
8696 if (NumSignBits1 * 2 >=
8698 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8702 case Instruction::Select:
8703 case Instruction::FNeg:
8704 case Instruction::Add:
8705 case Instruction::FAdd:
8706 case Instruction::Sub:
8707 case Instruction::FSub:
8708 case Instruction::Mul:
8709 case Instruction::FMul:
8710 case Instruction::UDiv:
8711 case Instruction::SDiv:
8712 case Instruction::FDiv:
8713 case Instruction::URem:
8714 case Instruction::SRem:
8715 case Instruction::FRem:
8716 case Instruction::Shl:
8717 case Instruction::LShr:
8718 case Instruction::AShr:
8719 case Instruction::And:
8720 case Instruction::Or:
8721 case Instruction::Xor:
8722 case Instruction::Freeze: {
8723 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8724 ReuseShuffleIndices);
8726 dbgs() <<
"SLP: added a new TreeEntry "
8727 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8730 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8732 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8735 case Instruction::GetElementPtr: {
8736 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8737 ReuseShuffleIndices);
8738 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8742 for (
Value *V : VL) {
8743 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8748 Operands.front().push_back(
GEP->getPointerOperand());
8759 [VL0Ty, IndexIdx](
Value *V) {
8760 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8763 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8767 ->getPointerOperandType()
8770 for (
Value *V : VL) {
8771 auto *
I = dyn_cast<GetElementPtrInst>(V);
8774 ConstantInt::get(Ty, 0,
false));
8777 auto *
Op =
I->getOperand(IndexIdx);
8778 auto *CI = dyn_cast<ConstantInt>(
Op);
8783 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8787 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8791 case Instruction::Store: {
8792 bool Consecutive = CurrentOrder.empty();
8795 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8796 ReuseShuffleIndices, CurrentOrder);
8798 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8802 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8804 TE->setOperand(*
this);
8805 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8808 case Instruction::Call: {
8811 CallInst *CI = cast<CallInst>(VL0);
8814 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8815 ReuseShuffleIndices);
8819 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8824 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8828 case Instruction::ShuffleVector: {
8829 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8830 ReuseShuffleIndices);
8831 if (S.isAltShuffle()) {
8832 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8837 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8842 auto *CI = dyn_cast<CmpInst>(VL0);
8844 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8846 auto *MainCI = cast<CmpInst>(S.getMainOp());
8847 auto *AltCI = cast<CmpInst>(S.getAltOp());
8851 "Expected different main/alternate predicates.");
8855 for (
Value *V : VL) {
8856 if (isa<PoisonValue>(V)) {
8861 auto *
Cmp = cast<CmpInst>(V);
8872 Left.push_back(LHS);
8873 Right.push_back(RHS);
8882 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8884 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8897 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8900 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8902 for (
const auto *Ty : ST->elements())
8903 if (Ty != *ST->element_begin())
8905 N *= ST->getNumElements();
8906 EltTy = *ST->element_begin();
8907 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8908 N *= AT->getNumElements();
8909 EltTy = AT->getElementType();
8911 auto *VT = cast<FixedVectorType>(EltTy);
8912 N *= VT->getNumElements();
8913 EltTy = VT->getElementType();
8920 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8928 bool ResizeAllowed)
const {
8929 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8930 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8931 auto *E0 = cast<Instruction>(*It);
8933 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8937 Value *Vec = E0->getOperand(0);
8939 CurrentOrder.
clear();
8943 if (E0->getOpcode() == Instruction::ExtractValue) {
8948 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8952 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8955 unsigned E = VL.
size();
8956 if (!ResizeAllowed && NElts != E)
8959 unsigned MinIdx = NElts, MaxIdx = 0;
8961 auto *Inst = dyn_cast<Instruction>(V);
8964 if (Inst->getOperand(0) != Vec)
8966 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8967 if (isa<UndefValue>(EE->getIndexOperand()))
8972 const unsigned ExtIdx = *
Idx;
8973 if (ExtIdx >= NElts)
8975 Indices[
I] = ExtIdx;
8976 if (MinIdx > ExtIdx)
8978 if (MaxIdx < ExtIdx)
8981 if (MaxIdx - MinIdx + 1 > E)
8983 if (MaxIdx + 1 <= E)
8987 bool ShouldKeepOrder =
true;
8993 CurrentOrder.
assign(E, E);
8994 for (
unsigned I = 0;
I < E; ++
I) {
8997 const unsigned ExtIdx = Indices[
I] - MinIdx;
8998 if (CurrentOrder[ExtIdx] != E) {
8999 CurrentOrder.
clear();
9002 ShouldKeepOrder &= ExtIdx ==
I;
9003 CurrentOrder[ExtIdx] =
I;
9005 if (ShouldKeepOrder)
9006 CurrentOrder.
clear();
9008 return ShouldKeepOrder;
9011bool BoUpSLP::areAllUsersVectorized(
9013 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9015 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
9016 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9020static std::pair<InstructionCost, InstructionCost>
9028 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9029 FMF = FPCI->getFastMathFlags();
9031 auto IntrinsicCost =
9038 auto LibCost = IntrinsicCost;
9045 return {IntrinsicCost, LibCost};
9048void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9052 unsigned Sz = Scalars.size();
9055 if (!ReorderIndices.empty())
9057 for (
unsigned I = 0;
I < Sz; ++
I) {
9059 if (!ReorderIndices.empty())
9061 if (isa<PoisonValue>(Scalars[
Idx]))
9063 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9064 if (IsAltOp(OpInst)) {
9074 if (!ReuseShuffleIndices.
empty()) {
9077 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9087 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9088 auto *AltCI = cast<CmpInst>(AltOp);
9091 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9092 auto *CI = cast<CmpInst>(
I);
9100 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9101 "CmpInst expected to match either main or alternate predicate or "
9103 return MainP !=
P && MainP != SwappedP;
9110 const auto *Op0 = Ops.
front();
9116 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9120 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9122 if (
auto *CI = dyn_cast<ConstantInt>(V))
9123 return CI->getValue().isPowerOf2();
9126 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9128 if (
auto *CI = dyn_cast<ConstantInt>(V))
9129 return CI->getValue().isNegatedPowerOf2();
9134 if (IsConstant && IsUniform)
9136 else if (IsConstant)
9150class BaseShuffleAnalysis {
9152 Type *ScalarTy =
nullptr;
9154 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9162 unsigned getVF(
Value *V)
const {
9163 assert(V &&
"V cannot be nullptr");
9164 assert(isa<FixedVectorType>(
V->getType()) &&
9165 "V does not have FixedVectorType");
9166 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9168 unsigned VNumElements =
9169 cast<FixedVectorType>(
V->getType())->getNumElements();
9170 assert(VNumElements > ScalarTyNumElements &&
9171 "the number of elements of V is not large enough");
9172 assert(VNumElements % ScalarTyNumElements == 0 &&
9173 "the number of elements of V is not a vectorized value");
9174 return VNumElements / ScalarTyNumElements;
9182 int Limit =
Mask.size();
9194 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9210 unsigned VF =
Mask.size();
9212 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9215 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9256 bool SinglePermute) {
9260 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9262 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9268 if (isIdentityMask(Mask, SVTy,
false)) {
9269 if (!IdentityOp || !SinglePermute ||
9270 (isIdentityMask(Mask, SVTy,
true) &&
9272 IdentityMask.
size()))) {
9277 IdentityMask.
assign(Mask);
9297 if (SV->isZeroEltSplat()) {
9299 IdentityMask.
assign(Mask);
9301 int LocalVF =
Mask.size();
9303 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9304 LocalVF = SVOpTy->getNumElements();
9308 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9310 ExtMask[
Idx] = SV->getMaskValue(
I);
9320 if (!IsOp1Undef && !IsOp2Undef) {
9322 for (
int &
I : Mask) {
9325 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9332 combineMasks(LocalVF, ShuffleMask, Mask);
9333 Mask.swap(ShuffleMask);
9335 Op = SV->getOperand(0);
9337 Op = SV->getOperand(1);
9339 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9340 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9345 "Expected masks of same sizes.");
9350 Mask.swap(IdentityMask);
9351 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9352 return SinglePermute &&
9353 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9355 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9356 Shuffle->isZeroEltSplat() &&
9369 template <
typename T,
typename ShuffleBuilderTy>
9371 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
9372 assert(V1 &&
"Expected at least one vector value.");
9375 if (ScalarTyNumElements != 1) {
9381 Builder.resizeToMatch(V1, V2);
9382 int VF =
Mask.size();
9383 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9384 VF = FTy->getNumElements();
9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9392 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9395 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9397 CombinedMask1[
I] =
Mask[
I];
9399 CombinedMask2[
I] =
Mask[
I] - VF;
9406 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9407 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9410 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9411 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9416 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9419 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9421 ExtMask1, UseMask::SecondArg);
9426 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9429 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9431 ExtMask2, UseMask::SecondArg);
9432 if (SV1->getOperand(0)->getType() ==
9433 SV2->getOperand(0)->getType() &&
9434 SV1->getOperand(0)->getType() != SV1->getType() &&
9437 Op1 = SV1->getOperand(0);
9438 Op2 = SV2->getOperand(0);
9440 int LocalVF = ShuffleMask1.size();
9441 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9442 LocalVF = FTy->getNumElements();
9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9444 CombinedMask1.swap(ShuffleMask1);
9446 LocalVF = ShuffleMask2.size();
9447 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9450 CombinedMask2.swap(ShuffleMask2);
9453 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9454 Builder.resizeToMatch(Op1, Op2);
9455 VF = std::max(cast<VectorType>(Op1->
getType())
9457 .getKnownMinValue(),
9458 cast<VectorType>(Op2->
getType())
9460 .getKnownMinValue());
9461 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9464 "Expected undefined mask element");
9465 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9471 isa<ShuffleVectorInst>(Op1) &&
9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9474 return Builder.createIdentity(Op1);
9475 return Builder.createShuffleVector(
9479 if (isa<PoisonValue>(V1))
9480 return Builder.createPoison(
9481 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9482 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9483 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9486 return Builder.createShuffleVector(V1, NewMask);
9487 return Builder.createIdentity(V1);
9494 for (
unsigned I : seq<unsigned>(CommonMask.
size()))
9502static std::pair<InstructionCost, InstructionCost>
9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9527 for (
Value *V : Ptrs) {
9532 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9537 if (!
Ptr || !
Ptr->hasOneUse())
9541 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9547 TTI::PointersChainInfo::getKnownStride(),
9557 [](
const Value *V) {
9558 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9559 return Ptr && !
Ptr->hasAllConstantIndices();
9561 ? TTI::PointersChainInfo::getUnknownStride()
9562 : TTI::PointersChainInfo::getKnownStride();
9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9568 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9569 if (It != Ptrs.
end())
9570 BaseGEP = cast<GEPOperator>(*It);
9575 BaseGEP->getPointerOperand(), Indices, VecTy,
9580 return std::make_pair(ScalarCost, VecCost);
9583void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9584 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9585 "Expected gather node without reordering.");
9591 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
9595 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9596 return VectorizableTree[Idx]->isSame(TE.Scalars);
9600 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9605 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9606 if (LIt != LoadsMap.
end()) {
9607 for (
LoadInst *RLI : LIt->second) {
9613 for (
LoadInst *RLI : LIt->second) {
9620 if (LIt->second.size() > 2) {
9622 hash_value(LIt->second.back()->getPointerOperand());
9628 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9633 bool IsOrdered =
true;
9634 unsigned NumInstructions = 0;
9639 if (
auto *Inst = dyn_cast<Instruction>(V);
9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9646 auto &Container = SortedValues[
Key];
9647 if (IsOrdered && !KeyToIndex.
contains(V) &&
9648 !(isa<Constant, ExtractElementInst>(V) ||
9650 ((Container.contains(
Idx) &&
9651 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9652 (!Container.empty() && !Container.contains(
Idx) &&
9653 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9655 auto &KTI = KeyToIndex[
V];
9657 Container[
Idx].push_back(V);
9662 if (!IsOrdered && NumInstructions > 1) {
9664 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9665 for (
const auto &
D : SortedValues) {
9666 for (
const auto &
P :
D.second) {
9668 for (
Value *V :
P.second) {
9671 TE.ReorderIndices[Cnt +
K] =
Idx;
9672 TE.Scalars[Cnt +
K] =
V;
9674 Sz += Indices.
size();
9675 Cnt += Indices.
size();
9677 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9679 *
TTI,
TE.Scalars.front()->getType(), Sz);
9681 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9683 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9684 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9691 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9696 auto *ScalarTy =
TE.Scalars.front()->getType();
9698 for (
auto [
Idx, Sz] : SubVectors) {
9702 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9707 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9708 if (DemandedElts[
I])
9711 CostKind,
I * ScalarTyNumElements, FTy);
9716 int Sz =
TE.Scalars.size();
9718 TE.ReorderIndices.end());
9719 for (
unsigned I : seq<unsigned>(Sz)) {
9721 if (isa<PoisonValue>(V)) {
9724 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9728 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9731 VecTy, ReorderMask);
9734 for (
unsigned I : seq<unsigned>(Sz)) {
9738 if (!isa<PoisonValue>(V))
9741 ReorderMask[
I] =
I + Sz;
9745 VecTy, DemandedElts,
true,
false,
CostKind);
9748 if (
Cost >= BVCost) {
9751 TE.ReorderIndices.clear();
9757 BaseGraphSize = VectorizableTree.size();
9759 class GraphTransformModeRAAI {
9760 bool &SavedIsGraphTransformMode;
9763 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9764 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9765 IsGraphTransformMode =
true;
9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9768 } TransformContext(IsGraphTransformMode);
9777 const InstructionsState &S) {
9779 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9781 I2->getOperand(
Op));
9783 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9785 [](
const std::pair<Value *, Value *> &
P) {
9786 return isa<Constant>(
P.first) ||
9787 isa<Constant>(
P.second) ||
P.first ==
P.second;
9794 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9795 TreeEntry &E = *VectorizableTree[
Idx];
9797 reorderGatherNode(E);
9801 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9802 TreeEntry &E = *VectorizableTree[
Idx];
9809 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9810 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9816 unsigned StartIdx = 0;
9821 *
TTI, VL.
front()->getType(), VF - 1)) {
9822 if (StartIdx + VF >
End)
9825 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9830 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
9837 bool IsSplat =
isSplat(Slice);
9838 bool IsTwoRegisterSplat =
true;
9839 if (IsSplat && VF == 2) {
9842 IsTwoRegisterSplat = NumRegs2VF == 2;
9844 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
9846 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9852 (S.getOpcode() == Instruction::Load &&
9854 (S.getOpcode() != Instruction::Load &&
9860 if ((!UserIgnoreList || E.Idx != 0) &&
9864 if (isa<PoisonValue>(V))
9866 return areAllUsersVectorized(cast<Instruction>(V),
9870 if (S.getOpcode() == Instruction::Load) {
9882 if (UserIgnoreList && E.Idx == 0)
9887 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9890 !CheckOperandsProfitability(
9893 IsaPred<Instruction>)),
9904 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9905 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9906 if (StartIdx == Cnt)
9907 StartIdx = Cnt + Sz;
9908 if (
End == Cnt + Sz)
9911 for (
auto [Cnt, Sz] : Slices) {
9914 if (TreeEntry *SE = getSameValuesTreeEntry(Slice.
front(), Slice,
9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9917 AddCombinedNode(SE->Idx, Cnt, Sz);
9920 unsigned PrevSize = VectorizableTree.size();
9921 [[maybe_unused]]
unsigned PrevEntriesSize =
9922 LoadEntriesToVectorize.size();
9923 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9924 if (PrevSize + 1 == VectorizableTree.size() &&
9925 VectorizableTree[PrevSize]->isGather() &&
9926 VectorizableTree[PrevSize]->hasState() &&
9927 VectorizableTree[PrevSize]->getOpcode() !=
9928 Instruction::ExtractElement &&
9930 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9932 VectorizableTree.pop_back();
9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9934 "LoadEntriesToVectorize expected to remain the same");
9937 AddCombinedNode(PrevSize, Cnt, Sz);
9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9944 E.ReorderIndices.clear();
9949 switch (E.getOpcode()) {
9950 case Instruction::Load: {
9953 if (E.State != TreeEntry::Vectorize)
9955 Type *ScalarTy = E.getMainOp()->getType();
9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9960 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9972 false, CommonAlignment,
CostKind, BaseLI);
9973 if (StridedCost < OriginalVecCost)
9976 E.State = TreeEntry::StridedVectorize;
9980 case Instruction::Store: {
9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9987 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9999 false, CommonAlignment,
CostKind, BaseSI);
10000 if (StridedCost < OriginalVecCost)
10003 E.State = TreeEntry::StridedVectorize;
10004 }
else if (!E.ReorderIndices.empty()) {
10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10008 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
10009 if (Mask.size() < 4)
10011 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10015 VecTy, Factor, BaseSI->getAlign(),
10023 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10024 if (InterleaveFactor != 0)
10025 E.setInterleave(InterleaveFactor);
10029 case Instruction::Select: {
10030 if (E.State != TreeEntry::Vectorize)
10036 E.CombinedOp = TreeEntry::MinMax;
10037 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10039 CondEntry->State == TreeEntry::Vectorize) {
10041 CondEntry->State = TreeEntry::CombinedVectorize;
10050 if (LoadEntriesToVectorize.empty()) {
10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10053 VectorizableTree.front()->getOpcode() == Instruction::Load)
10056 constexpr unsigned SmallTree = 3;
10057 constexpr unsigned SmallVF = 2;
10058 if ((VectorizableTree.size() <= SmallTree &&
10059 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10060 (VectorizableTree.size() <= 2 && UserIgnoreList))
10063 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10067 [](
const std::unique_ptr<TreeEntry> &TE) {
10068 return TE->isGather() && TE->hasState() &&
10069 TE->getOpcode() == Instruction::Load &&
10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10082 TreeEntry &E = *TE;
10083 if (E.isGather() &&
10084 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10085 (!E.hasState() &&
any_of(E.Scalars,
10087 return isa<LoadInst>(V) &&
10088 !isVectorized(V) &&
10089 !isDeleted(cast<Instruction>(V));
10092 for (
Value *V : E.Scalars) {
10093 auto *LI = dyn_cast<LoadInst>(V);
10099 *
this, V, *DL, *SE, *
TTI,
10100 GatheredLoads[std::make_tuple(
10108 if (!GatheredLoads.
empty())
10109 tryToVectorizeGatheredLoads(GatheredLoads);
10119 bool IsFinalized =
false;
10132 bool SameNodesEstimated =
true;
10141 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10157 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10158 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10161 count(VL, *It) > 1 &&
10163 if (!NeedShuffle) {
10164 if (isa<FixedVectorType>(ScalarTy)) {
10169 cast<FixedVectorType>(ScalarTy));
10172 CostKind, std::distance(VL.
begin(), It),
10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10185 VecTy, ShuffleMask, CostKind,
10189 return GatherCost +
10190 (
all_of(Gathers, IsaPred<UndefValue>)
10192 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10201 unsigned NumParts) {
10202 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10204 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10205 auto *EE = dyn_cast<ExtractElementInst>(V);
10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10211 return std::max(Sz, VecTy->getNumElements());
10217 -> std::optional<TTI::ShuffleKind> {
10218 if (NumElts <= EltsPerVector)
10219 return std::nullopt;
10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10223 if (I == PoisonMaskElem)
10225 return std::min(S, I);
10228 int OffsetReg1 = OffsetReg0;
10232 int FirstRegId = -1;
10233 Indices.assign(1, OffsetReg0);
10237 int Idx =
I - OffsetReg0;
10239 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10240 if (FirstRegId < 0)
10241 FirstRegId = RegId;
10242 RegIndices.
insert(RegId);
10243 if (RegIndices.
size() > 2)
10244 return std::nullopt;
10245 if (RegIndices.
size() == 2) {
10247 if (Indices.
size() == 1) {
10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10251 [&](
int S,
int I) {
10252 if (I == PoisonMaskElem)
10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10255 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10256 if (RegId == FirstRegId)
10258 return std::min(S, I);
10261 Indices.push_back(OffsetReg1 % NumElts);
10263 Idx =
I - OffsetReg1;
10265 I = (
Idx % NumElts) % EltsPerVector +
10266 (RegId == FirstRegId ? 0 : EltsPerVector);
10268 return ShuffleKind;
10275 for (
unsigned Part : seq<unsigned>(NumParts)) {
10276 if (!ShuffleKinds[Part])
10279 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10283 std::optional<TTI::ShuffleKind> RegShuffleKind =
10284 CheckPerRegistersShuffle(SubMask, Indices);
10285 if (!RegShuffleKind) {
10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10301 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10302 for (
unsigned Idx : Indices) {
10303 assert((
Idx + EltsPerVector) <= BaseVF &&
10304 "SK_ExtractSubvector index out of range");
10315 if (OriginalCost <
Cost)
10316 Cost = OriginalCost;
10323 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10325 unsigned SliceSize) {
10326 if (SameNodesEstimated) {
10332 if ((InVectors.
size() == 2 &&
10333 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10334 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10335 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10336 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10339 "Expected all poisoned elements.");
10341 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10346 Cost += createShuffle(InVectors.
front(),
10347 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10349 transformMaskAfterShuffle(CommonMask, CommonMask);
10350 }
else if (InVectors.
size() == 2) {
10351 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10352 transformMaskAfterShuffle(CommonMask, CommonMask);
10354 SameNodesEstimated =
false;
10355 if (!E2 && InVectors.
size() == 1) {
10356 unsigned VF = E1.getVectorFactor();
10359 cast<FixedVectorType>(V1->
getType())->getNumElements());
10361 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10362 VF = std::max(VF, E->getVectorFactor());
10364 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10366 CommonMask[
Idx] = Mask[
Idx] + VF;
10367 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10368 transformMaskAfterShuffle(CommonMask, CommonMask);
10370 auto P = InVectors.
front();
10371 Cost += createShuffle(&E1, E2, Mask);
10372 unsigned VF = Mask.size();
10377 const auto *E = cast<const TreeEntry *>(
P);
10378 VF = std::max(VF, E->getVectorFactor());
10380 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10382 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10383 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10384 transformMaskAfterShuffle(CommonMask, CommonMask);
10388 class ShuffleCostBuilder {
10391 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10393 return Mask.empty() ||
10394 (VF == Mask.size() &&
10402 ~ShuffleCostBuilder() =
default;
10407 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10408 if (isEmptyOrIdentity(Mask, VF))
10411 cast<VectorType>(V1->
getType()), Mask);
10416 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10417 if (isEmptyOrIdentity(Mask, VF))
10420 cast<VectorType>(V1->
getType()), Mask);
10426 void resizeToMatch(
Value *&,
Value *&)
const {}
10436 ShuffleCostBuilder Builder(
TTI);
10439 unsigned CommonVF = Mask.size();
10441 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10445 Type *EScalarTy = E.Scalars.front()->getType();
10446 bool IsSigned =
true;
10447 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10449 IsSigned = It->second.second;
10451 if (EScalarTy != ScalarTy) {
10452 unsigned CastOpcode = Instruction::Trunc;
10453 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10454 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10456 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10464 if (isa<Constant>(V))
10466 auto *VecTy = cast<VectorType>(V->getType());
10468 if (EScalarTy != ScalarTy) {
10470 unsigned CastOpcode = Instruction::Trunc;
10471 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10472 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10474 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10481 if (!V1 && !V2 && !P2.
isNull()) {
10483 const TreeEntry *E = cast<const TreeEntry *>(P1);
10484 unsigned VF = E->getVectorFactor();
10485 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10486 CommonVF = std::max(VF, E2->getVectorFactor());
10489 return Idx < 2 * static_cast<int>(CommonVF);
10491 "All elements in mask must be less than 2 * CommonVF.");
10492 if (E->Scalars.size() == E2->Scalars.size()) {
10496 for (
int &
Idx : CommonMask) {
10499 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10501 else if (
Idx >=
static_cast<int>(CommonVF))
10502 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10506 CommonVF = E->Scalars.size();
10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10508 GetNodeMinBWAffectedCost(*E2, CommonVF);
10510 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10511 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10514 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10515 }
else if (!V1 && P2.
isNull()) {
10517 const TreeEntry *E = cast<const TreeEntry *>(P1);
10518 unsigned VF = E->getVectorFactor();
10522 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10523 "All elements in mask must be less than CommonVF.");
10524 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10526 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10527 for (
int &
Idx : CommonMask) {
10531 CommonVF = E->Scalars.size();
10532 }
else if (
unsigned Factor = E->getInterleaveFactor();
10533 Factor > 0 && E->Scalars.size() != Mask.size() &&
10537 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10539 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10542 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10543 CommonVF == CommonMask.
size() &&
10545 [](
const auto &&
P) {
10547 static_cast<unsigned>(
P.value()) !=
P.index();
10555 }
else if (V1 && P2.
isNull()) {
10557 ExtraCost += GetValueMinBWAffectedCost(V1);
10558 CommonVF = getVF(V1);
10561 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10562 "All elements in mask must be less than CommonVF.");
10563 }
else if (V1 && !V2) {
10565 unsigned VF = getVF(V1);
10566 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10567 CommonVF = std::max(VF, E2->getVectorFactor());
10570 return Idx < 2 * static_cast<int>(CommonVF);
10572 "All elements in mask must be less than 2 * CommonVF.");
10573 if (E2->Scalars.size() == VF && VF != CommonVF) {
10575 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10576 for (
int &
Idx : CommonMask) {
10579 if (
Idx >=
static_cast<int>(CommonVF))
10580 Idx = E2Mask[
Idx - CommonVF] + VF;
10584 ExtraCost += GetValueMinBWAffectedCost(V1);
10586 ExtraCost += GetNodeMinBWAffectedCost(
10587 *E2, std::min(CommonVF, E2->getVectorFactor()));
10588 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10589 }
else if (!V1 && V2) {
10591 unsigned VF = getVF(V2);
10592 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10593 CommonVF = std::max(VF, E1->getVectorFactor());
10596 return Idx < 2 * static_cast<int>(CommonVF);
10598 "All elements in mask must be less than 2 * CommonVF.");
10599 if (E1->Scalars.size() == VF && VF != CommonVF) {
10601 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10602 for (
int &
Idx : CommonMask) {
10605 if (
Idx >=
static_cast<int>(CommonVF))
10606 Idx = E1Mask[
Idx - CommonVF] + VF;
10612 ExtraCost += GetNodeMinBWAffectedCost(
10613 *E1, std::min(CommonVF, E1->getVectorFactor()));
10615 ExtraCost += GetValueMinBWAffectedCost(V2);
10616 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10618 assert(V1 && V2 &&
"Expected both vectors.");
10619 unsigned VF = getVF(V1);
10620 CommonVF = std::max(VF, getVF(V2));
10623 return Idx < 2 * static_cast<int>(CommonVF);
10625 "All elements in mask must be less than 2 * CommonVF.");
10627 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10628 if (V1->
getType() != V2->getType()) {
10630 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10632 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10634 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10635 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10638 InVectors.
front() =
10640 if (InVectors.
size() == 2)
10642 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10643 V1, V2, CommonMask, Builder, ScalarTy);
10650 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10651 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10652 CheckedExtracts(CheckedExtracts) {}
10654 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10655 unsigned NumParts,
bool &UseVecBaseAsInput) {
10656 UseVecBaseAsInput =
false;
10659 Value *VecBase =
nullptr;
10661 if (!E->ReorderIndices.empty()) {
10663 E->ReorderIndices.end());
10668 bool PrevNodeFound =
any_of(
10670 [&](
const std::unique_ptr<TreeEntry> &TE) {
10671 return ((TE->hasState() && !TE->isAltShuffle() &&
10672 TE->getOpcode() == Instruction::ExtractElement) ||
10674 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10675 return VL.size() > Data.index() &&
10676 (Mask[Data.index()] == PoisonMaskElem ||
10677 isa<UndefValue>(VL[Data.index()]) ||
10678 Data.value() == VL[Data.index()]);
10683 for (
unsigned Part : seq<unsigned>(NumParts)) {
10685 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10689 if (isa<UndefValue>(V) ||
10698 auto *EE = cast<ExtractElementInst>(V);
10699 VecBase = EE->getVectorOperand();
10700 UniqueBases.
insert(VecBase);
10702 if (!CheckedExtracts.
insert(V).second ||
10703 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10706 return isa<GetElementPtrInst>(U) &&
10707 !R.areAllUsersVectorized(cast<Instruction>(U),
10715 unsigned Idx = *EEIdx;
10717 if (EE->hasOneUse() || !PrevNodeFound) {
10719 if (isa<SExtInst, ZExtInst>(Ext) &&
10720 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10725 EE->getVectorOperandType(),
Idx);
10728 Ext->getOpcode(), Ext->getType(), EE->getType(),
10743 if (!PrevNodeFound)
10744 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10747 transformMaskAfterShuffle(CommonMask, CommonMask);
10748 SameNodesEstimated =
false;
10749 if (NumParts != 1 && UniqueBases.
size() != 1) {
10750 UseVecBaseAsInput =
true;
10758 std::optional<InstructionCost>
10762 return std::nullopt;
10768 return Idx < static_cast<int>(E1.getVectorFactor());
10770 "Expected single vector shuffle mask.");
10774 if (InVectors.
empty()) {
10775 CommonMask.
assign(Mask.begin(), Mask.end());
10776 InVectors.
assign({&E1, &E2});
10779 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10785 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10786 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10789 if (InVectors.
empty()) {
10790 CommonMask.
assign(Mask.begin(), Mask.end());
10791 InVectors.
assign(1, &E1);
10794 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10800 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10801 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10802 if (!SameNodesEstimated && InVectors.
size() == 1)
10814 auto *EI = cast<ExtractElementInst>(
10815 cast<const TreeEntry *>(InVectors.
front())
10816 ->getOrdered(
P.index()));
10817 return EI->getVectorOperand() == V1 ||
10818 EI->getVectorOperand() == V2;
10820 "Expected extractelement vectors.");
10824 if (InVectors.
empty()) {
10826 "Expected empty input mask/vectors.");
10827 CommonMask.
assign(Mask.begin(), Mask.end());
10828 InVectors.
assign(1, V1);
10833 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10834 !CommonMask.
empty() &&
10837 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10838 ->getOrdered(
P.index());
10840 return P.value() == Mask[
P.index()] ||
10841 isa<UndefValue>(Scalar);
10842 if (isa<Constant>(V1))
10844 auto *EI = cast<ExtractElementInst>(Scalar);
10845 return EI->getVectorOperand() == V1;
10847 "Expected only tree entry for extractelement vectors.");
10851 "Expected only tree entries from extracts/reused buildvectors.");
10852 unsigned VF = getVF(V1);
10853 if (InVectors.
size() == 2) {
10854 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10855 transformMaskAfterShuffle(CommonMask, CommonMask);
10856 VF = std::max<unsigned>(VF, CommonMask.
size());
10857 }
else if (
const auto *InTE =
10858 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10859 VF = std::max(VF, InTE->getVectorFactor());
10862 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10863 ->getNumElements());
10866 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10868 CommonMask[
Idx] = Mask[
Idx] + VF;
10871 Value *Root =
nullptr) {
10872 Cost += getBuildVectorCost(VL, Root);
10876 unsigned VF = VL.
size();
10878 VF = std::min(VF, MaskVF);
10880 if (isa<UndefValue>(V)) {
10886 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10893 Type *ScalarTy = V->getType()->getScalarType();
10895 if (isa<PoisonValue>(V))
10897 else if (isa<UndefValue>(V))
10901 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10904 Vals.
swap(NewVals);
10910 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10917 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10920 IsFinalized =
true;
10923 if (InVectors.
size() == 2)
10924 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10926 Cost += createShuffle(Vec,
nullptr, CommonMask);
10927 transformMaskAfterShuffle(CommonMask, CommonMask);
10929 "Expected vector length for the final value before action.");
10930 Value *V = cast<Value *>(Vec);
10931 Action(V, CommonMask);
10932 InVectors.
front() = V;
10934 if (!SubVectors.empty()) {
10936 if (InVectors.
size() == 2)
10937 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10939 Cost += createShuffle(Vec,
nullptr, CommonMask);
10940 transformMaskAfterShuffle(CommonMask, CommonMask);
10942 if (!SubVectorsMask.
empty()) {
10944 "Expected same size of masks for subvectors and common mask.");
10946 copy(SubVectorsMask, SVMask.begin());
10947 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10950 I1 = I2 + CommonMask.
size();
10957 for (
auto [E,
Idx] : SubVectors) {
10958 Type *EScalarTy = E->Scalars.front()->getType();
10959 bool IsSigned =
true;
10960 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10963 IsSigned = It->second.second;
10965 if (ScalarTy != EScalarTy) {
10966 unsigned CastOpcode = Instruction::Trunc;
10967 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10968 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10970 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10980 if (!CommonMask.
empty()) {
10981 std::iota(std::next(CommonMask.
begin(),
Idx),
10982 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10988 if (!ExtMask.
empty()) {
10989 if (CommonMask.
empty()) {
10993 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
10996 NewMask[
I] = CommonMask[ExtMask[
I]];
10998 CommonMask.
swap(NewMask);
11001 if (CommonMask.
empty()) {
11002 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11006 createShuffle(InVectors.
front(),
11007 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11013 "Shuffle construction must be finalized.");
11017const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11018 unsigned Idx)
const {
11019 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11022 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11023 return TE->isGather() &&
11024 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11025 return EI.EdgeIdx == Idx && EI.UserTE == E;
11026 }) != TE->UserTreeIndices.end();
11028 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11033 if (TE.State == TreeEntry::ScatterVectorize ||
11034 TE.State == TreeEntry::StridedVectorize)
11036 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11037 !TE.isAltShuffle()) {
11038 if (TE.ReorderIndices.empty())
11052 const unsigned VF,
unsigned MinBW,
11084 auto It = MinBWs.
find(E);
11085 Type *OrigScalarTy = ScalarTy;
11086 if (It != MinBWs.
end()) {
11087 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11093 unsigned EntryVF = E->getVectorFactor();
11096 if (E->isGather()) {
11099 if (isa<InsertElementInst>(VL[0]))
11101 if (isa<CmpInst>(VL.
front()))
11102 ScalarTy = VL.
front()->getType();
11103 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11104 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11108 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11111 if (E->getOpcode() == Instruction::Store) {
11113 NewMask.
resize(E->ReorderIndices.size());
11114 copy(E->ReorderIndices, NewMask.
begin());
11120 if (!E->ReuseShuffleIndices.empty())
11121 ::addMask(Mask, E->ReuseShuffleIndices);
11125 assert((E->State == TreeEntry::Vectorize ||
11126 E->State == TreeEntry::ScatterVectorize ||
11127 E->State == TreeEntry::StridedVectorize) &&
11128 "Unhandled state");
11129 assert(E->getOpcode() &&
11131 (E->getOpcode() == Instruction::GetElementPtr &&
11132 E->getMainOp()->getType()->isPointerTy())) &&
11135 unsigned ShuffleOrOp =
11136 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11137 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11138 ShuffleOrOp = E->CombinedOp;
11140 const unsigned Sz = UniqueValues.
size();
11142 for (
unsigned I = 0;
I < Sz; ++
I) {
11143 if (isa<Instruction>(UniqueValues[
I]) &&
11146 UsedScalars.set(
I);
11148 auto GetCastContextHint = [&](
Value *
V) {
11150 return getCastContextHint(*OpTEs.front());
11151 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11152 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11153 !SrcState.isAltShuffle())
11162 if (isa<CastInst, CallInst>(VL0)) {
11166 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11168 for (
unsigned I = 0;
I < Sz; ++
I) {
11169 if (UsedScalars.test(
I))
11171 ScalarCost += ScalarEltCost(
I);
11180 (E->getOpcode() != Instruction::Load ||
11181 !E->UserTreeIndices.empty())) {
11182 const EdgeInfo &EI =
11183 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11184 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11186 if (EI.UserTE->getOpcode() != Instruction::Select ||
11188 auto UserBWIt = MinBWs.
find(EI.UserTE);
11189 Type *UserScalarTy =
11190 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11191 if (UserBWIt != MinBWs.
end())
11193 UserBWIt->second.first);
11194 if (ScalarTy != UserScalarTy) {
11195 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11196 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11197 unsigned VecOpcode;
11198 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11199 if (BWSz > SrcBWSz)
11200 VecOpcode = Instruction::Trunc;
11203 It->second.second ? Instruction::SExt : Instruction::ZExt;
11210 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11211 ScalarCost,
"Calculated costs for Tree"));
11212 return VecCost - ScalarCost;
11217 assert((E->State == TreeEntry::Vectorize ||
11218 E->State == TreeEntry::StridedVectorize) &&
11219 "Entry state expected to be Vectorize or StridedVectorize here.");
11223 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11224 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11225 "Calculated GEPs cost for Tree"));
11227 return VecCost - ScalarCost;
11234 Type *CanonicalType = Ty;
11241 {CanonicalType, CanonicalType});
11246 if (VI && SelectOnly) {
11248 "Expected only for scalar type.");
11249 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11251 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11252 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11253 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11255 return IntrinsicCost;
11257 switch (ShuffleOrOp) {
11258 case Instruction::PHI: {
11262 for (
Value *V : UniqueValues) {
11263 auto *
PHI = dyn_cast<PHINode>(V);
11268 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11272 if (
const TreeEntry *OpTE =
11274 if (CountedOps.
insert(OpTE).second &&
11275 !OpTE->ReuseShuffleIndices.empty())
11276 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11277 OpTE->Scalars.size());
11280 return CommonCost - ScalarCost;
11282 case Instruction::ExtractValue:
11283 case Instruction::ExtractElement: {
11284 auto GetScalarCost = [&](
unsigned Idx) {
11285 if (isa<PoisonValue>(UniqueValues[
Idx]))
11288 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11290 if (ShuffleOrOp == Instruction::ExtractElement) {
11291 auto *EE = cast<ExtractElementInst>(
I);
11292 SrcVecTy = EE->getVectorOperandType();
11294 auto *EV = cast<ExtractValueInst>(
I);
11295 Type *AggregateTy = EV->getAggregateOperand()->getType();
11297 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11298 NumElts = ATy->getNumElements();
11303 if (
I->hasOneUse()) {
11305 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11306 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11313 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11321 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11322 return GetCostDiff(GetScalarCost, GetVectorCost);
11324 case Instruction::InsertElement: {
11325 assert(E->ReuseShuffleIndices.empty() &&
11326 "Unique insertelements only are expected.");
11327 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11328 unsigned const NumElts = SrcVecTy->getNumElements();
11329 unsigned const NumScalars = VL.
size();
11335 unsigned OffsetEnd = OffsetBeg;
11336 InsertMask[OffsetBeg] = 0;
11339 if (OffsetBeg >
Idx)
11341 else if (OffsetEnd <
Idx)
11343 InsertMask[
Idx] =
I + 1;
11346 if (NumOfParts > 0 && NumOfParts < NumElts)
11347 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11348 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11350 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11351 unsigned InsertVecSz = std::min<unsigned>(
11353 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11354 bool IsWholeSubvector =
11355 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11359 if (OffsetBeg + InsertVecSz > VecSz) {
11362 InsertVecSz = VecSz;
11368 if (!E->ReorderIndices.empty()) {
11373 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11375 bool IsIdentity =
true;
11377 Mask.swap(PrevMask);
11378 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11380 DemandedElts.
setBit(InsertIdx);
11381 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11382 Mask[InsertIdx - OffsetBeg] =
I;
11384 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11398 InsertVecTy, Mask);
11399 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11400 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11408 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11409 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11410 if (InsertVecSz != VecSz) {
11421 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11430 case Instruction::ZExt:
11431 case Instruction::SExt:
11432 case Instruction::FPToUI:
11433 case Instruction::FPToSI:
11434 case Instruction::FPExt:
11435 case Instruction::PtrToInt:
11436 case Instruction::IntToPtr:
11437 case Instruction::SIToFP:
11438 case Instruction::UIToFP:
11439 case Instruction::Trunc:
11440 case Instruction::FPTrunc:
11441 case Instruction::BitCast: {
11442 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11445 unsigned Opcode = ShuffleOrOp;
11446 unsigned VecOpcode = Opcode;
11448 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11450 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11451 if (SrcIt != MinBWs.
end()) {
11452 SrcBWSz = SrcIt->second.first;
11459 if (BWSz == SrcBWSz) {
11460 VecOpcode = Instruction::BitCast;
11461 }
else if (BWSz < SrcBWSz) {
11462 VecOpcode = Instruction::Trunc;
11463 }
else if (It != MinBWs.
end()) {
11464 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11465 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11466 }
else if (SrcIt != MinBWs.
end()) {
11467 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11469 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11471 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11472 !SrcIt->second.second) {
11473 VecOpcode = Instruction::UIToFP;
11476 assert(
Idx == 0 &&
"Expected 0 index only");
11484 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11486 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11489 bool IsArithmeticExtendedReduction =
11490 E->Idx == 0 && UserIgnoreList &&
11492 auto *
I = cast<Instruction>(V);
11493 return is_contained({Instruction::Add, Instruction::FAdd,
11494 Instruction::Mul, Instruction::FMul,
11495 Instruction::And, Instruction::Or,
11499 if (IsArithmeticExtendedReduction &&
11500 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11502 return CommonCost +
11504 VecOpcode == Opcode ? VI :
nullptr);
11506 return GetCostDiff(GetScalarCost, GetVectorCost);
11508 case Instruction::FCmp:
11509 case Instruction::ICmp:
11510 case Instruction::Select: {
11514 match(VL0, MatchCmp))
11520 auto GetScalarCost = [&](
unsigned Idx) {
11521 if (isa<PoisonValue>(UniqueValues[
Idx]))
11524 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11530 !
match(VI, MatchCmp)) ||
11538 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11539 CostKind, getOperandInfo(
VI->getOperand(0)),
11540 getOperandInfo(
VI->getOperand(1)), VI);
11543 ScalarCost = IntrinsicCost;
11552 CostKind, getOperandInfo(E->getOperand(0)),
11553 getOperandInfo(E->getOperand(1)), VL0);
11554 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11557 unsigned CondNumElements = CondType->getNumElements();
11559 assert(VecTyNumElements >= CondNumElements &&
11560 VecTyNumElements % CondNumElements == 0 &&
11561 "Cannot vectorize Instruction::Select");
11562 if (CondNumElements != VecTyNumElements) {
11571 return VecCost + CommonCost;
11573 return GetCostDiff(GetScalarCost, GetVectorCost);
11575 case TreeEntry::MinMax: {
11576 auto GetScalarCost = [&](
unsigned Idx) {
11577 return GetMinMaxCost(OrigScalarTy);
11581 return VecCost + CommonCost;
11583 return GetCostDiff(GetScalarCost, GetVectorCost);
11585 case Instruction::FNeg:
11586 case Instruction::Add:
11587 case Instruction::FAdd:
11588 case Instruction::Sub:
11589 case Instruction::FSub:
11590 case Instruction::Mul:
11591 case Instruction::FMul:
11592 case Instruction::UDiv:
11593 case Instruction::SDiv:
11594 case Instruction::FDiv:
11595 case Instruction::URem:
11596 case Instruction::SRem:
11597 case Instruction::FRem:
11598 case Instruction::Shl:
11599 case Instruction::LShr:
11600 case Instruction::AShr:
11601 case Instruction::And:
11602 case Instruction::Or:
11603 case Instruction::Xor: {
11604 auto GetScalarCost = [&](
unsigned Idx) {
11605 if (isa<PoisonValue>(UniqueValues[
Idx]))
11608 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11609 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11618 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11619 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11622 auto *CI = dyn_cast<ConstantInt>(
Op);
11623 return CI && CI->getValue().countr_one() >= It->second.first;
11628 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11632 Op2Info, {},
nullptr, TLI) +
11635 return GetCostDiff(GetScalarCost, GetVectorCost);
11637 case Instruction::GetElementPtr: {
11638 return CommonCost + GetGEPCostDiff(VL, VL0);
11640 case Instruction::Load: {
11641 auto GetScalarCost = [&](
unsigned Idx) {
11642 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11644 VI->getAlign(),
VI->getPointerAddressSpace(),
11647 auto *LI0 = cast<LoadInst>(VL0);
11650 switch (E->State) {
11651 case TreeEntry::Vectorize:
11652 if (
unsigned Factor = E->getInterleaveFactor()) {
11654 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11655 LI0->getPointerAddressSpace(),
CostKind);
11659 Instruction::Load, VecTy, LI0->getAlign(),
11663 case TreeEntry::StridedVectorize: {
11664 Align CommonAlignment =
11665 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11667 Instruction::Load, VecTy, LI0->getPointerOperand(),
11668 false, CommonAlignment,
CostKind);
11671 case TreeEntry::ScatterVectorize: {
11672 Align CommonAlignment =
11673 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11675 Instruction::Load, VecTy, LI0->getPointerOperand(),
11676 false, CommonAlignment,
CostKind);
11679 case TreeEntry::CombinedVectorize:
11680 case TreeEntry::NeedToGather:
11683 return VecLdCost + CommonCost;
11689 if (E->State == TreeEntry::ScatterVectorize)
11695 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11696 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11698 case Instruction::Store: {
11699 bool IsReorder = !E->ReorderIndices.empty();
11700 auto GetScalarCost = [=](
unsigned Idx) {
11701 auto *
VI = cast<StoreInst>(VL[
Idx]);
11704 VI->getAlign(),
VI->getPointerAddressSpace(),
11708 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11712 if (E->State == TreeEntry::StridedVectorize) {
11713 Align CommonAlignment =
11714 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11716 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11717 false, CommonAlignment,
CostKind);
11719 assert(E->State == TreeEntry::Vectorize &&
11720 "Expected either strided or consecutive stores.");
11721 if (
unsigned Factor = E->getInterleaveFactor()) {
11722 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11723 "No reused shuffles expected");
11726 Instruction::Store, VecTy, Factor, std::nullopt,
11727 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11731 Instruction::Store, VecTy, BaseSI->getAlign(),
11732 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11735 return VecStCost + CommonCost;
11739 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11740 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11743 return GetCostDiff(GetScalarCost, GetVectorCost) +
11744 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11746 case Instruction::Call: {
11747 auto GetScalarCost = [&](
unsigned Idx) {
11748 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11759 auto *CI = cast<CallInst>(VL0);
11763 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11765 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11767 return GetCostDiff(GetScalarCost, GetVectorCost);
11769 case Instruction::ShuffleVector: {
11770 if (!
SLPReVec || E->isAltShuffle())
11771 assert(E->isAltShuffle() &&
11776 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11777 "Invalid Shuffle Vector Operand");
11780 auto TryFindNodeWithEqualOperands = [=]() {
11781 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11784 if (
TE->hasState() &&
TE->isAltShuffle() &&
11785 ((
TE->getOpcode() == E->getOpcode() &&
11786 TE->getAltOpcode() == E->getAltOpcode()) ||
11787 (
TE->getOpcode() == E->getAltOpcode() &&
11788 TE->getAltOpcode() == E->getOpcode())) &&
11789 TE->hasEqualOperands(*E))
11794 auto GetScalarCost = [&](
unsigned Idx) {
11795 if (isa<PoisonValue>(UniqueValues[
Idx]))
11798 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11799 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11809 if (TryFindNodeWithEqualOperands()) {
11811 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11818 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11820 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11821 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11823 VecCost = TTIRef.getCmpSelInstrCost(
11824 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11825 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11827 VecCost += TTIRef.getCmpSelInstrCost(
11828 E->getOpcode(), VecTy, MaskTy,
11829 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11830 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11833 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11836 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11837 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11839 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11840 if (SrcIt != MinBWs.
end()) {
11841 SrcBWSz = SrcIt->second.first;
11845 if (BWSz <= SrcBWSz) {
11846 if (BWSz < SrcBWSz)
11848 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11852 <<
"SLP: alternate extension, which should be truncated.\n";
11858 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11861 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11865 E->buildAltOpShuffleMask(
11867 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11878 unsigned Opcode0 = E->getOpcode();
11879 unsigned Opcode1 = E->getAltOpcode();
11883 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11885 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11886 return AltVecCost < VecCost ? AltVecCost : VecCost;
11891 if (
SLPReVec && !E->isAltShuffle())
11892 return GetCostDiff(
11897 "Not supported shufflevector usage.");
11898 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11899 unsigned SVNumElements =
11900 cast<FixedVectorType>(SV->getOperand(0)->getType())
11901 ->getNumElements();
11902 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11903 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11907 assert(isa<ShuffleVectorInst>(V) &&
11908 "Not supported shufflevector usage.");
11909 auto *SV = cast<ShuffleVectorInst>(V);
11911 [[maybe_unused]]
bool IsExtractSubvectorMask =
11912 SV->isExtractSubvectorMask(Index);
11913 assert(IsExtractSubvectorMask &&
11914 "Not supported shufflevector usage.");
11915 if (NextIndex != Index)
11917 NextIndex += SV->getShuffleMask().size();
11920 return ::getShuffleCost(
11926 return GetCostDiff(GetScalarCost, GetVectorCost);
11928 case Instruction::Freeze:
11935bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11937 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11939 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11941 return TE->isGather() &&
11943 [
this](
Value *V) { return EphValues.contains(V); }) &&
11945 TE->Scalars.size() < Limit ||
11946 (((
TE->hasState() &&
11947 TE->getOpcode() == Instruction::ExtractElement) ||
11948 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11950 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
11951 !
TE->isAltShuffle()) ||
11952 any_of(
TE->Scalars, IsaPred<LoadInst>));
11956 if (VectorizableTree.size() == 1 &&
11957 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11958 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11960 AreVectorizableGathers(VectorizableTree[0].
get(),
11961 VectorizableTree[0]->Scalars.size()) &&
11962 VectorizableTree[0]->getVectorFactor() > 2)))
11965 if (VectorizableTree.size() != 2)
11973 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11974 AreVectorizableGathers(VectorizableTree[1].
get(),
11975 VectorizableTree[0]->Scalars.size()))
11979 if (VectorizableTree[0]->
isGather() ||
11980 (VectorizableTree[1]->isGather() &&
11981 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11982 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11990 bool MustMatchOrInst) {
11994 Value *ZextLoad = Root;
11995 const APInt *ShAmtC;
11996 bool FoundOr =
false;
11997 while (!isa<ConstantExpr>(ZextLoad) &&
12000 ShAmtC->
urem(8) == 0))) {
12001 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12002 ZextLoad = BinOp->getOperand(0);
12003 if (BinOp->getOpcode() == Instruction::Or)
12008 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12015 Type *SrcTy = Load->getType();
12022 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
12023 << *(cast<Instruction>(Root)) <<
"\n");
12032 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12033 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12041 unsigned NumElts = Stores.
size();
12042 for (
Value *Scalar : Stores) {
12056 if (VectorizableTree.empty()) {
12057 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12063 if (VectorizableTree.size() == 2 &&
12064 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12065 VectorizableTree[1]->isGather() &&
12066 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12067 !(
isSplat(VectorizableTree[1]->Scalars) ||
12075 constexpr int Limit = 4;
12077 !VectorizableTree.empty() &&
12078 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12079 return (TE->isGather() &&
12080 (!TE->hasState() ||
12081 TE->getOpcode() != Instruction::ExtractElement) &&
12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12083 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12094 if (isFullyVectorizableTinyTree(ForReduction))
12099 bool IsAllowedSingleBVNode =
12100 VectorizableTree.size() > 1 ||
12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12102 !VectorizableTree.front()->isAltShuffle() &&
12103 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12106 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12107 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12108 return isa<ExtractElementInst, UndefValue>(V) ||
12109 (IsAllowedSingleBVNode &&
12110 !V->hasNUsesOrMore(UsesLimit) &&
12111 any_of(V->users(), IsaPred<InsertElementInst>));
12116 if (VectorizableTree.back()->isGather() &&
12117 VectorizableTree.back()->hasState() &&
12118 VectorizableTree.back()->isAltShuffle() &&
12119 VectorizableTree.back()->getVectorFactor() > 2 &&
12121 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12123 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12124 VectorizableTree.back()->getVectorFactor()),
12137 constexpr unsigned SmallTree = 3;
12138 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12141 [](
const std::unique_ptr<TreeEntry> &TE) {
12142 return TE->isGather() && TE->hasState() &&
12143 TE->getOpcode() == Instruction::Load &&
12151 TreeEntry &E = *VectorizableTree[
Idx];
12154 if (E.hasState() && E.getOpcode() != Instruction::Load)
12168 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12181 for (
const auto &TEPtr : VectorizableTree) {
12182 if (TEPtr->State != TreeEntry::Vectorize)
12184 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12190 auto *NodeA = DT->
getNode(
A->getParent());
12191 auto *NodeB = DT->
getNode(
B->getParent());
12192 assert(NodeA &&
"Should only process reachable instructions");
12193 assert(NodeB &&
"Should only process reachable instructions");
12194 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195 "Different nodes should have different DFS numbers");
12196 if (NodeA != NodeB)
12197 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198 return B->comesBefore(
A);
12208 LiveValues.
erase(PrevInst);
12209 for (
auto &J : PrevInst->
operands()) {
12211 LiveValues.
insert(cast<Instruction>(&*J));
12215 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12216 for (
auto *
X : LiveValues)
12217 dbgs() <<
" " <<
X->getName();
12218 dbgs() <<
", Looking at ";
12223 unsigned NumCalls = 0;
12227 while (InstIt != PrevInstIt) {
12228 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12229 PrevInstIt = Inst->getParent()->rbegin();
12234 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12235 if (
II->isAssumeLikeIntrinsic())
12239 for (
auto &ArgOp :
II->args())
12240 Tys.push_back(ArgOp->getType());
12241 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12242 FMF = FPMO->getFastMathFlags();
12249 if (IntrCost < CallCost)
12256 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12257 &*PrevInstIt != PrevInst)
12265 for (
auto *
II : LiveValues) {
12266 auto *ScalarTy =
II->getType();
12267 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12268 ScalarTy = VectorTy->getElementType();
12286 const auto *I1 = IE1;
12287 const auto *I2 = IE2;
12299 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12301 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12302 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12304 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12305 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12312struct ValueSelect {
12313 template <
typename U>
12314 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12317 template <
typename U>
12318 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12336template <
typename T>
12342 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12344 auto VMIt = std::next(ShuffleMask.begin());
12347 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12349 if (!IsBaseUndef.
all()) {
12351 std::pair<T *, bool> Res =
12352 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12354 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12358 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12360 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
12361 assert((!V || GetVF(V) == Mask.size()) &&
12362 "Expected base vector of VF number of elements.");
12363 Prev = Action(Mask, {
nullptr, Res.first});
12364 }
else if (ShuffleMask.size() == 1) {
12367 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12373 Prev = Action(Mask, {ShuffleMask.begin()->first});
12377 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12378 unsigned Vec2VF = GetVF(VMIt->first);
12379 if (Vec1VF == Vec2VF) {
12383 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12386 Mask[
I] = SecMask[
I] + Vec1VF;
12389 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12392 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12394 std::pair<T *, bool> Res2 =
12395 ResizeAction(VMIt->first, VMIt->second,
false);
12397 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12404 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12407 Prev = Action(Mask, {Res1.first, Res2.first});
12409 VMIt = std::next(VMIt);
12411 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
12413 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12415 std::pair<T *, bool> Res =
12416 ResizeAction(VMIt->first, VMIt->second,
false);
12418 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12421 "Multiple uses of scalars.");
12422 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12427 Prev = Action(Mask, {Prev, Res.first});
12435template <
typename T>
struct ShuffledInsertData {
12446 << VectorizableTree.size() <<
".\n");
12448 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12451 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12452 TreeEntry &TE = *VectorizableTree[
I];
12455 if (TE.State == TreeEntry::CombinedVectorize) {
12457 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12458 << *TE.Scalars[0] <<
".\n";
12459 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12462 if (TE.isGather() && TE.hasState()) {
12463 if (
const TreeEntry *E =
12464 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
12465 E && E->getVectorFactor() == TE.getVectorFactor()) {
12470 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12477 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12478 "Expected gather nodes with users only.");
12484 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12493 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12500 for (ExternalUser &EU : ExternalUses) {
12501 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12503 for (ExternalUser &EU : ExternalUses) {
12507 if (EphValues.
count(EU.User))
12513 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12516 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12520 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12521 !ExtractCostCalculated.
insert(EU.Scalar).second)
12525 if (isa<FixedVectorType>(EU.Scalar->getType()))
12530 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12532 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12533 if (!UsedInserts.
insert(VU).second)
12537 const TreeEntry *ScalarTE = &EU.E;
12540 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12545 Value *Op0 =
II->getOperand(0);
12552 if (It == ShuffledInserts.
end()) {
12554 Data.InsertElements.emplace_back(VU);
12556 VecId = ShuffledInserts.
size() - 1;
12557 auto It = MinBWs.
find(ScalarTE);
12558 if (It != MinBWs.
end() &&
12560 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12562 unsigned BWSz = It->second.first;
12563 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12564 unsigned VecOpcode;
12565 if (DstBWSz < BWSz)
12566 VecOpcode = Instruction::Trunc;
12569 It->second.second ? Instruction::SExt : Instruction::ZExt;
12574 FTy->getNumElements()),
12577 <<
" for extending externally used vector with "
12578 "non-equal minimum bitwidth.\n");
12583 It->InsertElements.front() = VU;
12584 VecId = std::distance(ShuffledInserts.
begin(), It);
12586 int InIdx = *InsertIdx;
12588 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12591 Mask[InIdx] = EU.Lane;
12592 DemandedElts[VecId].setBit(InIdx);
12603 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12604 const TreeEntry *Entry = &EU.E;
12605 auto It = MinBWs.
find(Entry);
12606 if (It != MinBWs.
end()) {
12609 ? Instruction::ZExt
12610 : Instruction::SExt;
12617 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12620 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12621 Entry->getOpcode() == Instruction::Load) {
12623 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12624 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12625 auto *
I = cast<Instruction>(U.Scalar);
12626 const Loop *L = LI->getLoopFor(Phi->getParent());
12627 return L && (Phi->getParent() ==
I->getParent() ||
12628 L == LI->getLoopFor(
I->getParent()));
12632 if (!ValueToExtUses) {
12633 ValueToExtUses.emplace();
12636 if (IsPhiInLoop(
P.value()))
12639 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12644 auto *Inst = cast<Instruction>(EU.Scalar);
12646 auto OperandIsScalar = [&](
Value *V) {
12651 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12652 return !EE->hasOneUse() || !MustGather.contains(EE);
12655 return ValueToExtUses->contains(V);
12657 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12658 bool CanBeUsedAsScalarCast =
false;
12659 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12660 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12661 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12666 if (ScalarCost + OpCost <= ExtraCost) {
12667 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12668 ScalarCost += OpCost;
12672 if (CanBeUsedAsScalar) {
12673 bool KeepScalar = ScalarCost <= ExtraCost;
12677 bool IsProfitablePHIUser =
12679 VectorizableTree.front()->Scalars.size() > 2)) &&
12680 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12684 auto *PHIUser = dyn_cast<PHINode>(U);
12685 return (!PHIUser ||
12686 PHIUser->getParent() !=
12688 VectorizableTree.front()->getMainOp())
12693 return ValueToExtUses->contains(V);
12695 if (IsProfitablePHIUser) {
12699 (!GatheredLoadsEntriesFirst.has_value() ||
12700 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12701 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12702 return ValueToExtUses->contains(V);
12704 auto It = ExtractsCount.
find(Entry);
12705 if (It != ExtractsCount.
end()) {
12706 assert(ScalarUsesCount >= It->getSecond().size() &&
12707 "Expected total number of external uses not less than "
12708 "number of scalar uses.");
12709 ScalarUsesCount -= It->getSecond().size();
12714 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12717 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12719 auto It = ValueToExtUses->find(V);
12720 if (It != ValueToExtUses->end()) {
12722 ExternalUses[It->second].User = nullptr;
12725 ExtraCost = ScalarCost;
12726 if (!IsPhiInLoop(EU))
12727 ExtractsCount[Entry].
insert(Inst);
12728 if (CanBeUsedAsScalarCast) {
12729 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12732 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12734 auto It = ValueToExtUses->find(V);
12735 if (It != ValueToExtUses->end()) {
12737 ExternalUses[It->second].User = nullptr;
12746 ExtractCost += ExtraCost;
12750 for (
Value *V : ScalarOpsFromCasts) {
12751 ExternalUsesAsOriginalScalar.
insert(V);
12753 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
12754 TEs.front()->findLaneForValue(V));
12758 if (!VectorizedVals.
empty()) {
12759 const TreeEntry &Root = *VectorizableTree.front();
12760 auto BWIt = MinBWs.find(&Root);
12761 if (BWIt != MinBWs.end()) {
12762 Type *DstTy = Root.Scalars.front()->getType();
12765 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12766 if (OriginalSz != SrcSz) {
12767 unsigned Opcode = Instruction::Trunc;
12768 if (OriginalSz > SrcSz)
12769 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12771 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12783 Cost += SpillCost + ExtractCost;
12787 unsigned VF =
Mask.size();
12788 unsigned VecVF =
TE->getVectorFactor();
12790 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12793 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12799 dbgs() <<
"SLP: Adding cost " <<
C
12800 <<
" for final shuffle of insertelement external users.\n";
12801 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12803 return std::make_pair(TE,
true);
12805 return std::make_pair(TE,
false);
12808 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12809 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12810 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12814 assert((TEs.size() == 1 || TEs.size() == 2) &&
12815 "Expected exactly 1 or 2 tree entries.");
12816 if (TEs.size() == 1) {
12818 VF = TEs.front()->getVectorFactor();
12819 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12823 (
Data.index() < VF &&
12824 static_cast<int>(
Data.index()) ==
Data.value());
12829 <<
" for final shuffle of insertelement "
12830 "external users.\n";
12831 TEs.front()->
dump();
12832 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12838 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12839 VF = TEs.front()->getVectorFactor();
12843 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12847 <<
" for final shuffle of vector node and external "
12848 "insertelement users.\n";
12849 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12850 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12856 (void)performExtractsShuffleAction<const TreeEntry>(
12858 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12859 EstimateShufflesCost);
12861 cast<FixedVectorType>(
12862 ShuffledInserts[
I].InsertElements.front()->getType()),
12865 Cost -= InsertCost;
12869 if (ReductionBitWidth != 0) {
12870 assert(UserIgnoreList &&
"Expected reduction tree.");
12871 const TreeEntry &E = *VectorizableTree.front();
12872 auto It = MinBWs.find(&E);
12873 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12874 unsigned SrcSize = It->second.first;
12875 unsigned DstSize = ReductionBitWidth;
12876 unsigned Opcode = Instruction::Trunc;
12877 if (SrcSize < DstSize) {
12878 bool IsArithmeticExtendedReduction =
12880 auto *
I = cast<Instruction>(V);
12881 return is_contained({Instruction::Add, Instruction::FAdd,
12882 Instruction::Mul, Instruction::FMul,
12883 Instruction::And, Instruction::Or,
12887 if (IsArithmeticExtendedReduction)
12889 Instruction::BitCast;
12891 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12893 if (Opcode != Instruction::BitCast) {
12895 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12897 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12900 switch (E.getOpcode()) {
12901 case Instruction::SExt:
12902 case Instruction::ZExt:
12903 case Instruction::Trunc: {
12904 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12905 CCH = getCastContextHint(*OpTE);
12915 <<
" for final resize for reduction from " << SrcVecTy
12916 <<
" to " << DstVecTy <<
"\n";
12917 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12926 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12927 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12928 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12932 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12943std::optional<TTI::ShuffleKind>
12944BoUpSLP::tryToGatherSingleRegisterExtractElements(
12950 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12951 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12953 if (isa<UndefValue>(VL[
I]))
12957 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12958 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12971 ExtractMask.reset(*
Idx);
12976 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12981 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12982 return P1.second.size() > P2.second.size();
12985 const int UndefSz = UndefVectorExtracts.
size();
12986 unsigned SingleMax = 0;
12987 unsigned PairMax = 0;
12988 if (!Vectors.
empty()) {
12989 SingleMax = Vectors.
front().second.size() + UndefSz;
12990 if (Vectors.
size() > 1) {
12991 auto *ItNext = std::next(Vectors.
begin());
12992 PairMax = SingleMax + ItNext->second.size();
12995 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12996 return std::nullopt;
13002 if (SingleMax >= PairMax && SingleMax) {
13003 for (
int Idx : Vectors.
front().second)
13005 }
else if (!Vectors.
empty()) {
13006 for (
unsigned Idx : {0, 1})
13007 for (
int Idx : Vectors[
Idx].second)
13011 for (
int Idx : UndefVectorExtracts)
13015 std::optional<TTI::ShuffleKind> Res =
13021 return std::nullopt;
13025 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13026 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
13027 isa<UndefValue>(GatheredExtracts[
I])) {
13031 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13032 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13033 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13048 unsigned NumParts)
const {
13049 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13053 for (
unsigned Part : seq<unsigned>(NumParts)) {
13059 std::optional<TTI::ShuffleKind> Res =
13060 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13061 ShufflesRes[Part] = Res;
13062 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13064 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13065 return Res.has_value();
13067 ShufflesRes.clear();
13068 return ShufflesRes;
13071std::optional<TargetTransformInfo::ShuffleKind>
13072BoUpSLP::isGatherShuffledSingleRegisterEntry(
13078 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13079 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13080 :
TE->UserTreeIndices.front();
13081 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13085 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13086 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13089 TEInsertBlock = TEInsertPt->
getParent();
13092 return std::nullopt;
13093 auto *NodeUI = DT->
getNode(TEInsertBlock);
13094 assert(NodeUI &&
"Should only process reachable instructions");
13096 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13110 auto *NodeEUI = DT->
getNode(InsertBlock);
13113 assert((NodeUI == NodeEUI) ==
13114 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13115 "Different nodes should have different DFS numbers");
13117 if (TEInsertPt->
getParent() != InsertBlock &&
13120 if (TEInsertPt->
getParent() == InsertBlock &&
13134 for (
Value *V : VL) {
13139 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13140 if (TEPtr == TE || TEPtr->Idx == 0)
13143 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13144 "Must contain at least single gathered value.");
13145 assert(TEPtr->UserTreeIndices.size() == 1 &&
13146 "Expected only single user of a gather node.");
13147 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13149 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13152 : &getLastInstructionInBundle(UseEI.UserTE);
13153 if (TEInsertPt == InsertPt) {
13157 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13161 if (TEUseEI.UserTE != UseEI.UserTE &&
13162 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13168 if ((TEInsertBlock != InsertPt->
getParent() ||
13169 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13170 !CheckOrdering(InsertPt))
13175 const TreeEntry *VTE = VTEs.front();
13176 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
13177 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
13178 VTEs = VTEs.drop_front();
13180 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
13181 return MTE->State == TreeEntry::Vectorize;
13183 if (MIt == VTEs.end())
13187 if (
none_of(
TE->CombinedEntriesWithIndices,
13188 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
13189 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13190 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13195 if (VToTEs.
empty())
13197 if (UsedTEs.
empty()) {
13211 if (!VToTEs.
empty()) {
13217 VToTEs = SavedVToTEs;
13226 if (UsedTEs.
size() == 2)
13228 UsedTEs.push_back(SavedVToTEs);
13235 if (UsedTEs.
empty()) {
13237 return std::nullopt;
13241 if (UsedTEs.
size() == 1) {
13244 UsedTEs.front().
end());
13245 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13246 return TE1->Idx < TE2->Idx;
13249 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13250 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13252 if (It != FirstEntries.end() &&
13253 ((*It)->getVectorFactor() == VL.size() ||
13254 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13255 TE->ReuseShuffleIndices.size() == VL.size() &&
13256 (*It)->isSame(
TE->Scalars)))) {
13257 Entries.push_back(*It);
13258 if ((*It)->getVectorFactor() == VL.size()) {
13259 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13260 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13266 for (
unsigned I : seq<unsigned>(VL.size()))
13267 if (isa<PoisonValue>(VL[
I]))
13273 Entries.push_back(FirstEntries.front());
13274 VF = FirstEntries.front()->getVectorFactor();
13277 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13280 for (
const TreeEntry *TE : UsedTEs.front()) {
13281 unsigned VF =
TE->getVectorFactor();
13282 auto It = VFToTE.
find(VF);
13283 if (It != VFToTE.
end()) {
13284 if (It->second->Idx >
TE->Idx)
13285 It->getSecond() =
TE;
13292 UsedTEs.back().
end());
13293 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13294 return TE1->Idx < TE2->Idx;
13296 for (
const TreeEntry *TE : SecondEntries) {
13297 auto It = VFToTE.
find(
TE->getVectorFactor());
13298 if (It != VFToTE.
end()) {
13300 Entries.push_back(It->second);
13301 Entries.push_back(TE);
13307 if (Entries.empty()) {
13309 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13310 return TE1->Idx < TE2->Idx;
13312 Entries.push_back(SecondEntries.front());
13313 VF = std::max(Entries.front()->getVectorFactor(),
13314 Entries.back()->getVectorFactor());
13316 VF = Entries.front()->getVectorFactor();
13320 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13323 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13324 auto *
PHI = cast<PHINode>(V);
13325 auto *PHI1 = cast<PHINode>(V1);
13330 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13332 Value *In1 = PHI1->getIncomingValue(
I);
13337 if (cast<Instruction>(In)->
getParent() !=
13347 auto MightBeIgnored = [=](
Value *
V) {
13348 auto *
I = dyn_cast<Instruction>(V);
13351 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13356 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13358 bool UsedInSameVTE =
false;
13359 auto It = UsedValuesEntry.
find(V1);
13360 if (It != UsedValuesEntry.
end())
13361 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13362 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13364 cast<Instruction>(V)->getParent() ==
13365 cast<Instruction>(V1)->getParent() &&
13366 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13371 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13373 auto It = UsedValuesEntry.
find(V);
13374 if (It == UsedValuesEntry.
end())
13380 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13381 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13383 unsigned Idx = It->second;
13390 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13391 if (!UsedIdxs.test(
I))
13397 for (std::pair<unsigned, int> &Pair : EntryLanes)
13398 if (Pair.first ==
I)
13399 Pair.first = TempEntries.
size();
13402 Entries.swap(TempEntries);
13403 if (EntryLanes.size() == Entries.size() &&
13405 .
slice(Part * VL.size(),
13406 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13412 return std::nullopt;
13415 bool IsIdentity = Entries.size() == 1;
13418 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13419 unsigned Idx = Part * VL.size() + Pair.second;
13422 (ForOrder ? std::distance(
13423 Entries[Pair.first]->Scalars.begin(),
13424 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13425 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13426 IsIdentity &=
Mask[
Idx] == Pair.second;
13428 if (ForOrder || IsIdentity || Entries.empty()) {
13429 switch (Entries.size()) {
13431 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13435 if (EntryLanes.size() > 2 || VL.size() <= 2)
13441 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13442 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13445 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13446 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13447 for (
int Idx : SubMask) {
13455 assert(MaxElement >= 0 && MinElement >= 0 &&
13456 MaxElement % VF >= MinElement % VF &&
13457 "Expected at least single element.");
13458 unsigned NewVF = std::max<unsigned>(
13460 (MaxElement % VF) -
13461 (MinElement % VF) + 1));
13466 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13467 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13475 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13476 auto GetShuffleCost = [&,
13480 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13482 Mask, Entries.front()->getInterleaveFactor()))
13484 return ::getShuffleCost(
TTI,
13489 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13492 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13493 FirstShuffleCost = ShuffleCost;
13497 bool IsIdentity =
true;
13499 if (
Idx >=
static_cast<int>(NewVF)) {
13504 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13508 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13510 MaskVecTy, DemandedElts,
true,
13515 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13516 SecondShuffleCost = ShuffleCost;
13520 bool IsIdentity =
true;
13522 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13528 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13533 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13535 MaskVecTy, DemandedElts,
true,
13545 const TreeEntry *BestEntry =
nullptr;
13546 if (FirstShuffleCost < ShuffleCost) {
13547 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13548 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13550 if (Idx >= static_cast<int>(VF))
13551 Idx = PoisonMaskElem;
13553 BestEntry = Entries.front();
13554 ShuffleCost = FirstShuffleCost;
13556 if (SecondShuffleCost < ShuffleCost) {
13557 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13558 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13560 if (Idx < static_cast<int>(VF))
13561 Idx = PoisonMaskElem;
13565 BestEntry = Entries[1];
13566 ShuffleCost = SecondShuffleCost;
13568 if (BuildVectorCost >= ShuffleCost) {
13571 Entries.push_back(BestEntry);
13579 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13581 return std::nullopt;
13585BoUpSLP::isGatherShuffledEntry(
13589 assert(NumParts > 0 && NumParts < VL.
size() &&
13590 "Expected positive number of registers.");
13593 if (TE == VectorizableTree.front().get() &&
13594 (!GatheredLoadsEntriesFirst.has_value() ||
13596 [](
const std::unique_ptr<TreeEntry> &TE) {
13597 return !
TE->isGather();
13602 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
13605 assert((
TE->UserTreeIndices.size() == 1 ||
13606 TE == VectorizableTree.front().get()) &&
13607 "Expected only single user of the gather node.");
13609 "Number of scalars must be divisible by NumParts.");
13610 if (!
TE->UserTreeIndices.empty() &&
13611 TE->UserTreeIndices.front().UserTE->isGather() &&
13612 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13615 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
13617 "Expected splat or extractelements only node.");
13622 for (
unsigned Part : seq<unsigned>(NumParts)) {
13626 std::optional<TTI::ShuffleKind> SubRes =
13627 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13630 SubEntries.
clear();
13633 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13634 (SubEntries.
front()->isSame(
TE->Scalars) ||
13635 SubEntries.
front()->isSame(VL))) {
13637 LocalSubEntries.
swap(SubEntries);
13640 std::iota(
Mask.begin(),
Mask.end(), 0);
13642 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13643 if (isa<PoisonValue>(VL[
I]))
13645 Entries.emplace_back(1, LocalSubEntries.
front());
13651 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13659 Type *ScalarTy)
const {
13661 bool DuplicateNonConst =
false;
13669 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13670 if (
V->getType() != ScalarTy) {
13681 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13684 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13692 EstimateInsertCost(
I, V);
13693 ShuffleMask[
I] =
I;
13697 DuplicateNonConst =
true;
13699 ShuffleMask[
I] = Res.first->second;
13701 if (ForPoisonSrc) {
13702 if (isa<FixedVectorType>(ScalarTy)) {
13708 for (
unsigned I : seq<unsigned>(VL.
size()))
13709 if (!ShuffledElements[
I])
13712 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13720 if (DuplicateNonConst)
13722 VecTy, ShuffleMask);
13726Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13727 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13733 auto *Front = E->getMainOp();
13735 assert(((GatheredLoadsEntriesFirst.has_value() &&
13736 E->getOpcode() == Instruction::Load && E->isGather() &&
13737 E->Idx < *GatheredLoadsEntriesFirst) ||
13739 [=](
Value *V) ->
bool {
13740 if (E->getOpcode() == Instruction::GetElementPtr &&
13741 !isa<GetElementPtrInst>(V))
13743 auto *I = dyn_cast<Instruction>(V);
13744 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13745 isVectorLikeInstWithConstOps(I);
13747 "Expected gathered loads or GEPs or instructions from same basic "
13750 auto FindLastInst = [&]() {
13752 for (
Value *V : E->Scalars) {
13753 auto *
I = dyn_cast<Instruction>(V);
13756 if (LastInst->
getParent() ==
I->getParent()) {
13761 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13762 !isa<GetElementPtrInst>(
I)) ||
13765 (GatheredLoadsEntriesFirst.has_value() &&
13766 E->getOpcode() == Instruction::Load && E->isGather() &&
13767 E->Idx < *GatheredLoadsEntriesFirst)) &&
13768 "Expected vector-like or non-GEP in GEP node insts only.");
13776 auto *NodeB = DT->
getNode(
I->getParent());
13777 assert(NodeA &&
"Should only process reachable instructions");
13778 assert(NodeB &&
"Should only process reachable instructions");
13779 assert((NodeA == NodeB) ==
13780 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13781 "Different nodes should have different DFS numbers");
13782 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13789 auto FindFirstInst = [&]() {
13791 for (
Value *V : E->Scalars) {
13792 auto *
I = dyn_cast<Instruction>(V);
13795 if (FirstInst->
getParent() ==
I->getParent()) {
13796 if (
I->comesBefore(FirstInst))
13800 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13801 !isa<GetElementPtrInst>(
I)) ||
13804 "Expected vector-like or non-GEP in GEP node insts only.");
13812 auto *NodeB = DT->
getNode(
I->getParent());
13813 assert(NodeA &&
"Should only process reachable instructions");
13814 assert(NodeB &&
"Should only process reachable instructions");
13815 assert((NodeA == NodeB) ==
13816 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13817 "Different nodes should have different DFS numbers");
13818 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13825 if (GatheredLoadsEntriesFirst.has_value() &&
13826 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13827 E->getOpcode() == Instruction::Load) {
13828 Res = FindFirstInst();
13836 if ((E->getOpcode() == Instruction::GetElementPtr &&
13839 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13843 return isa<PoisonValue>(V) ||
13844 (!isVectorLikeInstWithConstOps(V) &&
13845 isUsedOutsideBlock(V));
13847 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13848 return isa<ExtractElementInst, UndefValue>(V) ||
13849 areAllOperandsNonInsts(V);
13851 Res = FindLastInst();
13853 Res = FindFirstInst();
13861 if (BlocksSchedules.count(BB) && !E->isGather()) {
13862 Value *
V = E->isOneOf(E->Scalars.back());
13865 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13866 if (Bundle && Bundle->isPartOfBundle())
13867 for (; Bundle; Bundle = Bundle->NextInBundle)
13868 Res = Bundle->Inst;
13890 Res = FindLastInst();
13891 assert(Res &&
"Failed to find last instruction in bundle");
13895void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13896 auto *Front = E->getMainOp();
13897 Instruction *LastInst = &getLastInstructionInBundle(E);
13898 assert(LastInst &&
"Failed to find last instruction in bundle");
13901 bool IsPHI = isa<PHINode>(LastInst);
13903 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13905 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13909 Builder.SetInsertPoint(
13913 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13916Value *BoUpSLP::gather(
13925 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13928 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13929 InsertBB = InsertBB->getSinglePredecessor();
13930 return InsertBB && InsertBB == InstBB;
13932 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13933 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13934 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13936 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13937 PostponedIndices.
insert(
I).second)
13941 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13944 if (
Scalar->getType() != Ty) {
13948 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13949 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13951 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13955 Scalar = Builder.CreateIntCast(
13960 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13964 auto *
II = dyn_cast<IntrinsicInst>(Vec);
13965 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13969 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13970 InsElt = dyn_cast<InsertElementInst>(Vec);
13974 GatherShuffleExtractSeq.
insert(InsElt);
13977 if (isa<Instruction>(V)) {
13980 User *UserOp =
nullptr;
13982 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13988 unsigned FoundLane = Entries.front()->findLaneForValue(V);
13989 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
13999 std::iota(
Mask.begin(),
Mask.end(), 0);
14000 Value *OriginalRoot = Root;
14001 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14002 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14003 SV->getOperand(0)->getType() == VecTy) {
14004 Root = SV->getOperand(0);
14005 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14008 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
14015 if (isa<PoisonValue>(VL[
I]))
14017 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14021 if (isa<PoisonValue>(Vec)) {
14022 Vec = OriginalRoot;
14024 Vec = CreateShuffle(Root, Vec, Mask);
14025 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14026 OI && OI->hasNUses(0) &&
14027 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14028 return TE->VectorizedValue == OI;
14034 for (
int I : NonConsts)
14035 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14038 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14039 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14077 bool IsFinalized =
false;
14090 class ShuffleIRBuilder {
14103 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14104 CSEBlocks(CSEBlocks),
DL(
DL) {}
14105 ~ShuffleIRBuilder() =
default;
14108 if (V1->
getType() != V2->getType()) {
14111 "Expected integer vector types only.");
14112 if (V1->
getType() != V2->getType()) {
14113 if (cast<VectorType>(V2->getType())
14115 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14117 ->getIntegerBitWidth())
14126 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14127 GatherShuffleExtractSeq.
insert(
I);
14128 CSEBlocks.
insert(
I->getParent());
14137 unsigned VF = Mask.size();
14138 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14142 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14143 GatherShuffleExtractSeq.
insert(
I);
14144 CSEBlocks.
insert(
I->getParent());
14148 Value *createIdentity(
Value *V) {
return V; }
14149 Value *createPoison(
Type *Ty,
unsigned VF) {
14154 void resizeToMatch(
Value *&V1,
Value *&V2) {
14155 if (V1->
getType() == V2->getType())
14157 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14158 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14159 int VF = std::max(V1VF, V2VF);
14160 int MinVF = std::min(V1VF, V2VF);
14162 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14164 Value *&
Op = MinVF == V1VF ? V1 : V2;
14166 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14167 GatherShuffleExtractSeq.
insert(
I);
14168 CSEBlocks.
insert(
I->getParent());
14181 assert(V1 &&
"Expected at least one vector value.");
14182 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14183 R.CSEBlocks, *R.DL);
14184 return BaseShuffleAnalysis::createShuffle<Value *>(
14185 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14191 std::optional<bool> IsSigned = std::nullopt) {
14192 auto *VecTy = cast<VectorType>(V->getType());
14203 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14207 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14208 unsigned NumParts,
bool &UseVecBaseAsInput) {
14209 UseVecBaseAsInput =
false;
14211 Value *VecBase =
nullptr;
14213 if (!E->ReorderIndices.empty()) {
14215 E->ReorderIndices.end());
14218 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14222 auto *EI = cast<ExtractElementInst>(VL[
I]);
14223 VecBase = EI->getVectorOperand();
14225 VecBase = TEs.front()->VectorizedValue;
14226 assert(VecBase &&
"Expected vectorized value.");
14227 UniqueBases.
insert(VecBase);
14230 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14231 (NumParts != 1 &&
count(VL, EI) > 1) ||
14233 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
14234 return UTEs.empty() || UTEs.size() > 1 ||
14235 (isa<GetElementPtrInst>(U) &&
14236 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14238 count_if(R.VectorizableTree,
14239 [&](const std::unique_ptr<TreeEntry> &TE) {
14240 return any_of(TE->UserTreeIndices,
14241 [&](const EdgeInfo &Edge) {
14242 return Edge.UserTE ==
14245 is_contained(VL, EI);
14249 R.eraseInstruction(EI);
14251 if (NumParts == 1 || UniqueBases.
size() == 1) {
14252 assert(VecBase &&
"Expected vectorized value.");
14253 return castToScalarTyElem(VecBase);
14255 UseVecBaseAsInput =
true;
14265 Value *Vec =
nullptr;
14268 for (
unsigned Part : seq<unsigned>(NumParts)) {
14272 constexpr int MaxBases = 2;
14274 auto VLMask =
zip(SubVL, SubMask);
14275 const unsigned VF = std::accumulate(
14276 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14277 if (std::get<1>(D) == PoisonMaskElem)
14280 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14281 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
14283 VecOp = TEs.front()->VectorizedValue;
14284 assert(VecOp &&
"Expected vectorized value.");
14285 const unsigned Size =
14286 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14287 return std::max(S, Size);
14289 for (
const auto [V,
I] : VLMask) {
14292 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14294 VecOp = TEs.front()->VectorizedValue;
14295 assert(VecOp &&
"Expected vectorized value.");
14296 VecOp = castToScalarTyElem(VecOp);
14297 Bases[
I / VF] = VecOp;
14299 if (!Bases.front())
14302 if (Bases.back()) {
14303 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14304 TransformToIdentity(SubMask);
14306 SubVec = Bases.front();
14313 Mask.slice(
P * SliceSize,
14320 "Expected first part or all previous parts masked.");
14321 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14324 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14326 unsigned SubVecVF =
14327 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14328 NewVF = std::max(NewVF, SubVecVF);
14331 for (
int &
Idx : SubMask)
14334 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14335 Vec = createShuffle(Vec, SubVec, VecMask);
14336 TransformToIdentity(VecMask);
14344 std::optional<Value *>
14350 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14352 return std::nullopt;
14355 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14364 Value *V1 = E1.VectorizedValue;
14366 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14367 if (isa<PoisonValue>(V))
14369 return !isKnownNonNegative(
14370 V, SimplifyQuery(*R.DL));
14372 Value *V2 = E2.VectorizedValue;
14373 if (V2->getType()->isIntOrIntVectorTy())
14374 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14375 if (isa<PoisonValue>(V))
14377 return !isKnownNonNegative(
14378 V, SimplifyQuery(*R.DL));
14385 Value *V1 = E1.VectorizedValue;
14387 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14388 if (isa<PoisonValue>(V))
14390 return !isKnownNonNegative(
14391 V, SimplifyQuery(*R.DL));
14397 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14399 isa<FixedVectorType>(V2->getType()) &&
14400 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14401 V1 = castToScalarTyElem(V1);
14402 V2 = castToScalarTyElem(V2);
14403 if (InVectors.
empty()) {
14406 CommonMask.
assign(Mask.begin(), Mask.end());
14410 if (InVectors.
size() == 2) {
14411 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14412 transformMaskAfterShuffle(CommonMask, CommonMask);
14413 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14415 Vec = createShuffle(Vec,
nullptr, CommonMask);
14416 transformMaskAfterShuffle(CommonMask, CommonMask);
14418 V1 = createShuffle(V1, V2, Mask);
14419 unsigned VF = std::max(getVF(V1), getVF(Vec));
14420 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14422 CommonMask[
Idx] =
Idx + VF;
14423 InVectors.
front() = Vec;
14424 if (InVectors.
size() == 2)
14425 InVectors.
back() = V1;
14432 "castToScalarTyElem expects V1 to be FixedVectorType");
14433 V1 = castToScalarTyElem(V1);
14434 if (InVectors.
empty()) {
14436 CommonMask.
assign(Mask.begin(), Mask.end());
14439 const auto *It =
find(InVectors, V1);
14440 if (It == InVectors.
end()) {
14441 if (InVectors.
size() == 2 ||
14444 if (InVectors.
size() == 2) {
14445 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14446 transformMaskAfterShuffle(CommonMask, CommonMask);
14447 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14448 CommonMask.
size()) {
14449 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14450 transformMaskAfterShuffle(CommonMask, CommonMask);
14452 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14453 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14456 V->getType() != V1->
getType()
14458 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14459 ->getNumElements();
14460 if (V->getType() != V1->
getType())
14461 V1 = createShuffle(V1,
nullptr, Mask);
14462 InVectors.
front() = V;
14463 if (InVectors.
size() == 2)
14464 InVectors.
back() = V1;
14471 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14478 for (
Value *V : InVectors)
14479 VF = std::max(VF, getVF(V));
14480 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14482 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
14491 Value *Root =
nullptr) {
14492 return R.gather(VL, Root, ScalarTy,
14494 return createShuffle(V1, V2, Mask);
14503 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14506 IsFinalized =
true;
14509 if (InVectors.
size() == 2) {
14510 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14513 Vec = createShuffle(Vec,
nullptr, CommonMask);
14515 transformMaskAfterShuffle(CommonMask, CommonMask);
14517 "Expected vector length for the final value before action.");
14518 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14521 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14522 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14524 Action(Vec, CommonMask);
14525 InVectors.
front() = Vec;
14527 if (!SubVectors.empty()) {
14529 if (InVectors.
size() == 2) {
14530 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14533 Vec = createShuffle(Vec,
nullptr, CommonMask);
14535 transformMaskAfterShuffle(CommonMask, CommonMask);
14536 auto CreateSubVectors = [&](
Value *Vec,
14538 for (
auto [E,
Idx] : SubVectors) {
14539 Value *
V = E->VectorizedValue;
14540 if (
V->getType()->isIntOrIntVectorTy())
14541 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14542 if (isa<PoisonValue>(V))
14544 return !isKnownNonNegative(
14545 V, SimplifyQuery(*R.DL));
14549 Builder, Vec, V, InsertionIndex,
14550 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14552 if (!CommonMask.
empty()) {
14553 std::iota(std::next(CommonMask.
begin(),
Idx),
14554 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
14560 if (SubVectorsMask.
empty()) {
14561 Vec = CreateSubVectors(Vec, CommonMask);
14564 copy(SubVectorsMask, SVMask.begin());
14565 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14568 I1 = I2 + CommonMask.
size();
14573 Vec = createShuffle(InsertVec, Vec, SVMask);
14574 transformMaskAfterShuffle(CommonMask, SVMask);
14576 InVectors.
front() = Vec;
14579 if (!ExtMask.
empty()) {
14580 if (CommonMask.
empty()) {
14584 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14587 NewMask[
I] = CommonMask[ExtMask[
I]];
14589 CommonMask.
swap(NewMask);
14592 if (CommonMask.
empty()) {
14593 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14594 return InVectors.
front();
14596 if (InVectors.
size() == 2)
14597 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14598 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14603 "Shuffle construction must be finalized.");
14607BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14608 unsigned NodeIdx) {
14612 if (!S && VL.
front()->getType()->isPointerTy()) {
14613 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14614 if (It != VL.
end())
14619 auto CheckSameVE = [&](
const TreeEntry *VE) {
14620 return any_of(VE->UserTreeIndices,
14621 [E, NodeIdx](
const EdgeInfo &EI) {
14622 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14624 any_of(VectorizableTree,
14625 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14626 return TE->isOperandGatherNode(
14627 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14628 VE->isSame(TE->Scalars);
14631 TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
14632 if (VE && CheckSameVE(VE))
14637Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14638 bool PostponedPHIs) {
14639 ValueList &VL = E->getOperand(NodeIdx);
14640 const unsigned VF = VL.size();
14641 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14646 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14648 ShuffleInstructionBuilder ShuffleBuilder(
14652 ShuffleBuilder.add(V, Mask);
14654 E->CombinedEntriesWithIndices.size());
14655 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14656 [&](
const auto &
P) {
14657 return std::make_pair(VectorizableTree[P.first].get(),
14660 assert((E->CombinedEntriesWithIndices.empty() ||
14661 E->ReorderIndices.empty()) &&
14662 "Expected either combined subnodes or reordering");
14663 return ShuffleBuilder.finalize({}, SubVectors, {});
14667 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14668 if (!VE->ReuseShuffleIndices.empty()) {
14689 if (isa<PoisonValue>(V))
14691 Mask[
I] = VE->findLaneForValue(V);
14693 V = FinalShuffle(V, Mask);
14695 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14696 "Expected vectorization factor less "
14697 "than original vector size.");
14699 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14700 V = FinalShuffle(V, UniformMask);
14706 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14707 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14708 }) == VE->UserTreeIndices.end()) {
14710 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14711 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14712 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14714 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14715 (*It)->VectorizedValue =
V;
14723 auto *
I =
find_if(VectorizableTree,
14724 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14725 return TE->isOperandGatherNode({E, NodeIdx});
14727 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14728 assert(
I->get()->UserTreeIndices.size() == 1 &&
14729 "Expected only single user for the gather node.");
14730 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14734template <
typename BVTy,
typename ResTy,
typename...
Args>
14735ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14737 assert(E->isGather() &&
"Expected gather node.");
14738 unsigned VF = E->getVectorFactor();
14740 bool NeedFreeze =
false;
14742 E->ReuseShuffleIndices.end());
14745 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14747 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14750 E->CombinedEntriesWithIndices.size());
14751 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14752 [&](
const auto &
P) {
14753 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14758 E->ReorderIndices.end());
14759 if (!ReorderMask.empty())
14765 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14766 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14767 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14770 SubVectorsMask.
clear();
14774 unsigned I,
unsigned SliceSize,
14775 bool IsNotPoisonous) {
14777 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14780 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14781 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14782 if (UserTE->getNumOperands() != 2)
14784 if (!IsNotPoisonous) {
14786 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14787 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14788 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14789 }) !=
TE->UserTreeIndices.end();
14791 if (It == VectorizableTree.end())
14794 if (!(*It)->ReorderIndices.empty()) {
14798 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14799 Value *V0 = std::get<0>(
P);
14800 Value *V1 = std::get<1>(
P);
14801 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14802 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14808 if ((
Mask.size() < InputVF &&
14811 (
Mask.size() == InputVF &&
14814 std::next(
Mask.begin(),
I * SliceSize),
14815 std::next(
Mask.begin(),
14822 std::next(
Mask.begin(),
I * SliceSize),
14823 std::next(
Mask.begin(),
14829 BVTy ShuffleBuilder(ScalarTy, Params...);
14830 ResTy Res = ResTy();
14834 Value *ExtractVecBase =
nullptr;
14835 bool UseVecBaseAsInput =
false;
14838 Type *OrigScalarTy = GatheredScalars.front()->getType();
14841 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14843 bool Resized =
false;
14845 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14846 if (!ExtractShuffles.
empty()) {
14852 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand());
14854 ExtractEntries.
append(TEs.begin(), TEs.end());
14856 if (std::optional<ResTy> Delayed =
14857 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14859 PostponedGathers.
insert(E);
14864 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14865 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14866 ExtractVecBase = VecBase;
14867 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14868 if (VF == VecBaseTy->getNumElements() &&
14869 GatheredScalars.size() != VF) {
14871 GatheredScalars.append(VF - GatheredScalars.size(),
14879 if (!ExtractShuffles.
empty() || !E->hasState() ||
14880 E->getOpcode() != Instruction::Load ||
14881 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14882 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14885 return isa<LoadInst>(V) && isVectorized(V);
14887 (E->hasState() && E->isAltShuffle()) ||
14888 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
14890 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14892 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14894 if (!GatherShuffles.
empty()) {
14895 if (std::optional<ResTy> Delayed =
14896 ShuffleBuilder.needToDelay(E, Entries)) {
14898 PostponedGathers.
insert(E);
14903 if (GatherShuffles.
size() == 1 &&
14905 Entries.front().front()->isSame(E->Scalars)) {
14908 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14911 Mask.resize(E->Scalars.size());
14912 const TreeEntry *FrontTE = Entries.front().front();
14913 if (FrontTE->ReorderIndices.empty() &&
14914 ((FrontTE->ReuseShuffleIndices.empty() &&
14915 E->Scalars.size() == FrontTE->Scalars.size()) ||
14916 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14917 std::iota(
Mask.begin(),
Mask.end(), 0);
14920 if (isa<PoisonValue>(V)) {
14924 Mask[
I] = FrontTE->findLaneForValue(V);
14927 ShuffleBuilder.add(*FrontTE, Mask);
14929 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14933 if (GatheredScalars.size() != VF &&
14935 return any_of(TEs, [&](
const TreeEntry *TE) {
14936 return TE->getVectorFactor() == VF;
14939 GatheredScalars.append(VF - GatheredScalars.size(),
14943 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14951 bool IsRootPoison) {
14954 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14961 int NumNonConsts = 0;
14964 if (isa<UndefValue>(V)) {
14965 if (!isa<PoisonValue>(V)) {
14980 Scalars.
front() = OrigV;
14983 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
14984 Scalars[Res.first->second] = OrigV;
14985 ReuseMask[
I] = Res.first->second;
14988 if (NumNonConsts == 1) {
14993 if (!UndefPos.
empty() && UndefPos.
front() == 0)
14996 ReuseMask[SinglePos] = SinglePos;
14997 }
else if (!UndefPos.
empty() && IsSplat) {
15002 return !isa<UndefValue>(V) &&
15004 (E->UserTreeIndices.size() == 1 &&
15008 return E->UserTreeIndices.front().EdgeIdx !=
15009 U.getOperandNo() &&
15011 E->UserTreeIndices.front().UserTE->Scalars,
15015 if (It != Scalars.
end()) {
15017 int Pos = std::distance(Scalars.
begin(), It);
15018 for (
int I : UndefPos) {
15020 ReuseMask[
I] = Pos;
15029 for (
int I : UndefPos) {
15031 if (isa<UndefValue>(Scalars[
I]))
15038 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15039 bool IsNonPoisoned =
true;
15040 bool IsUsedInExpr =
true;
15041 Value *Vec1 =
nullptr;
15042 if (!ExtractShuffles.
empty()) {
15046 Value *Vec2 =
nullptr;
15047 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15051 if (UseVecBaseAsInput) {
15052 Vec1 = ExtractVecBase;
15054 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15057 if (isa<UndefValue>(E->Scalars[
I]))
15059 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15060 Value *VecOp = EI->getVectorOperand();
15062 !TEs.
empty() && TEs.
front()->VectorizedValue)
15063 VecOp = TEs.
front()->VectorizedValue;
15066 }
else if (Vec1 != VecOp) {
15067 assert((!Vec2 || Vec2 == VecOp) &&
15068 "Expected only 1 or 2 vectors shuffle.");
15074 IsUsedInExpr =
false;
15077 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15080 IsUsedInExpr &= FindReusedSplat(
15082 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15083 ExtractMask.size(), IsNotPoisonedVec);
15084 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15085 IsNonPoisoned &= IsNotPoisonedVec;
15087 IsUsedInExpr =
false;
15092 if (!GatherShuffles.
empty()) {
15095 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15098 "No shuffles with empty entries list expected.");
15102 "Expected shuffle of 1 or 2 entries.");
15106 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15107 if (TEs.
size() == 1) {
15108 bool IsNotPoisonedVec =
15109 TEs.
front()->VectorizedValue
15113 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15114 SliceSize, IsNotPoisonedVec);
15115 ShuffleBuilder.add(*TEs.
front(), VecMask);
15116 IsNonPoisoned &= IsNotPoisonedVec;
15118 IsUsedInExpr =
false;
15119 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15120 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15131 int EMSz = ExtractMask.size();
15132 int MSz =
Mask.size();
15135 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15136 bool IsIdentityShuffle =
15137 ((UseVecBaseAsInput ||
15139 [](
const std::optional<TTI::ShuffleKind> &SK) {
15143 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15145 (!GatherShuffles.
empty() &&
15147 [](
const std::optional<TTI::ShuffleKind> &SK) {
15151 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15153 bool EnoughConstsForShuffle =
15157 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15161 return isa<Constant>(V) && !isa<UndefValue>(V);
15163 (!IsIdentityShuffle ||
15164 (GatheredScalars.size() == 2 &&
15166 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15168 return isa<Constant>(V) && !isa<PoisonValue>(V);
15172 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15173 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15179 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15181 TryPackScalars(GatheredScalars, BVMask,
true);
15182 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15183 ShuffleBuilder.add(BV, BVMask);
15186 return isa<PoisonValue>(V) ||
15187 (IsSingleShuffle && ((IsIdentityShuffle &&
15188 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15190 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15193 Res = ShuffleBuilder.finalize(
15194 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15196 TryPackScalars(NonConstants, Mask,
false);
15197 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15202 TryPackScalars(GatheredScalars, ReuseMask,
true);
15203 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15204 ShuffleBuilder.add(BV, ReuseMask);
15205 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15210 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15211 if (!isa<PoisonValue>(V))
15214 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15215 ShuffleBuilder.add(BV, Mask);
15216 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15221 Res = ShuffleBuilder.createFreeze(Res);
15225Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15226 bool PostponedPHIs) {
15227 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15229 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15237 for (
Value *V : VL)
15238 if (isa<Instruction>(V))
15246 if (E->VectorizedValue &&
15247 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15248 E->isAltShuffle())) {
15249 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15250 return E->VectorizedValue;
15253 Value *
V = E->Scalars.front();
15254 Type *ScalarTy =
V->getType();
15255 if (!isa<CmpInst>(V))
15257 auto It = MinBWs.
find(E);
15258 if (It != MinBWs.
end()) {
15259 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15265 if (E->isGather()) {
15267 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15268 setInsertPointAfterBundle(E);
15269 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15270 E->VectorizedValue = Vec;
15274 bool IsReverseOrder =
15275 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15276 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15277 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15278 if (E->getOpcode() == Instruction::Store &&
15279 E->State == TreeEntry::Vectorize) {
15281 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15282 E->ReorderIndices.size());
15283 ShuffleBuilder.add(V, Mask);
15284 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15285 ShuffleBuilder.addOrdered(V, {});
15287 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15290 E->CombinedEntriesWithIndices.size());
15292 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15293 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15296 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15297 "Expected either combined subnodes or reordering");
15298 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15301 assert(!E->isGather() &&
"Unhandled state");
15302 unsigned ShuffleOrOp =
15303 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15305 auto GetOperandSignedness = [&](
unsigned Idx) {
15306 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15307 bool IsSigned =
false;
15308 auto It = MinBWs.
find(OpE);
15309 if (It != MinBWs.
end())
15310 IsSigned = It->second.second;
15313 if (isa<PoisonValue>(V))
15315 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15319 switch (ShuffleOrOp) {
15320 case Instruction::PHI: {
15321 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15322 E != VectorizableTree.front().get() ||
15323 !E->UserTreeIndices.empty()) &&
15324 "PHI reordering is free.");
15325 if (PostponedPHIs && E->VectorizedValue)
15326 return E->VectorizedValue;
15327 auto *PH = cast<PHINode>(VL0);
15329 PH->getParent()->getFirstNonPHIIt());
15331 if (PostponedPHIs || !E->VectorizedValue) {
15338 PH->getParent()->getFirstInsertionPt());
15341 V = FinalShuffle(V, E);
15343 E->VectorizedValue =
V;
15347 PHINode *NewPhi = cast<PHINode>(E->PHI);
15356 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15362 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15366 if (!VisitedBBs.
insert(IBB).second) {
15373 Value *Vec = vectorizeOperand(E,
I,
true);
15374 if (VecTy != Vec->
getType()) {
15376 MinBWs.
contains(getOperandEntry(E,
I))) &&
15377 "Expected item in MinBWs.");
15378 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15384 "Invalid number of incoming values");
15385 assert(E->VectorizedValue &&
"Expected vectorized value.");
15386 return E->VectorizedValue;
15389 case Instruction::ExtractElement: {
15390 Value *
V = E->getSingleOperand(0);
15392 V = TEs.
front()->VectorizedValue;
15393 setInsertPointAfterBundle(E);
15394 V = FinalShuffle(V, E);
15395 E->VectorizedValue =
V;
15398 case Instruction::ExtractValue: {
15399 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15404 NewV = FinalShuffle(NewV, E);
15405 E->VectorizedValue = NewV;
15408 case Instruction::InsertElement: {
15409 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15411 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15413 Type *ScalarTy =
Op.front()->getType();
15414 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15416 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15417 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15422 cast<FixedVectorType>(
V->getType())->getNumElements()),
15427 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15428 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15430 const unsigned NumElts =
15431 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15432 const unsigned NumScalars = E->Scalars.size();
15435 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15439 if (!E->ReorderIndices.empty()) {
15444 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15447 bool IsIdentity =
true;
15449 Mask.swap(PrevMask);
15450 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15453 IsIdentity &= InsertIdx -
Offset ==
I;
15456 if (!IsIdentity || NumElts != NumScalars) {
15458 bool IsVNonPoisonous =
15461 if (NumElts != NumScalars &&
Offset == 0) {
15470 InsertMask[*InsertIdx] = *InsertIdx;
15471 if (!
Ins->hasOneUse())
15473 Ins = dyn_cast_or_null<InsertElementInst>(
15474 Ins->getUniqueUndroppableUser());
15477 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15479 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15482 if (!IsFirstPoison.
all()) {
15484 for (
unsigned I = 0;
I < NumElts;
I++) {
15486 IsFirstUndef.
test(
I)) {
15487 if (IsVNonPoisonous) {
15488 InsertMask[
I] =
I < NumScalars ?
I : 0;
15493 if (
Idx >= NumScalars)
15494 Idx = NumScalars - 1;
15495 InsertMask[
I] = NumScalars +
Idx;
15509 if (
auto *
I = dyn_cast<Instruction>(V)) {
15510 GatherShuffleExtractSeq.
insert(
I);
15511 CSEBlocks.
insert(
I->getParent());
15516 for (
unsigned I = 0;
I < NumElts;
I++) {
15521 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15524 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15525 NumElts != NumScalars) {
15526 if (IsFirstUndef.
all()) {
15529 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15530 if (!IsFirstPoison.
all()) {
15531 for (
unsigned I = 0;
I < NumElts;
I++) {
15533 InsertMask[
I] =
I + NumElts;
15540 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15541 if (
auto *
I = dyn_cast<Instruction>(V)) {
15542 GatherShuffleExtractSeq.
insert(
I);
15543 CSEBlocks.
insert(
I->getParent());
15548 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15549 for (
unsigned I = 0;
I < NumElts;
I++) {
15553 InsertMask[
I] += NumElts;
15556 FirstInsert->getOperand(0), V, InsertMask,
15557 cast<Instruction>(E->Scalars.back())->getName());
15558 if (
auto *
I = dyn_cast<Instruction>(V)) {
15559 GatherShuffleExtractSeq.
insert(
I);
15560 CSEBlocks.
insert(
I->getParent());
15565 ++NumVectorInstructions;
15566 E->VectorizedValue =
V;
15569 case Instruction::ZExt:
15570 case Instruction::SExt:
15571 case Instruction::FPToUI:
15572 case Instruction::FPToSI:
15573 case Instruction::FPExt:
15574 case Instruction::PtrToInt:
15575 case Instruction::IntToPtr:
15576 case Instruction::SIToFP:
15577 case Instruction::UIToFP:
15578 case Instruction::Trunc:
15579 case Instruction::FPTrunc:
15580 case Instruction::BitCast: {
15581 setInsertPointAfterBundle(E);
15583 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15584 if (E->VectorizedValue) {
15585 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15586 return E->VectorizedValue;
15589 auto *CI = cast<CastInst>(VL0);
15591 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15592 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15594 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15597 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15598 if (SrcIt != MinBWs.
end())
15599 SrcBWSz = SrcIt->second.first;
15601 if (BWSz == SrcBWSz) {
15602 VecOpcode = Instruction::BitCast;
15603 }
else if (BWSz < SrcBWSz) {
15604 VecOpcode = Instruction::Trunc;
15605 }
else if (It != MinBWs.
end()) {
15606 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15607 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15608 }
else if (SrcIt != MinBWs.
end()) {
15609 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15611 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15613 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15614 !SrcIt->second.second) {
15615 VecOpcode = Instruction::UIToFP;
15617 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15619 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15620 V = FinalShuffle(V, E);
15622 E->VectorizedValue =
V;
15623 ++NumVectorInstructions;
15626 case Instruction::FCmp:
15627 case Instruction::ICmp: {
15628 setInsertPointAfterBundle(E);
15630 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15631 if (E->VectorizedValue) {
15632 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15633 return E->VectorizedValue;
15635 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15636 if (E->VectorizedValue) {
15637 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15638 return E->VectorizedValue;
15640 if (
L->getType() !=
R->getType()) {
15642 getOperandEntry(E, 1)->
isGather() ||
15643 MinBWs.
contains(getOperandEntry(E, 0)) ||
15644 MinBWs.
contains(getOperandEntry(E, 1))) &&
15645 "Expected item in MinBWs.");
15646 if (cast<VectorType>(
L->getType())
15648 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15650 ->getIntegerBitWidth()) {
15651 Type *CastTy =
R->getType();
15654 Type *CastTy =
L->getType();
15662 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15663 ICmp->setSameSign(
false);
15665 VecTy = cast<FixedVectorType>(
V->getType());
15666 V = FinalShuffle(V, E);
15668 E->VectorizedValue =
V;
15669 ++NumVectorInstructions;
15672 case Instruction::Select: {
15673 setInsertPointAfterBundle(E);
15675 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15676 if (E->VectorizedValue) {
15677 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15678 return E->VectorizedValue;
15680 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15681 if (E->VectorizedValue) {
15682 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15683 return E->VectorizedValue;
15685 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15686 if (E->VectorizedValue) {
15687 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15688 return E->VectorizedValue;
15692 getOperandEntry(E, 2)->
isGather() ||
15693 MinBWs.
contains(getOperandEntry(E, 1)) ||
15694 MinBWs.
contains(getOperandEntry(E, 2))) &&
15695 "Expected item in MinBWs.");
15696 if (True->
getType() != VecTy)
15697 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15698 if (False->
getType() != VecTy)
15699 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15704 assert(TrueNumElements >= CondNumElements &&
15705 TrueNumElements % CondNumElements == 0 &&
15706 "Cannot vectorize Instruction::Select");
15708 "Cannot vectorize Instruction::Select");
15709 if (CondNumElements != TrueNumElements) {
15717 "Cannot vectorize Instruction::Select");
15719 V = FinalShuffle(V, E);
15721 E->VectorizedValue =
V;
15722 ++NumVectorInstructions;
15725 case Instruction::FNeg: {
15726 setInsertPointAfterBundle(E);
15728 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15730 if (E->VectorizedValue) {
15731 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15732 return E->VectorizedValue;
15738 if (
auto *
I = dyn_cast<Instruction>(V))
15741 V = FinalShuffle(V, E);
15743 E->VectorizedValue =
V;
15744 ++NumVectorInstructions;
15748 case Instruction::Freeze: {
15749 setInsertPointAfterBundle(E);
15751 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15753 if (E->VectorizedValue) {
15754 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15755 return E->VectorizedValue;
15758 if (
Op->getType() != VecTy) {
15760 MinBWs.
contains(getOperandEntry(E, 0))) &&
15761 "Expected item in MinBWs.");
15765 V = FinalShuffle(V, E);
15767 E->VectorizedValue =
V;
15768 ++NumVectorInstructions;
15772 case Instruction::Add:
15773 case Instruction::FAdd:
15774 case Instruction::Sub:
15775 case Instruction::FSub:
15776 case Instruction::Mul:
15777 case Instruction::FMul:
15778 case Instruction::UDiv:
15779 case Instruction::SDiv:
15780 case Instruction::FDiv:
15781 case Instruction::URem:
15782 case Instruction::SRem:
15783 case Instruction::FRem:
15784 case Instruction::Shl:
15785 case Instruction::LShr:
15786 case Instruction::AShr:
15787 case Instruction::And:
15788 case Instruction::Or:
15789 case Instruction::Xor: {
15790 setInsertPointAfterBundle(E);
15792 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15793 if (E->VectorizedValue) {
15794 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15795 return E->VectorizedValue;
15797 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15798 if (E->VectorizedValue) {
15799 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15800 return E->VectorizedValue;
15802 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15803 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15806 auto *CI = dyn_cast<ConstantInt>(
Op);
15807 return CI && CI->getValue().countr_one() >= It->second.first;
15809 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15810 E->VectorizedValue =
V;
15811 ++NumVectorInstructions;
15818 getOperandEntry(E, 1)->
isGather() ||
15819 MinBWs.
contains(getOperandEntry(E, 0)) ||
15820 MinBWs.
contains(getOperandEntry(E, 1))) &&
15821 "Expected item in MinBWs.");
15832 if (
auto *
I = dyn_cast<Instruction>(V)) {
15835 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15837 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15839 I->setHasNoUnsignedWrap(
false);
15842 V = FinalShuffle(V, E);
15844 E->VectorizedValue =
V;
15845 ++NumVectorInstructions;
15849 case Instruction::Load: {
15852 setInsertPointAfterBundle(E);
15854 LoadInst *LI = cast<LoadInst>(VL0);
15857 if (E->State == TreeEntry::Vectorize) {
15859 }
else if (E->State == TreeEntry::StridedVectorize) {
15860 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15861 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15862 PO = IsReverseOrder ? PtrN : Ptr0;
15868 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15870 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15871 DL->getTypeAllocSize(ScalarTy));
15875 return cast<LoadInst>(V)->getPointerOperand();
15878 std::optional<Value *> Stride =
15887 (IsReverseOrder ? -1 : 1) *
15888 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15890 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15892 Intrinsic::experimental_vp_strided_load,
15893 {VecTy, PO->
getType(), StrideTy},
15895 Builder.
getInt32(E->Scalars.size())});
15901 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15902 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15903 if (E->VectorizedValue) {
15904 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15905 return E->VectorizedValue;
15907 if (isa<FixedVectorType>(ScalarTy)) {
15911 unsigned ScalarTyNumElements =
15912 cast<FixedVectorType>(ScalarTy)->getNumElements();
15913 unsigned VecTyNumElements =
15914 cast<FixedVectorType>(VecTy)->getNumElements();
15915 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15916 "Cannot expand getelementptr.");
15917 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15920 return Builder.getInt64(I % ScalarTyNumElements);
15929 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15934 V = FinalShuffle(V, E);
15935 E->VectorizedValue =
V;
15936 ++NumVectorInstructions;
15939 case Instruction::Store: {
15940 auto *
SI = cast<StoreInst>(VL0);
15942 setInsertPointAfterBundle(E);
15944 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15945 if (VecValue->
getType() != VecTy)
15947 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15948 VecValue = FinalShuffle(VecValue, E);
15952 if (E->State == TreeEntry::Vectorize) {
15955 assert(E->State == TreeEntry::StridedVectorize &&
15956 "Expected either strided or consecutive stores.");
15957 if (!E->ReorderIndices.empty()) {
15958 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15959 Ptr =
SI->getPointerOperand();
15961 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15962 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15964 Intrinsic::experimental_vp_strided_store,
15965 {VecTy,
Ptr->getType(), StrideTy},
15968 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15970 Builder.
getInt32(E->Scalars.size())});
15979 E->VectorizedValue =
V;
15980 ++NumVectorInstructions;
15983 case Instruction::GetElementPtr: {
15984 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15985 setInsertPointAfterBundle(E);
15987 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15988 if (E->VectorizedValue) {
15989 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15990 return E->VectorizedValue;
15994 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
15995 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15996 if (E->VectorizedValue) {
15997 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15998 return E->VectorizedValue;
16003 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16004 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
16006 for (
Value *V : E->Scalars) {
16007 if (isa<GetElementPtrInst>(V))
16013 V = FinalShuffle(V, E);
16015 E->VectorizedValue =
V;
16016 ++NumVectorInstructions;
16020 case Instruction::Call: {
16021 CallInst *CI = cast<CallInst>(VL0);
16022 setInsertPointAfterBundle(E);
16028 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16031 VecCallCosts.first <= VecCallCosts.second;
16033 Value *ScalarArg =
nullptr;
16039 auto *CEI = cast<CallInst>(VL0);
16040 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16045 ScalarArg = CEI->getArgOperand(
I);
16048 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16049 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16057 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16058 if (E->VectorizedValue) {
16059 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16060 return E->VectorizedValue;
16062 ScalarArg = CEI->getArgOperand(
I);
16063 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16065 It == MinBWs.
end()) {
16068 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16069 }
else if (It != MinBWs.
end()) {
16070 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16079 if (!UseIntrinsic) {
16095 V = FinalShuffle(V, E);
16097 E->VectorizedValue =
V;
16098 ++NumVectorInstructions;
16101 case Instruction::ShuffleVector: {
16103 if (
SLPReVec && !E->isAltShuffle()) {
16104 setInsertPointAfterBundle(E);
16105 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16106 if (E->VectorizedValue) {
16107 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16108 return E->VectorizedValue;
16111 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16112 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16113 "Not supported shufflevector usage.");
16116 return SVSrc->getShuffleMask()[Mask];
16123 if (
auto *
I = dyn_cast<Instruction>(V))
16125 V = FinalShuffle(V, E);
16127 assert(E->isAltShuffle() &&
16132 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16133 "Invalid Shuffle Vector Operand");
16137 setInsertPointAfterBundle(E);
16138 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16139 if (E->VectorizedValue) {
16140 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16141 return E->VectorizedValue;
16143 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16145 setInsertPointAfterBundle(E);
16146 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16148 if (E->VectorizedValue) {
16149 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16150 return E->VectorizedValue;
16157 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16158 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16159 MinBWs.
contains(getOperandEntry(E, 0)) ||
16160 MinBWs.
contains(getOperandEntry(E, 1))) &&
16161 "Expected item in MinBWs.");
16162 Type *CastTy = VecTy;
16166 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16168 ->getIntegerBitWidth())
16185 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16186 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16187 auto *AltCI = cast<CmpInst>(E->getAltOp());
16189 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16192 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16193 cast<VectorType>(
LHS->
getType())->getElementType());
16194 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16195 if (BWSz <= SrcBWSz) {
16196 if (BWSz < SrcBWSz)
16199 "Expected same type as operand.");
16200 if (
auto *
I = dyn_cast<Instruction>(LHS))
16202 LHS = FinalShuffle(LHS, E);
16203 E->VectorizedValue =
LHS;
16204 ++NumVectorInstructions;
16215 for (
Value *V : {V0, V1}) {
16216 if (
auto *
I = dyn_cast<Instruction>(V)) {
16217 GatherShuffleExtractSeq.
insert(
I);
16218 CSEBlocks.
insert(
I->getParent());
16227 E->buildAltOpShuffleMask(
16229 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16233 Mask, &OpScalars, &AltScalars);
16237 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16239 if (
auto *
I = dyn_cast<Instruction>(Vec);
16240 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16242 if (isa<PoisonValue>(V))
16244 auto *IV = cast<Instruction>(V);
16245 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16247 I->setHasNoUnsignedWrap(
false);
16249 DropNuwFlag(V0, E->getOpcode());
16250 DropNuwFlag(V1, E->getAltOpcode());
16252 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16257 if (
auto *
I = dyn_cast<Instruction>(V)) {
16259 GatherShuffleExtractSeq.
insert(
I);
16260 CSEBlocks.
insert(
I->getParent());
16264 E->VectorizedValue =
V;
16265 ++NumVectorInstructions;
16284 for (
auto &BSIter : BlocksSchedules) {
16285 scheduleBlock(BSIter.second.get());
16289 EntryToLastInstruction.
clear();
16299 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16300 if (GatheredLoadsEntriesFirst.has_value() &&
16301 TE->Idx >= *GatheredLoadsEntriesFirst &&
16302 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16303 assert((!TE->UserTreeIndices.empty() ||
16304 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16305 "Expected gathered load node.");
16311 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16312 if (TE->State == TreeEntry::Vectorize &&
16313 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16314 TE->VectorizedValue)
16320 for (
const TreeEntry *E : PostponedNodes) {
16321 auto *TE =
const_cast<TreeEntry *
>(E);
16322 if (
auto *VecTE = getSameValuesTreeEntry(
16323 TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
16324 TE->UserTreeIndices.front().EdgeIdx));
16325 VecTE && VecTE->isSame(TE->Scalars))
16329 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16330 TE->VectorizedValue =
nullptr;
16332 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16341 if (isa<PHINode>(UserI)) {
16344 for (
User *U : PrevVec->users()) {
16347 auto *UI = dyn_cast<Instruction>(U);
16348 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16350 if (UI->comesBefore(InsertPt))
16359 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16364 if (Vec->
getType() != PrevVec->getType()) {
16366 PrevVec->getType()->isIntOrIntVectorTy() &&
16367 "Expected integer vector types only.");
16368 std::optional<bool> IsSigned;
16369 for (
Value *V : TE->Scalars) {
16371 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
16372 auto It = MinBWs.
find(MNTE);
16373 if (It != MinBWs.
end()) {
16374 IsSigned = IsSigned.value_or(
false) || It->second.second;
16379 if (IsSigned.value_or(
false))
16382 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16383 auto It = MinBWs.
find(BVE);
16384 if (It != MinBWs.
end()) {
16385 IsSigned = IsSigned.value_or(
false) || It->second.second;
16390 if (IsSigned.value_or(
false))
16392 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16394 IsSigned.value_or(
false) ||
16398 if (IsSigned.value_or(
false))
16402 if (IsSigned.value_or(
false)) {
16404 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16405 if (It != MinBWs.
end())
16406 IsSigned = It->second.second;
16409 "Expected user node or perfect diamond match in MinBWs.");
16413 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16416 auto It = PostponedValues.
find(PrevVec);
16417 if (It != PostponedValues.
end()) {
16418 for (TreeEntry *VTE : It->getSecond())
16419 VTE->VectorizedValue = Vec;
16439 for (
const auto &ExternalUse : ExternalUses) {
16440 Value *Scalar = ExternalUse.Scalar;
16447 const TreeEntry *E = &ExternalUse.E;
16448 assert(E &&
"Invalid scalar");
16449 assert(!E->isGather() &&
"Extracting from a gather list");
16451 if (E->getOpcode() == Instruction::GetElementPtr &&
16452 !isa<GetElementPtrInst>(Scalar))
16455 Value *Vec = E->VectorizedValue;
16456 assert(Vec &&
"Can't find vectorizable value");
16459 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16460 if (Scalar->getType() != Vec->
getType()) {
16461 Value *Ex =
nullptr;
16462 Value *ExV =
nullptr;
16463 auto *Inst = dyn_cast<Instruction>(Scalar);
16464 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16465 auto It = ScalarToEEs.
find(Scalar);
16466 if (It != ScalarToEEs.
end()) {
16469 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16471 if (EEIt != It->second.end()) {
16472 Value *PrevV = EEIt->second.first;
16473 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16474 I && !ReplaceInst &&
16479 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16483 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16491 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16492 IgnoredExtracts.
insert(EE);
16495 auto *CloneInst = Inst->clone();
16496 CloneInst->insertBefore(Inst->getIterator());
16497 if (Inst->hasName())
16501 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16502 ES && isa<Instruction>(Vec)) {
16503 Value *V = ES->getVectorOperand();
16504 auto *IVec = cast<Instruction>(Vec);
16506 V = ETEs.front()->VectorizedValue;
16507 if (
auto *
IV = dyn_cast<Instruction>(V);
16508 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16509 IV->comesBefore(IVec))
16513 }
else if (
auto *VecTy =
16514 dyn_cast<FixedVectorType>(Scalar->getType())) {
16521 ExternalUse.Lane * VecTyNumElements);
16528 if (Scalar->getType() != Ex->
getType())
16530 Ex, Scalar->getType(),
16532 auto *
I = dyn_cast<Instruction>(Ex);
16534 : &
F->getEntryBlock(),
16535 std::make_pair(Ex, ExV));
16539 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16541 GatherShuffleExtractSeq.
insert(ExI);
16542 CSEBlocks.
insert(ExI->getParent());
16546 assert(isa<FixedVectorType>(Scalar->getType()) &&
16547 isa<InsertElementInst>(Scalar) &&
16548 "In-tree scalar of vector type is not insertelement?");
16549 auto *IE = cast<InsertElementInst>(Scalar);
16557 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16560 (ExternallyUsedValues.
count(Scalar) ||
16562 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16566 if (ExternalUsesAsOriginalScalar.contains(U))
16568 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
16569 return !UseEntries.empty() &&
16570 (E->State == TreeEntry::Vectorize ||
16571 E->State == TreeEntry::StridedVectorize) &&
16572 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
16573 return (UseEntry->State == TreeEntry::Vectorize ||
16575 TreeEntry::StridedVectorize) &&
16576 doesInTreeUserNeedToExtract(
16577 Scalar, getRootEntryInstruction(*UseEntry),
16581 "Scalar with nullptr User must be registered in "
16582 "ExternallyUsedValues map or remain as scalar in vectorized "
16584 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16585 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16586 if (
PHI->getParent()->isLandingPad())
16590 PHI->getParent()->getLandingPadInst()->getIterator()));
16593 PHI->getParent()->getFirstNonPHIIt());
16596 std::next(VecI->getIterator()));
16601 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16603 if (Scalar != NewInst) {
16604 assert((!isa<ExtractElementInst>(Scalar) ||
16605 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16606 "Extractelements should not be replaced.");
16607 Scalar->replaceAllUsesWith(NewInst);
16612 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16615 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16616 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16617 if (!UsedInserts.
insert(VU).second)
16620 auto BWIt = MinBWs.
find(E);
16622 auto *ScalarTy = FTy->getElementType();
16623 auto Key = std::make_pair(Vec, ScalarTy);
16624 auto VecIt = VectorCasts.
find(Key);
16625 if (VecIt == VectorCasts.
end()) {
16627 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16628 if (IVec->getParent()->isLandingPad())
16630 std::next(IVec->getParent()
16631 ->getLandingPadInst()
16635 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16636 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16643 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16644 BWIt->second.second);
16647 Vec = VecIt->second;
16654 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16661 unsigned Idx = *InsertIdx;
16662 if (It == ShuffledInserts.
end()) {
16664 It = std::next(ShuffledInserts.
begin(),
16665 ShuffledInserts.
size() - 1);
16670 Mask[
Idx] = ExternalUse.Lane;
16671 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16680 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16682 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16683 if (PH->getIncomingValue(
I) == Scalar) {
16685 PH->getIncomingBlock(
I)->getTerminator();
16686 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16688 std::next(VecI->getIterator()));
16692 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16693 PH->setOperand(
I, NewInst);
16698 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16703 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16713 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16714 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16716 CombinedMask1[
I] = Mask[
I];
16718 CombinedMask2[
I] = Mask[
I] - VF;
16720 ShuffleInstructionBuilder ShuffleBuilder(
16721 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16722 ShuffleBuilder.add(V1, CombinedMask1);
16724 ShuffleBuilder.add(V2, CombinedMask2);
16725 return ShuffleBuilder.finalize({}, {}, {});
16729 bool ForSingleMask) {
16730 unsigned VF =
Mask.size();
16731 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16733 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16734 Vec = CreateShuffle(Vec,
nullptr, Mask);
16735 return std::make_pair(Vec,
true);
16737 if (!ForSingleMask) {
16739 for (
unsigned I = 0;
I < VF; ++
I) {
16743 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16747 return std::make_pair(Vec,
false);
16751 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
16757 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16758 Value *NewInst = performExtractsShuffleAction<Value>(
16762 return cast<VectorType>(Vec->getType())
16763 ->getElementCount()
16764 .getKnownMinValue();
16769 assert((Vals.size() == 1 || Vals.size() == 2) &&
16770 "Expected exactly 1 or 2 input values.");
16771 if (Vals.size() == 1) {
16774 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16775 ->getNumElements() ||
16776 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16777 return CreateShuffle(Vals.front(), nullptr, Mask);
16778 return Vals.front();
16780 return CreateShuffle(Vals.
front() ? Vals.
front()
16782 Vals.
back(), Mask);
16784 auto It = ShuffledInserts[
I].InsertElements.rbegin();
16787 if (It != ShuffledInserts[
I].InsertElements.rend())
16790 while (It != ShuffledInserts[
I].InsertElements.rend()) {
16791 assert(
II &&
"Must be an insertelement instruction.");
16796 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16799 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16800 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16801 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16802 II->moveAfter(NewI);
16807 IE->replaceUsesOfWith(
IE->getOperand(0),
16809 IE->replaceUsesOfWith(
IE->getOperand(1),
16818 for (
auto &TEPtr : VectorizableTree) {
16819 TreeEntry *
Entry = TEPtr.get();
16822 if (
Entry->isGather())
16825 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
16828 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
16831 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
16832 !isa<GetElementPtrInst>(Scalar))
16834 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16835 EE && IgnoredExtracts.contains(EE))
16837 if (isa<PoisonValue>(Scalar))
16846 assert((isVectorized(U) ||
16847 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16848 (isa_and_nonnull<Instruction>(U) &&
16849 isDeleted(cast<Instruction>(U)))) &&
16850 "Deleting out-of-tree value");
16854 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16855 auto *
I = cast<Instruction>(Scalar);
16862 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16863 V->mergeDIAssignID(RemovedInsts);
16866 if (UserIgnoreList) {
16868 const TreeEntry *
IE = getTreeEntries(
I).front();
16869 if (
IE->Idx != 0 &&
16870 !(VectorizableTree.front()->isGather() &&
16871 !
IE->UserTreeIndices.empty() &&
16872 (ValueToGatherNodes.lookup(
I).contains(
16873 VectorizableTree.front().get()) ||
16875 [&](
const EdgeInfo &EI) {
16876 return EI.UserTE == VectorizableTree.front().get() &&
16877 EI.EdgeIdx == UINT_MAX;
16879 !(GatheredLoadsEntriesFirst.has_value() &&
16880 IE->Idx >= *GatheredLoadsEntriesFirst &&
16881 VectorizableTree.front()->isGather() &&
16887 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16888 (match(U.getUser(), m_LogicalAnd()) ||
16889 match(U.getUser(), m_LogicalOr())) &&
16890 U.getOperandNo() == 0;
16891 if (IsPoisoningLogicalOp) {
16892 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16895 return UserIgnoreList->contains(
U.getUser());
16907 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16910 InstrElementSize.
clear();
16912 const TreeEntry &RootTE = *VectorizableTree.front();
16913 Value *Vec = RootTE.VectorizedValue;
16914 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16915 It != MinBWs.end() &&
16916 ReductionBitWidth != It->second.first) {
16919 ReductionRoot->getIterator());
16923 cast<VectorType>(Vec->
getType())->getElementCount()),
16924 It->second.second);
16931 <<
" gather sequences instructions.\n");
16938 Loop *L = LI->getLoopFor(
I->getParent());
16943 BasicBlock *PreHeader = L->getLoopPreheader();
16951 auto *OpI = dyn_cast<Instruction>(V);
16952 return OpI && L->contains(OpI);
16958 CSEBlocks.
insert(PreHeader);
16973 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
16974 "Different nodes should have different DFS numbers");
16975 return A->getDFSNumIn() <
B->getDFSNumIn();
16986 if (I1->getType() != I2->getType())
16988 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16989 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16991 return I1->isIdenticalTo(I2);
16992 if (SI1->isIdenticalTo(SI2))
16994 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
16995 if (SI1->getOperand(
I) != SI2->getOperand(
I))
16998 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17002 unsigned LastUndefsCnt = 0;
17003 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
17009 NewMask[
I] != SM1[
I])
17012 NewMask[
I] = SM1[
I];
17016 return SM1.
size() - LastUndefsCnt > 1 &&
17020 SM1.
size() - LastUndefsCnt));
17026 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17029 "Worklist not sorted properly!");
17035 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17036 !GatherShuffleExtractSeq.contains(&In))
17041 bool Replaced =
false;
17044 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17045 DT->
dominates(V->getParent(), In.getParent())) {
17046 In.replaceAllUsesWith(V);
17048 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17049 if (!NewMask.
empty())
17050 SI->setShuffleMask(NewMask);
17054 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17055 GatherShuffleExtractSeq.contains(V) &&
17056 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17057 DT->
dominates(In.getParent(), V->getParent())) {
17059 V->replaceAllUsesWith(&In);
17061 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17062 if (!NewMask.
empty())
17063 SI->setShuffleMask(NewMask);
17071 Visited.push_back(&In);
17076 GatherShuffleExtractSeq.clear();
17079BoUpSLP::ScheduleData *
17081 ScheduleData *Bundle =
nullptr;
17082 ScheduleData *PrevInBundle =
nullptr;
17083 for (
Value *V : VL) {
17086 ScheduleData *BundleMember = getScheduleData(V);
17088 "no ScheduleData for bundle member "
17089 "(maybe not in same basic block)");
17090 assert(BundleMember->isSchedulingEntity() &&
17091 "bundle member already part of other bundle");
17092 if (PrevInBundle) {
17093 PrevInBundle->NextInBundle = BundleMember;
17095 Bundle = BundleMember;
17099 BundleMember->FirstInBundle = Bundle;
17100 PrevInBundle = BundleMember;
17102 assert(Bundle &&
"Failed to find schedule bundle");
17108std::optional<BoUpSLP::ScheduleData *>
17110 const InstructionsState &S) {
17113 if (isa<PHINode>(S.getMainOp()) ||
17119 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17121 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17122 ScheduleData *Bundle) {
17128 if (ScheduleEnd != OldScheduleEnd) {
17129 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17130 if (ScheduleData *SD = getScheduleData(
I))
17131 SD->clearDependencies();
17136 <<
" in block " << BB->
getName() <<
"\n");
17137 calculateDependencies(Bundle,
true, SLP);
17142 initialFillReadyList(ReadyInsts);
17149 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17150 !ReadyInsts.empty()) {
17151 ScheduleData *Picked = ReadyInsts.pop_back_val();
17152 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17153 "must be ready to schedule");
17154 schedule(Picked, ReadyInsts);
17160 for (
Value *V : VL) {
17163 if (!extendSchedulingRegion(V, S)) {
17170 TryScheduleBundleImpl(
false,
nullptr);
17171 return std::nullopt;
17175 bool ReSchedule =
false;
17176 for (
Value *V : VL) {
17179 ScheduleData *BundleMember = getScheduleData(V);
17181 "no ScheduleData for bundle member (maybe not in same basic block)");
17185 ReadyInsts.remove(BundleMember);
17187 if (!BundleMember->IsScheduled)
17192 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17193 <<
" was already scheduled\n");
17197 auto *Bundle = buildBundle(VL);
17198 TryScheduleBundleImpl(ReSchedule, Bundle);
17199 if (!Bundle->isReady()) {
17200 cancelScheduling(VL, S.getMainOp());
17201 return std::nullopt;
17214 ScheduleData *Bundle = getScheduleData(OpValue);
17215 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17216 assert(!Bundle->IsScheduled &&
17217 "Can't cancel bundle which is already scheduled");
17218 assert(Bundle->isSchedulingEntity() &&
17220 "tried to unbundle something which is not a bundle");
17223 if (Bundle->isReady())
17224 ReadyInsts.remove(Bundle);
17227 ScheduleData *BundleMember = Bundle;
17228 while (BundleMember) {
17229 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17230 BundleMember->FirstInBundle = BundleMember;
17231 ScheduleData *Next = BundleMember->NextInBundle;
17232 BundleMember->NextInBundle =
nullptr;
17233 BundleMember->TE =
nullptr;
17234 if (BundleMember->unscheduledDepsInBundle() == 0) {
17235 ReadyInsts.insert(BundleMember);
17237 BundleMember = Next;
17241BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17243 if (ChunkPos >= ChunkSize) {
17244 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17247 return &(ScheduleDataChunks.back()[ChunkPos++]);
17250bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17251 Value *V,
const InstructionsState &S) {
17253 assert(
I &&
"bundle member must be an instruction");
17256 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17258 if (getScheduleData(
I))
17260 if (!ScheduleStart) {
17262 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17264 ScheduleEnd =
I->getNextNode();
17265 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17266 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17274 ++ScheduleStart->getIterator().getReverse();
17279 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17280 return II->isAssumeLikeIntrinsic();
17283 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17284 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17285 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17287 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17288 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17295 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17296 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17298 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17299 assert(
I->getParent() == ScheduleStart->getParent() &&
17300 "Instruction is in wrong basic block.");
17301 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17307 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17308 "Expected to reach top of the basic block or instruction down the "
17310 assert(
I->getParent() == ScheduleEnd->getParent() &&
17311 "Instruction is in wrong basic block.");
17312 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17314 ScheduleEnd =
I->getNextNode();
17315 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17316 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17320void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17322 ScheduleData *PrevLoadStore,
17323 ScheduleData *NextLoadStore) {
17324 ScheduleData *CurrentLoadStore = PrevLoadStore;
17329 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17331 SD = allocateScheduleDataChunks();
17332 ScheduleDataMap[
I] = SD;
17334 assert(!isInSchedulingRegion(SD) &&
17335 "new ScheduleData already in scheduling region");
17336 SD->init(SchedulingRegionID,
I);
17338 if (
I->mayReadOrWriteMemory() &&
17339 (!isa<IntrinsicInst>(
I) ||
17340 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17341 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17342 Intrinsic::pseudoprobe))) {
17344 if (CurrentLoadStore) {
17345 CurrentLoadStore->NextLoadStore = SD;
17347 FirstLoadStoreInRegion = SD;
17349 CurrentLoadStore = SD;
17352 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17353 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17354 RegionHasStackSave =
true;
17356 if (NextLoadStore) {
17357 if (CurrentLoadStore)
17358 CurrentLoadStore->NextLoadStore = NextLoadStore;
17360 LastLoadStoreInRegion = CurrentLoadStore;
17364void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17365 bool InsertInReadyList,
17367 assert(SD->isSchedulingEntity());
17372 while (!WorkList.
empty()) {
17374 for (ScheduleData *BundleMember = SD; BundleMember;
17375 BundleMember = BundleMember->NextInBundle) {
17376 assert(isInSchedulingRegion(BundleMember));
17377 if (BundleMember->hasValidDependencies())
17382 BundleMember->Dependencies = 0;
17383 BundleMember->resetUnscheduledDeps();
17386 for (
User *U : BundleMember->Inst->
users()) {
17387 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17388 BundleMember->Dependencies++;
17389 ScheduleData *DestBundle = UseSD->FirstInBundle;
17390 if (!DestBundle->IsScheduled)
17391 BundleMember->incrementUnscheduledDeps(1);
17392 if (!DestBundle->hasValidDependencies())
17398 auto *DepDest = getScheduleData(
I);
17399 assert(DepDest &&
"must be in schedule window");
17400 DepDest->ControlDependencies.push_back(BundleMember);
17401 BundleMember->Dependencies++;
17402 ScheduleData *DestBundle = DepDest->FirstInBundle;
17403 if (!DestBundle->IsScheduled)
17404 BundleMember->incrementUnscheduledDeps(1);
17405 if (!DestBundle->hasValidDependencies())
17413 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17414 I != ScheduleEnd;
I =
I->getNextNode()) {
17419 MakeControlDependent(
I);
17427 if (RegionHasStackSave) {
17431 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17432 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17433 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17434 I != ScheduleEnd;
I =
I->getNextNode()) {
17435 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17436 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17441 if (!isa<AllocaInst>(
I))
17445 MakeControlDependent(
I);
17454 if (isa<AllocaInst>(BundleMember->Inst) ||
17455 BundleMember->Inst->mayReadOrWriteMemory()) {
17456 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17457 I != ScheduleEnd;
I =
I->getNextNode()) {
17458 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17459 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17463 MakeControlDependent(
I);
17470 ScheduleData *DepDest = BundleMember->NextLoadStore;
17475 "NextLoadStore list for non memory effecting bundle?");
17477 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17478 unsigned NumAliased = 0;
17479 unsigned DistToSrc = 1;
17481 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17482 assert(isInSchedulingRegion(DepDest));
17492 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17494 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17501 DepDest->MemoryDependencies.push_back(BundleMember);
17502 BundleMember->Dependencies++;
17503 ScheduleData *DestBundle = DepDest->FirstInBundle;
17504 if (!DestBundle->IsScheduled) {
17505 BundleMember->incrementUnscheduledDeps(1);
17507 if (!DestBundle->hasValidDependencies()) {
17530 if (InsertInReadyList && SD->isReady()) {
17531 ReadyInsts.insert(SD);
17538void BoUpSLP::BlockScheduling::resetSchedule() {
17540 "tried to reset schedule on block which has not been scheduled");
17541 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17542 if (ScheduleData *SD = getScheduleData(
I)) {
17543 assert(isInSchedulingRegion(SD) &&
17544 "ScheduleData not in scheduling region");
17545 SD->IsScheduled =
false;
17546 SD->resetUnscheduledDeps();
17549 ReadyInsts.clear();
17552void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17553 if (!BS->ScheduleStart)
17556 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17563 BS->resetSchedule();
17570 struct ScheduleDataCompare {
17571 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17572 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17575 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17580 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17581 I =
I->getNextNode()) {
17582 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17585 SD->isPartOfBundle() ==
17588 "scheduler and vectorizer bundle mismatch");
17589 SD->FirstInBundle->SchedulingPriority =
Idx++;
17591 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17592 BS->calculateDependencies(SD,
false,
this);
17595 BS->initialFillReadyList(ReadyInsts);
17597 Instruction *LastScheduledInst = BS->ScheduleEnd;
17600 while (!ReadyInsts.empty()) {
17601 ScheduleData *Picked = *ReadyInsts.begin();
17602 ReadyInsts.erase(ReadyInsts.begin());
17606 for (ScheduleData *BundleMember = Picked; BundleMember;
17607 BundleMember = BundleMember->NextInBundle) {
17611 LastScheduledInst = PickedInst;
17614 BS->schedule(Picked, ReadyInsts);
17618#ifdef EXPENSIVE_CHECKS
17622#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17624 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17625 ScheduleData *SD = BS->getScheduleData(
I);
17626 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17627 assert(SD->IsScheduled &&
"must be scheduled at this point");
17632 BS->ScheduleStart =
nullptr;
17639 if (
auto *Store = dyn_cast<StoreInst>(V))
17640 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17642 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17645 auto E = InstrElementSize.
find(V);
17646 if (E != InstrElementSize.
end())
17655 if (
auto *
I = dyn_cast<Instruction>(V)) {
17663 Value *FirstNonBool =
nullptr;
17664 while (!Worklist.
empty()) {
17669 auto *Ty =
I->getType();
17670 if (isa<VectorType>(Ty))
17672 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17679 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17680 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17688 for (
Use &U :
I->operands()) {
17689 if (
auto *J = dyn_cast<Instruction>(U.get()))
17690 if (Visited.
insert(J).second &&
17691 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17695 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17696 FirstNonBool = U.get();
17707 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17709 Width =
DL->getTypeSizeInBits(V->getType());
17713 InstrElementSize[
I] = Width;
17718bool BoUpSLP::collectValuesToDemote(
17719 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17722 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17724 if (
all_of(E.Scalars, IsaPred<Constant>))
17727 unsigned OrigBitWidth =
17728 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17735 if (NodesToKeepBWs.
contains(E.Idx))
17741 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17742 if (isa<PoisonValue>(R))
17744 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17746 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17747 if (isa<PoisonValue>(V))
17749 if (getTreeEntries(V).size() > 1)
17755 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17761 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17764 if (
auto *
I = dyn_cast<Instruction>(V)) {
17766 unsigned BitWidth2 =
17767 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17768 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17774 BitWidth1 = std::min(BitWidth1, BitWidth2);
17779 auto FinalAnalysis = [&,
TTI =
TTI]() {
17780 if (!IsProfitableToDemote)
17783 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17785 if (Res && E.isGather()) {
17789 for (
Value *V : E.Scalars) {
17790 auto *EE = dyn_cast<ExtractElementInst>(V);
17793 UniqueBases.
insert(EE->getVectorOperand());
17795 const unsigned VF = E.Scalars.size();
17796 Type *OrigScalarTy = E.Scalars.front()->getType();
17797 if (UniqueBases.
size() <= 2 ||
17808 if (E.isGather() || !Visited.
insert(&E).second ||
17810 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17811 return isa<InsertElementInst>(U) && !isVectorized(U);
17814 return FinalAnalysis();
17817 return !all_of(V->users(), [=](User *U) {
17818 return isVectorized(U) ||
17819 (E.Idx == 0 && UserIgnoreList &&
17820 UserIgnoreList->contains(U)) ||
17821 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17822 !U->getType()->isScalableTy() &&
17823 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17824 }) && !IsPotentiallyTruncated(V,
BitWidth);
17829 bool &NeedToExit) {
17830 NeedToExit =
false;
17831 unsigned InitLevel = MaxDepthLevel;
17833 unsigned Level = InitLevel;
17834 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17835 ToDemote, Visited, NodesToKeepBWs, Level,
17836 IsProfitableToDemote, IsTruncRoot)) {
17837 if (!IsProfitableToDemote)
17840 if (!FinalAnalysis())
17844 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17848 auto AttemptCheckBitwidth =
17851 NeedToExit =
false;
17852 unsigned BestFailBitwidth = 0;
17854 if (Checker(
BitWidth, OrigBitWidth))
17856 if (BestFailBitwidth == 0 && FinalAnalysis())
17860 if (BestFailBitwidth == 0) {
17871 auto TryProcessInstruction =
17877 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17882 if (E.UserTreeIndices.size() > 1 &&
17883 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17886 bool NeedToExit =
false;
17887 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17891 if (!ProcessOperands(
Operands, NeedToExit))
17900 return IsProfitableToDemote;
17902 switch (E.getOpcode()) {
17906 case Instruction::Trunc:
17907 if (IsProfitableToDemoteRoot)
17908 IsProfitableToDemote =
true;
17909 return TryProcessInstruction(
BitWidth);
17910 case Instruction::ZExt:
17911 case Instruction::SExt:
17912 IsProfitableToDemote =
true;
17913 return TryProcessInstruction(
BitWidth);
17917 case Instruction::Add:
17918 case Instruction::Sub:
17919 case Instruction::Mul:
17920 case Instruction::And:
17921 case Instruction::Or:
17922 case Instruction::Xor: {
17923 return TryProcessInstruction(
17924 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17926 case Instruction::Freeze:
17927 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17928 case Instruction::Shl: {
17933 if (isa<PoisonValue>(V))
17935 auto *I = cast<Instruction>(V);
17936 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17937 return AmtKnownBits.getMaxValue().ult(BitWidth);
17940 return TryProcessInstruction(
17941 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17943 case Instruction::LShr: {
17947 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17949 if (isa<PoisonValue>(V))
17951 auto *I = cast<Instruction>(V);
17952 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17953 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17954 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17955 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17956 SimplifyQuery(*DL));
17959 return TryProcessInstruction(
17960 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17963 case Instruction::AShr: {
17967 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17969 if (isa<PoisonValue>(V))
17971 auto *I = cast<Instruction>(V);
17972 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17973 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17974 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17975 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17979 return TryProcessInstruction(
17980 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17983 case Instruction::UDiv:
17984 case Instruction::URem: {
17986 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17989 auto *I = cast<Instruction>(V);
17990 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17991 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17992 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17995 return TryProcessInstruction(
17996 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18000 case Instruction::Select: {
18001 return TryProcessInstruction(
18002 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18007 case Instruction::PHI: {
18008 const unsigned NumOps = E.getNumOperands();
18011 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18013 return TryProcessInstruction(
BitWidth, Ops);
18016 case Instruction::Call: {
18017 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18021 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18022 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18026 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18029 auto *I = cast<Instruction>(V);
18030 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18031 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18032 return MaskedValueIsZero(I->getOperand(0), Mask,
18033 SimplifyQuery(*DL)) &&
18034 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18036 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18037 "Expected min/max intrinsics only.");
18038 unsigned SignBits = OrigBitWidth -
BitWidth;
18044 return SignBits <= Op0SignBits &&
18045 ((SignBits != Op0SignBits &&
18049 SignBits <= Op1SignBits &&
18050 ((SignBits != Op1SignBits &&
18055 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18058 auto *I = cast<Instruction>(V);
18059 unsigned SignBits = OrigBitWidth - BitWidth;
18060 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18061 unsigned Op0SignBits =
18062 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18063 return SignBits <= Op0SignBits &&
18064 ((SignBits != Op0SignBits &&
18065 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18066 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18069 if (
ID != Intrinsic::abs) {
18070 Operands.push_back(getOperandEntry(&E, 1));
18071 CallChecker = CompChecker;
18073 CallChecker = AbsChecker;
18076 std::numeric_limits<InstructionCost::CostType>::max();
18078 unsigned VF = E.Scalars.size();
18088 if (
Cost < BestCost) {
18094 [[maybe_unused]]
bool NeedToExit;
18095 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18105 return FinalAnalysis();
18112 bool IsStoreOrInsertElt =
18113 VectorizableTree.front()->hasState() &&
18114 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18115 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18116 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18117 ExtraBitWidthNodes.
size() <= 1 &&
18118 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18119 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18122 unsigned NodeIdx = 0;
18123 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18127 if (VectorizableTree[NodeIdx]->
isGather() ||
18128 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18129 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18131 return EI.
UserTE->Idx > NodeIdx;
18137 bool IsTruncRoot =
false;
18138 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18141 if (NodeIdx != 0 &&
18142 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18143 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18144 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18145 IsTruncRoot =
true;
18147 IsProfitableToDemoteRoot =
true;
18152 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18156 auto ComputeMaxBitWidth =
18157 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
18158 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
18162 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18163 !NodesToKeepBWs.
contains(E.Idx) &&
18164 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18166 return V->hasOneUse() || isa<Constant>(V) ||
18169 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
18170 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18171 if (TEs.empty() || is_contained(TEs, UserTE))
18173 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18175 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18176 SelectInst>(UserTE->getMainOp()))
18178 unsigned UserTESz = DL->getTypeSizeInBits(
18179 UserTE->Scalars.front()->getType());
18180 if (all_of(TEs, [&](const TreeEntry *TE) {
18181 auto It = MinBWs.find(TE);
18182 return It != MinBWs.end() &&
18183 It->second.first > UserTESz;
18186 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18190 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18191 auto It = MinBWs.
find(UserTE);
18192 if (It != MinBWs.
end())
18193 return It->second.first;
18194 unsigned MaxBitWidth =
18195 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18196 MaxBitWidth =
bit_ceil(MaxBitWidth);
18197 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18199 return MaxBitWidth;
18205 unsigned VF = E.getVectorFactor();
18206 Type *ScalarTy = E.Scalars.front()->getType();
18208 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18213 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18222 unsigned MaxBitWidth = 1u;
18230 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18231 if (isa<PoisonValue>(R))
18233 KnownBits Known = computeKnownBits(R, *DL);
18234 return Known.isNonNegative();
18239 for (
Value *Root : E.Scalars) {
18240 if (isa<PoisonValue>(Root))
18245 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18261 if (!IsKnownPositive)
18265 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18267 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18270 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18275 if (NumParts > 1 &&
18283 unsigned Opcode = E.getOpcode();
18284 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18285 Opcode == Instruction::SExt ||
18286 Opcode == Instruction::ZExt || NumParts > 1;
18291 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18292 bool NeedToDemote = IsProfitableToDemote;
18294 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18295 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18296 NeedToDemote, IsTruncRoot) ||
18297 (MaxDepthLevel <= Limit &&
18298 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18299 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18300 DL->getTypeSizeInBits(TreeRootIT) /
18301 DL->getTypeSizeInBits(
18302 E.getMainOp()->getOperand(0)->getType()) >
18306 MaxBitWidth =
bit_ceil(MaxBitWidth);
18308 return MaxBitWidth;
18315 if (UserIgnoreList &&
18316 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18319 if (
all_of(*UserIgnoreList,
18321 return isa<PoisonValue>(V) ||
18322 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18324 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18325 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18326 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18328 ReductionBitWidth = 1;
18330 for (
Value *V : *UserIgnoreList) {
18331 if (isa<PoisonValue>(V))
18334 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18335 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18338 unsigned BitWidth2 = BitWidth1;
18340 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18341 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18343 ReductionBitWidth =
18344 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18346 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18347 ReductionBitWidth = 8;
18349 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18352 bool IsTopRoot = NodeIdx == 0;
18353 while (NodeIdx < VectorizableTree.size() &&
18354 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18355 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18356 RootDemotes.push_back(NodeIdx);
18358 IsTruncRoot =
true;
18360 bool IsSignedCmp =
false;
18361 while (NodeIdx < VectorizableTree.size()) {
18363 unsigned Limit = 2;
18365 ReductionBitWidth ==
18366 DL->getTypeSizeInBits(
18367 VectorizableTree.front()->Scalars.front()->getType()))
18369 unsigned MaxBitWidth = ComputeMaxBitWidth(
18370 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18371 IsTruncRoot, IsSignedCmp);
18372 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18373 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18374 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18375 else if (MaxBitWidth == 0)
18376 ReductionBitWidth = 0;
18379 for (
unsigned Idx : RootDemotes) {
18382 DL->getTypeSizeInBits(
V->getType()->getScalarType());
18383 if (OrigBitWidth > MaxBitWidth) {
18391 RootDemotes.clear();
18393 IsProfitableToDemoteRoot =
true;
18395 if (ExtraBitWidthNodes.empty()) {
18396 NodeIdx = VectorizableTree.size();
18398 unsigned NewIdx = 0;
18400 NewIdx = *ExtraBitWidthNodes.begin();
18401 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18402 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18405 NodeIdx < VectorizableTree.size() &&
18406 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18407 [](
const EdgeInfo &EI) {
18408 return EI.EdgeIdx == 0 &&
18409 EI.UserTE->getOpcode() == Instruction::Trunc &&
18410 !EI.UserTE->isAltShuffle();
18413 NodeIdx < VectorizableTree.size() &&
18415 VectorizableTree[NodeIdx]->UserTreeIndices,
18416 [&](
const EdgeInfo &EI) {
18417 return (EI.UserTE->hasState() &&
18418 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18420 auto *IC = dyn_cast<ICmpInst>(V);
18423 !isKnownNonNegative(IC->getOperand(0),
18424 SimplifyQuery(*DL)) ||
18425 !isKnownNonNegative(IC->getOperand(1),
18426 SimplifyQuery(*DL)));
18433 if (MaxBitWidth == 0 ||
18435 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18437 if (UserIgnoreList)
18445 for (
unsigned Idx : ToDemote) {
18446 TreeEntry *
TE = VectorizableTree[
Idx].get();
18447 if (MinBWs.contains(TE))
18450 if (isa<PoisonValue>(R))
18452 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18454 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18470 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18495 DL = &
F.getDataLayout();
18499 bool Changed =
false;
18505 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18510 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18513 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18517 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18526 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18531 R.clearReductionData();
18532 collectSeedInstructions(BB);
18535 if (!Stores.
empty()) {
18537 <<
" underlying objects.\n");
18538 Changed |= vectorizeStoreChains(R);
18542 Changed |= vectorizeChainsInBlock(BB, R);
18547 if (!GEPs.
empty()) {
18549 <<
" underlying objects.\n");
18550 Changed |= vectorizeGEPIndices(BB, R);
18555 R.optimizeGatherSequence();
18563 unsigned Idx,
unsigned MinVF,
18568 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18569 unsigned VF = Chain.
size();
18573 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18575 VF < 2 || VF < MinVF) {
18587 for (
Value *V : Chain)
18588 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18591 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18593 bool IsAllowedSize =
18597 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18598 (!S.getMainOp()->isSafeToRemove() ||
18601 return !isa<ExtractElementInst>(V) &&
18602 (V->getNumUses() > Chain.size() ||
18603 any_of(V->users(), [&](User *U) {
18604 return !Stores.contains(U);
18607 (ValOps.
size() > Chain.size() / 2 && !S)) {
18608 Size = (!IsAllowedSize && S) ? 1 : 2;
18612 if (
R.isLoadCombineCandidate(Chain))
18614 R.buildTree(Chain);
18616 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18617 if (
R.isGathered(Chain.front()) ||
18618 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18619 return std::nullopt;
18620 Size =
R.getCanonicalGraphSize();
18623 R.reorderTopToBottom();
18624 R.reorderBottomToTop();
18625 R.transformNodes();
18626 R.buildExternalUses();
18628 R.computeMinimumValueSizes();
18630 Size =
R.getCanonicalGraphSize();
18631 if (S && S.getOpcode() == Instruction::Load)
18639 using namespace ore;
18642 cast<StoreInst>(Chain[0]))
18643 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18644 <<
" and with tree size "
18645 <<
NV(
"TreeSize",
R.getTreeSize()));
18659 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18660 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18661 unsigned Size = First ? Val.first : Val.second;
18673 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18674 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18675 unsigned P = First ? Val.first : Val.second;
18678 return V + (P - Mean) * (P - Mean);
18681 return Dev * 81 / (Mean * Mean) == 0;
18684bool SLPVectorizerPass::vectorizeStores(
18686 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18691 bool Changed =
false;
18693 struct StoreDistCompare {
18694 bool operator()(
const std::pair<unsigned, int> &Op1,
18695 const std::pair<unsigned, int> &Op2)
const {
18696 return Op1.second < Op2.second;
18701 using StoreIndexToDistSet =
18702 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18703 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18708 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18710 PrevDist =
Data.second;
18711 if (
Idx !=
Set.size() - 1)
18716 Operands.push_back(Stores[DataVar.first]);
18717 PrevDist = DataVar.second;
18722 .
insert({Operands.front(),
18723 cast<StoreInst>(Operands.front())->getValueOperand(),
18725 cast<StoreInst>(Operands.back())->getValueOperand(),
18730 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18731 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18735 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18737 Type *StoreTy =
Store->getValueOperand()->getType();
18738 Type *ValueTy = StoreTy;
18739 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18740 ValueTy = Trunc->getSrcTy();
18741 unsigned MinVF = std::max<unsigned>(
18743 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18746 if (MaxVF < MinVF) {
18747 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18749 <<
"MinVF (" << MinVF <<
")\n");
18753 unsigned NonPowerOf2VF = 0;
18758 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18760 NonPowerOf2VF = CandVF;
18761 assert(NonPowerOf2VF != MaxVF &&
18762 "Non-power-of-2 VF should not be equal to MaxVF");
18766 unsigned MaxRegVF = MaxVF;
18768 if (MaxVF < MinVF) {
18769 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18771 <<
"MinVF (" << MinVF <<
")\n");
18777 unsigned Size = MinVF;
18779 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18783 unsigned Repeat = 0;
18784 constexpr unsigned MaxAttempts = 4;
18786 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18787 P.first =
P.second = 1;
18790 auto IsNotVectorized = [](
bool First,
18791 const std::pair<unsigned, unsigned> &
P) {
18792 return First ?
P.first > 0 :
P.second > 0;
18794 auto IsVectorized = [](
bool First,
18795 const std::pair<unsigned, unsigned> &
P) {
18796 return First ?
P.first == 0 :
P.second == 0;
18798 auto VFIsProfitable = [](
bool First,
unsigned Size,
18799 const std::pair<unsigned, unsigned> &
P) {
18802 auto FirstSizeSame = [](
unsigned Size,
18803 const std::pair<unsigned, unsigned> &
P) {
18804 return Size ==
P.first;
18808 bool RepeatChanged =
false;
18809 bool AnyProfitableGraph =
false;
18810 for (
unsigned Size : CandidateVFs) {
18811 AnyProfitableGraph =
false;
18812 unsigned StartIdx = std::distance(
18813 RangeSizes.begin(),
18814 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18815 std::placeholders::_1)));
18816 while (StartIdx <
End) {
18818 std::distance(RangeSizes.begin(),
18819 find_if(RangeSizes.drop_front(StartIdx),
18820 std::bind(IsVectorized,
Size >= MaxRegVF,
18821 std::placeholders::_1)));
18822 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18823 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18825 Size >= MaxRegVF)) {
18832 return cast<StoreInst>(V)
18833 ->getValueOperand()
18835 cast<StoreInst>(Slice.
front())
18836 ->getValueOperand()
18839 "Expected all operands of same type.");
18840 if (!NonSchedulable.empty()) {
18841 auto [NonSchedSizeMax, NonSchedSizeMin] =
18842 NonSchedulable.lookup(Slice.
front());
18843 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18844 Cnt += NonSchedSizeMax;
18849 std::optional<bool> Res =
18850 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18854 .first->getSecond()
18862 AnyProfitableGraph = RepeatChanged = Changed =
true;
18866 [](std::pair<unsigned, unsigned> &
P) {
18867 P.first = P.second = 0;
18869 if (Cnt < StartIdx + MinVF) {
18870 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18871 [](std::pair<unsigned, unsigned> &
P) {
18872 P.first = P.second = 0;
18874 StartIdx = Cnt +
Size;
18876 if (Cnt > Sz -
Size - MinVF) {
18878 [](std::pair<unsigned, unsigned> &
P) {
18879 P.first = P.second = 0;
18888 if (
Size > 2 && Res &&
18890 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18891 std::placeholders::_1))) {
18897 if (
Size > MaxRegVF && TreeSize > 1 &&
18899 std::bind(FirstSizeSame, TreeSize,
18900 std::placeholders::_1))) {
18902 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18908 [&](std::pair<unsigned, unsigned> &
P) {
18909 if (Size >= MaxRegVF)
18910 P.second = std::max(P.second, TreeSize);
18912 P.first = std::max(P.first, TreeSize);
18915 AnyProfitableGraph =
true;
18917 if (StartIdx >=
End)
18919 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18920 AnyProfitableGraph =
true;
18921 StartIdx = std::distance(
18922 RangeSizes.begin(),
18923 find_if(RangeSizes.drop_front(Sz),
18924 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18925 std::placeholders::_1)));
18931 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18932 return P.first == 0 &&
P.second == 0;
18936 if (Repeat >= MaxAttempts ||
18937 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18939 constexpr unsigned StoresLimit = 64;
18940 const unsigned MaxTotalNum = std::min<unsigned>(
18942 static_cast<unsigned>(
18945 RangeSizes.begin(),
18946 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18947 std::placeholders::_1))) +
18949 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18952 CandidateVFs.clear();
18954 CandidateVFs.push_back(Limit);
18955 if (VF > MaxTotalNum || VF >= StoresLimit)
18957 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18959 P.first = std::max(
P.second,
P.first);
18963 CandidateVFs.push_back(VF);
19010 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19012 Stores[
Set.first]->getValueOperand()->getType(),
19013 Stores[
Set.first]->getPointerOperand(),
19014 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19018 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19019 if (It ==
Set.second.end()) {
19020 Set.second.emplace(
Idx, *Diff);
19024 TryToVectorize(
Set.second);
19025 unsigned ItIdx = It->first;
19026 int ItDist = It->second;
19027 StoreIndexToDistSet PrevSet;
19028 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19029 [&](
const std::pair<unsigned, int> &Pair) {
19030 return Pair.first > ItIdx;
19032 Set.second.clear();
19034 Set.second.emplace(
Idx, 0);
19037 unsigned StartIdx = ItIdx + 1;
19042 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19044 if (VectorizedStores.
contains(Stores[Pair.first]))
19046 unsigned BI = Pair.first - StartIdx;
19047 UsedStores.set(BI);
19048 Dists[BI] = Pair.second - ItDist;
19050 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19051 unsigned BI =
I - StartIdx;
19052 if (UsedStores.test(BI))
19053 Set.second.emplace(
I, Dists[BI]);
19057 auto &Res = SortedStores.emplace_back();
19059 Res.second.emplace(
Idx, 0);
19061 Type *PrevValTy =
nullptr;
19063 if (
R.isDeleted(SI))
19066 PrevValTy =
SI->getValueOperand()->getType();
19068 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19069 for (
auto &Set : SortedStores)
19070 TryToVectorize(
Set.second);
19071 SortedStores.clear();
19072 PrevValTy =
SI->getValueOperand()->getType();
19074 FillStoresSet(
I, SI);
19078 for (
auto &Set : SortedStores)
19079 TryToVectorize(
Set.second);
19084void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19095 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19096 if (!
SI->isSimple())
19106 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19107 if (
GEP->getNumIndices() != 1)
19110 if (isa<Constant>(
Idx))
19114 if (
GEP->getType()->isVectorTy())
19126 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19127 << VL.
size() <<
".\n");
19138 for (
Value *V : VL) {
19139 Type *Ty =
V->getType();
19143 R.getORE()->emit([&]() {
19144 std::string TypeStr;
19148 <<
"Cannot SLP vectorize list: type "
19149 << TypeStr +
" is unsupported by vectorizer";
19156 unsigned Sz =
R.getVectorElementSize(I0);
19157 unsigned MinVF =
R.getMinVF(Sz);
19158 unsigned MaxVF = std::max<unsigned>(
19160 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19162 R.getORE()->emit([&]() {
19164 <<
"Cannot SLP vectorize list: vectorization factor "
19165 <<
"less than 2 is not supported";
19170 bool Changed =
false;
19171 bool CandidateFound =
false;
19174 unsigned NextInst = 0, MaxInst = VL.size();
19175 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19183 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19184 unsigned ActualVF = std::min(MaxInst -
I, VF);
19189 if (MaxVFOnly && ActualVF < MaxVF)
19191 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19196 for (
Value *V : VL.drop_front(
I)) {
19199 if (
auto *Inst = dyn_cast<Instruction>(V);
19200 !Inst || !
R.isDeleted(Inst)) {
19203 if (
Idx == ActualVF)
19208 if (
Idx != ActualVF)
19211 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19215 if (
R.isTreeTinyAndNotFullyVectorizable())
19217 R.reorderTopToBottom();
19218 R.reorderBottomToTop(
19219 !isa<InsertElementInst>(Ops.
front()) &&
19220 !
R.doesRootHaveInTreeUses());
19221 R.transformNodes();
19222 R.buildExternalUses();
19224 R.computeMinimumValueSizes();
19226 CandidateFound =
true;
19227 MinCost = std::min(MinCost,
Cost);
19230 <<
" for VF=" << ActualVF <<
"\n");
19234 cast<Instruction>(Ops[0]))
19235 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19236 <<
" and with tree size "
19237 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19248 if (!Changed && CandidateFound) {
19249 R.getORE()->emit([&]() {
19251 <<
"List vectorization was possible but not beneficial with cost "
19252 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19255 }
else if (!Changed) {
19256 R.getORE()->emit([&]() {
19258 <<
"Cannot SLP vectorize list: vectorization was impossible"
19259 <<
" with available vectorization factors";
19269 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19275 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19276 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19277 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19278 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19285 auto *
A = dyn_cast<BinaryOperator>(Op0);
19286 auto *
B = dyn_cast<BinaryOperator>(Op1);
19288 if (
A &&
B &&
B->hasOneUse()) {
19289 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19290 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19291 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19293 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19297 if (
B &&
A &&
A->hasOneUse()) {
19298 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19299 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19300 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19302 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19306 if (Candidates.
size() == 1)
19307 return tryToVectorizeList({Op0, Op1},
R);
19310 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19311 if (!BestCandidate)
19313 return tryToVectorizeList(
19314 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19348 ReductionOpsListType ReductionOps;
19358 bool IsSupportedHorRdxIdentityOp =
false;
19369 return isa<SelectInst>(
I) &&
19375 if (Kind == RecurKind::None)
19383 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19387 return I->getFastMathFlags().noNaNs();
19390 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19393 return I->isAssociative();
19402 return I->getOperand(2);
19403 return I->getOperand(
Index);
19410 case RecurKind::Or: {
19418 case RecurKind::And: {
19426 case RecurKind::Add:
19427 case RecurKind::Mul:
19428 case RecurKind::Xor:
19429 case RecurKind::FAdd:
19430 case RecurKind::FMul: {
19435 case RecurKind::SMax:
19436 case RecurKind::SMin:
19437 case RecurKind::UMax:
19438 case RecurKind::UMin:
19445 case RecurKind::FMax:
19446 case RecurKind::FMin:
19447 case RecurKind::FMaximum:
19448 case RecurKind::FMinimum: {
19461 const ReductionOpsListType &ReductionOps) {
19462 bool UseSelect = ReductionOps.size() == 2 ||
19464 (ReductionOps.size() == 1 &&
19465 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19466 assert((!UseSelect || ReductionOps.size() != 2 ||
19467 isa<SelectInst>(ReductionOps[1][0])) &&
19468 "Expected cmp + select pairs for reduction");
19471 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19485 auto *
I = dyn_cast<Instruction>(V);
19487 return RecurKind::None;
19489 return RecurKind::Add;
19491 return RecurKind::Mul;
19494 return RecurKind::And;
19497 return RecurKind::Or;
19499 return RecurKind::Xor;
19501 return RecurKind::FAdd;
19503 return RecurKind::FMul;
19506 return RecurKind::FMax;
19508 return RecurKind::FMin;
19511 return RecurKind::FMaximum;
19513 return RecurKind::FMinimum;
19519 return RecurKind::SMax;
19521 return RecurKind::SMin;
19523 return RecurKind::UMax;
19525 return RecurKind::UMin;
19527 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19549 if (!isa<ExtractElementInst>(
RHS) ||
19551 return RecurKind::None;
19553 if (!isa<ExtractElementInst>(
LHS) ||
19555 return RecurKind::None;
19557 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19558 return RecurKind::None;
19562 return RecurKind::None;
19567 return RecurKind::None;
19570 return RecurKind::SMax;
19573 return RecurKind::SMin;
19576 return RecurKind::UMax;
19579 return RecurKind::UMin;
19582 return RecurKind::None;
19586 static unsigned getFirstOperandIndex(
Instruction *
I) {
19587 return isCmpSelMinMax(
I) ? 1 : 0;
19593 return isCmpSelMinMax(
I) ? 3 : 2;
19599 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19600 auto *Sel = cast<SelectInst>(
I);
19601 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19602 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19604 return I->getParent() == BB;
19608 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19609 if (IsCmpSelMinMax) {
19612 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19613 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19614 return I->hasNUses(2);
19618 return I->hasOneUse();
19623 if (isCmpSelMinMax(
I))
19624 ReductionOps.assign(2, ReductionOpsType());
19626 ReductionOps.assign(1, ReductionOpsType());
19631 if (isCmpSelMinMax(
I)) {
19632 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19633 ReductionOps[1].emplace_back(
I);
19635 ReductionOps[0].emplace_back(
I);
19640 int Sz = Data.size();
19641 auto *
I = dyn_cast<Instruction>(Data.front());
19642 return Sz > 1 ||
isConstant(Data.front()) ||
19653 RdxKind = HorizontalReduction::getRdxKind(Root);
19654 if (!isVectorizable(RdxKind, Root))
19665 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19666 if (!Sel->getCondition()->hasOneUse())
19669 ReductionRoot = Root;
19674 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19676 1, std::make_pair(Root, 0));
19684 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19685 getNumberOfOperands(TreeN)))) {
19686 Value *EdgeVal = getRdxOperand(TreeN,
I);
19687 ReducedValsToOps[EdgeVal].push_back(TreeN);
19688 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19695 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19696 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19697 !isVectorizable(RdxKind, EdgeInst) ||
19698 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19699 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19700 PossibleReducedVals.push_back(EdgeVal);
19703 ReductionOps.push_back(EdgeInst);
19714 PossibleReducedVals;
19715 initReductionOps(Root);
19719 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19723 if (!LoadKeyUsed.
insert(Key).second) {
19724 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19725 if (LIt != LoadsMap.
end()) {
19726 for (
LoadInst *RLI : LIt->second) {
19732 for (
LoadInst *RLI : LIt->second) {
19739 if (LIt->second.size() > 2) {
19741 hash_value(LIt->second.back()->getPointerOperand());
19747 .first->second.push_back(LI);
19751 while (!Worklist.empty()) {
19752 auto [TreeN, Level] = Worklist.pop_back_val();
19755 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19756 addReductionOps(TreeN);
19759 for (
Value *V : PossibleRedVals) {
19763 ++PossibleReducedVals[
Key][
Idx]
19764 .
insert(std::make_pair(V, 0))
19768 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19770 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19773 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19774 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19776 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19779 auto RedValsVect = It->second.takeVector();
19781 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19782 PossibleRedValsVect.
back().append(Data.second, Data.first);
19784 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19785 return P1.size() > P2.size();
19790 (!isGoodForReduction(Data) &&
19791 (!isa<LoadInst>(Data.front()) ||
19792 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19794 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19796 cast<LoadInst>(ReducedVals[NewIdx].front())
19798 NewIdx = ReducedVals.
size();
19801 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19807 return P1.size() > P2.
size();
19816 constexpr unsigned RegMaxNumber = 4;
19817 constexpr unsigned RedValsMaxNumber = 128;
19821 if (
unsigned NumReducedVals = std::accumulate(
19822 ReducedVals.
begin(), ReducedVals.
end(), 0,
19824 if (!isGoodForReduction(Vals))
19826 return Num + Vals.size();
19828 NumReducedVals < ReductionLimit &&
19832 for (ReductionOpsType &RdxOps : ReductionOps)
19833 for (
Value *RdxOp : RdxOps)
19834 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19845 ReducedVals.
front().size());
19849 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19850 assert(isa<SelectInst>(RdxRootInst) &&
19851 "Expected min/max reduction to have select root instruction");
19852 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19853 assert(isa<Instruction>(ScalarCond) &&
19854 "Expected min/max reduction to have compare condition");
19855 return cast<Instruction>(ScalarCond);
19858 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19859 return isBoolLogicOp(cast<Instruction>(V));
19862 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19863 if (VectorizedTree) {
19866 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19867 if (AnyBoolLogicOp) {
19868 auto It = ReducedValsToOps.
find(VectorizedTree);
19869 auto It1 = ReducedValsToOps.
find(Res);
19870 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19872 (It != ReducedValsToOps.
end() &&
19874 return isBoolLogicOp(I) &&
19875 getRdxOperand(I, 0) == VectorizedTree;
19879 (It1 != ReducedValsToOps.
end() &&
19881 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19885 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19889 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19896 ReductionOps.front().size());
19897 for (ReductionOpsType &RdxOps : ReductionOps)
19898 for (
Value *RdxOp : RdxOps) {
19901 IgnoreList.insert(RdxOp);
19906 for (
Value *U : IgnoreList)
19907 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19908 RdxFMF &= FPMO->getFastMathFlags();
19909 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19914 for (
Value *V : Candidates)
19915 TrackedVals.try_emplace(V, V);
19918 Value *
V) ->
unsigned & {
19919 auto *It = MV.
find(V);
19920 assert(It != MV.
end() &&
"Unable to find given key.");
19929 bool CheckForReusedReductionOps =
false;
19934 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19936 InstructionsState S = States[
I];
19940 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19941 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19946 auto *Inst = dyn_cast<Instruction>(RdxVal);
19948 (!S || !S.isOpcodeOrAlt(Inst))) ||
19952 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19954 bool ShuffledExtracts =
false;
19956 if (S && S.getOpcode() == Instruction::ExtractElement &&
19957 !S.isAltShuffle() &&
I + 1 <
E) {
19959 for (
Value *RV : ReducedVals[
I + 1]) {
19960 Value *RdxVal = TrackedVals.at(RV);
19964 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19967 CommonCandidates.push_back(RdxVal);
19968 TrackedToOrig.try_emplace(RdxVal, RV);
19973 Candidates.
swap(CommonCandidates);
19974 ShuffledExtracts =
true;
19981 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19982 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19984 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
19985 Value *OrigV = TrackedToOrig.at(VC);
19986 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19987 if (
auto *ResI = dyn_cast<Instruction>(Res))
19988 V.analyzedReductionRoot(ResI);
19990 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19994 unsigned NumReducedVals = Candidates.
size();
19995 if (NumReducedVals < ReductionLimit &&
19996 (NumReducedVals < 2 || !
isSplat(Candidates)))
20001 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20002 RdxKind != RecurKind::FMul &&
20003 RdxKind != RecurKind::FMulAdd;
20006 if (IsSupportedHorRdxIdentityOp)
20007 for (
Value *V : Candidates) {
20008 Value *OrigV = TrackedToOrig.at(V);
20009 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20021 bool SameScaleFactor =
false;
20022 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20023 SameValuesCounter.
size() != Candidates.size();
20025 if (OptReusedScalars) {
20027 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20028 RdxKind == RecurKind::Xor) &&
20030 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20031 return P.second == SameValuesCounter.
front().second;
20033 Candidates.resize(SameValuesCounter.
size());
20034 transform(SameValuesCounter, Candidates.begin(),
20035 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20036 NumReducedVals = Candidates.size();
20038 if (NumReducedVals == 1) {
20039 Value *OrigV = TrackedToOrig.at(Candidates.front());
20040 unsigned Cnt = At(SameValuesCounter, OrigV);
20042 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20043 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20044 VectorizedVals.try_emplace(OrigV, Cnt);
20045 ExternallyUsedValues.
insert(OrigV);
20050 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20051 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20052 const unsigned MaxElts = std::clamp<unsigned>(
20054 RegMaxNumber * RedValsMaxNumber);
20056 unsigned ReduxWidth = NumReducedVals;
20057 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20058 unsigned NumParts, NumRegs;
20059 Type *ScalarTy = Candidates.front()->getType();
20066 while (NumParts > NumRegs) {
20067 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
20068 ReduxWidth =
bit_floor(ReduxWidth - 1);
20074 if (NumParts > NumRegs / 2)
20079 ReduxWidth = GetVectorFactor(ReduxWidth);
20080 ReduxWidth = std::min(ReduxWidth, MaxElts);
20082 unsigned Start = 0;
20083 unsigned Pos = Start;
20085 unsigned PrevReduxWidth = ReduxWidth;
20086 bool CheckForReusedReductionOpsLocal =
false;
20087 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20088 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20089 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20092 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20095 if (Pos < NumReducedVals - ReduxWidth + 1)
20096 return IsAnyRedOpGathered;
20099 if (ReduxWidth > 1)
20100 ReduxWidth = GetVectorFactor(ReduxWidth);
20101 return IsAnyRedOpGathered;
20103 bool AnyVectorized =
false;
20105 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20106 ReduxWidth >= ReductionLimit) {
20109 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20111 CheckForReusedReductionOps =
true;
20114 PrevReduxWidth = ReduxWidth;
20117 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20120 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20122 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20124 V.areAnalyzedReductionVals(VL)) {
20125 (void)AdjustReducedVals(
true);
20131 auto *RedValI = dyn_cast<Instruction>(RedVal);
20134 return V.isDeleted(RedValI);
20137 V.buildTree(VL, IgnoreList);
20138 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20139 if (!AdjustReducedVals())
20140 V.analyzedReductionVals(VL);
20143 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20144 if (!AdjustReducedVals())
20145 V.analyzedReductionVals(VL);
20148 V.reorderTopToBottom();
20150 V.reorderBottomToTop(
true);
20154 ExternallyUsedValues);
20158 LocalExternallyUsedValues.insert(ReductionRoot);
20159 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20160 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20162 for (
Value *V : ReducedVals[Cnt])
20163 if (isa<Instruction>(V))
20164 LocalExternallyUsedValues.insert(TrackedVals[V]);
20166 if (!IsSupportedHorRdxIdentityOp) {
20169 "Reused values counter map is not empty");
20170 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20171 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20173 Value *
V = Candidates[Cnt];
20174 Value *OrigV = TrackedToOrig.at(V);
20175 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20178 V.transformNodes();
20182 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20183 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20185 Value *RdxVal = Candidates[Cnt];
20186 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20187 RdxVal = It->second;
20188 if (!Visited.
insert(RdxVal).second)
20192 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20193 LocalExternallyUsedValues.insert(RdxVal);
20196 Value *OrigV = TrackedToOrig.at(RdxVal);
20198 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20199 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20200 LocalExternallyUsedValues.insert(RdxVal);
20203 if (!IsSupportedHorRdxIdentityOp)
20204 SameValuesCounter.
clear();
20205 for (
Value *RdxVal : VL)
20206 if (RequiredExtract.
contains(RdxVal))
20207 LocalExternallyUsedValues.insert(RdxVal);
20208 V.buildExternalUses(LocalExternallyUsedValues);
20210 V.computeMinimumValueSizes();
20215 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20218 <<
" for reduction\n");
20222 V.getORE()->emit([&]() {
20224 ReducedValsToOps.
at(VL[0]).front())
20225 <<
"Vectorizing horizontal reduction is possible "
20226 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20227 <<
" and threshold "
20230 if (!AdjustReducedVals()) {
20231 V.analyzedReductionVals(VL);
20232 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20233 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20236 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20237 VF >= ReductionLimit;
20239 *
TTI, VL.front()->getType(), VF - 1)) {
20241 V.getCanonicalGraphSize() !=
V.getTreeSize())
20243 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20251 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20252 <<
Cost <<
". (HorRdx)\n");
20253 V.getORE()->emit([&]() {
20255 ReducedValsToOps.
at(VL[0]).front())
20256 <<
"Vectorized horizontal reduction with cost "
20257 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20258 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20265 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20267 if (IsCmpSelMinMax)
20268 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20271 Value *VectorizedRoot =
20272 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20275 for (
Value *RdxVal : Candidates) {
20276 Value *OrigVal = TrackedToOrig.at(RdxVal);
20277 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20278 if (TransformedRdxVal != RdxVal)
20279 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20288 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20291 if (OptReusedScalars && !SameScaleFactor) {
20292 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20293 SameValuesCounter, TrackedToOrig);
20296 Value *ReducedSubTree;
20297 Type *ScalarTy = VL.front()->getType();
20298 if (isa<FixedVectorType>(ScalarTy)) {
20303 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20321 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20324 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20327 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20328 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20329 "Expected different reduction type.");
20331 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20332 V.isSignedMinBitwidthRootNode());
20338 if (OptReusedScalars && SameScaleFactor)
20339 ReducedSubTree = emitScaleForReusedOps(
20340 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20342 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20344 for (
Value *RdxVal : VL) {
20345 Value *OrigV = TrackedToOrig.at(RdxVal);
20346 if (IsSupportedHorRdxIdentityOp) {
20347 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20350 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20351 if (!
V.isVectorized(RdxVal))
20352 RequiredExtract.
insert(RdxVal);
20356 ReduxWidth = NumReducedVals - Pos;
20357 if (ReduxWidth > 1)
20358 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20359 AnyVectorized =
true;
20361 if (OptReusedScalars && !AnyVectorized) {
20362 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20363 Value *RdxVal = TrackedVals.at(
P.first);
20364 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20366 VectorizedVals.try_emplace(
P.first,
P.second);
20371 if (VectorizedTree) {
20392 if (!AnyBoolLogicOp)
20394 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20395 getRdxOperand(RedOp1, 0) ==
LHS ||
20398 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20399 getRdxOperand(RedOp2, 0) ==
RHS ||
20404 if (
LHS != VectorizedTree)
20415 unsigned Sz = InstVals.
size();
20418 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20421 Value *RdxVal1 = InstVals[
I].second;
20422 Value *StableRdxVal1 = RdxVal1;
20423 auto It1 = TrackedVals.find(RdxVal1);
20424 if (It1 != TrackedVals.end())
20425 StableRdxVal1 = It1->second;
20426 Value *RdxVal2 = InstVals[
I + 1].second;
20427 Value *StableRdxVal2 = RdxVal2;
20428 auto It2 = TrackedVals.find(RdxVal2);
20429 if (It2 != TrackedVals.end())
20430 StableRdxVal2 = It2->second;
20434 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20436 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20437 StableRdxVal2,
"op.rdx", ReductionOps);
20438 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20441 ExtraReds[Sz / 2] = InstVals.
back();
20445 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20449 for (
Value *RdxVal : Candidates) {
20450 if (!Visited.
insert(RdxVal).second)
20452 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20459 bool InitStep =
true;
20460 while (ExtraReductions.
size() > 1) {
20462 FinalGen(ExtraReductions, InitStep);
20463 ExtraReductions.
swap(NewReds);
20466 VectorizedTree = ExtraReductions.
front().second;
20468 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20477 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20484 for (
auto *U :
Ignore->users()) {
20486 "All users must be either in the reduction ops list.");
20489 if (!
Ignore->use_empty()) {
20491 Ignore->replaceAllUsesWith(
P);
20494 V.removeInstructionsAndOperands(RdxOps);
20496 }
else if (!CheckForReusedReductionOps) {
20497 for (ReductionOpsType &RdxOps : ReductionOps)
20498 for (
Value *RdxOp : RdxOps)
20499 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20501 return VectorizedTree;
20511 Type *ScalarTy = ReducedVals.
front()->getType();
20512 unsigned ReduxWidth = ReducedVals.
size();
20521 int Cnt = ReducedVals.
size();
20522 for (
Value *RdxVal : ReducedVals) {
20527 Cost += GenCostFn();
20532 auto *RdxOp = cast<Instruction>(U);
20533 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20541 Cost += ScalarCost;
20543 Cost += GenCostFn();
20548 case RecurKind::Add:
20549 case RecurKind::Mul:
20550 case RecurKind::Or:
20551 case RecurKind::And:
20552 case RecurKind::Xor:
20553 case RecurKind::FAdd:
20554 case RecurKind::FMul: {
20557 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20560 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20572 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20573 std::make_pair(RedTy,
true));
20574 if (RType == RedTy) {
20584 ScalarCost = EvaluateScalarCost([&]() {
20589 case RecurKind::FMax:
20590 case RecurKind::FMin:
20591 case RecurKind::FMaximum:
20592 case RecurKind::FMinimum:
20593 case RecurKind::SMax:
20594 case RecurKind::SMin:
20595 case RecurKind::UMax:
20596 case RecurKind::UMin: {
20600 ScalarCost = EvaluateScalarCost([&]() {
20610 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20612 <<
" (It is a splitting reduction)\n");
20613 return VectorCost - ScalarCost;
20619 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20620 assert(RdxKind != RecurKind::FMulAdd &&
20621 "A call to the llvm.fmuladd intrinsic is not handled yet");
20623 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20624 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20625 RdxKind == RecurKind::Add &&
20630 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20631 ++NumVectorInstructions;
20634 ++NumVectorInstructions;
20641 assert(IsSupportedHorRdxIdentityOp &&
20642 "The optimization of matched scalar identity horizontal reductions "
20643 "must be supported.");
20645 return VectorizedValue;
20647 case RecurKind::Add: {
20649 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20651 << VectorizedValue <<
". (HorRdx)\n");
20652 return Builder.
CreateMul(VectorizedValue, Scale);
20654 case RecurKind::Xor: {
20656 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20657 <<
". (HorRdx)\n");
20660 return VectorizedValue;
20662 case RecurKind::FAdd: {
20664 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20666 << VectorizedValue <<
". (HorRdx)\n");
20667 return Builder.
CreateFMul(VectorizedValue, Scale);
20669 case RecurKind::And:
20670 case RecurKind::Or:
20671 case RecurKind::SMax:
20672 case RecurKind::SMin:
20673 case RecurKind::UMax:
20674 case RecurKind::UMin:
20675 case RecurKind::FMax:
20676 case RecurKind::FMin:
20677 case RecurKind::FMaximum:
20678 case RecurKind::FMinimum:
20680 return VectorizedValue;
20681 case RecurKind::Mul:
20682 case RecurKind::FMul:
20683 case RecurKind::FMulAdd:
20684 case RecurKind::IAnyOf:
20685 case RecurKind::FAnyOf:
20686 case RecurKind::IFindLastIV:
20687 case RecurKind::FFindLastIV:
20688 case RecurKind::None:
20700 assert(IsSupportedHorRdxIdentityOp &&
20701 "The optimization of matched scalar identity horizontal reductions "
20702 "must be supported.");
20704 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20705 if (VTy->getElementType() != VL.
front()->getType()) {
20709 R.isSignedMinBitwidthRootNode());
20712 case RecurKind::Add: {
20715 for (
Value *V : VL) {
20716 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20717 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20721 << VectorizedValue <<
". (HorRdx)\n");
20722 return Builder.
CreateMul(VectorizedValue, Scale);
20724 case RecurKind::And:
20725 case RecurKind::Or:
20728 <<
". (HorRdx)\n");
20729 return VectorizedValue;
20730 case RecurKind::SMax:
20731 case RecurKind::SMin:
20732 case RecurKind::UMax:
20733 case RecurKind::UMin:
20734 case RecurKind::FMax:
20735 case RecurKind::FMin:
20736 case RecurKind::FMaximum:
20737 case RecurKind::FMinimum:
20740 <<
". (HorRdx)\n");
20741 return VectorizedValue;
20742 case RecurKind::Xor: {
20748 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20750 std::iota(
Mask.begin(),
Mask.end(), 0);
20751 bool NeedShuffle =
false;
20752 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20754 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20755 if (Cnt % 2 == 0) {
20757 NeedShuffle =
true;
20763 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20767 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20768 return VectorizedValue;
20770 case RecurKind::FAdd: {
20773 for (
Value *V : VL) {
20774 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20775 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20778 return Builder.
CreateFMul(VectorizedValue, Scale);
20780 case RecurKind::Mul:
20781 case RecurKind::FMul:
20782 case RecurKind::FMulAdd:
20783 case RecurKind::IAnyOf:
20784 case RecurKind::FAnyOf:
20785 case RecurKind::IFindLastIV:
20786 case RecurKind::FFindLastIV:
20787 case RecurKind::None:
20797 return HorizontalReduction::getRdxKind(V);
20800 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20801 return cast<FixedVectorType>(IE->getType())->getNumElements();
20803 unsigned AggregateSize = 1;
20804 auto *
IV = cast<InsertValueInst>(InsertInst);
20805 Type *CurrentType =
IV->getType();
20807 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20808 for (
auto *Elt : ST->elements())
20809 if (Elt != ST->getElementType(0))
20810 return std::nullopt;
20811 AggregateSize *= ST->getNumElements();
20812 CurrentType = ST->getElementType(0);
20813 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20814 AggregateSize *= AT->getNumElements();
20815 CurrentType = AT->getElementType();
20816 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20817 AggregateSize *= VT->getNumElements();
20818 return AggregateSize;
20820 return AggregateSize;
20822 return std::nullopt;
20831 unsigned OperandOffset,
const BoUpSLP &R) {
20834 std::optional<unsigned> OperandIndex =
20836 if (!OperandIndex || R.isDeleted(LastInsertInst))
20838 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20840 BuildVectorOpds, InsertElts, *OperandIndex, R);
20843 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20844 InsertElts[*OperandIndex] = LastInsertInst;
20846 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20847 }
while (LastInsertInst !=
nullptr &&
20848 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20872 assert((isa<InsertElementInst>(LastInsertInst) ||
20873 isa<InsertValueInst>(LastInsertInst)) &&
20874 "Expected insertelement or insertvalue instruction!");
20877 "Expected empty result vectors!");
20880 if (!AggregateSize)
20882 BuildVectorOpds.
resize(*AggregateSize);
20883 InsertElts.
resize(*AggregateSize);
20889 if (BuildVectorOpds.
size() >= 2)
20907 auto DominatedReduxValue = [&](
Value *R) {
20908 return isa<Instruction>(R) &&
20909 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20915 if (
P->getIncomingBlock(0) == ParentBB) {
20916 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20917 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20918 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20921 if (Rdx && DominatedReduxValue(Rdx))
20934 if (
P->getIncomingBlock(0) == BBLatch) {
20935 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20936 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20937 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20940 if (Rdx && DominatedReduxValue(Rdx))
20974 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20975 isa<IntrinsicInst>(Root)) &&
20976 "Expected binop, select, or intrinsic for reduction matching");
20978 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20980 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20982 return dyn_cast<Instruction>(
RHS);
20984 return dyn_cast<Instruction>(
LHS);
20991 Value *Op0 =
nullptr;
20992 Value *Op1 =
nullptr;
20995 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21001 Value *B0 =
nullptr, *B1 =
nullptr;
21006bool SLPVectorizerPass::vectorizeHorReduction(
21011 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21013 if (Root->
getParent() != BB || isa<PHINode>(Root))
21017 auto SelectRoot = [&]() {
21036 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21037 Stack.emplace(SelectRoot(), 0);
21041 if (
R.isAnalyzedReductionRoot(Inst))
21046 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21048 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21050 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21051 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21058 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21063 while (!
Stack.empty()) {
21066 std::tie(Inst, Level) =
Stack.front();
21071 if (
R.isDeleted(Inst))
21073 if (
Value *VectorizedV = TryToReduce(Inst)) {
21075 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21077 Stack.emplace(
I, Level);
21080 if (
R.isDeleted(Inst))
21084 if (!TryAppendToPostponedInsts(Inst)) {
21095 if (VisitedInstrs.
insert(
Op).second)
21096 if (
auto *
I = dyn_cast<Instruction>(
Op))
21099 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21100 !
R.isDeleted(
I) &&
I->getParent() == BB)
21101 Stack.emplace(
I, Level);
21109 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21110 Res |= tryToVectorize(PostponedInsts, R);
21117 for (
Value *V : Insts)
21118 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21119 Res |= tryToVectorize(Inst, R);
21123bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21126 if (!
R.canMapToVector(IVI->
getType()))
21134 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21135 R.getORE()->emit([&]() {
21137 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21138 "trying reduction first.";
21142 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21144 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21154 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21158 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21159 R.getORE()->emit([&]() {
21161 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21162 "trying reduction first.";
21166 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21167 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21170template <
typename T>
21175 bool MaxVFOnly,
BoUpSLP &R) {
21176 bool Changed =
false;
21187 auto *
I = dyn_cast<Instruction>(*IncIt);
21188 if (!
I || R.isDeleted(
I)) {
21192 auto *SameTypeIt = IncIt;
21193 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21194 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21195 AreCompatible(*SameTypeIt, *IncIt))) {
21196 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21198 if (
I && !R.isDeleted(
I))
21203 unsigned NumElts = VL.
size();
21204 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21205 << NumElts <<
")\n");
21215 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21218 VL.
swap(Candidates);
21219 Candidates.
clear();
21221 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21227 auto GetMinNumElements = [&R](
Value *V) {
21228 unsigned EltSize = R.getVectorElementSize(V);
21229 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21231 if (NumElts < GetMinNumElements(*IncIt) &&
21232 (Candidates.
empty() ||
21233 Candidates.
front()->getType() == (*IncIt)->getType())) {
21235 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21241 if (Candidates.
size() > 1 &&
21242 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21243 if (TryToVectorizeHelper(Candidates,
false)) {
21246 }
else if (MaxVFOnly) {
21249 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21251 auto *
I = dyn_cast<Instruction>(*It);
21252 if (!
I || R.isDeleted(
I)) {
21256 auto *SameTypeIt = It;
21257 while (SameTypeIt !=
End &&
21258 (!isa<Instruction>(*SameTypeIt) ||
21259 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21260 AreCompatible(*SameTypeIt, *It))) {
21261 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21263 if (
I && !R.isDeleted(
I))
21266 unsigned NumElts = VL.
size();
21267 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21273 Candidates.
clear();
21277 IncIt = SameTypeIt;
21289template <
bool IsCompatibility>
21294 "Expected valid element types only.");
21296 return IsCompatibility;
21297 auto *CI1 = cast<CmpInst>(V);
21298 auto *CI2 = cast<CmpInst>(V2);
21299 if (CI1->getOperand(0)->getType()->getTypeID() <
21301 return !IsCompatibility;
21302 if (CI1->getOperand(0)->getType()->getTypeID() >
21305 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21307 return !IsCompatibility;
21308 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21317 if (BasePred1 < BasePred2)
21318 return !IsCompatibility;
21319 if (BasePred1 > BasePred2)
21322 bool CI1Preds = Pred1 == BasePred1;
21323 bool CI2Preds = Pred2 == BasePred1;
21324 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21325 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21326 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21330 return !IsCompatibility;
21333 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21334 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21335 if (IsCompatibility) {
21336 if (I1->getParent() != I2->getParent())
21343 return NodeI2 !=
nullptr;
21346 assert((NodeI1 == NodeI2) ==
21348 "Different nodes should have different DFS numbers");
21349 if (NodeI1 != NodeI2)
21353 if (S && (IsCompatibility || !S.isAltShuffle()))
21355 if (IsCompatibility)
21357 if (I1->getOpcode() != I2->getOpcode())
21358 return I1->getOpcode() < I2->getOpcode();
21361 return IsCompatibility;
21364template <
typename ItT>
21367 bool Changed =
false;
21370 if (
R.isDeleted(
I))
21373 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21374 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21375 if (
R.isDeleted(
I))
21381 if (
R.isDeleted(
I))
21383 Changed |= tryToVectorize(
I, R);
21390 return compareCmp<false>(V, V2, *TLI, *DT);
21393 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21396 return compareCmp<true>(V1, V2, *TLI, *DT);
21403 if (Vals.
size() <= 1)
21405 Changed |= tryToVectorizeSequence<Value>(
21406 Vals, CompareSorter, AreCompatibleCompares,
21409 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21411 auto *Select = dyn_cast<SelectInst>(U);
21413 Select->getParent() != cast<Instruction>(V)->getParent();
21416 if (ArePossiblyReducedInOtherBlock)
21418 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21424bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21426 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21427 "This function only accepts Insert instructions");
21428 bool OpsChanged =
false;
21430 for (
auto *
I :
reverse(Instructions)) {
21432 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21434 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21436 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21437 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21439 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21442 if (
R.isDeleted(
I))
21444 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21445 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21448 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21450 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21451 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21452 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21457 OpsChanged |= tryToVectorize(PostponedInsts, R);
21464 bool Changed =
false;
21471 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21474 "Expected vectorizable types only.");
21482 V2->getType()->getScalarSizeInBits())
21485 V2->getType()->getScalarSizeInBits())
21489 if (Opcodes1.
size() < Opcodes2.
size())
21491 if (Opcodes1.
size() > Opcodes2.
size())
21493 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21496 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21497 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21502 return NodeI2 !=
nullptr;
21505 assert((NodeI1 == NodeI2) ==
21507 "Different nodes should have different DFS numbers");
21508 if (NodeI1 != NodeI2)
21511 if (S && !S.isAltShuffle())
21513 return I1->getOpcode() < I2->getOpcode();
21522 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21523 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21531 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21532 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21536 auto ValID1 = Opcodes1[
I]->getValueID();
21537 auto ValID2 = Opcodes2[
I]->getValueID();
21538 if (ValID1 == ValID2)
21540 if (ValID1 < ValID2)
21542 if (ValID1 > ValID2)
21551 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21555 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21558 if (V1->getType() !=
V2->getType())
21562 if (Opcodes1.
size() != Opcodes2.
size())
21564 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21566 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21568 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21569 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21570 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21572 if (
I1->getParent() != I2->getParent())
21578 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21580 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21586 bool HaveVectorizedPhiNodes =
false;
21591 auto *
P = dyn_cast<PHINode>(&
I);
21597 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21610 if (!Opcodes.
empty())
21614 while (!Nodes.empty()) {
21615 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21618 for (
Value *V :
PHI->incoming_values()) {
21619 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21620 Nodes.push_back(PHI1);
21628 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21629 Incoming, PHICompare, AreCompatiblePHIs,
21631 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21634 Changed |= HaveVectorizedPhiNodes;
21635 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21636 auto *
PHI = dyn_cast<PHINode>(
P.first);
21637 return !
PHI ||
R.isDeleted(
PHI);
21639 PHIToOpcodes.
clear();
21641 }
while (HaveVectorizedPhiNodes);
21643 VisitedInstrs.
clear();
21645 InstSetVector PostProcessInserts;
21649 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21650 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21651 if (VectorizeCmps) {
21652 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21653 PostProcessCmps.
clear();
21655 PostProcessInserts.clear();
21660 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21661 return PostProcessCmps.
contains(Cmp);
21662 return isa<InsertElementInst, InsertValueInst>(
I) &&
21663 PostProcessInserts.contains(
I);
21669 return I->use_empty() &&
21670 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21675 if (isa<ScalableVectorType>(It->getType()))
21679 if (
R.isDeleted(&*It))
21682 if (!VisitedInstrs.
insert(&*It).second) {
21683 if (HasNoUsers(&*It) &&
21684 VectorizeInsertsAndCmps(It->isTerminator())) {
21694 if (isa<DbgInfoIntrinsic>(It))
21698 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21700 if (
P->getNumIncomingValues() == 2) {
21703 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21712 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21717 if (BB ==
P->getIncomingBlock(
I) ||
21723 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21724 PI && !IsInPostProcessInstrs(PI)) {
21726 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21728 if (Res &&
R.isDeleted(
P)) {
21738 if (HasNoUsers(&*It)) {
21739 bool OpsChanged =
false;
21740 auto *
SI = dyn_cast<StoreInst>(It);
21750 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21751 SI->getValueOperand()->hasOneUse();
21753 if (TryToVectorizeRoot) {
21754 for (
auto *V : It->operand_values()) {
21757 if (
auto *VI = dyn_cast<Instruction>(V);
21758 VI && !IsInPostProcessInstrs(VI))
21760 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21767 VectorizeInsertsAndCmps(It->isTerminator());
21778 if (isa<InsertElementInst, InsertValueInst>(It))
21779 PostProcessInserts.insert(&*It);
21780 else if (isa<CmpInst>(It))
21781 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21788 auto Changed =
false;
21789 for (
auto &Entry : GEPs) {
21792 if (
Entry.second.size() < 2)
21795 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21796 <<
Entry.second.size() <<
".\n");
21804 return !R.isDeleted(GEP);
21806 if (It ==
Entry.second.end())
21808 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21809 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21810 if (MaxVecRegSize < EltSize)
21813 unsigned MaxElts = MaxVecRegSize / EltSize;
21814 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21815 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21828 Candidates.remove_if([&R](
Value *
I) {
21829 return R.isDeleted(cast<Instruction>(
I)) ||
21830 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21838 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21839 auto *GEPI = GEPList[
I];
21840 if (!Candidates.count(GEPI))
21843 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21844 auto *GEPJ = GEPList[J];
21846 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21847 Candidates.remove(GEPI);
21848 Candidates.remove(GEPJ);
21849 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21850 Candidates.remove(GEPJ);
21857 if (Candidates.
size() < 2)
21864 auto BundleIndex = 0
u;
21865 for (
auto *V : Candidates) {
21866 auto *
GEP = cast<GetElementPtrInst>(V);
21867 auto *GEPIdx =
GEP->idx_begin()->get();
21868 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21869 Bundle[BundleIndex++] = GEPIdx;
21881 Changed |= tryToVectorizeList(Bundle, R);
21887bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21888 bool Changed =
false;
21893 if (
V->getValueOperand()->getType()->getTypeID() <
21894 V2->getValueOperand()->getType()->getTypeID())
21896 if (
V->getValueOperand()->getType()->getTypeID() >
21897 V2->getValueOperand()->getType()->getTypeID())
21899 if (
V->getPointerOperandType()->getTypeID() <
21900 V2->getPointerOperandType()->getTypeID())
21902 if (
V->getPointerOperandType()->getTypeID() >
21903 V2->getPointerOperandType()->getTypeID())
21905 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21906 V2->getValueOperand()->getType()->getScalarSizeInBits())
21908 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21909 V2->getValueOperand()->getType()->getScalarSizeInBits())
21912 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21913 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21917 DT->
getNode(I2->getParent());
21918 assert(NodeI1 &&
"Should only process reachable instructions");
21919 assert(NodeI2 &&
"Should only process reachable instructions");
21920 assert((NodeI1 == NodeI2) ==
21922 "Different nodes should have different DFS numbers");
21923 if (NodeI1 != NodeI2)
21925 return I1->getOpcode() < I2->getOpcode();
21927 return V->getValueOperand()->getValueID() <
21928 V2->getValueOperand()->getValueID();
21940 isa<UndefValue>(
V2->getValueOperand()))
21943 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21944 if (
I1->getParent() != I2->getParent())
21949 isa<Constant>(
V2->getValueOperand()))
21952 V2->getValueOperand()->getValueID();
21957 for (
auto &Pair : Stores) {
21958 if (Pair.second.size() < 2)
21962 << Pair.second.size() <<
".\n");
21971 Pair.second.rend());
21972 Changed |= tryToVectorizeSequence<StoreInst>(
21973 ReversedStores, StoreSorter, AreCompatibleStores,
21975 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.