73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
107using namespace std::placeholders;
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115 "Controls which SLP graphs should be vectorized.");
119 cl::desc(
"Run the SLP vectorization passes"));
123 cl::desc(
"Enable vectorization for wider vector utilization"));
127 cl::desc(
"Only vectorize if you gain more than this "
132 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Attempt to vectorize for this register size in bits"));
150 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
158 cl::desc(
"Limit the size of the SLP scheduling region per block"));
162 cl::desc(
"Attempt to vectorize for this register size in bits"));
166 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
176 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
185 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189 cl::desc(
"The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
194 cl::desc(
"The maximum stride, considered to be profitable."));
198 cl::desc(
"Display the SLP trees with Graphviz"));
202 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
233 if (
SLPReVec && isa<FixedVectorType>(Ty))
235 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
244 if (
auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (
auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (
auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
272 Type *Ty,
unsigned Sz) {
277 if (NumParts == 0 || NumParts >= Sz)
292 if (NumParts == 0 || NumParts >= Sz)
297 return (Sz / RegVF) * RegVF;
307 for (
unsigned I : seq<unsigned>(Mask.size()))
309 I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
342 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344 auto *SV = cast<ShuffleVectorInst>(VL.
front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353 unsigned NumGroup = 0;
354 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356 Value *Src = SV->getOperand(0);
360 auto *SV = cast<ShuffleVectorInst>(V);
362 if (SV->getOperand(0) != Src)
365 if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371 if (!ExpectedIndex.
all())
375 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
393 auto *SV = cast<ShuffleVectorInst>(VL.
front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397 unsigned AccumulateLength = 0;
398 for (
Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421 auto *
I = dyn_cast<Instruction>(V);
422 if (!
I || isa<ExtractValueInst>(
I))
424 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426 if (isa<ExtractElementInst>(
I))
428 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
444 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
453 OS <<
"Idx: " <<
Idx <<
", ";
454 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
462 auto *It =
find_if(VL, IsaPred<Instruction>);
471 if (isa<PoisonValue>(V))
473 auto *
II = dyn_cast<Instruction>(V);
477 if (BB !=
II->getParent())
494 Value *FirstNonUndef =
nullptr;
495 for (
Value *V : VL) {
496 if (isa<UndefValue>(V))
498 if (!FirstNonUndef) {
502 if (V != FirstNonUndef)
505 return FirstNonUndef !=
nullptr;
510 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511 return Cmp->isCommutative();
512 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539 return I->isCommutative();
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549 if (
const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556 if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
570 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
581 Type *CurrentType =
IV->getType();
582 for (
unsigned I :
IV->indices()) {
583 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
621 if (MaskArg == UseMask::UndefsAsMask)
625 if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
636template <
bool IsPoisonOnly = false>
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646 auto *
C = dyn_cast<Constant>(V);
648 if (!UseMask.empty()) {
650 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652 if (isa<T>(
II->getOperand(1)))
659 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
675 if (
Constant *Elem =
C->getAggregateElement(
I))
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
705static std::optional<TargetTransformInfo::ShuffleKind>
708 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
722 Value *Vec1 =
nullptr;
723 Value *Vec2 =
nullptr;
725 auto *EE = dyn_cast<ExtractElementInst>(V);
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
738 if (isa<UndefValue>(VL[
I]))
740 auto *EI = cast<ExtractElementInst>(VL[
I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743 auto *Vec = EI->getVectorOperand();
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
748 if (isa<UndefValue>(Vec)) {
751 if (isa<UndefValue>(EI->getIndexOperand()))
753 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
759 unsigned IntIdx =
Idx->getValue().getZExtValue();
766 if (!Vec1 || Vec1 == Vec) {
768 }
else if (!Vec2 || Vec2 == Vec) {
774 if (CommonShuffleMode == Permute)
778 if (Mask[
I] %
Size !=
I) {
779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
785 if (CommonShuffleMode ==
Select && Vec2)
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803 return CI->getZExtValue();
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
808 return *EI->idx_begin();
814class InstructionsState {
821 assert(valid() &&
"InstructionsState is invalid.");
826 assert(valid() &&
"InstructionsState is invalid.");
831 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
833 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
836 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
839 unsigned CheckedOpcode =
I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
844 bool valid()
const {
return MainOp && AltOp; }
846 explicit operator bool()
const {
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
890 "Assessing comparisons of different types?");
900 return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
912 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
915 auto *It =
find_if(VL, IsaPred<Instruction>);
917 return InstructionsState::invalid();
920 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
932 unsigned AltOpcode = Opcode;
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
938 for (
Value *V : VL) {
939 auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946 if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
953 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
959 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963 return InstructionsState::invalid();
965 bool AnyPoison = InstCnt != VL.
size();
969 auto *
I = dyn_cast<Instruction>(V);
976 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
977 return InstructionsState::invalid();
978 unsigned InstOpcode =
I->getOpcode();
979 if (IsBinOp && isa<BinaryOperator>(
I)) {
980 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
984 AltOpcode = InstOpcode;
988 }
else if (IsCastOp && isa<CastInst>(
I)) {
991 Value *Op1 =
I->getOperand(0);
994 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
996 if (Opcode == AltOpcode) {
999 "Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1005 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1006 auto *BaseInst = cast<CmpInst>(MainOp);
1007 Type *Ty0 = BaseInst->getOperand(0)->getType();
1008 Type *Ty1 = Inst->getOperand(0)->getType();
1010 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1011 assert(InstOpcode == AltOpcode &&
1012 "Alternate instructions are only supported by BinaryOperator "
1020 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1026 auto *AltInst = cast<CmpInst>(AltOp);
1027 if (MainOp != AltOp) {
1030 }
else if (BasePred != CurrentPred) {
1033 "CmpInst isn't safe for alternation, logic needs to be updated!");
1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1042 }
else if (InstOpcode == Opcode) {
1043 assert(InstOpcode == AltOpcode &&
1044 "Alternate instructions are only supported by BinaryOperator and "
1046 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1047 if (Gep->getNumOperands() != 2 ||
1049 return InstructionsState::invalid();
1050 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1052 return InstructionsState::invalid();
1053 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1054 auto *BaseLI = cast<LoadInst>(MainOp);
1055 if (!LI->isSimple() || !BaseLI->isSimple())
1056 return InstructionsState::invalid();
1057 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1058 auto *
CallBase = cast<CallInst>(MainOp);
1060 return InstructionsState::invalid();
1061 if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1067 return InstructionsState::invalid();
1070 return InstructionsState::invalid();
1073 if (Mappings.
size() != BaseMappings.
size() ||
1074 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1075 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1076 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1077 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1078 Mappings.
front().Shape.Parameters !=
1079 BaseMappings.
front().Shape.Parameters)
1080 return InstructionsState::invalid();
1085 return InstructionsState::invalid();
1088 return InstructionsState(MainOp, AltOp);
1105 unsigned Opcode = UserInst->
getOpcode();
1107 case Instruction::Load: {
1108 LoadInst *LI = cast<LoadInst>(UserInst);
1111 case Instruction::Store: {
1112 StoreInst *SI = cast<StoreInst>(UserInst);
1113 return (SI->getPointerOperand() == Scalar);
1115 case Instruction::Call: {
1116 CallInst *CI = cast<CallInst>(UserInst);
1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1120 Arg.value().get() == Scalar;
1132 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1140 return LI->isSimple();
1142 return SI->isSimple();
1144 return !
MI->isVolatile();
1152 bool ExtendingManyInputs =
false) {
1153 if (SubMask.
empty())
1156 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1159 "SubMask with many inputs support must be larger than the mask.");
1161 Mask.append(SubMask.
begin(), SubMask.
end());
1165 int TermValue = std::min(Mask.size(), SubMask.
size());
1166 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1168 (!ExtendingManyInputs &&
1169 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1171 NewMask[
I] = Mask[SubMask[
I]];
1187 const unsigned Sz = Order.
size();
1190 for (
unsigned I = 0;
I < Sz; ++
I) {
1192 UnusedIndices.
reset(Order[
I]);
1194 MaskedIndices.
set(
I);
1196 if (MaskedIndices.
none())
1199 "Non-synced masked/available indices.");
1203 assert(
Idx >= 0 &&
"Indices must be synced.");
1214 Type *ScalarTy = VL[0]->getType();
1217 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1218 if (isa<PoisonValue>(VL[Lane]))
1220 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1221 OpcodeMask.
set(Lane * ScalarTyNumElements,
1222 Lane * ScalarTyNumElements + ScalarTyNumElements);
1232 const unsigned E = Indices.
size();
1234 for (
unsigned I = 0;
I < E; ++
I)
1235 Mask[Indices[
I]] =
I;
1241 assert(!Mask.empty() &&
"Expected non-empty mask.");
1245 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1247 Scalars[Mask[
I]] = Prev[
I];
1255 auto *
I = dyn_cast<Instruction>(V);
1260 auto *IO = dyn_cast<Instruction>(V);
1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1272 auto *
I = dyn_cast<Instruction>(V);
1276 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1278 auto *IU = dyn_cast<Instruction>(U);
1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1297 return !VL.
empty() &&
1313 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1322 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1324 if (NumParts == 0 || NumParts >= Limit)
1327 if (NumParts >= Sz || Sz % NumParts != 0 ||
1333namespace slpvectorizer {
1338 struct ScheduleData;
1362 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1363 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1414 return !VectorizableTree.
empty() &&
1415 !VectorizableTree.
front()->UserTreeIndices.empty();
1420 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1421 return VectorizableTree.
front()->Scalars;
1427 const TreeEntry &Root = *VectorizableTree.
front().get();
1428 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1429 !Root.Scalars.front()->getType()->isIntegerTy())
1430 return std::nullopt;
1431 auto It = MinBWs.
find(&Root);
1432 if (It != MinBWs.
end())
1436 if (Root.getOpcode() == Instruction::ZExt ||
1437 Root.getOpcode() == Instruction::SExt)
1438 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1439 Root.getOpcode() == Instruction::SExt);
1440 return std::nullopt;
1446 return MinBWs.
at(VectorizableTree.
front().get()).second;
1451 if (ReductionBitWidth == 0 ||
1452 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1453 ReductionBitWidth >=
1454 DL->getTypeSizeInBits(
1455 VectorizableTree.
front()->Scalars.front()->getType()))
1457 VectorizableTree.
front()->Scalars.front()->getType(),
1458 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
front()->Scalars.front()->getContext(),
1463 VectorizableTree.
front()->getVectorFactor());
1478 VectorizableTree.
clear();
1479 ScalarToTreeEntries.clear();
1481 NonScheduledFirst.
clear();
1482 EntryToLastInstruction.clear();
1483 LoadEntriesToVectorize.
clear();
1484 IsGraphTransformMode =
false;
1485 GatheredLoadsEntriesFirst.reset();
1486 ExternalUses.
clear();
1487 ExternalUsesAsOriginalScalar.clear();
1488 for (
auto &Iter : BlocksSchedules) {
1489 BlockScheduling *BS = Iter.second.get();
1493 ReductionBitWidth = 0;
1495 CastMaxMinBWSizes.reset();
1496 ExtraBitWidthNodes.
clear();
1497 InstrElementSize.clear();
1498 UserIgnoreList =
nullptr;
1499 PostponedGathers.
clear();
1500 ValueToGatherNodes.
clear();
1516 assert(!Order.
empty() &&
"expected non-empty order");
1517 const unsigned Sz = Order.
size();
1519 return P.value() ==
P.index() ||
P.value() == Sz;
1572 return MaxVecRegSize;
1577 return MinVecRegSize;
1585 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1587 return MaxVF ? MaxVF : UINT_MAX;
1639 unsigned *BestVF =
nullptr,
1640 bool TryRecursiveCheck =
true)
const;
1648 template <
typename T>
1675 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1676 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1698 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1699 MaxLevel(MaxLevel) {}
1753 if (isa<LoadInst>(V1)) {
1755 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1760 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1762 return U == U1 || U == U2 || R.isVectorized(U);
1765 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1768 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1770 ((
int)V1->getNumUses() == NumLanes ||
1771 AllUsersAreInternal(V1, V2)))
1777 auto CheckSameEntryOrFail = [&]() {
1782 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
1788 auto *LI1 = dyn_cast<LoadInst>(V1);
1789 auto *LI2 = dyn_cast<LoadInst>(V2);
1791 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1793 return CheckSameEntryOrFail();
1796 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1797 LI2->getPointerOperand(),
DL, SE,
true);
1798 if (!Dist || *Dist == 0) {
1801 R.TTI->isLegalMaskedGather(
1804 return CheckSameEntryOrFail();
1808 if (std::abs(*Dist) > NumLanes / 2)
1817 auto *C1 = dyn_cast<Constant>(V1);
1818 auto *C2 = dyn_cast<Constant>(V2);
1832 if (isa<UndefValue>(V2))
1836 Value *EV2 =
nullptr;
1849 int Dist = Idx2 - Idx1;
1852 if (std::abs(Dist) == 0)
1854 if (std::abs(Dist) > NumLanes / 2)
1861 return CheckSameEntryOrFail();
1864 auto *I1 = dyn_cast<Instruction>(V1);
1865 auto *I2 = dyn_cast<Instruction>(V2);
1867 if (I1->getParent() != I2->getParent())
1868 return CheckSameEntryOrFail();
1876 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1877 !S.isAltShuffle()) &&
1879 return isa<PoisonValue>(V) ||
1880 cast<Instruction>(V)->getNumOperands() ==
1881 S.getMainOp()->getNumOperands();
1887 if (I1 && isa<PoisonValue>(V2))
1890 if (isa<UndefValue>(V2))
1893 return CheckSameEntryOrFail();
1927 int ShallowScoreAtThisLevel =
1936 auto *I1 = dyn_cast<Instruction>(
LHS);
1937 auto *I2 = dyn_cast<Instruction>(
RHS);
1938 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1940 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1941 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1942 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1943 ShallowScoreAtThisLevel))
1944 return ShallowScoreAtThisLevel;
1945 assert(I1 && I2 &&
"Should have early exited.");
1952 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1953 OpIdx1 != NumOperands1; ++OpIdx1) {
1955 int MaxTmpScore = 0;
1956 unsigned MaxOpIdx2 = 0;
1957 bool FoundBest =
false;
1961 ? I2->getNumOperands()
1962 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1963 assert(FromIdx <= ToIdx &&
"Bad index");
1964 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1966 if (Op2Used.
count(OpIdx2))
1971 I1, I2, CurrLevel + 1, {});
1974 TmpScore > MaxTmpScore) {
1975 MaxTmpScore = TmpScore;
1982 Op2Used.
insert(MaxOpIdx2);
1983 ShallowScoreAtThisLevel += MaxTmpScore;
1986 return ShallowScoreAtThisLevel;
2017 struct OperandData {
2018 OperandData() =
default;
2019 OperandData(
Value *V,
bool APO,
bool IsUsed)
2020 : V(V), APO(APO), IsUsed(IsUsed) {}
2030 bool IsUsed =
false;
2039 enum class ReorderingMode {
2053 unsigned ArgSize = 0;
2059 const Loop *L =
nullptr;
2062 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2063 return OpsVec[OpIdx][Lane];
2067 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2068 return OpsVec[OpIdx][Lane];
2073 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2074 OpIdx != NumOperands; ++OpIdx)
2075 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2077 OpsVec[OpIdx][Lane].IsUsed =
false;
2081 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2082 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2094 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2096 Value *IdxLaneV = getData(
Idx, Lane).V;
2097 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2098 isa<ExtractElementInst>(IdxLaneV))
2101 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2104 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2105 if (!isa<Instruction>(OpIdxLnV))
2109 unsigned UniquesCount = Uniques.
size();
2110 auto IdxIt = Uniques.
find(IdxLaneV);
2111 unsigned UniquesCntWithIdxLaneV =
2112 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2113 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2114 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2115 unsigned UniquesCntWithOpIdxLaneV =
2116 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2117 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2119 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2120 UniquesCntWithOpIdxLaneV,
2121 UniquesCntWithOpIdxLaneV -
2123 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2124 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2125 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2134 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2135 Value *IdxLaneV = getData(
Idx, Lane).V;
2136 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2145 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2146 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2148 return R.areAllUsersVectorized(IdxLaneI)
2156 static const int ScoreScaleFactor = 10;
2164 int Lane,
unsigned OpIdx,
unsigned Idx,
2174 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2175 if (Score <= -SplatScore) {
2179 Score += SplatScore;
2185 Score *= ScoreScaleFactor;
2186 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2204 std::optional<unsigned>
2205 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2209 unsigned NumOperands = getNumOperands();
2212 Value *OpLastLane = getData(OpIdx, LastLane).V;
2215 ReorderingMode RMode = ReorderingModes[OpIdx];
2216 if (RMode == ReorderingMode::Failed)
2217 return std::nullopt;
2220 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2226 std::optional<unsigned>
Idx;
2230 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2236 bool IsUsed = RMode == ReorderingMode::Splat ||
2237 RMode == ReorderingMode::Constant ||
2238 RMode == ReorderingMode::Load;
2240 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2242 OperandData &OpData = getData(
Idx, Lane);
2244 bool OpAPO = OpData.APO;
2253 if (OpAPO != OpIdxAPO)
2258 case ReorderingMode::Load:
2259 case ReorderingMode::Opcode: {
2260 bool LeftToRight = Lane > LastLane;
2261 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2262 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2263 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2264 OpIdx,
Idx, IsUsed, UsedLanes);
2265 if (Score >
static_cast<int>(BestOp.Score) ||
2266 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2269 BestOp.Score = Score;
2270 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2274 case ReorderingMode::Constant:
2275 if (isa<Constant>(
Op) ||
2276 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2278 if (isa<Constant>(
Op)) {
2280 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2283 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2287 case ReorderingMode::Splat:
2288 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2289 IsUsed =
Op == OpLastLane;
2290 if (
Op == OpLastLane) {
2292 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2298 case ReorderingMode::Failed:
2304 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2308 return std::nullopt;
2315 unsigned getBestLaneToStartReordering()
const {
2316 unsigned Min = UINT_MAX;
2317 unsigned SameOpNumber = 0;
2328 for (
int I = getNumLanes();
I > 0; --
I) {
2329 unsigned Lane =
I - 1;
2330 OperandsOrderData NumFreeOpsHash =
2331 getMaxNumOperandsThatCanBeReordered(Lane);
2334 if (NumFreeOpsHash.NumOfAPOs < Min) {
2335 Min = NumFreeOpsHash.NumOfAPOs;
2336 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2338 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2339 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2340 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2343 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2344 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2345 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2346 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2347 auto [It, Inserted] =
2348 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2354 unsigned BestLane = 0;
2355 unsigned CntMin = UINT_MAX;
2357 if (
Data.second.first < CntMin) {
2358 CntMin =
Data.second.first;
2359 BestLane =
Data.second.second;
2366 struct OperandsOrderData {
2369 unsigned NumOfAPOs = UINT_MAX;
2372 unsigned NumOpsWithSameOpcodeParent = 0;
2386 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2387 unsigned CntTrue = 0;
2388 unsigned NumOperands = getNumOperands();
2398 bool AllUndefs =
true;
2399 unsigned NumOpsWithSameOpcodeParent = 0;
2403 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2404 const OperandData &OpData = getData(OpIdx, Lane);
2409 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2411 I->getParent() != Parent) {
2412 if (NumOpsWithSameOpcodeParent == 0) {
2413 NumOpsWithSameOpcodeParent = 1;
2415 Parent =
I->getParent();
2417 --NumOpsWithSameOpcodeParent;
2420 ++NumOpsWithSameOpcodeParent;
2424 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2425 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2429 OperandsOrderData
Data;
2430 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2431 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2439 assert((empty() || VL.
size() == getNumLanes()) &&
2440 "Expected same number of lanes");
2441 assert(S.valid() &&
"InstructionsState is invalid.");
2444 constexpr unsigned IntrinsicNumOperands = 2;
2447 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2448 OpsVec.
resize(NumOperands);
2449 unsigned NumLanes = VL.
size();
2450 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2451 OpsVec[OpIdx].
resize(NumLanes);
2452 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2453 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2454 "Expected instruction or poison value");
2465 if (isa<PoisonValue>(VL[Lane])) {
2466 if (
auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2468 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2471 }
else if (
auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2473 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2477 OpsVec[OpIdx][Lane] = {
2482 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2483 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2484 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2491 unsigned getNumOperands()
const {
return ArgSize; }
2494 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2497 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2498 return getData(OpIdx, Lane).V;
2502 bool empty()
const {
return OpsVec.
empty(); }
2505 void clear() { OpsVec.
clear(); }
2510 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2511 assert(
Op == getValue(OpIdx, Lane) &&
2512 "Op is expected to be getValue(OpIdx, Lane).");
2514 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2516 bool OpAPO = getData(OpIdx, Lane).APO;
2517 bool IsInvariant = L && L->isLoopInvariant(
Op);
2519 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2523 bool FoundCandidate =
false;
2524 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2525 OperandData &
Data = getData(OpI, Ln);
2526 if (
Data.APO != OpAPO ||
Data.IsUsed)
2528 Value *OpILane = getValue(OpI, Lane);
2529 bool IsConstantOp = isa<Constant>(OpILane);
2538 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2543 isa<Constant>(
Data.V)))) ||
2550 (IsInvariant && !isa<Constant>(
Data.V) &&
2552 L->isLoopInvariant(
Data.V))) {
2553 FoundCandidate =
true;
2560 if (!FoundCandidate)
2563 return getNumLanes() == 2 || Cnt > 1;
2568 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2569 assert(
Op == getValue(OpIdx, Lane) &&
2570 "Op is expected to be getValue(OpIdx, Lane).");
2571 bool OpAPO = getData(OpIdx, Lane).APO;
2572 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2575 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2576 const OperandData &
Data = getData(OpI, Ln);
2577 if (
Data.APO != OpAPO ||
Data.IsUsed)
2579 Value *OpILn = getValue(OpI, Ln);
2580 return (L && L->isLoopInvariant(OpILn)) ||
2593 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2594 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
2596 appendOperandsOfVL(RootVL, S);
2603 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2604 "Expected same num of lanes across all operands");
2605 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2606 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2614 unsigned NumOperands = getNumOperands();
2615 unsigned NumLanes = getNumLanes();
2635 unsigned FirstLane = getBestLaneToStartReordering();
2638 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2639 Value *OpLane0 = getValue(OpIdx, FirstLane);
2642 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2644 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2645 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2646 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2647 else if (isa<LoadInst>(OpILane0))
2648 ReorderingModes[OpIdx] = ReorderingMode::Load;
2650 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2651 }
else if (isa<Constant>(OpLane0)) {
2652 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2653 }
else if (isa<Argument>(OpLane0)) {
2655 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2665 auto &&SkipReordering = [
this]() {
2668 for (
const OperandData &
Data : Op0)
2672 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2679 return UniqueValues.
size() != 2 &&
2681 UniqueValues.
size());
2693 if (SkipReordering())
2696 bool StrategyFailed =
false;
2704 for (
unsigned I = 0;
I < NumOperands; ++
I)
2705 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2708 UsedLanes.
set(FirstLane);
2709 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2712 int Lane = FirstLane +
Direction * Distance;
2713 if (Lane < 0 || Lane >= (
int)NumLanes)
2715 UsedLanes.
set(Lane);
2717 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2720 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2722 std::optional<unsigned> BestIdx =
2723 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2724 MainAltOps[OpIdx], UsedLanes);
2731 swap(OpIdx, *BestIdx, Lane);
2734 StrategyFailed =
true;
2737 if (MainAltOps[OpIdx].
size() != 2) {
2738 OperandData &AltOp = getData(OpIdx, Lane);
2739 InstructionsState OpS =
2741 if (OpS && OpS.isAltShuffle())
2748 if (!StrategyFailed)
2753#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2756 case ReorderingMode::Load:
2758 case ReorderingMode::Opcode:
2760 case ReorderingMode::Constant:
2762 case ReorderingMode::Splat:
2764 case ReorderingMode::Failed:
2785 const unsigned Indent = 2;
2788 OS <<
"Operand " << Cnt++ <<
"\n";
2789 for (
const OperandData &OpData : OpDataVec) {
2791 if (
Value *V = OpData.V)
2795 OS <<
", APO:" << OpData.APO <<
"}\n";
2817 int BestScore = Limit;
2818 std::optional<int> Index;
2819 for (
int I : seq<int>(0, Candidates.size())) {
2821 Candidates[
I].second,
2824 if (Score > BestScore) {
2839 DeletedInstructions.insert(
I);
2844 template <
typename T>
2847 for (
T *V : DeadVals) {
2848 auto *
I = cast<Instruction>(V);
2849 DeletedInstructions.insert(
I);
2852 for (
T *V : DeadVals) {
2853 if (!V || !Processed.
insert(V).second)
2855 auto *
I = cast<Instruction>(V);
2858 for (
Use &U :
I->operands()) {
2859 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2860 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2862 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2863 return Entry->VectorizedValue == OpI;
2867 I->dropAllReferences();
2869 for (
T *V : DeadVals) {
2870 auto *
I = cast<Instruction>(V);
2871 if (!
I->getParent())
2876 cast<Instruction>(U.getUser()));
2878 "trying to erase instruction with users.");
2879 I->removeFromParent();
2883 while (!DeadInsts.
empty()) {
2886 if (!VI || !VI->getParent())
2889 "Live instruction found in dead worklist!");
2890 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2897 for (
Use &OpU : VI->operands()) {
2898 Value *OpV = OpU.get();
2909 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2910 if (!DeletedInstructions.contains(OpI) &&
2915 VI->removeFromParent();
2916 DeletedInstructions.insert(VI);
2924 return AnalyzedReductionsRoots.count(
I);
2929 AnalyzedReductionsRoots.insert(
I);
2943 AnalyzedReductionsRoots.clear();
2944 AnalyzedReductionVals.
clear();
2945 AnalyzedMinBWVals.
clear();
2957 return NonScheduledFirst.
contains(V);
2962 assert(V &&
"V cannot be nullptr.");
2963 return ScalarToTreeEntries.contains(V);
2973 bool collectValuesToDemote(
2974 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2977 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2987 canReorderOperands(TreeEntry *UserTE,
2994 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
3000 TreeEntry *TE =
nullptr;
3002 for (TreeEntry *E : getTreeEntries(V)) {
3003 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
3010 if (It != VL.
end()) {
3011 assert(
TE->isSame(VL) &&
"Expected same scalars.");
3019 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3020 unsigned OpIdx)
const {
3021 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3022 const_cast<TreeEntry *
>(UserTE), OpIdx);
3026 bool areAllUsersVectorized(
3035 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3040 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3044 getCastContextHint(
const TreeEntry &TE)
const;
3053 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3064 bool ResizeAllowed =
false)
const;
3073 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3074 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3075 unsigned NodeIdx)
const {
3076 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3083 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3088 template <
typename BVTy,
typename ResTy,
typename...
Args>
3089 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3094 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3095 bool PostponedPHIs);
3101 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3108 std::optional<TargetTransformInfo::ShuffleKind>
3120 unsigned NumParts)
const;
3132 std::optional<TargetTransformInfo::ShuffleKind>
3133 isGatherShuffledSingleRegisterEntry(
3150 isGatherShuffledEntry(
3153 unsigned NumParts,
bool ForOrder =
false);
3159 Type *ScalarTy)
const;
3163 void setInsertPointAfterBundle(
const TreeEntry *E);
3173 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3178 void tryToVectorizeGatheredLoads(
3187 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3203 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3207 void reorderGatherNode(TreeEntry &TE);
3211 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3228 [Scalars](
Value *V,
int Idx) {
3229 return (isa<UndefValue>(V) &&
3230 Idx == PoisonMaskElem) ||
3231 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3234 if (!ReorderIndices.empty()) {
3241 return IsSame(Scalars, Mask);
3242 if (VL.
size() == ReuseShuffleIndices.size()) {
3244 return IsSame(Scalars, Mask);
3248 return IsSame(Scalars, ReuseShuffleIndices);
3251 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3252 return isGather() && !UserTreeIndices.empty() &&
3253 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3254 UserTreeIndices.front().UserTE == UserEI.UserTE;
3258 bool hasEqualOperands(
const TreeEntry &TE)
const {
3259 if (
TE.getNumOperands() != getNumOperands())
3262 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3263 unsigned PrevCount =
Used.count();
3264 for (
unsigned K = 0;
K < E; ++
K) {
3267 if (getOperand(K) ==
TE.getOperand(
I)) {
3273 if (PrevCount ==
Used.count())
3282 unsigned getVectorFactor()
const {
3283 if (!ReuseShuffleIndices.empty())
3284 return ReuseShuffleIndices.size();
3285 return Scalars.
size();
3289 bool isGather()
const {
return State == NeedToGather; }
3316 enum CombinedOpcode {
3318 MinMax = Instruction::OtherOpsEnd + 1,
3320 CombinedOpcode CombinedOp = NotCombinedOp;
3334 VecTreeTy &Container;
3355 InstructionsState S = InstructionsState::invalid();
3358 unsigned InterleaveFactor = 0;
3362 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3364 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3370 assert(Operands[OpIdx].empty() &&
"Already resized?");
3372 "Number of operands is greater than the number of scalars.");
3378 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3379 VLOperands Ops(Scalars, S, R);
3382 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3383 setOperand(
I, Ops.getVL(
I));
3405 unsigned getNumOperands()
const {
return Operands.size(); }
3408 Value *getSingleOperand(
unsigned OpIdx)
const {
3410 assert(!Operands[OpIdx].empty() &&
"No operand available");
3415 bool isAltShuffle()
const {
return S.isAltShuffle(); }
3417 bool isOpcodeOrAlt(
Instruction *
I)
const {
return S.isOpcodeOrAlt(
I); }
3423 auto *
I = dyn_cast<Instruction>(
Op);
3424 if (
I && isOpcodeOrAlt(
I))
3426 return S.getMainOp();
3429 void setOperations(
const InstructionsState &S) {
3430 assert(S &&
"InstructionsState is invalid.");
3434 Instruction *getMainOp()
const {
return S.getMainOp(); }
3436 Instruction *getAltOp()
const {
return S.getAltOp(); }
3439 unsigned getOpcode()
const {
return S.
getOpcode(); }
3441 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
3443 bool hasState()
const {
return S.valid(); }
3447 int findLaneForValue(
Value *V)
const {
3448 unsigned FoundLane = getVectorFactor();
3449 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3450 std::advance(It, 1)) {
3453 FoundLane = std::distance(Scalars.begin(), It);
3454 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3455 if (!ReorderIndices.
empty())
3456 FoundLane = ReorderIndices[FoundLane];
3457 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3458 if (ReuseShuffleIndices.
empty())
3460 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3461 RIt != ReuseShuffleIndices.
end()) {
3462 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3466 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3479 bool isNonPowOf2Vec()
const {
3481 return IsNonPowerOf2;
3490 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3491 "Reshuffling not supported with non-power-of-2 vectors yet.");
3492 return IsNonPowerOf2;
3495 Value *getOrdered(
unsigned Idx)
const {
3496 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3497 if (ReorderIndices.
empty())
3498 return Scalars[
Idx];
3508 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3509 dbgs() <<
"Operand " << OpI <<
":\n";
3510 for (
const Value *V : Operands[OpI])
3513 dbgs() <<
"Scalars: \n";
3514 for (
Value *V : Scalars)
3516 dbgs() <<
"State: ";
3519 if (InterleaveFactor > 0) {
3520 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3523 dbgs() <<
"Vectorize\n";
3526 case ScatterVectorize:
3527 dbgs() <<
"ScatterVectorize\n";
3529 case StridedVectorize:
3530 dbgs() <<
"StridedVectorize\n";
3533 dbgs() <<
"NeedToGather\n";
3535 case CombinedVectorize:
3536 dbgs() <<
"CombinedVectorize\n";
3540 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
3541 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
3543 dbgs() <<
"MainOp: NULL\n";
3544 dbgs() <<
"AltOp: NULL\n";
3546 dbgs() <<
"VectorizedValue: ";
3547 if (VectorizedValue)
3548 dbgs() << *VectorizedValue <<
"\n";
3551 dbgs() <<
"ReuseShuffleIndices: ";
3552 if (ReuseShuffleIndices.
empty())
3555 for (
int ReuseIdx : ReuseShuffleIndices)
3556 dbgs() << ReuseIdx <<
", ";
3558 dbgs() <<
"ReorderIndices: ";
3559 for (
unsigned ReorderIdx : ReorderIndices)
3560 dbgs() << ReorderIdx <<
", ";
3562 dbgs() <<
"UserTreeIndices: ";
3563 for (
const auto &EInfo : UserTreeIndices)
3564 dbgs() << EInfo <<
", ";
3566 if (!CombinedEntriesWithIndices.
empty()) {
3567 dbgs() <<
"Combined entries: ";
3569 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3578 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3581 dbgs() <<
"SLP: " << Banner <<
":\n";
3583 dbgs() <<
"SLP: Costs:\n";
3584 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3585 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3586 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3587 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3588 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3594 std::optional<ScheduleData *> Bundle,
3595 const InstructionsState &S,
3596 const EdgeInfo &UserTreeIdx,
3599 unsigned InterleaveFactor = 0) {
3600 TreeEntry::EntryState EntryState =
3601 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3602 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3603 ReuseShuffleIndices, ReorderIndices);
3604 if (E && InterleaveFactor > 0)
3605 E->setInterleave(InterleaveFactor);
3610 TreeEntry::EntryState EntryState,
3611 std::optional<ScheduleData *> Bundle,
3612 const InstructionsState &S,
3613 const EdgeInfo &UserTreeIdx,
3616 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3617 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3618 "Need to vectorize gather entry?");
3620 if (GatheredLoadsEntriesFirst.has_value() &&
3621 EntryState == TreeEntry::NeedToGather && S &&
3622 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3623 !UserTreeIdx.UserTE)
3625 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3626 TreeEntry *
Last = VectorizableTree.
back().get();
3627 Last->Idx = VectorizableTree.
size() - 1;
3628 Last->State = EntryState;
3633 ReuseShuffleIndices.empty()) &&
3634 "Reshuffling scalars not yet supported for nodes with padding");
3635 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3636 ReuseShuffleIndices.end());
3637 if (ReorderIndices.
empty()) {
3640 Last->setOperations(S);
3643 Last->Scalars.assign(VL.
size(),
nullptr);
3646 if (Idx >= VL.size())
3647 return UndefValue::get(VL.front()->getType());
3652 Last->setOperations(S);
3653 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3655 if (!
Last->isGather()) {
3657 for (
Value *V : VL) {
3658 if (isa<PoisonValue>(V))
3660 auto It = ScalarToTreeEntries.find(V);
3662 (It == ScalarToTreeEntries.end() ||
3663 (It->getSecond().size() == 1 && It->getSecond().front() ==
Last) ||
3665 "Scalar already in tree!");
3666 if (It == ScalarToTreeEntries.end()) {
3667 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
3668 (void)Processed.
insert(V);
3669 }
else if (Processed.
insert(V).second) {
3671 "Value already associated with the node.");
3672 It->getSecond().push_back(
Last);
3676 ScheduleData *BundleMember = *Bundle;
3677 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3680 "Bundle and VL out of sync");
3682 for (
Value *V : VL) {
3687 BundleMember->TE =
Last;
3688 BundleMember = BundleMember->NextInBundle;
3691 assert(!BundleMember &&
"Bundle and VL out of sync");
3694 bool AllConstsOrCasts =
true;
3697 auto *
I = dyn_cast<CastInst>(V);
3698 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3699 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3700 !UserTreeIdx.UserTE->isGather())
3703 if (AllConstsOrCasts)
3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3706 MustGather.
insert(VL.begin(), VL.end());
3709 if (UserTreeIdx.UserTE)
3710 Last->UserTreeIndices.push_back(UserTreeIdx);
3716 TreeEntry::VecTreeTy VectorizableTree;
3721 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3722 VectorizableTree[
Id]->dump();
3730 assert(V &&
"V cannot be nullptr.");
3731 auto It = ScalarToTreeEntries.find(V);
3732 if (It == ScalarToTreeEntries.end())
3734 return It->getSecond();
3739 bool SameVF =
false)
const {
3740 assert(V &&
"V cannot be nullptr.");
3741 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
3742 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
3753 bool areAltOperandsProfitable(
const InstructionsState &S,
3758 TreeEntry::EntryState
3760 bool IsScatterVectorizeUserTE,
3789 using ValueToGatherNodesMap =
3791 ValueToGatherNodesMap ValueToGatherNodes;
3799 bool IsGraphTransformMode =
false;
3802 std::optional<unsigned> GatheredLoadsEntriesFirst;
3805 struct ExternalUser {
3832 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3833 auto It = AliasCache.
find(Key);
3834 if (It != AliasCache.
end())
3839 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3843 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3875 UserList ExternalUses;
3898 struct ScheduleData {
3901 enum { InvalidDeps = -1 };
3903 ScheduleData() =
default;
3906 FirstInBundle =
this;
3907 NextInBundle =
nullptr;
3908 NextLoadStore =
nullptr;
3909 IsScheduled =
false;
3910 SchedulingRegionID = BlockSchedulingRegionID;
3911 clearDependencies();
3918 if (hasValidDependencies()) {
3919 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3921 assert(UnscheduledDeps == Dependencies &&
"invariant");
3925 assert(isSchedulingEntity() &&
3926 "unexpected scheduled state");
3927 for (
const ScheduleData *BundleMember =
this; BundleMember;
3928 BundleMember = BundleMember->NextInBundle) {
3929 assert(BundleMember->hasValidDependencies() &&
3930 BundleMember->UnscheduledDeps == 0 &&
3931 "unexpected scheduled state");
3932 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3933 "only bundle is marked scheduled");
3937 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3938 "all bundle members must be in same basic block");
3944 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3948 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3952 bool isPartOfBundle()
const {
3953 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3958 bool isReady()
const {
3959 assert(isSchedulingEntity() &&
3960 "can't consider non-scheduling entity for ready list");
3961 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3967 int incrementUnscheduledDeps(
int Incr) {
3968 assert(hasValidDependencies() &&
3969 "increment of unscheduled deps would be meaningless");
3970 UnscheduledDeps += Incr;
3971 return FirstInBundle->unscheduledDepsInBundle();
3976 void resetUnscheduledDeps() {
3977 UnscheduledDeps = Dependencies;
3981 void clearDependencies() {
3982 Dependencies = InvalidDeps;
3983 resetUnscheduledDeps();
3984 MemoryDependencies.clear();
3985 ControlDependencies.clear();
3988 int unscheduledDepsInBundle()
const {
3989 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3991 for (
const ScheduleData *BundleMember =
this; BundleMember;
3992 BundleMember = BundleMember->NextInBundle) {
3993 if (BundleMember->UnscheduledDeps == InvalidDeps)
3995 Sum += BundleMember->UnscheduledDeps;
4001 if (!isSchedulingEntity()) {
4002 os <<
"/ " << *Inst;
4003 }
else if (NextInBundle) {
4005 ScheduleData *SD = NextInBundle;
4007 os <<
';' << *SD->Inst;
4008 SD = SD->NextInBundle;
4021 TreeEntry *
TE =
nullptr;
4025 ScheduleData *FirstInBundle =
nullptr;
4029 ScheduleData *NextInBundle =
nullptr;
4033 ScheduleData *NextLoadStore =
nullptr;
4047 int SchedulingRegionID = 0;
4050 int SchedulingPriority = 0;
4056 int Dependencies = InvalidDeps;
4062 int UnscheduledDeps = InvalidDeps;
4066 bool IsScheduled =
false;
4071 const BoUpSLP::ScheduleData &SD) {
4096 struct BlockScheduling {
4098 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4102 ScheduleStart =
nullptr;
4103 ScheduleEnd =
nullptr;
4104 FirstLoadStoreInRegion =
nullptr;
4105 LastLoadStoreInRegion =
nullptr;
4106 RegionHasStackSave =
false;
4110 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4113 ScheduleRegionSize = 0;
4117 ++SchedulingRegionID;
4121 if (BB !=
I->getParent())
4124 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4125 if (SD && isInSchedulingRegion(SD))
4130 ScheduleData *getScheduleData(
Value *V) {
4131 if (
auto *
I = dyn_cast<Instruction>(V))
4132 return getScheduleData(
I);
4136 bool isInSchedulingRegion(ScheduleData *SD)
const {
4137 return SD->SchedulingRegionID == SchedulingRegionID;
4142 template <
typename ReadyListType>
4143 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4144 SD->IsScheduled =
true;
4147 for (ScheduleData *BundleMember = SD; BundleMember;
4148 BundleMember = BundleMember->NextInBundle) {
4153 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4154 ScheduleData *OpDef = getScheduleData(
I);
4155 if (OpDef && OpDef->hasValidDependencies() &&
4156 OpDef->incrementUnscheduledDeps(-1) == 0) {
4160 ScheduleData *DepBundle = OpDef->FirstInBundle;
4161 assert(!DepBundle->IsScheduled &&
4162 "already scheduled bundle gets ready");
4163 ReadyList.insert(DepBundle);
4165 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4172 if (TreeEntry *TE = BundleMember->TE) {
4174 auto *
In = BundleMember->Inst;
4175 int Lane = std::distance(
TE->Scalars.begin(),
4177 assert(Lane >= 0 &&
"Lane not set");
4187 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4188 In->getNumOperands() ==
TE->getNumOperands()) &&
4189 "Missed TreeEntry operands?");
4191 for (
unsigned OpIdx : seq<unsigned>(
TE->getNumOperands()))
4192 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4197 for (
Use &U : BundleMember->Inst->operands())
4198 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4202 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4203 if (MemoryDepSD->hasValidDependencies() &&
4204 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4207 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4208 assert(!DepBundle->IsScheduled &&
4209 "already scheduled bundle gets ready");
4210 ReadyList.insert(DepBundle);
4212 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4216 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4217 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4220 ScheduleData *DepBundle = DepSD->FirstInBundle;
4221 assert(!DepBundle->IsScheduled &&
4222 "already scheduled bundle gets ready");
4223 ReadyList.insert(DepBundle);
4225 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4236 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4237 ScheduleStart->comesBefore(ScheduleEnd) &&
4238 "Not a valid scheduling region?");
4240 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4241 auto *SD = getScheduleData(
I);
4244 assert(isInSchedulingRegion(SD) &&
4245 "primary schedule data not in window?");
4246 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4247 "entire bundle in window!");
4251 for (
auto *SD : ReadyInsts) {
4252 assert(SD->isSchedulingEntity() && SD->isReady() &&
4253 "item in ready list not ready?");
4259 template <
typename ReadyListType>
4260 void initialFillReadyList(ReadyListType &ReadyList) {
4261 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4262 ScheduleData *SD = getScheduleData(
I);
4263 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4265 ReadyList.insert(SD);
4267 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4281 std::optional<ScheduleData *>
4283 const InstructionsState &S);
4289 ScheduleData *allocateScheduleDataChunks();
4293 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4298 ScheduleData *PrevLoadStore,
4299 ScheduleData *NextLoadStore);
4303 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4307 void resetSchedule();
4337 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4341 ScheduleData *LastLoadStoreInRegion =
nullptr;
4346 bool RegionHasStackSave =
false;
4349 int ScheduleRegionSize = 0;
4358 int SchedulingRegionID = 1;
4366 void scheduleBlock(BlockScheduling *BS);
4373 struct OrdersTypeDenseMapInfo {
4386 static unsigned getHashValue(
const OrdersType &V) {
4407 unsigned MaxVecRegSize;
4408 unsigned MinVecRegSize;
4423 unsigned ReductionBitWidth = 0;
4426 unsigned BaseGraphSize = 1;
4430 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4449 struct ChildIteratorType
4451 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4462 return R.VectorizableTree[0].get();
4466 return {
N->UserTreeIndices.begin(),
N->Container};
4470 return {
N->UserTreeIndices.end(),
N->Container};
4475 class nodes_iterator {
4486 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4490 return nodes_iterator(R->VectorizableTree.begin());
4494 return nodes_iterator(R->VectorizableTree.end());
4497 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4508 OS << Entry->Idx <<
".\n";
4511 for (
auto *V : Entry->Scalars) {
4513 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4514 return EU.Scalar == V;
4524 if (Entry->isGather())
4526 if (Entry->State == TreeEntry::ScatterVectorize ||
4527 Entry->State == TreeEntry::StridedVectorize)
4528 return "color=blue";
4537 for (
auto *
I : DeletedInstructions) {
4538 if (!
I->getParent()) {
4541 if (isa<PHINode>(
I))
4543 I->insertBefore(
F->getEntryBlock(),
4544 F->getEntryBlock().getFirstNonPHIIt());
4546 I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
4549 for (
Use &U :
I->operands()) {
4550 auto *
Op = dyn_cast<Instruction>(U.get());
4551 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4555 I->dropAllReferences();
4557 for (
auto *
I : DeletedInstructions) {
4559 "trying to erase instruction with users.");
4560 I->eraseFromParent();
4566#ifdef EXPENSIVE_CHECKS
4577 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4578 "Expected non-empty mask.");
4581 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4583 Reuses[Mask[
I]] = Prev[
I];
4591 bool BottomOrder =
false) {
4592 assert(!Mask.empty() &&
"Expected non-empty mask.");
4593 unsigned Sz = Mask.size();
4596 if (Order.
empty()) {
4598 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4600 PrevOrder.
swap(Order);
4603 for (
unsigned I = 0;
I < Sz; ++
I)
4605 Order[
I] = PrevOrder[Mask[
I]];
4607 return Data.value() == Sz ||
Data.index() ==
Data.value();
4616 if (Order.
empty()) {
4618 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4628 for (
unsigned I = 0;
I < Sz; ++
I)
4630 Order[MaskOrder[
I]] =
I;
4634std::optional<BoUpSLP::OrdersType>
4636 assert(TE.isGather() &&
"Expected gather node only.");
4640 Type *ScalarTy = GatheredScalars.
front()->getType();
4641 int NumScalars = GatheredScalars.
size();
4643 return std::nullopt;
4650 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4652 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4655 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4656 return std::nullopt;
4657 OrdersType CurrentOrder(NumScalars, NumScalars);
4658 if (GatherShuffles.
size() == 1 &&
4660 Entries.front().front()->isSame(TE.Scalars)) {
4663 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4664 return CurrentOrder;
4668 return all_of(Mask, [&](
int I) {
4675 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4676 (Entries.size() != 1 ||
4677 Entries.front().front()->ReorderIndices.empty())) ||
4678 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4679 return std::nullopt;
4684 for (
int I : seq<int>(0, NumParts)) {
4685 if (ShuffledSubMasks.
test(
I))
4687 const int VF = GetVF(
I);
4693 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4694 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4695 ShuffledSubMasks.
set(
I);
4699 int FirstMin = INT_MAX;
4700 int SecondVecFound =
false;
4701 for (
int K : seq<int>(Limit)) {
4702 int Idx = Mask[
I * PartSz + K];
4704 Value *V = GatheredScalars[
I * PartSz + K];
4706 SecondVecFound =
true;
4715 SecondVecFound =
true;
4719 FirstMin = (FirstMin / PartSz) * PartSz;
4721 if (SecondVecFound) {
4722 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4723 ShuffledSubMasks.
set(
I);
4726 for (
int K : seq<int>(Limit)) {
4727 int Idx = Mask[
I * PartSz + K];
4731 if (
Idx >= PartSz) {
4732 SecondVecFound =
true;
4735 if (CurrentOrder[
I * PartSz +
Idx] >
4736 static_cast<unsigned>(
I * PartSz + K) &&
4737 CurrentOrder[
I * PartSz +
Idx] !=
4738 static_cast<unsigned>(
I * PartSz +
Idx))
4739 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4742 if (SecondVecFound) {
4743 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4744 ShuffledSubMasks.
set(
I);
4750 if (!ExtractShuffles.
empty())
4751 TransformMaskToOrder(
4752 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4753 if (!ExtractShuffles[
I])
4756 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4757 for (
unsigned Idx : seq<unsigned>(Sz)) {
4758 int K =
I * PartSz +
Idx;
4761 if (!TE.ReuseShuffleIndices.empty())
4762 K = TE.ReuseShuffleIndices[K];
4765 if (!TE.ReorderIndices.empty())
4766 K = std::distance(TE.ReorderIndices.begin(),
4767 find(TE.ReorderIndices, K));
4768 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4771 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4773 .getKnownMinValue());
4778 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4779 if (ShuffledSubMasks.
any())
4780 return std::nullopt;
4781 PartSz = NumScalars;
4784 if (!Entries.empty())
4785 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4786 if (!GatherShuffles[
I])
4788 return std::max(Entries[
I].front()->getVectorFactor(),
4789 Entries[
I].back()->getVectorFactor());
4792 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4793 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4794 return std::nullopt;
4795 return std::move(CurrentOrder);
4800 bool CompareOpcodes =
true) {
4804 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4805 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4806 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4807 (!GEP2 || GEP2->getNumOperands() == 2) &&
4808 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4809 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4812 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4816template <
typename T>
4818 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4820 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4821 return CommonAlignment;
4827 "Order is empty. Please check it before using isReverseOrder.");
4828 unsigned Sz = Order.
size();
4830 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4841static std::optional<Value *>
4847 const SCEV *PtrSCEVLowest =
nullptr;
4848 const SCEV *PtrSCEVHighest =
nullptr;
4854 return std::nullopt;
4856 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4857 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4861 if (isa<SCEVCouldNotCompute>(Diff))
4862 return std::nullopt;
4864 PtrSCEVLowest = PtrSCEV;
4868 if (isa<SCEVCouldNotCompute>(Diff1))
4869 return std::nullopt;
4871 PtrSCEVHighest = PtrSCEV;
4877 if (isa<SCEVCouldNotCompute>(Dist))
4878 return std::nullopt;
4879 int Size =
DL.getTypeStoreSize(ElemTy);
4880 auto TryGetStride = [&](
const SCEV *Dist,
4881 const SCEV *Multiplier) ->
const SCEV * {
4882 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4883 if (M->getOperand(0) == Multiplier)
4884 return M->getOperand(1);
4885 if (M->getOperand(1) == Multiplier)
4886 return M->getOperand(0);
4889 if (Multiplier == Dist)
4894 const SCEV *Stride =
nullptr;
4895 if (
Size != 1 || SCEVs.
size() > 2) {
4897 Stride = TryGetStride(Dist, Sz);
4899 return std::nullopt;
4901 if (!Stride || isa<SCEVConstant>(Stride))
4902 return std::nullopt;
4905 using DistOrdPair = std::pair<int64_t, int>;
4907 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4909 bool IsConsecutive =
true;
4910 for (
const SCEV *PtrSCEV : SCEVs) {
4912 if (PtrSCEV != PtrSCEVLowest) {
4914 const SCEV *Coeff = TryGetStride(Diff, Stride);
4916 return std::nullopt;
4917 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4918 if (!SC || isa<SCEVCouldNotCompute>(SC))
4919 return std::nullopt;
4923 return std::nullopt;
4924 Dist = SC->getAPInt().getZExtValue();
4928 return std::nullopt;
4929 auto Res = Offsets.emplace(Dist, Cnt);
4931 return std::nullopt;
4933 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4936 if (Offsets.size() != SCEVs.
size())
4937 return std::nullopt;
4938 SortedIndices.
clear();
4939 if (!IsConsecutive) {
4943 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4944 SortedIndices[Cnt] = Pair.second;
4954static std::pair<InstructionCost, InstructionCost>
4970 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4973 Mask, NumSrcElts, NumSubElts,
Index)) {
4974 if (
Index + NumSubElts > NumSrcElts &&
4975 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4991 if (
Index % SubVecVF == 0) {
4999 std::iota(
Mask.begin(),
Mask.end(), 0);
5000 for (
unsigned I : seq<unsigned>(SubVecVF))
5003 Vec = Generator(Vec, V, Mask);
5007 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5019 unsigned SubVecVF,
unsigned Index) {
5020 if (
Index % SubVecVF == 0) {
5028 std::iota(Mask.begin(), Mask.end(),
Index);
5036 unsigned *BestVF,
bool TryRecursiveCheck)
const {
5049 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5055 const unsigned Sz = VL.
size();
5057 auto *POIter = PointerOps.
begin();
5058 for (
Value *V : VL) {
5059 auto *L = dyn_cast<LoadInst>(V);
5060 if (!L || !L->isSimple())
5062 *POIter = L->getPointerOperand();
5071 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5091 if (Order.
empty()) {
5092 Ptr0 = PointerOps.
front();
5093 PtrN = PointerOps.
back();
5095 Ptr0 = PointerOps[Order.
front()];
5096 PtrN = PointerOps[Order.
back()];
5098 std::optional<int> Diff =
5101 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5107 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5121 auto IsAnyPointerUsedOutGraph =
5122 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5123 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5124 return !isVectorized(U) && !MustGather.contains(U);
5127 const unsigned AbsoluteDiff = std::abs(*Diff);
5128 if (IsPossibleStrided &&
5129 (IsAnyPointerUsedOutGraph ||
5130 (AbsoluteDiff > Sz &&
5133 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
5134 *Diff == -(
static_cast<int>(Sz) - 1))) {
5135 int Stride = *Diff /
static_cast<int>(Sz - 1);
5136 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5148 else if (
Ptr != Ptr0)
5152 if (((Dist / Stride) * Stride) != Dist ||
5153 !Dists.
insert(Dist).second)
5156 if (Dists.
size() == Sz)
5165 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5167 bool ProfitableGatherPointers) {
5172 auto [ScalarGEPCost, VectorGEPCost] =
5174 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5180 VecTy->getNumElements());
5181 if (
static_cast<unsigned>(
count_if(
5182 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5188 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5207 false, CommonAlignment,
CostKind) +
5208 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5215 constexpr unsigned ListLimit = 4;
5216 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5225 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5235 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5248 DemandedElts.
setBits(Cnt, Cnt + VF);
5263 if (!DemandedElts.
isZero()) {
5268 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5269 if (DemandedElts[
Idx])
5276 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5281 LI0->getPointerOperand(),
5282 Instruction::GetElementPtr,
CostKind, ScalarTy,
5286 if (
static_cast<unsigned>(
5287 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5288 PointerOps.
size() - 1 ||
5308 LI0->getPointerAddressSpace(),
CostKind,
5314 LI0->getPointerOperand(),
5321 LI0->getPointerOperand(),
5331 for (
int Idx : seq<int>(0, VL.
size()))
5341 if (MaskedGatherCost >= VecLdCost &&
5354 bool ProfitableGatherPointers =
5355 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5356 return L->isLoopInvariant(V);
5358 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5359 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5361 (
GEP &&
GEP->getNumOperands() == 2 &&
5362 isa<Constant, Instruction>(
GEP->getOperand(1)));
5369 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5370 ProfitableGatherPointers))
5383 "Expected list of pointer operands.");
5393 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5395 SortedIndices.
clear();
5397 auto Key = std::make_pair(BBs[Cnt + 1],
5401 std::optional<int> Diff = getPointersDiff(
5402 ElemTy, std::get<0>(Base.front()), ElemTy,
5408 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5414 if (Bases.
size() > VL.
size() / 2 - 1)
5418 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5425 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5426 Bases.
front().second.size() == VL.
size()))
5431 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5440 FirstPointers.
insert(P1);
5441 SecondPointers.
insert(P2);
5447 "Unable to find matching root.");
5450 for (
auto &
Base : Bases) {
5451 for (
auto &Vec :
Base.second) {
5452 if (Vec.size() > 1) {
5453 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5454 const std::tuple<Value *, int, unsigned> &
Y) {
5455 return std::get<1>(
X) < std::get<1>(
Y);
5457 int InitialOffset = std::get<1>(Vec[0]);
5458 bool AnyConsecutive =
5460 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5464 if (!AnyConsecutive)
5469 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5473 for (
auto &
T : Bases)
5474 for (
const auto &Vec :
T.second)
5475 for (
const auto &
P : Vec)
5479 "Expected SortedIndices to be the size of VL");
5483std::optional<BoUpSLP::OrdersType>
5485 assert(TE.isGather() &&
"Expected gather node only.");
5486 Type *ScalarTy = TE.Scalars[0]->getType();
5489 Ptrs.
reserve(TE.Scalars.size());
5491 BBs.
reserve(TE.Scalars.size());
5492 for (
Value *V : TE.Scalars) {
5493 auto *L = dyn_cast<LoadInst>(V);
5494 if (!L || !L->isSimple())
5495 return std::nullopt;
5501 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5503 return std::move(Order);
5504 return std::nullopt;
5515 if (VU->
getType() != V->getType())
5518 if (!VU->
hasOneUse() && !V->hasOneUse())
5524 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5530 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5531 bool IsReusedIdx =
false;
5533 if (IE2 == VU && !IE1)
5535 if (IE1 == V && !IE2)
5536 return V->hasOneUse();
5537 if (IE1 && IE1 != V) {
5539 IsReusedIdx |= ReusedIdx.
test(Idx1);
5540 ReusedIdx.
set(Idx1);
5541 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5544 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5546 if (IE2 && IE2 != VU) {
5548 IsReusedIdx |= ReusedIdx.
test(Idx2);
5549 ReusedIdx.
set(Idx2);
5550 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5553 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5555 }
while (!IsReusedIdx && (IE1 || IE2));
5559std::optional<BoUpSLP::OrdersType>
5563 if (!TE.ReuseShuffleIndices.empty()) {
5565 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5566 "Reshuffling scalars not yet supported for nodes with padding");
5569 return std::nullopt;
5577 unsigned Sz = TE.Scalars.size();
5578 if (TE.isGather()) {
5579 if (std::optional<OrdersType> CurrentOrder =
5584 ::addMask(Mask, TE.ReuseShuffleIndices);
5585 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5586 unsigned Sz = TE.Scalars.size();
5587 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5590 Res[
Idx + K * Sz] =
I + K * Sz;
5592 return std::move(Res);
5595 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5597 2 * TE.getVectorFactor())) == 1)
5598 return std::nullopt;
5602 if (TE.ReorderIndices.empty())
5603 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5606 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5607 unsigned VF = ReorderMask.
size();
5611 for (
unsigned I = 0;
I < VF;
I += Sz) {
5613 unsigned UndefCnt = 0;
5614 unsigned Limit = std::min(Sz, VF -
I);
5623 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5625 return std::nullopt;
5627 for (
unsigned K = 0; K < NumParts; ++K) {
5628 unsigned Idx = Val + Sz * K;
5630 ResOrder[
Idx] =
I + K;
5633 return std::move(ResOrder);
5635 unsigned VF = TE.getVectorFactor();
5638 TE.ReuseShuffleIndices.end());
5639 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5641 if (isa<PoisonValue>(V))
5643 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5644 return Idx && *Idx < Sz;
5646 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5647 "by BinaryOperator and CastInst.");
5649 if (TE.ReorderIndices.empty())
5650 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5653 for (
unsigned I = 0;
I < VF; ++
I) {
5654 int &
Idx = ReusedMask[
I];
5657 Value *V = TE.Scalars[ReorderMask[
Idx]];
5659 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5665 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5666 auto *It = ResOrder.
begin();
5667 for (
unsigned K = 0; K < VF; K += Sz) {
5671 std::iota(SubMask.begin(), SubMask.end(), 0);
5673 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5674 std::advance(It, Sz);
5677 return Data.index() ==
Data.value();
5679 return std::nullopt;
5680 return std::move(ResOrder);
5682 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5683 any_of(TE.UserTreeIndices,
5685 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5687 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5688 return std::nullopt;
5689 if ((TE.State == TreeEntry::Vectorize ||
5690 TE.State == TreeEntry::StridedVectorize) &&
5691 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5692 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5693 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5694 "BinaryOperator and CastInst.");
5695 return TE.ReorderIndices;
5697 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5698 if (!TE.ReorderIndices.empty())
5699 return TE.ReorderIndices;
5702 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5703 if (!V->hasNUsesOrMore(1))
5705 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5710 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5712 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5718 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5719 auto *NodeA = DT->
getNode(BB1);
5720 auto *NodeB = DT->
getNode(BB2);
5721 assert(NodeA &&
"Should only process reachable instructions");
5722 assert(NodeB &&
"Should only process reachable instructions");
5723 assert((NodeA == NodeB) ==
5724 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5725 "Different nodes should have different DFS numbers");
5726 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5728 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5729 Value *V1 = TE.Scalars[I1];
5730 Value *V2 = TE.Scalars[I2];
5731 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5733 if (isa<PoisonValue>(V1))
5735 if (isa<PoisonValue>(V2))
5741 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5742 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5743 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5744 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5745 FirstUserOfPhi2->getParent());
5746 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5747 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5748 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5749 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5755 if (UserBVHead[I1] && !UserBVHead[I2])
5757 if (!UserBVHead[I1])
5759 if (UserBVHead[I1] == UserBVHead[I2])
5762 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5764 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5771 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5772 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5773 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5774 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5777 if (EE1->getOperand(0) == EE2->getOperand(0))
5779 if (!Inst1 && Inst2)
5781 if (Inst1 && Inst2) {
5789 "Expected either instructions or arguments vector operands.");
5790 return P1->getArgNo() < P2->getArgNo();
5795 std::iota(Phis.
begin(), Phis.
end(), 0);
5798 return std::nullopt;
5799 return std::move(Phis);
5801 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5805 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5806 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5807 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5809 auto *EE = dyn_cast<ExtractElementInst>(V);
5810 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5816 canReuseExtract(TE.Scalars, CurrentOrder,
true);
5817 if (Reuse || !CurrentOrder.
empty())
5818 return std::move(CurrentOrder);
5826 int Sz = TE.Scalars.size();
5828 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5830 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5831 if (It == TE.Scalars.begin())
5834 if (It != TE.Scalars.end()) {
5836 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5851 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5854 return std::move(Order);
5859 return std::nullopt;
5860 if (TE.Scalars.size() >= 3)
5865 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5869 CurrentOrder, PointerOps);
5871 return std::move(CurrentOrder);
5877 return CurrentOrder;
5879 return std::nullopt;
5889 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5891 if (Cluster != FirstCluster)
5897void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5900 const unsigned Sz =
TE.Scalars.size();
5902 if (!
TE.isGather() ||
5909 addMask(NewMask,
TE.ReuseShuffleIndices);
5911 TE.ReorderIndices.clear();
5918 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5919 *
End =
TE.ReuseShuffleIndices.end();
5920 It !=
End; std::advance(It, Sz))
5921 std::iota(It, std::next(It, Sz), 0);
5927 "Expected same size of orders");
5928 unsigned Sz = Order.
size();
5930 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5931 if (Order[
Idx] != Sz)
5932 UsedIndices.
set(Order[
Idx]);
5934 if (SecondaryOrder.
empty()) {
5935 for (
unsigned Idx : seq<unsigned>(0, Sz))
5936 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5939 for (
unsigned Idx : seq<unsigned>(0, Sz))
5940 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5941 !UsedIndices.
test(SecondaryOrder[
Idx]))
5942 Order[
Idx] = SecondaryOrder[
Idx];
5962 ExternalUserReorderMap;
5967 const std::unique_ptr<TreeEntry> &TE) {
5970 findExternalStoreUsersReorderIndices(TE.get());
5971 if (!ExternalUserReorderIndices.
empty()) {
5972 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5974 std::move(ExternalUserReorderIndices));
5980 if (TE->hasState() && TE->isAltShuffle()) {
5983 unsigned Opcode0 = TE->getOpcode();
5984 unsigned Opcode1 = TE->getAltOpcode();
5987 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5988 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5994 if (std::optional<OrdersType> CurrentOrder =
6004 const TreeEntry *UserTE = TE.get();
6006 if (UserTE->UserTreeIndices.size() != 1)
6009 return EI.UserTE->State == TreeEntry::Vectorize &&
6010 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
6013 UserTE = UserTE->UserTreeIndices.back().UserTE;
6016 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
6017 if (!(TE->State == TreeEntry::Vectorize ||
6018 TE->State == TreeEntry::StridedVectorize) ||
6019 !TE->ReuseShuffleIndices.empty())
6020 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
6021 if (TE->State == TreeEntry::Vectorize &&
6022 TE->getOpcode() == Instruction::PHI)
6023 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
6028 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
6029 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6030 auto It = VFToOrderedEntries.
find(VF);
6031 if (It == VFToOrderedEntries.
end())
6046 for (
const TreeEntry *OpTE : OrderedEntries) {
6049 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6052 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6054 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6055 auto It = GathersToOrders.find(OpTE);
6056 if (It != GathersToOrders.end())
6059 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6060 auto It = AltShufflesToOrders.find(OpTE);
6061 if (It != AltShufflesToOrders.end())
6064 if (OpTE->State == TreeEntry::Vectorize &&
6065 OpTE->getOpcode() == Instruction::PHI) {
6066 auto It = PhisToOrders.
find(OpTE);
6067 if (It != PhisToOrders.
end())
6070 return OpTE->ReorderIndices;
6073 auto It = ExternalUserReorderMap.
find(OpTE);
6074 if (It != ExternalUserReorderMap.
end()) {
6075 const auto &ExternalUserReorderIndices = It->second;
6079 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6080 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6081 ExternalUserReorderIndices.size();
6083 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6084 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6091 if (OpTE->State == TreeEntry::Vectorize &&
6092 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6093 assert(!OpTE->isAltShuffle() &&
6094 "Alternate instructions are only supported by BinaryOperator "
6098 unsigned E = Order.size();
6101 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6104 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6106 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6109 if (OrdersUses.empty())
6112 unsigned IdentityCnt = 0;
6113 unsigned FilledIdentityCnt = 0;
6115 for (
auto &Pair : OrdersUses) {
6117 if (!Pair.first.empty())
6118 FilledIdentityCnt += Pair.second;
6119 IdentityCnt += Pair.second;
6124 unsigned Cnt = IdentityCnt;
6125 for (
auto &Pair : OrdersUses) {
6129 if (Cnt < Pair.second ||
6130 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6131 Cnt == Pair.second && !BestOrder.
empty() &&
6134 BestOrder = Pair.first;
6147 unsigned E = BestOrder.
size();
6149 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6152 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6154 if (TE->Scalars.size() != VF) {
6155 if (TE->ReuseShuffleIndices.size() == VF) {
6161 return EI.UserTE->Scalars.size() == VF ||
6162 EI.UserTE->Scalars.size() ==
6165 "All users must be of VF size.");
6173 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6178 return isa<ShuffleVectorInst>(
6179 EI.UserTE->getMainOp());
6181 "Does not know how to reorder.");
6185 reorderNodeWithReuses(*TE, Mask);
6189 if ((TE->State == TreeEntry::Vectorize ||
6190 TE->State == TreeEntry::StridedVectorize) &&
6193 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6194 assert(!TE->isAltShuffle() &&
6195 "Alternate instructions are only supported by BinaryOperator "
6200 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6201 TE->reorderOperands(Mask);
6204 TE->reorderOperands(Mask);
6205 assert(TE->ReorderIndices.empty() &&
6206 "Expected empty reorder sequence.");
6209 if (!TE->ReuseShuffleIndices.empty()) {
6216 addMask(NewReuses, TE->ReuseShuffleIndices);
6217 TE->ReuseShuffleIndices.swap(NewReuses);
6223bool BoUpSLP::canReorderOperands(
6224 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6227 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6228 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6229 return OpData.first ==
I &&
6230 (OpData.second->State == TreeEntry::Vectorize ||
6231 OpData.second->State == TreeEntry::StridedVectorize);
6234 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6236 if (
any_of(TE->UserTreeIndices,
6237 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6241 Edges.emplace_back(
I, TE);
6247 if (TE->State != TreeEntry::Vectorize &&
6248 TE->State != TreeEntry::StridedVectorize &&
6249 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6253 TreeEntry *
Gather =
nullptr;
6255 [&
Gather, UserTE,
I](TreeEntry *TE) {
6256 assert(TE->State != TreeEntry::Vectorize &&
6257 TE->State != TreeEntry::StridedVectorize &&
6258 "Only non-vectorized nodes are expected.");
6259 if (
any_of(TE->UserTreeIndices,
6260 [UserTE,
I](
const EdgeInfo &EI) {
6261 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6263 assert(TE->isSame(UserTE->getOperand(
I)) &&
6264 "Operand entry does not match operands.");
6285 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6286 if (TE->State != TreeEntry::Vectorize &&
6287 TE->State != TreeEntry::StridedVectorize)
6289 if (std::optional<OrdersType> CurrentOrder =
6291 OrderedEntries.
insert(TE.get());
6292 if (!(TE->State == TreeEntry::Vectorize ||
6293 TE->State == TreeEntry::StridedVectorize) ||
6294 !TE->ReuseShuffleIndices.empty())
6295 GathersToOrders.
insert(TE.get());
6304 while (!OrderedEntries.
empty()) {
6309 for (TreeEntry *TE : OrderedEntries) {
6310 if (!(TE->State == TreeEntry::Vectorize ||
6311 TE->State == TreeEntry::StridedVectorize ||
6312 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6313 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6316 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6318 !Visited.
insert(TE).second) {
6324 for (
EdgeInfo &EI : TE->UserTreeIndices)
6328 for (TreeEntry *TE : Filtered)
6329 OrderedEntries.remove(TE);
6331 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6333 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6334 return Data1.first->Idx > Data2.first->Idx;
6336 for (
auto &
Data : UsersVec) {
6339 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6341 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6342 OrderedEntries.remove(
Op.second);
6355 for (
const auto &
Op :
Data.second) {
6356 TreeEntry *OpTE =
Op.second;
6357 if (!VisitedOps.
insert(OpTE).second)
6359 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6361 const auto Order = [&]() ->
const OrdersType {
6362 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6365 return OpTE->ReorderIndices;
6369 if (Order.size() == 1)
6372 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6373 return P.second == OpTE;
6376 if (OpTE->State == TreeEntry::Vectorize &&
6377 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6378 assert(!OpTE->isAltShuffle() &&
6379 "Alternate instructions are only supported by BinaryOperator "
6383 unsigned E = Order.size();
6386 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6389 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6392 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6394 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6395 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6396 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6397 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6398 (IgnoreReorder && TE->Idx == 0))
6400 if (TE->isGather()) {
6409 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6410 TreeEntry *UserTE = EI.
UserTE;
6411 if (!VisitedUsers.
insert(UserTE).second)
6416 if (AllowsReordering(UserTE))
6424 if (
static_cast<unsigned>(
count_if(
6425 Ops, [UserTE, &AllowsReordering](
6426 const std::pair<unsigned, TreeEntry *> &
Op) {
6427 return AllowsReordering(
Op.second) &&
6430 return EI.UserTE == UserTE;
6432 })) <= Ops.
size() / 2)
6433 ++Res.first->second;
6436 if (OrdersUses.empty()) {
6437 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6438 OrderedEntries.remove(
Op.second);
6442 unsigned IdentityCnt = 0;
6443 unsigned VF =
Data.second.front().second->getVectorFactor();
6445 for (
auto &Pair : OrdersUses) {
6447 IdentityCnt += Pair.second;
6452 unsigned Cnt = IdentityCnt;
6453 for (
auto &Pair : OrdersUses) {
6457 if (Cnt < Pair.second) {
6459 BestOrder = Pair.first;
6467 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6468 OrderedEntries.remove(
Op.second);
6477 unsigned E = BestOrder.
size();
6479 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6481 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6482 TreeEntry *TE =
Op.second;
6483 OrderedEntries.remove(TE);
6484 if (!VisitedOps.
insert(TE).second)
6486 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6487 reorderNodeWithReuses(*TE, Mask);
6491 if (TE->State != TreeEntry::Vectorize &&
6492 TE->State != TreeEntry::StridedVectorize &&
6493 (TE->State != TreeEntry::ScatterVectorize ||
6494 TE->ReorderIndices.empty()))
6496 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6497 TE->ReorderIndices.empty()) &&
6498 "Non-matching sizes of user/operand entries.");
6500 if (IgnoreReorder && TE == VectorizableTree.front().get())
6501 IgnoreReorder =
false;
6504 for (TreeEntry *
Gather : GatherOps) {
6506 "Unexpected reordering of gathers.");
6507 if (!
Gather->ReuseShuffleIndices.empty()) {
6513 OrderedEntries.remove(
Gather);
6517 if (
Data.first->State != TreeEntry::Vectorize ||
6518 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6519 Data.first->getMainOp()) ||
6520 Data.first->isAltShuffle())
6521 Data.first->reorderOperands(Mask);
6522 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6523 Data.first->isAltShuffle() ||
6524 Data.first->State == TreeEntry::StridedVectorize) {
6528 if (
Data.first->ReuseShuffleIndices.empty() &&
6529 !
Data.first->ReorderIndices.empty() &&
6530 !
Data.first->isAltShuffle()) {
6533 OrderedEntries.insert(
Data.first);
6541 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6542 VectorizableTree.front()->ReuseShuffleIndices.empty())
6543 VectorizableTree.front()->ReorderIndices.clear();
6546Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6547 if ((Entry.getOpcode() == Instruction::Store ||
6548 Entry.getOpcode() == Instruction::Load) &&
6549 Entry.State == TreeEntry::StridedVectorize &&
6550 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6551 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6552 return dyn_cast<Instruction>(Entry.Scalars.front());
6559 for (
auto &TEPtr : VectorizableTree) {
6560 TreeEntry *Entry = TEPtr.get();
6563 if (Entry->isGather())
6567 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6568 Value *Scalar = Entry->Scalars[Lane];
6569 if (!isa<Instruction>(Scalar))
6572 auto It = ScalarToExtUses.
find(Scalar);
6573 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6577 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6578 if (ExtI != ExternallyUsedValues.
end()) {
6579 int FoundLane = Entry->findLaneForValue(Scalar);
6580 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6581 << FoundLane <<
" from " << *Scalar <<
".\n");
6582 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6583 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
6586 for (
User *U : Scalar->users()) {
6594 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6599 !UseEntries.empty()) {
6603 if (
any_of(UseEntries, [&](TreeEntry *UseEntry) {
6604 return UseEntry->State == TreeEntry::ScatterVectorize ||
6606 Scalar, getRootEntryInstruction(*UseEntry), TLI,
6609 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6612 [](TreeEntry *UseEntry) {
6613 return UseEntry->isGather();
6619 if (It != ScalarToExtUses.
end()) {
6620 ExternalUses[It->second].User =
nullptr;
6625 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6627 int FoundLane = Entry->findLaneForValue(Scalar);
6629 <<
" from lane " << FoundLane <<
" from " << *Scalar
6631 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6632 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
6641BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6645 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6646 Value *V = TE->Scalars[Lane];
6648 if (!isa<Instruction>(V))
6655 for (
User *U : V->users()) {
6656 auto *SI = dyn_cast<StoreInst>(U);
6659 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6668 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6669 SI->getValueOperand()->getType(),
Ptr}];
6672 if (StoresVec.size() > Lane)
6674 if (!StoresVec.empty()) {
6676 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6677 SI->getValueOperand()->getType(),
6678 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6684 StoresVec.push_back(SI);
6689 for (
auto &
P : PtrToStoresMap) {
6690 Res[
I].swap(
P.second);
6697 OrdersType &ReorderIndices)
const {
6708 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6710 std::optional<int> Diff =
6712 SI->getPointerOperand(), *
DL, *SE,
6718 if (StoreOffsetVec.
size() != StoresVec.
size())
6720 sort(StoreOffsetVec,
6721 [](
const std::pair<int, unsigned> &L,
6722 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6725 for (
const auto &
P : StoreOffsetVec) {
6726 if (
Idx > 0 &&
P.first != PrevDist + 1)
6734 ReorderIndices.assign(StoresVec.
size(), 0);
6735 bool IsIdentity =
true;
6737 ReorderIndices[
P.second] =
I;
6738 IsIdentity &=
P.second ==
I;
6744 ReorderIndices.clear();
6751 for (
unsigned Idx : Order)
6758BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6759 unsigned NumLanes =
TE->Scalars.size();
6772 if (StoresVec.
size() != NumLanes)
6777 if (!canFormVector(StoresVec, ReorderIndices))
6782 ExternalReorderIndices.
push_back(ReorderIndices);
6784 return ExternalReorderIndices;
6790 UserIgnoreList = &UserIgnoreLst;
6793 buildTree_rec(Roots, 0,
EdgeInfo());
6800 buildTree_rec(Roots, 0,
EdgeInfo());
6809 bool AddNew =
true) {
6817 for (
Value *V : VL) {
6818 auto *LI = dyn_cast<LoadInst>(V);
6821 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6823 bool IsFound =
false;
6824 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6825 assert(LI->getParent() ==
Data.front().first->getParent() &&
6826 LI->getType() ==
Data.front().first->getType() &&
6830 "Expected loads with the same type, same parent and same "
6831 "underlying pointer.");
6833 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6834 Data.front().first->getPointerOperand(),
DL, SE,
6838 auto It = Map.find(*Dist);
6839 if (It != Map.end() && It->second != LI)
6841 if (It == Map.end()) {
6842 Data.emplace_back(LI, *Dist);
6843 Map.try_emplace(*Dist, LI);
6853 auto FindMatchingLoads =
6858 int &
Offset,
unsigned &Start) {
6860 return GatheredLoads.
end();
6870 std::optional<int> Dist =
6872 Data.front().first->getType(),
6873 Data.front().first->getPointerOperand(),
DL, SE,
6879 for (std::pair<LoadInst *, int>
P :
Data) {
6885 unsigned NumUniques = 0;
6886 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6887 bool Used = DataLoads.
contains(Pair.first);
6888 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6892 Repeated.insert(Cnt);
6895 if (NumUniques > 0 &&
6896 (Loads.
size() == NumUniques ||
6897 (Loads.
size() - NumUniques >= 2 &&
6898 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6904 return std::next(GatheredLoads.
begin(),
Idx);
6908 return GatheredLoads.
end();
6910 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6914 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6916 while (It != GatheredLoads.
end()) {
6917 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6918 for (
unsigned Idx : LocalToAdd)
6920 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6921 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6925 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6929 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6938 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6939 return PD.front().first->getParent() == LI->
getParent() &&
6940 PD.front().first->getType() == LI->
getType();
6942 while (It != GatheredLoads.
end()) {
6945 std::next(It), GatheredLoads.
end(),
6946 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6947 return PD.front().first->getParent() == LI->getParent() &&
6948 PD.front().first->getType() == LI->getType();
6952 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6953 AddNewLoads(GatheredLoads.emplace_back());
6958void BoUpSLP::tryToVectorizeGatheredLoads(
6961 8> &GatheredLoads) {
6962 GatheredLoadsEntriesFirst = VectorizableTree.size();
6965 LoadEntriesToVectorize.
size());
6966 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6967 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6968 VectorizableTree[
Idx]->Scalars.end());
6971 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6972 const std::pair<LoadInst *, int> &L2) {
6973 return L1.second > L2.second;
6979 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6980 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6988 bool Final,
unsigned MaxVF) {
6990 unsigned StartIdx = 0;
6995 *
TTI, Loads.
front()->getType(), MaxVF);
6997 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
7003 if (Final && CandidateVFs.
empty())
7006 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
7007 for (
unsigned NumElts : CandidateVFs) {
7008 if (Final && NumElts > BestVF)
7011 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
7015 if (VectorizedLoads.count(Slice.
front()) ||
7016 VectorizedLoads.count(Slice.
back()) ||
7022 bool AllowToVectorize =
false;
7030 if (LI->hasOneUse())
7036 if (
static_cast<unsigned int>(std::distance(
7037 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7039 if (!IsLegalBroadcastLoad)
7043 for (
User *U : LI->users()) {
7044 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
7046 for (
const TreeEntry *UTE : getTreeEntries(U)) {
7047 for (
int I : seq<int>(UTE->getNumOperands())) {
7049 return V == LI || isa<PoisonValue>(V);
7059 AllowToVectorize = CheckIfAllowed(Slice);
7063 any_of(ValueToGatherNodes.at(Slice.front()),
7064 [=](
const TreeEntry *TE) {
7065 return TE->Scalars.size() == 2 &&
7066 ((TE->Scalars.front() == Slice.front() &&
7067 TE->Scalars.back() == Slice.back()) ||
7068 (TE->Scalars.front() == Slice.back() &&
7069 TE->Scalars.back() == Slice.front()));
7074 if (AllowToVectorize) {
7079 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
7081 PointerOps, &BestVF);
7083 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7085 if (MaskedGatherVectorized.
empty() ||
7086 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7091 Results.emplace_back(Values, LS);
7092 VectorizedLoads.insert(Slice.begin(), Slice.end());
7095 if (Cnt == StartIdx)
7096 StartIdx += NumElts;
7099 if (StartIdx >= Loads.
size())
7103 if (!MaskedGatherVectorized.
empty() &&
7104 Cnt < MaskedGatherVectorized.
back() + NumElts)
7110 if (!AllowToVectorize || BestVF == 0)
7114 for (
unsigned Cnt : MaskedGatherVectorized) {
7116 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7120 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7122 if (Cnt == StartIdx)
7123 StartIdx += NumElts;
7127 if (!VectorizedLoads.contains(LI))
7128 NonVectorized.push_back(LI);
7132 auto ProcessGatheredLoads =
7135 bool Final =
false) {
7137 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7138 if (LoadsDists.size() <= 1) {
7139 NonVectorized.
push_back(LoadsDists.back().first);
7144 transform(LoadsDists, OriginalLoads.begin(),
7145 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7150 unsigned MaxConsecutiveDistance = 0;
7151 unsigned CurrentConsecutiveDist = 1;
7152 int LastDist = LocalLoadsDists.
front().second;
7153 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7154 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7157 assert(LastDist >=
L.second &&
7158 "Expected first distance always not less than second");
7159 if (
static_cast<unsigned>(LastDist -
L.second) ==
7160 CurrentConsecutiveDist) {
7161 ++CurrentConsecutiveDist;
7162 MaxConsecutiveDistance =
7163 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7167 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7170 CurrentConsecutiveDist = 1;
7171 LastDist =
L.second;
7174 if (Loads.
size() <= 1)
7176 if (AllowMaskedGather)
7177 MaxConsecutiveDistance = Loads.
size();
7178 else if (MaxConsecutiveDistance < 2)
7183 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7184 Final, MaxConsecutiveDistance);
7186 OriginalLoads.size() == Loads.
size() &&
7187 MaxConsecutiveDistance == Loads.
size() &&
7192 VectorizedLoads.
clear();
7196 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7197 UnsortedNonVectorized, Final,
7198 OriginalLoads.size());
7199 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7200 SortedNonVectorized.
swap(UnsortedNonVectorized);
7201 Results.swap(UnsortedResults);
7206 << Slice.
size() <<
")\n");
7208 for (
Value *L : Slice)
7210 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7216 unsigned MaxVF = Slice.size();
7217 unsigned UserMaxVF = 0;
7218 unsigned InterleaveFactor = 0;
7223 std::optional<unsigned> InterleavedLoadsDistance = 0;
7225 std::optional<unsigned> CommonVF = 0;
7229 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7230 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7233 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7235 if (*CommonVF == 0) {
7236 CommonVF =
E->Scalars.size();
7239 if (*CommonVF !=
E->Scalars.size())
7243 if (Pos !=
Idx && InterleavedLoadsDistance) {
7246 if (isa<Constant>(V))
7248 if (isVectorized(V))
7250 const auto &Nodes = ValueToGatherNodes.at(V);
7251 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7252 !is_contained(Slice, V);
7254 InterleavedLoadsDistance.reset();
7258 if (*InterleavedLoadsDistance == 0) {
7259 InterleavedLoadsDistance =
Idx - Pos;
7262 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7263 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7264 InterleavedLoadsDistance.reset();
7265 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7269 DeinterleavedNodes.
clear();
7271 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7272 CommonVF.value_or(0) != 0) {
7273 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7274 unsigned VF = *CommonVF;
7278 if (InterleaveFactor <= Slice.size() &&
7282 cast<LoadInst>(Slice.front())->getAlign(),
7283 cast<LoadInst>(Slice.front())
7287 UserMaxVF = InterleaveFactor * VF;
7289 InterleaveFactor = 0;
7294 unsigned ConsecutiveNodesSize = 0;
7295 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7296 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7297 [&, Slice = Slice](
const auto &
P) {
7299 return std::get<1>(
P).contains(V);
7301 if (It == Slice.end())
7304 VectorizableTree[std::get<0>(
P)]->Scalars;
7305 ConsecutiveNodesSize += VL.
size();
7306 unsigned Start = std::distance(Slice.begin(), It);
7307 unsigned Sz = Slice.size() - Start;
7308 return Sz < VL.
size() ||
7309 Slice.slice(std::distance(Slice.begin(), It),
7315 if (InterleaveFactor == 0 &&
7316 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7317 [&, Slice = Slice](
unsigned Idx) {
7319 SmallVector<Value *> PointerOps;
7320 return canVectorizeLoads(
7321 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7322 Slice[Idx * UserMaxVF], Order,
7324 LoadsState::ScatterVectorize;
7327 if (Slice.size() != ConsecutiveNodesSize)
7328 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7330 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7331 bool IsVectorized =
true;
7332 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7334 Slice.
slice(
I, std::min(VF,
E -
I));
7339 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7340 [&](
const auto &
P) {
7342 VectorizableTree[std::get<0>(
P)]
7347 unsigned Sz = VectorizableTree.size();
7348 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7349 if (Sz == VectorizableTree.size()) {
7350 IsVectorized =
false;
7353 if (InterleaveFactor > 0) {
7354 VF = 2 * (MaxVF / InterleaveFactor);
7355 InterleaveFactor = 0;
7364 NonVectorized.
append(SortedNonVectorized);
7366 return NonVectorized;
7368 for (
const auto &GLs : GatheredLoads) {
7369 const auto &
Ref = GLs.second;
7371 if (!
Ref.empty() && !NonVectorized.
empty() &&
7373 Ref.begin(),
Ref.end(), 0u,
7375 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7376 return S + LoadsDists.size();
7377 }) != NonVectorized.
size() &&
7378 IsMaskedGatherSupported(NonVectorized)) {
7380 for (
LoadInst *LI : NonVectorized) {
7388 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7392 for (
unsigned Idx : LoadEntriesToVectorize) {
7393 const TreeEntry &
E = *VectorizableTree[
Idx];
7396 if (!
E.ReorderIndices.empty()) {
7403 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7407 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7408 VectorizableTree.size())
7409 GatheredLoadsEntriesFirst.reset();
7416 Value *NeedsScheduling =
nullptr;
7417 for (
Value *V : VL) {
7420 if (!NeedsScheduling) {
7421 NeedsScheduling = V;
7426 return NeedsScheduling;
7437 bool AllowAlternate) {
7441 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7444 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7449 if (isa<ExtractElementInst, UndefValue>(V))
7451 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7453 !isa<UndefValue>(EI->getIndexOperand()))
7456 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7459 if ((isa<BinaryOperator, CastInst>(
I)) &&
7469 : cast<CastInst>(
I)->getOperand(0)->getType()));
7471 if (isa<CastInst>(
I)) {
7472 std::pair<size_t, size_t> OpVals =
7478 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7480 if (CI->isCommutative())
7486 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7500 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7501 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7502 SubKey =
hash_value(Gep->getPointerOperand());
7506 !isa<ConstantInt>(
I->getOperand(1))) {
7514 return std::make_pair(Key, SubKey);
7524bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7526 unsigned Opcode0 = S.getOpcode();
7527 unsigned Opcode1 = S.getAltOpcode();
7531 Opcode0, Opcode1, OpcodeMask))
7534 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7537 for (
Value *V : VL) {
7538 if (isa<PoisonValue>(V)) {
7543 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7548 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7554 switch (Res.value_or(0)) {
7569 constexpr unsigned NumAltInsts = 3;
7570 unsigned NonInstCnt = 0;
7573 unsigned UndefCnt = 0;
7575 unsigned ExtraShuffleInsts = 0;
7584 return is_contained(Operands.back(), V);
7587 ++ExtraShuffleInsts;
7604 if (isa<Constant, ExtractElementInst>(V) ||
7606 if (isa<UndefValue>(V))
7612 if (!Res.second && Res.first->second == 1)
7613 ++ExtraShuffleInsts;
7614 ++Res.first->getSecond();
7615 if (
auto *
I = dyn_cast<Instruction>(V))
7616 UniqueOpcodes.
insert(
I->getOpcode());
7617 else if (Res.second)
7620 return none_of(Uniques, [&](
const auto &
P) {
7621 return P.first->hasNUsesOrMore(
P.second + 1) &&
7623 return isVectorized(U) || Uniques.contains(U);
7632 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7633 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7634 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7637BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7639 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7642 "Expected instructions with same/alternate opcodes only.");
7644 unsigned ShuffleOrOp =
7645 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7647 switch (ShuffleOrOp) {
7648 case Instruction::PHI: {
7651 return TreeEntry::NeedToGather;
7653 for (
Value *V : VL) {
7654 auto *
PHI = dyn_cast<PHINode>(V);
7659 if (Term &&
Term->isTerminator()) {
7661 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7662 return TreeEntry::NeedToGather;
7667 return TreeEntry::Vectorize;
7669 case Instruction::ExtractValue:
7670 case Instruction::ExtractElement: {
7671 bool Reuse = canReuseExtract(VL, CurrentOrder);
7675 return TreeEntry::NeedToGather;
7676 if (Reuse || !CurrentOrder.empty())
7677 return TreeEntry::Vectorize;
7679 return TreeEntry::NeedToGather;
7681 case Instruction::InsertElement: {
7685 for (
Value *V : VL) {
7686 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7688 "Non-constant or undef index?");
7692 return !SourceVectors.contains(V);
7695 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7696 "different source vectors.\n");
7697 return TreeEntry::NeedToGather;
7702 return SourceVectors.contains(V) && !
V->hasOneUse();
7705 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7706 "multiple uses.\n");
7707 return TreeEntry::NeedToGather;
7710 return TreeEntry::Vectorize;
7712 case Instruction::Load: {
7721 return TreeEntry::Vectorize;
7723 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7725 LoadEntriesToVectorize.insert(VectorizableTree.size());
7726 return TreeEntry::NeedToGather;
7728 return TreeEntry::ScatterVectorize;
7730 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7732 LoadEntriesToVectorize.insert(VectorizableTree.size());
7733 return TreeEntry::NeedToGather;
7735 return TreeEntry::StridedVectorize;
7739 if (
DL->getTypeSizeInBits(ScalarTy) !=
7740 DL->getTypeAllocSizeInBits(ScalarTy))
7741 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7743 auto *LI = dyn_cast<LoadInst>(V);
7744 return !LI || !LI->isSimple();
7751 return TreeEntry::NeedToGather;
7755 case Instruction::ZExt:
7756 case Instruction::SExt:
7757 case Instruction::FPToUI:
7758 case Instruction::FPToSI:
7759 case Instruction::FPExt:
7760 case Instruction::PtrToInt:
7761 case Instruction::IntToPtr:
7762 case Instruction::SIToFP:
7763 case Instruction::UIToFP:
7764 case Instruction::Trunc:
7765 case Instruction::FPTrunc:
7766 case Instruction::BitCast: {
7768 for (
Value *V : VL) {
7769 if (isa<PoisonValue>(V))
7771 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7774 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7775 return TreeEntry::NeedToGather;
7778 return TreeEntry::Vectorize;
7780 case Instruction::ICmp:
7781 case Instruction::FCmp: {
7786 for (
Value *V : VL) {
7787 if (isa<PoisonValue>(V))
7789 auto *
Cmp = cast<CmpInst>(V);
7790 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7791 Cmp->getOperand(0)->getType() != ComparedTy) {
7792 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7793 return TreeEntry::NeedToGather;
7796 return TreeEntry::Vectorize;
7798 case Instruction::Select:
7799 case Instruction::FNeg:
7800 case Instruction::Add:
7801 case Instruction::FAdd:
7802 case Instruction::Sub:
7803 case Instruction::FSub:
7804 case Instruction::Mul:
7805 case Instruction::FMul:
7806 case Instruction::UDiv:
7807 case Instruction::SDiv:
7808 case Instruction::FDiv:
7809 case Instruction::URem:
7810 case Instruction::SRem:
7811 case Instruction::FRem:
7812 case Instruction::Shl:
7813 case Instruction::LShr:
7814 case Instruction::AShr:
7815 case Instruction::And:
7816 case Instruction::Or:
7817 case Instruction::Xor:
7818 case Instruction::Freeze:
7819 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7821 auto *
I = dyn_cast<Instruction>(V);
7822 return I &&
I->isBinaryOp() && !
I->isFast();
7824 return TreeEntry::NeedToGather;
7825 return TreeEntry::Vectorize;
7826 case Instruction::GetElementPtr: {
7828 for (
Value *V : VL) {
7829 auto *
I = dyn_cast<GetElementPtrInst>(V);
7832 if (
I->getNumOperands() != 2) {
7833 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7834 return TreeEntry::NeedToGather;
7840 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7841 for (
Value *V : VL) {
7842 auto *
GEP = dyn_cast<GEPOperator>(V);
7845 Type *CurTy =
GEP->getSourceElementType();
7847 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7848 return TreeEntry::NeedToGather;
7854 for (
Value *V : VL) {
7855 auto *
I = dyn_cast<GetElementPtrInst>(V);
7858 auto *
Op =
I->getOperand(1);
7859 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7860 (
Op->getType() != Ty1 &&
7861 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7862 Op->getType()->getScalarSizeInBits() >
7863 DL->getIndexSizeInBits(
7864 V->getType()->getPointerAddressSpace())))) {
7866 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7867 return TreeEntry::NeedToGather;
7871 return TreeEntry::Vectorize;
7873 case Instruction::Store: {
7875 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7878 if (
DL->getTypeSizeInBits(ScalarTy) !=
7879 DL->getTypeAllocSizeInBits(ScalarTy)) {
7880 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7881 return TreeEntry::NeedToGather;
7885 for (
Value *V : VL) {
7886 auto *
SI = cast<StoreInst>(V);
7887 if (!
SI->isSimple()) {
7889 return TreeEntry::NeedToGather;
7898 if (CurrentOrder.empty()) {
7899 Ptr0 = PointerOps.
front();
7900 PtrN = PointerOps.
back();
7902 Ptr0 = PointerOps[CurrentOrder.front()];
7903 PtrN = PointerOps[CurrentOrder.back()];
7905 std::optional<int> Dist =
7908 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7909 return TreeEntry::Vectorize;
7913 return TreeEntry::NeedToGather;
7915 case Instruction::Call: {
7916 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7918 auto *
I = dyn_cast<Instruction>(V);
7919 return I && !
I->isFast();
7921 return TreeEntry::NeedToGather;
7924 CallInst *CI = cast<CallInst>(VL0);
7935 return TreeEntry::NeedToGather;
7940 for (
unsigned J = 0; J != NumArgs; ++J)
7943 for (
Value *V : VL) {
7944 CallInst *CI2 = dyn_cast<CallInst>(V);
7950 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7952 return TreeEntry::NeedToGather;
7956 for (
unsigned J = 0; J != NumArgs; ++J) {
7959 if (ScalarArgs[J] != A1J) {
7961 <<
"SLP: mismatched arguments in call:" << *CI
7962 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7963 return TreeEntry::NeedToGather;
7972 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7973 <<
"!=" << *V <<
'\n');
7974 return TreeEntry::NeedToGather;
7978 return TreeEntry::Vectorize;
7980 case Instruction::ShuffleVector: {
7981 if (!S.isAltShuffle()) {
7984 return TreeEntry::Vectorize;
7987 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7988 return TreeEntry::NeedToGather;
7993 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7994 "the whole alt sequence is not profitable.\n");
7995 return TreeEntry::NeedToGather;
7998 return TreeEntry::Vectorize;
8002 return TreeEntry::NeedToGather;
8016 PHIHandler() =
delete;
8018 : DT(DT), Main(Main), Phis(Phis),
8019 Operands(Main->getNumIncomingValues(),
8021 void buildOperands() {
8022 constexpr unsigned FastLimit = 4;
8032 auto *
P = dyn_cast<PHINode>(V);
8034 assert(isa<PoisonValue>(V) &&
8035 "Expected isa instruction or poison value.");
8039 if (
P->getIncomingBlock(
I) == InBB)
8054 Blocks.try_emplace(InBB).first->second.push_back(
I);
8057 if (isa<PoisonValue>(V)) {
8062 auto *
P = cast<PHINode>(V);
8063 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
8071 auto It =
Blocks.find(InBB);
8077 for (
const auto &
P :
Blocks) {
8078 if (
P.getSecond().size() <= 1)
8080 unsigned BasicI =
P.getSecond().front();
8083 [&](
const auto &Data) {
8084 return !Data.value() ||
8085 Data.value() ==
Operands[BasicI][Data.index()];
8087 "Expected empty operands list.");
8097 const EdgeInfo &UserTreeIdx,
8098 unsigned InterleaveFactor) {
8104 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8105 bool DoNotFail =
false) {
8108 for (
Value *V : VL) {
8115 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8120 size_t NumUniqueScalarValues = UniqueValues.
size();
8123 if (NumUniqueScalarValues == VL.size() &&
8125 ReuseShuffleIndices.
clear();
8128 if ((UserTreeIdx.UserTE &&
8129 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8131 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8132 "for nodes with padding.\n");
8133 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8137 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8138 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8141 if (DoNotFail && UniquePositions.size() > 1 &&
8142 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8143 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8146 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8147 if (PWSz == VL.size()) {
8148 ReuseShuffleIndices.
clear();
8150 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8152 PWSz - UniqueValues.
size(),
8158 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8161 VL = NonUniqueValueVL;
8166 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8179 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8181 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8187 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
8188 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
8189 if (E->isSame(VL)) {
8191 E->UserTreeIndices.push_back(UserTreeIdx);
8192 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8198 return isa<PoisonValue>(V) || Values.contains(V);
8201 if (TryToFindDuplicates(S))
8202 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8203 ReuseShuffleIndices);
8213 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8218 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8220 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8221 if (TryToFindDuplicates(S))
8222 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8223 ReuseShuffleIndices);
8228 if (S && S.getOpcode() == Instruction::ExtractElement &&
8229 isa<ScalableVectorType>(
8230 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8231 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8232 if (TryToFindDuplicates(S))
8233 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8234 ReuseShuffleIndices);
8241 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8250 auto &&NotProfitableForVectorization = [&S,
this,
8252 if (!S || !S.isAltShuffle() || VL.size() > 2)
8261 for (
Value *V : VL) {
8262 auto *
I = cast<Instruction>(V);
8264 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8267 bool IsCommutative =
8269 if ((IsCommutative &&
8270 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8272 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8274 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8276 auto *
I1 = cast<Instruction>(VL.front());
8277 auto *I2 = cast<Instruction>(VL.back());
8278 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8280 I2->getOperand(
Op));
8281 if (
static_cast<unsigned>(
count_if(
8282 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8284 })) >= S.getMainOp()->getNumOperands() / 2)
8286 if (S.getMainOp()->getNumOperands() > 2)
8288 if (IsCommutative) {
8293 I2->getOperand((
Op + 1) % E));
8295 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8304 bool IsScatterVectorizeUserTE =
8305 UserTreeIdx.UserTE &&
8306 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8308 bool AreScatterAllGEPSameBlock =
8309 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8313 auto *
I = dyn_cast<GetElementPtrInst>(V);
8317 BB =
I->getParent();
8318 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8321 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8323 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8326 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8329 NotProfitableForVectorization(VL)) {
8330 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8331 if (TryToFindDuplicates(S))
8332 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8333 ReuseShuffleIndices);
8338 if (S && !EphValues.
empty()) {
8339 for (
Value *V : VL) {
8340 if (EphValues.
count(V)) {
8342 <<
") is ephemeral.\n");
8343 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8353 for (
Value *V : VL) {
8354 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8359 <<
") is already in tree.\n");
8360 if (TryToFindDuplicates(S))
8361 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8362 ReuseShuffleIndices);
8368 if (UserIgnoreList && !UserIgnoreList->empty()) {
8369 for (
Value *V : VL) {
8370 if (UserIgnoreList->contains(V)) {
8371 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8372 if (TryToFindDuplicates(S))
8373 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8374 ReuseShuffleIndices);
8382 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8383 assert(VL.front()->getType()->isPointerTy() &&
8384 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8385 "Expected pointers only.");
8387 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8388 assert(It != VL.end() &&
"Expected at least one GEP.");
8405 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8410 if (!TryToFindDuplicates(S,
true))
8416 TreeEntry::EntryState State = getScalarsVectorizationState(
8417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8418 if (State == TreeEntry::NeedToGather) {
8419 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8420 ReuseShuffleIndices);
8424 auto &BSRef = BlocksSchedules[BB];
8426 BSRef = std::make_unique<BlockScheduling>(BB);
8428 BlockScheduling &BS = *BSRef;
8430 std::optional<ScheduleData *> Bundle =
8431 BS.tryScheduleBundle(UniqueValues,
this, S);
8432#ifdef EXPENSIVE_CHECKS
8437 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8438 assert((!BS.getScheduleData(VL0) ||
8439 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8440 "tryScheduleBundle should cancelScheduling on failure");
8441 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8442 ReuseShuffleIndices);
8443 NonScheduledFirst.insert(VL.front());
8444 if (S.getOpcode() == Instruction::Load &&
8445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8449 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8451 unsigned ShuffleOrOp =
8452 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8453 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8456 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8461 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8466 for (
unsigned I : PHIOps)
8469 switch (ShuffleOrOp) {
8470 case Instruction::PHI: {
8471 auto *PH = cast<PHINode>(VL0);
8474 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8479 PHIHandler Handler(*DT, PH, VL);
8480 Handler.buildOperands();
8481 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8482 TE->setOperand(
I, Handler.getOperands(
I));
8484 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8489 case Instruction::ExtractValue:
8490 case Instruction::ExtractElement: {
8491 if (CurrentOrder.empty()) {
8492 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8495 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8497 for (
unsigned Idx : CurrentOrder)
8505 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8506 ReuseShuffleIndices, CurrentOrder);
8508 "(ExtractValueInst/ExtractElementInst).\n";
8512 TE->setOperand(*
this);
8515 case Instruction::InsertElement: {
8516 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8518 auto OrdCompare = [](
const std::pair<int, int> &
P1,
8519 const std::pair<int, int> &P2) {
8520 return P1.first > P2.first;
8523 decltype(OrdCompare)>
8524 Indices(OrdCompare);
8525 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8527 Indices.emplace(
Idx,
I);
8529 OrdersType CurrentOrder(VL.size(), VL.size());
8530 bool IsIdentity =
true;
8531 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8532 CurrentOrder[Indices.top().second] =
I;
8533 IsIdentity &= Indices.top().second ==
I;
8537 CurrentOrder.clear();
8538 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8540 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8543 TE->setOperand(*
this);
8544 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8547 case Instruction::Load: {
8554 TreeEntry *
TE =
nullptr;
8557 case TreeEntry::Vectorize:
8558 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8559 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8560 if (CurrentOrder.empty())
8565 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8568 case TreeEntry::StridedVectorize:
8570 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8571 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8572 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8575 case TreeEntry::ScatterVectorize:
8577 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8578 UserTreeIdx, ReuseShuffleIndices);
8581 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8584 case TreeEntry::CombinedVectorize:
8585 case TreeEntry::NeedToGather:
8588 TE->setOperand(*
this);
8589 if (State == TreeEntry::ScatterVectorize)
8590 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8593 case Instruction::ZExt:
8594 case Instruction::SExt:
8595 case Instruction::FPToUI:
8596 case Instruction::FPToSI:
8597 case Instruction::FPExt:
8598 case Instruction::PtrToInt:
8599 case Instruction::IntToPtr:
8600 case Instruction::SIToFP:
8601 case Instruction::UIToFP:
8602 case Instruction::Trunc:
8603 case Instruction::FPTrunc:
8604 case Instruction::BitCast: {
8605 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8606 std::make_pair(std::numeric_limits<unsigned>::min(),
8607 std::numeric_limits<unsigned>::max()));
8608 if (ShuffleOrOp == Instruction::ZExt ||
8609 ShuffleOrOp == Instruction::SExt) {
8610 CastMaxMinBWSizes = std::make_pair(
8616 }
else if (ShuffleOrOp == Instruction::Trunc) {
8617 CastMaxMinBWSizes = std::make_pair(
8624 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8625 ReuseShuffleIndices);
8629 TE->setOperand(*
this);
8631 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8632 if (ShuffleOrOp == Instruction::Trunc) {
8633 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8634 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8635 ShuffleOrOp == Instruction::UIToFP) {
8636 unsigned NumSignBits =
8638 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8640 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8642 if (NumSignBits * 2 >=
8644 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8648 case Instruction::ICmp:
8649 case Instruction::FCmp: {
8652 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8653 ReuseShuffleIndices);
8658 VLOperands Ops(VL, S, *
this);
8663 "Commutative Predicate mismatch");
8665 Left = Ops.getVL(0);
8666 Right = Ops.getVL(1);
8669 for (
Value *V : VL) {
8670 if (isa<PoisonValue>(V)) {
8675 auto *
Cmp = cast<CmpInst>(V);
8678 if (
Cmp->getPredicate() != P0)
8680 Left.push_back(LHS);
8681 Right.push_back(RHS);
8688 if (ShuffleOrOp == Instruction::ICmp) {
8689 unsigned NumSignBits0 =
8691 if (NumSignBits0 * 2 >=
8693 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8694 unsigned NumSignBits1 =
8696 if (NumSignBits1 * 2 >=
8698 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8702 case Instruction::Select:
8703 case Instruction::FNeg:
8704 case Instruction::Add:
8705 case Instruction::FAdd:
8706 case Instruction::Sub:
8707 case Instruction::FSub:
8708 case Instruction::Mul:
8709 case Instruction::FMul:
8710 case Instruction::UDiv:
8711 case Instruction::SDiv:
8712 case Instruction::FDiv:
8713 case Instruction::URem:
8714 case Instruction::SRem:
8715 case Instruction::FRem:
8716 case Instruction::Shl:
8717 case Instruction::LShr:
8718 case Instruction::AShr:
8719 case Instruction::And:
8720 case Instruction::Or:
8721 case Instruction::Xor:
8722 case Instruction::Freeze: {
8723 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8724 ReuseShuffleIndices);
8726 dbgs() <<
"SLP: added a new TreeEntry "
8727 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8730 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8732 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8735 case Instruction::GetElementPtr: {
8736 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8737 ReuseShuffleIndices);
8738 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8742 for (
Value *V : VL) {
8743 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8748 Operands.front().push_back(
GEP->getPointerOperand());
8759 [VL0Ty, IndexIdx](
Value *V) {
8760 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8763 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8767 ->getPointerOperandType()
8770 for (
Value *V : VL) {
8771 auto *
I = dyn_cast<GetElementPtrInst>(V);
8774 ConstantInt::get(Ty, 0,
false));
8777 auto *
Op =
I->getOperand(IndexIdx);
8778 auto *CI = dyn_cast<ConstantInt>(
Op);
8783 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8787 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8791 case Instruction::Store: {
8792 bool Consecutive = CurrentOrder.empty();
8795 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8796 ReuseShuffleIndices, CurrentOrder);
8798 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8802 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8804 TE->setOperand(*
this);
8805 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8808 case Instruction::Call: {
8811 CallInst *CI = cast<CallInst>(VL0);
8814 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8815 ReuseShuffleIndices);
8819 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8824 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8828 case Instruction::ShuffleVector: {
8829 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8830 ReuseShuffleIndices);
8831 if (S.isAltShuffle()) {
8832 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8837 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8842 auto *CI = dyn_cast<CmpInst>(VL0);
8844 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8846 auto *MainCI = cast<CmpInst>(S.getMainOp());
8847 auto *AltCI = cast<CmpInst>(S.getAltOp());
8851 "Expected different main/alternate predicates.");
8855 for (
Value *V : VL) {
8856 if (isa<PoisonValue>(V)) {
8861 auto *
Cmp = cast<CmpInst>(V);
8872 Left.push_back(LHS);
8873 Right.push_back(RHS);
8882 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8884 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8897 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8900 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8902 for (
const auto *Ty : ST->elements())
8903 if (Ty != *ST->element_begin())
8905 N *= ST->getNumElements();
8906 EltTy = *ST->element_begin();
8907 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8908 N *= AT->getNumElements();
8909 EltTy = AT->getElementType();
8911 auto *VT = cast<FixedVectorType>(EltTy);
8912 N *= VT->getNumElements();
8913 EltTy = VT->getElementType();
8920 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8928 bool ResizeAllowed)
const {
8929 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8930 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8931 auto *E0 = cast<Instruction>(*It);
8933 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8937 Value *Vec = E0->getOperand(0);
8939 CurrentOrder.
clear();
8943 if (E0->getOpcode() == Instruction::ExtractValue) {
8948 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8952 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8955 unsigned E = VL.
size();
8956 if (!ResizeAllowed && NElts != E)
8959 unsigned MinIdx = NElts, MaxIdx = 0;
8961 auto *Inst = dyn_cast<Instruction>(V);
8964 if (Inst->getOperand(0) != Vec)
8966 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8967 if (isa<UndefValue>(EE->getIndexOperand()))
8972 const unsigned ExtIdx = *
Idx;
8973 if (ExtIdx >= NElts)
8975 Indices[
I] = ExtIdx;
8976 if (MinIdx > ExtIdx)
8978 if (MaxIdx < ExtIdx)
8981 if (MaxIdx - MinIdx + 1 > E)
8983 if (MaxIdx + 1 <= E)
8987 bool ShouldKeepOrder =
true;
8993 CurrentOrder.
assign(E, E);
8994 for (
unsigned I = 0;
I < E; ++
I) {
8997 const unsigned ExtIdx = Indices[
I] - MinIdx;
8998 if (CurrentOrder[ExtIdx] != E) {
8999 CurrentOrder.
clear();
9002 ShouldKeepOrder &= ExtIdx ==
I;
9003 CurrentOrder[ExtIdx] =
I;
9005 if (ShouldKeepOrder)
9006 CurrentOrder.
clear();
9008 return ShouldKeepOrder;
9011bool BoUpSLP::areAllUsersVectorized(
9013 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9015 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
9016 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9020static std::pair<InstructionCost, InstructionCost>
9028 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9029 FMF = FPCI->getFastMathFlags();
9031 auto IntrinsicCost =
9038 auto LibCost = IntrinsicCost;
9045 return {IntrinsicCost, LibCost};
9048void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9052 unsigned Sz = Scalars.size();
9055 if (!ReorderIndices.empty())
9057 for (
unsigned I = 0;
I < Sz; ++
I) {
9059 if (!ReorderIndices.empty())
9061 if (isa<PoisonValue>(Scalars[
Idx]))
9063 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9064 if (IsAltOp(OpInst)) {
9074 if (!ReuseShuffleIndices.
empty()) {
9077 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9087 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9088 auto *AltCI = cast<CmpInst>(AltOp);
9091 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9092 auto *CI = cast<CmpInst>(
I);
9100 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9101 "CmpInst expected to match either main or alternate predicate or "
9103 return MainP !=
P && MainP != SwappedP;
9110 const auto *Op0 = Ops.
front();
9116 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9120 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9122 if (
auto *CI = dyn_cast<ConstantInt>(V))
9123 return CI->getValue().isPowerOf2();
9126 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9128 if (
auto *CI = dyn_cast<ConstantInt>(V))
9129 return CI->getValue().isNegatedPowerOf2();
9134 if (IsConstant && IsUniform)
9136 else if (IsConstant)
9150class BaseShuffleAnalysis {
9152 Type *ScalarTy =
nullptr;
9154 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9162 unsigned getVF(
Value *V)
const {
9163 assert(V &&
"V cannot be nullptr");
9164 assert(isa<FixedVectorType>(
V->getType()) &&
9165 "V does not have FixedVectorType");
9166 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9168 unsigned VNumElements =
9169 cast<FixedVectorType>(
V->getType())->getNumElements();
9170 assert(VNumElements > ScalarTyNumElements &&
9171 "the number of elements of V is not large enough");
9172 assert(VNumElements % ScalarTyNumElements == 0 &&
9173 "the number of elements of V is not a vectorized value");
9174 return VNumElements / ScalarTyNumElements;
9182 int Limit =
Mask.size();
9194 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9210 unsigned VF =
Mask.size();
9212 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9215 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9256 bool SinglePermute) {
9260 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9262 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9268 if (isIdentityMask(Mask, SVTy,
false)) {
9269 if (!IdentityOp || !SinglePermute ||
9270 (isIdentityMask(Mask, SVTy,
true) &&
9272 IdentityMask.
size()))) {
9277 IdentityMask.
assign(Mask);
9297 if (SV->isZeroEltSplat()) {
9299 IdentityMask.
assign(Mask);
9301 int LocalVF =
Mask.size();
9303 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9304 LocalVF = SVOpTy->getNumElements();
9308 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9310 ExtMask[
Idx] = SV->getMaskValue(
I);
9320 if (!IsOp1Undef && !IsOp2Undef) {
9322 for (
int &
I : Mask) {
9325 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9332 combineMasks(LocalVF, ShuffleMask, Mask);
9333 Mask.swap(ShuffleMask);
9335 Op = SV->getOperand(0);
9337 Op = SV->getOperand(1);
9339 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9340 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9345 "Expected masks of same sizes.");
9350 Mask.swap(IdentityMask);
9351 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9352 return SinglePermute &&
9353 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9355 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9356 Shuffle->isZeroEltSplat() &&
9369 template <
typename T,
typename ShuffleBuilderTy>
9371 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
9372 assert(V1 &&
"Expected at least one vector value.");
9375 if (ScalarTyNumElements != 1) {
9381 Builder.resizeToMatch(V1, V2);
9382 int VF =
Mask.size();
9383 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9384 VF = FTy->getNumElements();
9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9392 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9395 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9397 CombinedMask1[
I] =
Mask[
I];
9399 CombinedMask2[
I] =
Mask[
I] - VF;
9406 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9407 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9410 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9411 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9416 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9419 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9421 ExtMask1, UseMask::SecondArg);
9426 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9429 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9431 ExtMask2, UseMask::SecondArg);
9432 if (SV1->getOperand(0)->getType() ==
9433 SV2->getOperand(0)->getType() &&
9434 SV1->getOperand(0)->getType() != SV1->getType() &&
9437 Op1 = SV1->getOperand(0);
9438 Op2 = SV2->getOperand(0);
9440 int LocalVF = ShuffleMask1.size();
9441 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9442 LocalVF = FTy->getNumElements();
9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9444 CombinedMask1.swap(ShuffleMask1);
9446 LocalVF = ShuffleMask2.size();
9447 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9450 CombinedMask2.swap(ShuffleMask2);
9453 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9454 Builder.resizeToMatch(Op1, Op2);
9455 VF = std::max(cast<VectorType>(Op1->
getType())
9457 .getKnownMinValue(),
9458 cast<VectorType>(Op2->
getType())
9460 .getKnownMinValue());
9461 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9464 "Expected undefined mask element");
9465 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9471 isa<ShuffleVectorInst>(Op1) &&
9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9474 return Builder.createIdentity(Op1);
9475 return Builder.createShuffleVector(
9479 if (isa<PoisonValue>(V1))
9480 return Builder.createPoison(
9481 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9482 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9483 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9486 return Builder.createShuffleVector(V1, NewMask);
9487 return Builder.createIdentity(V1);
9494 for (
unsigned I : seq<unsigned>(CommonMask.
size()))
9502static std::pair<InstructionCost, InstructionCost>
9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9527 for (
Value *V : Ptrs) {
9532 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9537 if (!
Ptr || !
Ptr->hasOneUse())
9541 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9547 TTI::PointersChainInfo::getKnownStride(),
9557 [](
const Value *V) {
9558 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9559 return Ptr && !
Ptr->hasAllConstantIndices();
9561 ? TTI::PointersChainInfo::getUnknownStride()
9562 : TTI::PointersChainInfo::getKnownStride();
9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9568 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9569 if (It != Ptrs.
end())
9570 BaseGEP = cast<GEPOperator>(*It);
9575 BaseGEP->getPointerOperand(), Indices, VecTy,
9580 return std::make_pair(ScalarCost, VecCost);
9583void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9584 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9585 "Expected gather node without reordering.");
9591 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
9595 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9596 return VectorizableTree[Idx]->isSame(TE.Scalars);
9600 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9605 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9606 if (LIt != LoadsMap.
end()) {
9607 for (
LoadInst *RLI : LIt->second) {
9613 for (
LoadInst *RLI : LIt->second) {
9620 if (LIt->second.size() > 2) {
9622 hash_value(LIt->second.back()->getPointerOperand());
9628 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9633 bool IsOrdered =
true;
9634 unsigned NumInstructions = 0;
9639 if (
auto *Inst = dyn_cast<Instruction>(V);
9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9646 auto &Container = SortedValues[
Key];
9647 if (IsOrdered && !KeyToIndex.
contains(V) &&
9648 !(isa<Constant, ExtractElementInst>(V) ||
9650 ((Container.contains(
Idx) &&
9651 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9652 (!Container.empty() && !Container.contains(
Idx) &&
9653 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9655 auto &KTI = KeyToIndex[
V];
9657 Container[
Idx].push_back(V);
9662 if (!IsOrdered && NumInstructions > 1) {
9664 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9665 for (
const auto &
D : SortedValues) {
9666 for (
const auto &
P :
D.second) {
9668 for (
Value *V :
P.second) {
9671 TE.ReorderIndices[Cnt +
K] =
Idx;
9672 TE.Scalars[Cnt +
K] =
V;
9674 Sz += Indices.
size();
9675 Cnt += Indices.
size();
9677 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9679 *
TTI,
TE.Scalars.front()->getType(), Sz);
9681 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9683 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9684 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9691 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9696 auto *ScalarTy =
TE.Scalars.front()->getType();
9698 for (
auto [
Idx, Sz] : SubVectors) {
9702 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9707 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9708 if (DemandedElts[
I])
9711 CostKind,
I * ScalarTyNumElements, FTy);
9716 int Sz =
TE.Scalars.size();
9718 TE.ReorderIndices.end());
9719 for (
unsigned I : seq<unsigned>(Sz)) {
9721 if (isa<PoisonValue>(V)) {
9724 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9728 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9731 VecTy, ReorderMask);
9734 for (
unsigned I : seq<unsigned>(Sz)) {
9738 if (!isa<PoisonValue>(V))
9741 ReorderMask[
I] =
I + Sz;
9745 VecTy, DemandedElts,
true,
false,
CostKind);
9748 if (
Cost >= BVCost) {
9751 TE.ReorderIndices.clear();
9757 BaseGraphSize = VectorizableTree.size();
9759 class GraphTransformModeRAAI {
9760 bool &SavedIsGraphTransformMode;
9763 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9764 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9765 IsGraphTransformMode =
true;
9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9768 } TransformContext(IsGraphTransformMode);
9777 const InstructionsState &S) {
9779 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9781 I2->getOperand(
Op));
9783 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9785 [](
const std::pair<Value *, Value *> &
P) {
9786 return isa<Constant>(
P.first) ||
9787 isa<Constant>(
P.second) ||
P.first ==
P.second;
9794 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9795 TreeEntry &E = *VectorizableTree[
Idx];
9797 reorderGatherNode(E);
9801 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9802 TreeEntry &E = *VectorizableTree[
Idx];
9809 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9810 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9816 unsigned StartIdx = 0;
9821 *
TTI, VL.
front()->getType(), VF - 1)) {
9822 if (StartIdx + VF >
End)
9825 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9830 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
9837 bool IsSplat =
isSplat(Slice);
9838 bool IsTwoRegisterSplat =
true;
9839 if (IsSplat && VF == 2) {
9842 IsTwoRegisterSplat = NumRegs2VF == 2;
9844 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
9846 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9852 (S.getOpcode() == Instruction::Load &&
9854 (S.getOpcode() != Instruction::Load &&
9860 if ((!UserIgnoreList || E.Idx != 0) &&
9864 if (isa<PoisonValue>(V))
9866 return areAllUsersVectorized(cast<Instruction>(V),
9870 if (S.getOpcode() == Instruction::Load) {
9882 if (UserIgnoreList && E.Idx == 0)
9887 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9890 !CheckOperandsProfitability(
9893 IsaPred<Instruction>)),
9904 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9905 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9906 if (StartIdx == Cnt)
9907 StartIdx = Cnt + Sz;
9908 if (
End == Cnt + Sz)
9911 for (
auto [Cnt, Sz] : Slices) {
9914 if (TreeEntry *SE = getSameValuesTreeEntry(Slice.
front(), Slice,
9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9917 AddCombinedNode(SE->Idx, Cnt, Sz);
9920 unsigned PrevSize = VectorizableTree.size();
9921 [[maybe_unused]]
unsigned PrevEntriesSize =
9922 LoadEntriesToVectorize.size();
9923 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9924 if (PrevSize + 1 == VectorizableTree.size() &&
9925 VectorizableTree[PrevSize]->isGather() &&
9926 VectorizableTree[PrevSize]->hasState() &&
9927 VectorizableTree[PrevSize]->getOpcode() !=
9928 Instruction::ExtractElement &&
9930 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9932 VectorizableTree.pop_back();
9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9934 "LoadEntriesToVectorize expected to remain the same");
9937 AddCombinedNode(PrevSize, Cnt, Sz);
9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9944 E.ReorderIndices.clear();
9949 switch (E.getOpcode()) {
9950 case Instruction::Load: {
9953 if (E.State != TreeEntry::Vectorize)
9955 Type *ScalarTy = E.getMainOp()->getType();
9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9960 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9972 false, CommonAlignment,
CostKind, BaseLI);
9973 if (StridedCost < OriginalVecCost)
9976 E.State = TreeEntry::StridedVectorize;
9980 case Instruction::Store: {
9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9987 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9999 false, CommonAlignment,
CostKind, BaseSI);
10000 if (StridedCost < OriginalVecCost)
10003 E.State = TreeEntry::StridedVectorize;
10004 }
else if (!E.ReorderIndices.empty()) {
10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10008 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
10009 if (Mask.size() < 4)
10011 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10015 VecTy, Factor, BaseSI->getAlign(),
10023 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10024 if (InterleaveFactor != 0)
10025 E.setInterleave(InterleaveFactor);
10029 case Instruction::Select: {
10030 if (E.State != TreeEntry::Vectorize)
10036 E.CombinedOp = TreeEntry::MinMax;
10037 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10039 CondEntry->State == TreeEntry::Vectorize) {
10041 CondEntry->State = TreeEntry::CombinedVectorize;
10050 if (LoadEntriesToVectorize.empty()) {
10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10053 VectorizableTree.front()->getOpcode() == Instruction::Load)
10056 constexpr unsigned SmallTree = 3;
10057 constexpr unsigned SmallVF = 2;
10058 if ((VectorizableTree.size() <= SmallTree &&
10059 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10060 (VectorizableTree.size() <= 2 && UserIgnoreList))
10063 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10067 [](
const std::unique_ptr<TreeEntry> &TE) {
10068 return TE->isGather() && TE->hasState() &&
10069 TE->getOpcode() == Instruction::Load &&
10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10082 TreeEntry &E = *TE;
10083 if (E.isGather() &&
10084 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10085 (!E.hasState() &&
any_of(E.Scalars,
10087 return isa<LoadInst>(V) &&
10088 !isVectorized(V) &&
10089 !isDeleted(cast<Instruction>(V));
10092 for (
Value *V : E.Scalars) {
10093 auto *LI = dyn_cast<LoadInst>(V);
10099 *
this, V, *DL, *SE, *
TTI,
10100 GatheredLoads[std::make_tuple(
10108 if (!GatheredLoads.
empty())
10109 tryToVectorizeGatheredLoads(GatheredLoads);
10119 bool IsFinalized =
false;
10132 bool SameNodesEstimated =
true;
10141 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10157 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10158 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10161 count(VL, *It) > 1 &&
10163 if (!NeedShuffle) {
10164 if (isa<FixedVectorType>(ScalarTy)) {
10169 cast<FixedVectorType>(ScalarTy));
10172 CostKind, std::distance(VL.
begin(), It),
10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10185 VecTy, ShuffleMask, CostKind,
10189 return GatherCost +
10190 (
all_of(Gathers, IsaPred<UndefValue>)
10192 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10201 unsigned NumParts) {
10202 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10204 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10205 auto *EE = dyn_cast<ExtractElementInst>(V);
10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10211 return std::max(Sz, VecTy->getNumElements());
10217 -> std::optional<TTI::ShuffleKind> {
10218 if (NumElts <= EltsPerVector)
10219 return std::nullopt;
10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10223 if (I == PoisonMaskElem)
10225 return std::min(S, I);
10228 int OffsetReg1 = OffsetReg0;
10232 int FirstRegId = -1;
10233 Indices.assign(1, OffsetReg0);
10237 int Idx =
I - OffsetReg0;
10239 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10240 if (FirstRegId < 0)
10241 FirstRegId = RegId;
10242 RegIndices.
insert(RegId);
10243 if (RegIndices.
size() > 2)
10244 return std::nullopt;
10245 if (RegIndices.
size() == 2) {
10247 if (Indices.
size() == 1) {
10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10251 [&](
int S,
int I) {
10252 if (I == PoisonMaskElem)
10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10255 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10256 if (RegId == FirstRegId)
10258 return std::min(S, I);
10261 Indices.push_back(OffsetReg1 % NumElts);
10263 Idx =
I - OffsetReg1;
10265 I = (
Idx % NumElts) % EltsPerVector +
10266 (RegId == FirstRegId ? 0 : EltsPerVector);
10268 return ShuffleKind;
10275 for (
unsigned Part : seq<unsigned>(NumParts)) {
10276 if (!ShuffleKinds[Part])
10279 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10283 std::optional<TTI::ShuffleKind> RegShuffleKind =
10284 CheckPerRegistersShuffle(SubMask, Indices);
10285 if (!RegShuffleKind) {
10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10301 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10302 for (
unsigned Idx : Indices) {
10303 assert((
Idx + EltsPerVector) <= BaseVF &&
10304 "SK_ExtractSubvector index out of range");
10315 if (OriginalCost <
Cost)
10316 Cost = OriginalCost;
10323 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10325 unsigned SliceSize) {
10326 if (SameNodesEstimated) {
10332 if ((InVectors.
size() == 2 &&
10333 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10334 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10335 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10336 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10339 "Expected all poisoned elements.");
10341 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10346 Cost += createShuffle(InVectors.
front(),
10347 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10349 transformMaskAfterShuffle(CommonMask, CommonMask);
10350 }
else if (InVectors.
size() == 2) {
10351 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10352 transformMaskAfterShuffle(CommonMask, CommonMask);
10354 SameNodesEstimated =
false;
10355 if (!E2 && InVectors.
size() == 1) {
10356 unsigned VF = E1.getVectorFactor();
10359 cast<FixedVectorType>(V1->
getType())->getNumElements());
10361 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10362 VF = std::max(VF, E->getVectorFactor());
10364 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10366 CommonMask[
Idx] = Mask[
Idx] + VF;
10367 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10368 transformMaskAfterShuffle(CommonMask, CommonMask);
10370 auto P = InVectors.
front();
10371 Cost += createShuffle(&E1, E2, Mask);
10372 unsigned VF = Mask.size();
10377 const auto *E = cast<const TreeEntry *>(
P);
10378 VF = std::max(VF, E->getVectorFactor());
10380 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10382 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10383 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10384 transformMaskAfterShuffle(CommonMask, CommonMask);
10388 class ShuffleCostBuilder {
10391 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10393 return Mask.empty() ||
10394 (VF == Mask.size() &&
10402 ~ShuffleCostBuilder() =
default;
10407 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10408 if (isEmptyOrIdentity(Mask, VF))
10411 cast<VectorType>(V1->
getType()), Mask);
10416 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10417 if (isEmptyOrIdentity(Mask, VF))
10420 cast<VectorType>(V1->
getType()), Mask);
10426 void resizeToMatch(
Value *&,
Value *&)
const {}
10436 ShuffleCostBuilder Builder(
TTI);
10439 unsigned CommonVF = Mask.size();
10441 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10445 Type *EScalarTy = E.Scalars.front()->getType();
10446 bool IsSigned =
true;
10447 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10449 IsSigned = It->second.second;
10451 if (EScalarTy != ScalarTy) {
10452 unsigned CastOpcode = Instruction::Trunc;
10453 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10454 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10456 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10464 if (isa<Constant>(V))
10466 auto *VecTy = cast<VectorType>(V->getType());
10468 if (EScalarTy != ScalarTy) {
10470 unsigned CastOpcode = Instruction::Trunc;
10471 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10472 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10474 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10481 if (!V1 && !V2 && !P2.
isNull()) {
10483 const TreeEntry *E = cast<const TreeEntry *>(P1);
10484 unsigned VF = E->getVectorFactor();
10485 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10486 CommonVF = std::max(VF, E2->getVectorFactor());
10489 return Idx < 2 * static_cast<int>(CommonVF);
10491 "All elements in mask must be less than 2 * CommonVF.");
10492 if (E->Scalars.size() == E2->Scalars.size()) {
10496 for (
int &
Idx : CommonMask) {
10499 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10501 else if (
Idx >=
static_cast<int>(CommonVF))
10502 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10506 CommonVF = E->Scalars.size();
10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10508 GetNodeMinBWAffectedCost(*E2, CommonVF);
10510 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10511 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10514 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10515 }
else if (!V1 && P2.
isNull()) {
10517 const TreeEntry *E = cast<const TreeEntry *>(P1);
10518 unsigned VF = E->getVectorFactor();
10522 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10523 "All elements in mask must be less than CommonVF.");
10524 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10526 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10527 for (
int &
Idx : CommonMask) {
10531 CommonVF = E->Scalars.size();
10532 }
else if (
unsigned Factor = E->getInterleaveFactor();
10533 Factor > 0 && E->Scalars.size() != Mask.size() &&
10537 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10539 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10542 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10543 CommonVF == CommonMask.
size() &&
10545 [](
const auto &&
P) {
10547 static_cast<unsigned>(
P.value()) !=
P.index();
10555 }
else if (V1 && P2.
isNull()) {
10557 ExtraCost += GetValueMinBWAffectedCost(V1);
10558 CommonVF = getVF(V1);
10561 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10562 "All elements in mask must be less than CommonVF.");
10563 }
else if (V1 && !V2) {
10565 unsigned VF = getVF(V1);
10566 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10567 CommonVF = std::max(VF, E2->getVectorFactor());
10570 return Idx < 2 * static_cast<int>(CommonVF);
10572 "All elements in mask must be less than 2 * CommonVF.");
10573 if (E2->Scalars.size() == VF && VF != CommonVF) {
10575 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10576 for (
int &
Idx : CommonMask) {
10579 if (
Idx >=
static_cast<int>(CommonVF))
10580 Idx = E2Mask[
Idx - CommonVF] + VF;
10584 ExtraCost += GetValueMinBWAffectedCost(V1);
10586 ExtraCost += GetNodeMinBWAffectedCost(
10587 *E2, std::min(CommonVF, E2->getVectorFactor()));
10588 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10589 }
else if (!V1 && V2) {
10591 unsigned VF = getVF(V2);
10592 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10593 CommonVF = std::max(VF, E1->getVectorFactor());
10596 return Idx < 2 * static_cast<int>(CommonVF);
10598 "All elements in mask must be less than 2 * CommonVF.");
10599 if (E1->Scalars.size() == VF && VF != CommonVF) {
10601 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10602 for (
int &
Idx : CommonMask) {
10605 if (
Idx >=
static_cast<int>(CommonVF))
10606 Idx = E1Mask[
Idx - CommonVF] + VF;
10612 ExtraCost += GetNodeMinBWAffectedCost(
10613 *E1, std::min(CommonVF, E1->getVectorFactor()));
10615 ExtraCost += GetValueMinBWAffectedCost(V2);
10616 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10618 assert(V1 && V2 &&
"Expected both vectors.");
10619 unsigned VF = getVF(V1);
10620 CommonVF = std::max(VF, getVF(V2));
10623 return Idx < 2 * static_cast<int>(CommonVF);
10625 "All elements in mask must be less than 2 * CommonVF.");
10627 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10628 if (V1->
getType() != V2->getType()) {
10630 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10632 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10634 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10635 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10638 InVectors.
front() =
10640 if (InVectors.
size() == 2)
10642 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10643 V1, V2, CommonMask, Builder, ScalarTy);
10650 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10651 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10652 CheckedExtracts(CheckedExtracts) {}
10654 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10655 unsigned NumParts,
bool &UseVecBaseAsInput) {
10656 UseVecBaseAsInput =
false;
10659 Value *VecBase =
nullptr;
10661 if (!E->ReorderIndices.empty()) {
10663 E->ReorderIndices.end());
10668 bool PrevNodeFound =
any_of(
10670 [&](
const std::unique_ptr<TreeEntry> &TE) {
10671 return ((TE->hasState() && !TE->isAltShuffle() &&
10672 TE->getOpcode() == Instruction::ExtractElement) ||
10674 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10675 return VL.size() > Data.index() &&
10676 (Mask[Data.index()] == PoisonMaskElem ||
10677 isa<UndefValue>(VL[Data.index()]) ||
10678 Data.value() == VL[Data.index()]);
10683 for (
unsigned Part : seq<unsigned>(NumParts)) {
10685 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10689 if (isa<UndefValue>(V) ||
10698 auto *EE = cast<ExtractElementInst>(V);
10699 VecBase = EE->getVectorOperand();
10700 UniqueBases.
insert(VecBase);
10702 if (!CheckedExtracts.
insert(V).second ||
10703 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10706 return isa<GetElementPtrInst>(U) &&
10707 !R.areAllUsersVectorized(cast<Instruction>(U),
10715 unsigned Idx = *EEIdx;
10717 if (EE->hasOneUse() || !PrevNodeFound) {
10719 if (isa<SExtInst, ZExtInst>(Ext) &&
10720 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10725 EE->getVectorOperandType(),
Idx);
10728 Ext->getOpcode(), Ext->getType(), EE->getType(),
10743 if (!PrevNodeFound)
10744 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10747 transformMaskAfterShuffle(CommonMask, CommonMask);
10748 SameNodesEstimated =
false;
10749 if (NumParts != 1 && UniqueBases.
size() != 1) {
10750 UseVecBaseAsInput =
true;
10758 std::optional<InstructionCost>
10762 return std::nullopt;
10768 return Idx < static_cast<int>(E1.getVectorFactor());
10770 "Expected single vector shuffle mask.");
10774 if (InVectors.
empty()) {
10775 CommonMask.
assign(Mask.begin(), Mask.end());
10776 InVectors.
assign({&E1, &E2});
10779 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10785 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10786 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10789 if (InVectors.
empty()) {
10790 CommonMask.
assign(Mask.begin(), Mask.end());
10791 InVectors.
assign(1, &E1);
10794 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10800 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10801 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10802 if (!SameNodesEstimated && InVectors.
size() == 1)
10814 auto *EI = cast<ExtractElementInst>(
10815 cast<const TreeEntry *>(InVectors.
front())
10816 ->getOrdered(
P.index()));
10817 return EI->getVectorOperand() == V1 ||
10818 EI->getVectorOperand() == V2;
10820 "Expected extractelement vectors.");
10824 if (InVectors.
empty()) {
10826 "Expected empty input mask/vectors.");
10827 CommonMask.
assign(Mask.begin(), Mask.end());
10828 InVectors.
assign(1, V1);
10833 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10834 !CommonMask.
empty() &&
10837 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10838 ->getOrdered(
P.index());
10840 return P.value() == Mask[
P.index()] ||
10841 isa<UndefValue>(Scalar);
10842 if (isa<Constant>(V1))
10844 auto *EI = cast<ExtractElementInst>(Scalar);
10845 return EI->getVectorOperand() == V1;
10847 "Expected only tree entry for extractelement vectors.");
10851 "Expected only tree entries from extracts/reused buildvectors.");
10852 unsigned VF = getVF(V1);
10853 if (InVectors.
size() == 2) {
10854 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10855 transformMaskAfterShuffle(CommonMask, CommonMask);
10856 VF = std::max<unsigned>(VF, CommonMask.
size());
10857 }
else if (
const auto *InTE =
10858 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10859 VF = std::max(VF, InTE->getVectorFactor());
10862 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10863 ->getNumElements());
10866 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10868 CommonMask[
Idx] = Mask[
Idx] + VF;
10871 Value *Root =
nullptr) {
10872 Cost += getBuildVectorCost(VL, Root);
10876 unsigned VF = VL.
size();
10878 VF = std::min(VF, MaskVF);
10880 if (isa<UndefValue>(V)) {
10886 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10893 Type *ScalarTy = V->getType()->getScalarType();
10895 if (isa<PoisonValue>(V))
10897 else if (isa<UndefValue>(V))
10901 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10904 Vals.
swap(NewVals);
10910 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10917 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10920 IsFinalized =
true;
10923 if (InVectors.
size() == 2)
10924 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10926 Cost += createShuffle(Vec,
nullptr, CommonMask);
10927 transformMaskAfterShuffle(CommonMask, CommonMask);
10929 "Expected vector length for the final value before action.");
10930 Value *V = cast<Value *>(Vec);
10931 Action(V, CommonMask);
10932 InVectors.
front() = V;
10934 if (!SubVectors.empty()) {
10936 if (InVectors.
size() == 2)
10937 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10939 Cost += createShuffle(Vec,
nullptr, CommonMask);
10940 transformMaskAfterShuffle(CommonMask, CommonMask);
10942 if (!SubVectorsMask.
empty()) {
10944 "Expected same size of masks for subvectors and common mask.");
10946 copy(SubVectorsMask, SVMask.begin());
10947 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10950 I1 = I2 + CommonMask.
size();
10957 for (
auto [E,
Idx] : SubVectors) {
10958 Type *EScalarTy = E->Scalars.front()->getType();
10959 bool IsSigned =
true;
10960 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10963 IsSigned = It->second.second;
10965 if (ScalarTy != EScalarTy) {
10966 unsigned CastOpcode = Instruction::Trunc;
10967 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10968 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10970 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10980 if (!CommonMask.
empty()) {
10981 std::iota(std::next(CommonMask.
begin(),
Idx),
10982 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10988 if (!ExtMask.
empty()) {
10989 if (CommonMask.
empty()) {
10993 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
10996 NewMask[
I] = CommonMask[ExtMask[
I]];
10998 CommonMask.
swap(NewMask);
11001 if (CommonMask.
empty()) {
11002 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11006 createShuffle(InVectors.
front(),
11007 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11013 "Shuffle construction must be finalized.");
11017const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11018 unsigned Idx)
const {
11019 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11022 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11023 return TE->isGather() &&
11024 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11025 return EI.EdgeIdx == Idx && EI.UserTE == E;
11026 }) != TE->UserTreeIndices.end();
11028 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11033 if (TE.State == TreeEntry::ScatterVectorize ||
11034 TE.State == TreeEntry::StridedVectorize)
11036 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11037 !TE.isAltShuffle()) {
11038 if (TE.ReorderIndices.empty())
11052 const unsigned VF,
unsigned MinBW,
11084 auto It = MinBWs.
find(E);
11085 Type *OrigScalarTy = ScalarTy;
11086 if (It != MinBWs.
end()) {
11087 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11093 unsigned EntryVF = E->getVectorFactor();
11096 if (E->isGather()) {
11099 if (isa<InsertElementInst>(VL[0]))
11101 if (isa<CmpInst>(VL.
front()))
11102 ScalarTy = VL.
front()->getType();
11103 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11104 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11108 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11111 if (E->getOpcode() == Instruction::Store) {
11113 NewMask.
resize(E->ReorderIndices.size());
11114 copy(E->ReorderIndices, NewMask.
begin());
11120 if (!E->ReuseShuffleIndices.empty())
11121 ::addMask(Mask, E->ReuseShuffleIndices);
11125 assert((E->State == TreeEntry::Vectorize ||
11126 E->State == TreeEntry::ScatterVectorize ||
11127 E->State == TreeEntry::StridedVectorize) &&
11128 "Unhandled state");
11129 assert(E->getOpcode() &&
11131 (E->getOpcode() == Instruction::GetElementPtr &&
11132 E->getMainOp()->getType()->isPointerTy())) &&
11135 unsigned ShuffleOrOp =
11136 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11137 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11138 ShuffleOrOp = E->CombinedOp;
11140 const unsigned Sz = UniqueValues.
size();
11142 for (
unsigned I = 0;
I < Sz; ++
I) {
11143 if (isa<Instruction>(UniqueValues[
I]) &&
11146 UsedScalars.set(
I);
11148 auto GetCastContextHint = [&](
Value *
V) {
11150 return getCastContextHint(*OpTEs.front());
11151 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11152 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11153 !SrcState.isAltShuffle())
11162 if (isa<CastInst, CallInst>(VL0)) {
11166 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11168 for (
unsigned I = 0;
I < Sz; ++
I) {
11169 if (UsedScalars.test(
I))
11171 ScalarCost += ScalarEltCost(
I);
11180 (E->getOpcode() != Instruction::Load ||
11181 !E->UserTreeIndices.empty())) {
11182 const EdgeInfo &EI =
11183 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11184 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11186 if (EI.UserTE->getOpcode() != Instruction::Select ||
11188 auto UserBWIt = MinBWs.
find(EI.UserTE);
11189 Type *UserScalarTy =
11190 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11191 if (UserBWIt != MinBWs.
end())
11193 UserBWIt->second.first);
11194 if (ScalarTy != UserScalarTy) {
11195 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11196 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11197 unsigned VecOpcode;
11198 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11199 if (BWSz > SrcBWSz)
11200 VecOpcode = Instruction::Trunc;
11203 It->second.second ? Instruction::SExt : Instruction::ZExt;
11210 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11211 ScalarCost,
"Calculated costs for Tree"));
11212 return VecCost - ScalarCost;
11217 assert((E->State == TreeEntry::Vectorize ||
11218 E->State == TreeEntry::StridedVectorize) &&
11219 "Entry state expected to be Vectorize or StridedVectorize here.");
11223 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11224 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11225 "Calculated GEPs cost for Tree"));
11227 return VecCost - ScalarCost;
11234 Type *CanonicalType = Ty;
11241 {CanonicalType, CanonicalType});
11246 if (VI && SelectOnly) {
11248 "Expected only for scalar type.");
11249 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11251 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11252 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11253 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11255 return IntrinsicCost;
11257 switch (ShuffleOrOp) {
11258 case Instruction::PHI: {
11262 for (
Value *V : UniqueValues) {
11263 auto *
PHI = dyn_cast<PHINode>(V);
11268 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11272 if (
const TreeEntry *OpTE =
11274 if (CountedOps.
insert(OpTE).second &&
11275 !OpTE->ReuseShuffleIndices.empty())
11276 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11277 OpTE->Scalars.size());
11280 return CommonCost - ScalarCost;
11282 case Instruction::ExtractValue:
11283 case Instruction::ExtractElement: {
11284 auto GetScalarCost = [&](
unsigned Idx) {
11285 if (isa<PoisonValue>(UniqueValues[
Idx]))
11288 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11290 if (ShuffleOrOp == Instruction::ExtractElement) {
11291 auto *EE = cast<ExtractElementInst>(
I);
11292 SrcVecTy = EE->getVectorOperandType();
11294 auto *EV = cast<ExtractValueInst>(
I);
11295 Type *AggregateTy = EV->getAggregateOperand()->getType();
11297 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11298 NumElts = ATy->getNumElements();
11303 if (
I->hasOneUse()) {
11305 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11306 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11313 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11321 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11322 return GetCostDiff(GetScalarCost, GetVectorCost);
11324 case Instruction::InsertElement: {
11325 assert(E->ReuseShuffleIndices.empty() &&
11326 "Unique insertelements only are expected.");
11327 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11328 unsigned const NumElts = SrcVecTy->getNumElements();
11329 unsigned const NumScalars = VL.
size();
11335 unsigned OffsetEnd = OffsetBeg;
11336 InsertMask[OffsetBeg] = 0;
11339 if (OffsetBeg >
Idx)
11341 else if (OffsetEnd <
Idx)
11343 InsertMask[
Idx] =
I + 1;
11346 if (NumOfParts > 0 && NumOfParts < NumElts)
11347 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11348 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11350 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11351 unsigned InsertVecSz = std::min<unsigned>(
11353 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11354 bool IsWholeSubvector =
11355 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11359 if (OffsetBeg + InsertVecSz > VecSz) {
11362 InsertVecSz = VecSz;
11368 if (!E->ReorderIndices.empty()) {
11373 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11375 bool IsIdentity =
true;
11377 Mask.swap(PrevMask);
11378 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11380 DemandedElts.
setBit(InsertIdx);
11381 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11382 Mask[InsertIdx - OffsetBeg] =
I;
11384 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11398 InsertVecTy, Mask);
11399 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11400 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11408 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11409 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11410 if (InsertVecSz != VecSz) {
11421 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11430 case Instruction::ZExt:
11431 case Instruction::SExt:
11432 case Instruction::FPToUI:
11433 case Instruction::FPToSI:
11434 case Instruction::FPExt:
11435 case Instruction::PtrToInt:
11436 case Instruction::IntToPtr:
11437 case Instruction::SIToFP:
11438 case Instruction::UIToFP:
11439 case Instruction::Trunc:
11440 case Instruction::FPTrunc:
11441 case Instruction::BitCast: {
11442 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11445 unsigned Opcode = ShuffleOrOp;
11446 unsigned VecOpcode = Opcode;
11448 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11450 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11451 if (SrcIt != MinBWs.
end()) {
11452 SrcBWSz = SrcIt->second.first;
11459 if (BWSz == SrcBWSz) {
11460 VecOpcode = Instruction::BitCast;
11461 }
else if (BWSz < SrcBWSz) {
11462 VecOpcode = Instruction::Trunc;
11463 }
else if (It != MinBWs.
end()) {
11464 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11465 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11466 }
else if (SrcIt != MinBWs.
end()) {
11467 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11469 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11471 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11472 !SrcIt->second.second) {
11473 VecOpcode = Instruction::UIToFP;
11476 assert(
Idx == 0 &&
"Expected 0 index only");
11484 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11486 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11489 bool IsArithmeticExtendedReduction =
11490 E->Idx == 0 && UserIgnoreList &&
11492 auto *
I = cast<Instruction>(V);
11493 return is_contained({Instruction::Add, Instruction::FAdd,
11494 Instruction::Mul, Instruction::FMul,
11495 Instruction::And, Instruction::Or,
11499 if (IsArithmeticExtendedReduction &&
11500 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11502 return CommonCost +
11504 VecOpcode == Opcode ? VI :
nullptr);
11506 return GetCostDiff(GetScalarCost, GetVectorCost);
11508 case Instruction::FCmp:
11509 case Instruction::ICmp:
11510 case Instruction::Select: {
11514 match(VL0, MatchCmp))
11520 auto GetScalarCost = [&](
unsigned Idx) {
11521 if (isa<PoisonValue>(UniqueValues[
Idx]))
11524 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11530 !
match(VI, MatchCmp)) ||
11538 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11539 CostKind, getOperandInfo(
VI->getOperand(0)),
11540 getOperandInfo(
VI->getOperand(1)), VI);
11543 ScalarCost = IntrinsicCost;
11552 CostKind, getOperandInfo(E->getOperand(0)),
11553 getOperandInfo(E->getOperand(1)), VL0);
11554 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11557 unsigned CondNumElements = CondType->getNumElements();
11559 assert(VecTyNumElements >= CondNumElements &&
11560 VecTyNumElements % CondNumElements == 0 &&
11561 "Cannot vectorize Instruction::Select");
11562 if (CondNumElements != VecTyNumElements) {
11571 return VecCost + CommonCost;
11573 return GetCostDiff(GetScalarCost, GetVectorCost);
11575 case TreeEntry::MinMax: {
11576 auto GetScalarCost = [&](
unsigned Idx) {
11577 return GetMinMaxCost(OrigScalarTy);
11581 return VecCost + CommonCost;
11583 return GetCostDiff(GetScalarCost, GetVectorCost);
11585 case Instruction::FNeg:
11586 case Instruction::Add:
11587 case Instruction::FAdd:
11588 case Instruction::Sub:
11589 case Instruction::FSub:
11590 case Instruction::Mul:
11591 case Instruction::FMul:
11592 case Instruction::UDiv:
11593 case Instruction::SDiv:
11594 case Instruction::FDiv:
11595 case Instruction::URem:
11596 case Instruction::SRem:
11597 case Instruction::FRem:
11598 case Instruction::Shl:
11599 case Instruction::LShr:
11600 case Instruction::AShr:
11601 case Instruction::And:
11602 case Instruction::Or:
11603 case Instruction::Xor: {
11604 auto GetScalarCost = [&](
unsigned Idx) {
11605 if (isa<PoisonValue>(UniqueValues[
Idx]))
11608 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11609 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11618 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11619 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11622 auto *CI = dyn_cast<ConstantInt>(
Op);
11623 return CI && CI->getValue().countr_one() >= It->second.first;
11628 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11632 Op2Info, {},
nullptr, TLI) +
11635 return GetCostDiff(GetScalarCost, GetVectorCost);
11637 case Instruction::GetElementPtr: {
11638 return CommonCost + GetGEPCostDiff(VL, VL0);
11640 case Instruction::Load: {
11641 auto GetScalarCost = [&](
unsigned Idx) {
11642 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11644 VI->getAlign(),
VI->getPointerAddressSpace(),
11647 auto *LI0 = cast<LoadInst>(VL0);
11650 switch (E->State) {
11651 case TreeEntry::Vectorize:
11652 if (
unsigned Factor = E->getInterleaveFactor()) {
11654 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11655 LI0->getPointerAddressSpace(),
CostKind);
11659 Instruction::Load, VecTy, LI0->getAlign(),
11663 case TreeEntry::StridedVectorize: {
11664 Align CommonAlignment =
11665 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11667 Instruction::Load, VecTy, LI0->getPointerOperand(),
11668 false, CommonAlignment,
CostKind);
11671 case TreeEntry::ScatterVectorize: {
11672 Align CommonAlignment =
11673 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11675 Instruction::Load, VecTy, LI0->getPointerOperand(),
11676 false, CommonAlignment,
CostKind);
11679 case TreeEntry::CombinedVectorize:
11680 case TreeEntry::NeedToGather:
11683 return VecLdCost + CommonCost;
11689 if (E->State == TreeEntry::ScatterVectorize)
11695 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11696 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11698 case Instruction::Store: {
11699 bool IsReorder = !E->ReorderIndices.empty();
11700 auto GetScalarCost = [=](
unsigned Idx) {
11701 auto *
VI = cast<StoreInst>(VL[
Idx]);
11704 VI->getAlign(),
VI->getPointerAddressSpace(),
11708 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11712 if (E->State == TreeEntry::StridedVectorize) {
11713 Align CommonAlignment =
11714 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11716 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11717 false, CommonAlignment,
CostKind);
11719 assert(E->State == TreeEntry::Vectorize &&
11720 "Expected either strided or consecutive stores.");
11721 if (
unsigned Factor = E->getInterleaveFactor()) {
11722 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11723 "No reused shuffles expected");
11726 Instruction::Store, VecTy, Factor, std::nullopt,
11727 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11731 Instruction::Store, VecTy, BaseSI->getAlign(),
11732 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11735 return VecStCost + CommonCost;
11739 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11740 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11743 return GetCostDiff(GetScalarCost, GetVectorCost) +
11744 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11746 case Instruction::Call: {
11747 auto GetScalarCost = [&](
unsigned Idx) {
11748 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11759 auto *CI = cast<CallInst>(VL0);
11763 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11765 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11767 return GetCostDiff(GetScalarCost, GetVectorCost);
11769 case Instruction::ShuffleVector: {
11770 if (!
SLPReVec || E->isAltShuffle())
11771 assert(E->isAltShuffle() &&
11776 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11777 "Invalid Shuffle Vector Operand");
11780 auto TryFindNodeWithEqualOperands = [=]() {
11781 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11784 if (
TE->hasState() &&
TE->isAltShuffle() &&
11785 ((
TE->getOpcode() == E->getOpcode() &&
11786 TE->getAltOpcode() == E->getAltOpcode()) ||
11787 (
TE->getOpcode() == E->getAltOpcode() &&
11788 TE->getAltOpcode() == E->getOpcode())) &&
11789 TE->hasEqualOperands(*E))
11794 auto GetScalarCost = [&](
unsigned Idx) {
11795 if (isa<PoisonValue>(UniqueValues[
Idx]))
11798 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11799 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11809 if (TryFindNodeWithEqualOperands()) {
11811 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11818 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11820 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11821 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11823 VecCost = TTIRef.getCmpSelInstrCost(
11824 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11825 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11827 VecCost += TTIRef.getCmpSelInstrCost(
11828 E->getOpcode(), VecTy, MaskTy,
11829 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11830 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11833 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11836 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11837 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11839 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11840 if (SrcIt != MinBWs.
end()) {
11841 SrcBWSz = SrcIt->second.first;
11845 if (BWSz <= SrcBWSz) {
11846 if (BWSz < SrcBWSz)
11848 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11852 <<
"SLP: alternate extension, which should be truncated.\n";
11858 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11861 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11865 E->buildAltOpShuffleMask(
11867 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11878 unsigned Opcode0 = E->getOpcode();
11879 unsigned Opcode1 = E->getAltOpcode();
11883 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11885 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11886 return AltVecCost < VecCost ? AltVecCost : VecCost;
11891 if (
SLPReVec && !E->isAltShuffle())
11892 return GetCostDiff(
11897 "Not supported shufflevector usage.");
11898 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11899 unsigned SVNumElements =
11900 cast<FixedVectorType>(SV->getOperand(0)->getType())
11901 ->getNumElements();
11902 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11903 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11907 assert(isa<ShuffleVectorInst>(V) &&
11908 "Not supported shufflevector usage.");
11909 auto *SV = cast<ShuffleVectorInst>(V);
11911 [[maybe_unused]]
bool IsExtractSubvectorMask =
11912 SV->isExtractSubvectorMask(Index);
11913 assert(IsExtractSubvectorMask &&
11914 "Not supported shufflevector usage.");
11915 if (NextIndex != Index)
11917 NextIndex += SV->getShuffleMask().size();
11920 return ::getShuffleCost(
11926 return GetCostDiff(GetScalarCost, GetVectorCost);
11928 case Instruction::Freeze:
11935bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11937 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11939 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11941 return TE->isGather() &&
11943 [
this](
Value *V) { return EphValues.contains(V); }) &&
11945 TE->Scalars.size() < Limit ||
11946 (((
TE->hasState() &&
11947 TE->getOpcode() == Instruction::ExtractElement) ||
11948 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11950 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
11951 !
TE->isAltShuffle()) ||
11952 any_of(
TE->Scalars, IsaPred<LoadInst>));
11956 if (VectorizableTree.size() == 1 &&
11957 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11958 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11960 AreVectorizableGathers(VectorizableTree[0].
get(),
11961 VectorizableTree[0]->Scalars.size()) &&
11962 VectorizableTree[0]->getVectorFactor() > 2)))
11965 if (VectorizableTree.size() != 2)
11973 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11974 AreVectorizableGathers(VectorizableTree[1].
get(),
11975 VectorizableTree[0]->Scalars.size()))
11979 if (VectorizableTree[0]->
isGather() ||
11980 (VectorizableTree[1]->isGather() &&
11981 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11982 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11990 bool MustMatchOrInst) {
11994 Value *ZextLoad = Root;
11995 const APInt *ShAmtC;
11996 bool FoundOr =
false;
11997 while (!isa<ConstantExpr>(ZextLoad) &&
12000 ShAmtC->
urem(8) == 0))) {
12001 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12002 ZextLoad = BinOp->getOperand(0);
12003 if (BinOp->getOpcode() == Instruction::Or)
12008 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12015 Type *SrcTy = Load->getType();
12022 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
12023 << *(cast<Instruction>(Root)) <<
"\n");
12032 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12033 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12041 unsigned NumElts = Stores.
size();
12042 for (
Value *Scalar : Stores) {
12056 if (VectorizableTree.empty()) {
12057 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12063 if (VectorizableTree.size() == 2 &&
12064 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12065 VectorizableTree[1]->isGather() &&
12066 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12067 !(
isSplat(VectorizableTree[1]->Scalars) ||
12075 constexpr int Limit = 4;
12077 !VectorizableTree.empty() &&
12078 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12079 return (TE->isGather() &&
12080 (!TE->hasState() ||
12081 TE->getOpcode() != Instruction::ExtractElement) &&
12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12083 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12094 if (isFullyVectorizableTinyTree(ForReduction))
12099 bool IsAllowedSingleBVNode =
12100 VectorizableTree.size() > 1 ||
12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12102 !VectorizableTree.front()->isAltShuffle() &&
12103 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12106 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12107 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12108 return isa<ExtractElementInst, UndefValue>(V) ||
12109 (IsAllowedSingleBVNode &&
12110 !V->hasNUsesOrMore(UsesLimit) &&
12111 any_of(V->users(), IsaPred<InsertElementInst>));
12116 if (VectorizableTree.back()->isGather() &&
12117 VectorizableTree.back()->hasState() &&
12118 VectorizableTree.back()->isAltShuffle() &&
12119 VectorizableTree.back()->getVectorFactor() > 2 &&
12121 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12123 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12124 VectorizableTree.back()->getVectorFactor()),
12137 constexpr unsigned SmallTree = 3;
12138 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12141 [](
const std::unique_ptr<TreeEntry> &TE) {
12142 return TE->isGather() && TE->hasState() &&
12143 TE->getOpcode() == Instruction::Load &&
12151 TreeEntry &E = *VectorizableTree[
Idx];
12154 if (E.hasState() && E.getOpcode() != Instruction::Load)
12168 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12181 for (
const auto &TEPtr : VectorizableTree) {
12182 if (TEPtr->State != TreeEntry::Vectorize)
12184 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12190 auto *NodeA = DT->
getNode(
A->getParent());
12191 auto *NodeB = DT->
getNode(
B->getParent());
12192 assert(NodeA &&
"Should only process reachable instructions");
12193 assert(NodeB &&
"Should only process reachable instructions");
12194 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195 "Different nodes should have different DFS numbers");
12196 if (NodeA != NodeB)
12197 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198 return B->comesBefore(
A);
12208 LiveValues.
erase(PrevInst);
12209 for (
auto &J : PrevInst->
operands()) {
12211 LiveValues.
insert(cast<Instruction>(&*J));
12215 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12216 for (
auto *
X : LiveValues)
12217 dbgs() <<
" " <<
X->getName();
12218 dbgs() <<
", Looking at ";
12223 unsigned NumCalls = 0;
12227 while (InstIt != PrevInstIt) {
12228 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12229 PrevInstIt = Inst->getParent()->rbegin();
12234 auto *
II = dyn_cast<IntrinsicInst>(
I);
12237 if (
II->isAssumeLikeIntrinsic())
12241 for (
auto &ArgOp :
II->args())
12242 Tys.push_back(ArgOp->getType());
12243 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12244 FMF = FPMO->getFastMathFlags();
12251 return IntrCost < CallCost;
12255 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256 &*PrevInstIt != PrevInst)
12264 for (
auto *
II : LiveValues) {
12265 auto *ScalarTy =
II->getType();
12266 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267 ScalarTy = VectorTy->getElementType();
12285 const auto *I1 = IE1;
12286 const auto *I2 = IE2;
12298 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12300 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12301 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12303 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12304 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12311struct ValueSelect {
12312 template <
typename U>
12313 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12316 template <
typename U>
12317 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12335template <
typename T>
12341 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12343 auto VMIt = std::next(ShuffleMask.begin());
12346 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12348 if (!IsBaseUndef.
all()) {
12350 std::pair<T *, bool> Res =
12351 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12353 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12357 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12359 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
12360 assert((!V || GetVF(V) == Mask.size()) &&
12361 "Expected base vector of VF number of elements.");
12362 Prev = Action(Mask, {
nullptr, Res.first});
12363 }
else if (ShuffleMask.size() == 1) {
12366 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12372 Prev = Action(Mask, {ShuffleMask.begin()->first});
12376 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12377 unsigned Vec2VF = GetVF(VMIt->first);
12378 if (Vec1VF == Vec2VF) {
12382 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12385 Mask[
I] = SecMask[
I] + Vec1VF;
12388 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12391 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12393 std::pair<T *, bool> Res2 =
12394 ResizeAction(VMIt->first, VMIt->second,
false);
12396 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12403 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12406 Prev = Action(Mask, {Res1.first, Res2.first});
12408 VMIt = std::next(VMIt);
12410 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
12412 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12414 std::pair<T *, bool> Res =
12415 ResizeAction(VMIt->first, VMIt->second,
false);
12417 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12420 "Multiple uses of scalars.");
12421 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12426 Prev = Action(Mask, {Prev, Res.first});
12434template <
typename T>
struct ShuffledInsertData {
12445 << VectorizableTree.size() <<
".\n");
12447 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12450 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12451 TreeEntry &TE = *VectorizableTree[
I];
12454 if (TE.State == TreeEntry::CombinedVectorize) {
12456 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12457 << *TE.Scalars[0] <<
".\n";
12458 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12461 if (TE.isGather() && TE.hasState()) {
12462 if (
const TreeEntry *E =
12463 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
12464 E && E->getVectorFactor() == TE.getVectorFactor()) {
12469 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12476 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12477 "Expected gather nodes with users only.");
12483 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12492 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12499 for (ExternalUser &EU : ExternalUses) {
12500 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12502 for (ExternalUser &EU : ExternalUses) {
12506 if (EphValues.
count(EU.User))
12512 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12515 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12519 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12520 !ExtractCostCalculated.
insert(EU.Scalar).second)
12524 if (isa<FixedVectorType>(EU.Scalar->getType()))
12529 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12531 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12532 if (!UsedInserts.
insert(VU).second)
12536 const TreeEntry *ScalarTE = &EU.E;
12539 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12544 Value *Op0 =
II->getOperand(0);
12551 if (It == ShuffledInserts.
end()) {
12553 Data.InsertElements.emplace_back(VU);
12555 VecId = ShuffledInserts.
size() - 1;
12556 auto It = MinBWs.
find(ScalarTE);
12557 if (It != MinBWs.
end() &&
12559 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12561 unsigned BWSz = It->second.first;
12562 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12563 unsigned VecOpcode;
12564 if (DstBWSz < BWSz)
12565 VecOpcode = Instruction::Trunc;
12568 It->second.second ? Instruction::SExt : Instruction::ZExt;
12573 FTy->getNumElements()),
12576 <<
" for extending externally used vector with "
12577 "non-equal minimum bitwidth.\n");
12582 It->InsertElements.front() = VU;
12583 VecId = std::distance(ShuffledInserts.
begin(), It);
12585 int InIdx = *InsertIdx;
12587 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12590 Mask[InIdx] = EU.Lane;
12591 DemandedElts[VecId].setBit(InIdx);
12602 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12603 const TreeEntry *Entry = &EU.E;
12604 auto It = MinBWs.
find(Entry);
12605 if (It != MinBWs.
end()) {
12608 ? Instruction::ZExt
12609 : Instruction::SExt;
12616 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12619 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12620 Entry->getOpcode() == Instruction::Load) {
12622 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12623 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12624 auto *
I = cast<Instruction>(U.Scalar);
12625 const Loop *L = LI->getLoopFor(Phi->getParent());
12626 return L && (Phi->getParent() ==
I->getParent() ||
12627 L == LI->getLoopFor(
I->getParent()));
12631 if (!ValueToExtUses) {
12632 ValueToExtUses.emplace();
12635 if (IsPhiInLoop(
P.value()))
12638 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12643 auto *Inst = cast<Instruction>(EU.Scalar);
12645 auto OperandIsScalar = [&](
Value *V) {
12650 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12651 return !EE->hasOneUse() || !MustGather.contains(EE);
12654 return ValueToExtUses->contains(V);
12656 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12657 bool CanBeUsedAsScalarCast =
false;
12658 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12659 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12660 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12665 if (ScalarCost + OpCost <= ExtraCost) {
12666 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12667 ScalarCost += OpCost;
12671 if (CanBeUsedAsScalar) {
12672 bool KeepScalar = ScalarCost <= ExtraCost;
12676 bool IsProfitablePHIUser =
12678 VectorizableTree.front()->Scalars.size() > 2)) &&
12679 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12683 auto *PHIUser = dyn_cast<PHINode>(U);
12684 return (!PHIUser ||
12685 PHIUser->getParent() !=
12687 VectorizableTree.front()->getMainOp())
12692 return ValueToExtUses->contains(V);
12694 if (IsProfitablePHIUser) {
12698 (!GatheredLoadsEntriesFirst.has_value() ||
12699 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12700 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12701 return ValueToExtUses->contains(V);
12703 auto It = ExtractsCount.
find(Entry);
12704 if (It != ExtractsCount.
end()) {
12705 assert(ScalarUsesCount >= It->getSecond().size() &&
12706 "Expected total number of external uses not less than "
12707 "number of scalar uses.");
12708 ScalarUsesCount -= It->getSecond().size();
12713 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12716 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12718 auto It = ValueToExtUses->find(V);
12719 if (It != ValueToExtUses->end()) {
12721 ExternalUses[It->second].User = nullptr;
12724 ExtraCost = ScalarCost;
12725 if (!IsPhiInLoop(EU))
12726 ExtractsCount[Entry].
insert(Inst);
12727 if (CanBeUsedAsScalarCast) {
12728 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12731 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12733 auto It = ValueToExtUses->find(V);
12734 if (It != ValueToExtUses->end()) {
12736 ExternalUses[It->second].User = nullptr;
12745 ExtractCost += ExtraCost;
12749 for (
Value *V : ScalarOpsFromCasts) {
12750 ExternalUsesAsOriginalScalar.
insert(V);
12752 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
12753 TEs.front()->findLaneForValue(V));
12757 if (!VectorizedVals.
empty()) {
12758 const TreeEntry &Root = *VectorizableTree.front();
12759 auto BWIt = MinBWs.find(&Root);
12760 if (BWIt != MinBWs.end()) {
12761 Type *DstTy = Root.Scalars.front()->getType();
12764 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12765 if (OriginalSz != SrcSz) {
12766 unsigned Opcode = Instruction::Trunc;
12767 if (OriginalSz > SrcSz)
12768 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12770 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12782 Cost += SpillCost + ExtractCost;
12786 unsigned VF =
Mask.size();
12787 unsigned VecVF =
TE->getVectorFactor();
12789 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12792 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12798 dbgs() <<
"SLP: Adding cost " <<
C
12799 <<
" for final shuffle of insertelement external users.\n";
12800 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12802 return std::make_pair(TE,
true);
12804 return std::make_pair(TE,
false);
12807 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12808 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12809 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12813 assert((TEs.size() == 1 || TEs.size() == 2) &&
12814 "Expected exactly 1 or 2 tree entries.");
12815 if (TEs.size() == 1) {
12817 VF = TEs.front()->getVectorFactor();
12818 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12822 (
Data.index() < VF &&
12823 static_cast<int>(
Data.index()) ==
Data.value());
12828 <<
" for final shuffle of insertelement "
12829 "external users.\n";
12830 TEs.front()->
dump();
12831 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12837 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12838 VF = TEs.front()->getVectorFactor();
12842 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12846 <<
" for final shuffle of vector node and external "
12847 "insertelement users.\n";
12848 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12849 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12855 (void)performExtractsShuffleAction<const TreeEntry>(
12857 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12858 EstimateShufflesCost);
12860 cast<FixedVectorType>(
12861 ShuffledInserts[
I].InsertElements.front()->getType()),
12864 Cost -= InsertCost;
12868 if (ReductionBitWidth != 0) {
12869 assert(UserIgnoreList &&
"Expected reduction tree.");
12870 const TreeEntry &E = *VectorizableTree.front();
12871 auto It = MinBWs.find(&E);
12872 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12873 unsigned SrcSize = It->second.first;
12874 unsigned DstSize = ReductionBitWidth;
12875 unsigned Opcode = Instruction::Trunc;
12876 if (SrcSize < DstSize) {
12877 bool IsArithmeticExtendedReduction =
12879 auto *
I = cast<Instruction>(V);
12880 return is_contained({Instruction::Add, Instruction::FAdd,
12881 Instruction::Mul, Instruction::FMul,
12882 Instruction::And, Instruction::Or,
12886 if (IsArithmeticExtendedReduction)
12888 Instruction::BitCast;
12890 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12892 if (Opcode != Instruction::BitCast) {
12894 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12896 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12899 switch (E.getOpcode()) {
12900 case Instruction::SExt:
12901 case Instruction::ZExt:
12902 case Instruction::Trunc: {
12903 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12904 CCH = getCastContextHint(*OpTE);
12914 <<
" for final resize for reduction from " << SrcVecTy
12915 <<
" to " << DstVecTy <<
"\n";
12916 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12925 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12926 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12927 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12931 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12942std::optional<TTI::ShuffleKind>
12943BoUpSLP::tryToGatherSingleRegisterExtractElements(
12949 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12950 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12952 if (isa<UndefValue>(VL[
I]))
12956 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12957 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12970 ExtractMask.reset(*
Idx);
12975 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12980 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12981 return P1.second.size() > P2.second.size();
12984 const int UndefSz = UndefVectorExtracts.
size();
12985 unsigned SingleMax = 0;
12986 unsigned PairMax = 0;
12987 if (!Vectors.
empty()) {
12988 SingleMax = Vectors.
front().second.size() + UndefSz;
12989 if (Vectors.
size() > 1) {
12990 auto *ItNext = std::next(Vectors.
begin());
12991 PairMax = SingleMax + ItNext->second.size();
12994 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12995 return std::nullopt;
13001 if (SingleMax >= PairMax && SingleMax) {
13002 for (
int Idx : Vectors.
front().second)
13004 }
else if (!Vectors.
empty()) {
13005 for (
unsigned Idx : {0, 1})
13006 for (
int Idx : Vectors[
Idx].second)
13010 for (
int Idx : UndefVectorExtracts)
13014 std::optional<TTI::ShuffleKind> Res =
13020 return std::nullopt;
13024 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13025 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
13026 isa<UndefValue>(GatheredExtracts[
I])) {
13030 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13031 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13032 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13047 unsigned NumParts)
const {
13048 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13052 for (
unsigned Part : seq<unsigned>(NumParts)) {
13058 std::optional<TTI::ShuffleKind> Res =
13059 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13060 ShufflesRes[Part] = Res;
13061 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13063 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13064 return Res.has_value();
13066 ShufflesRes.clear();
13067 return ShufflesRes;
13070std::optional<TargetTransformInfo::ShuffleKind>
13071BoUpSLP::isGatherShuffledSingleRegisterEntry(
13077 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13078 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13079 :
TE->UserTreeIndices.front();
13080 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13084 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13085 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13088 TEInsertBlock = TEInsertPt->
getParent();
13091 return std::nullopt;
13092 auto *NodeUI = DT->
getNode(TEInsertBlock);
13093 assert(NodeUI &&
"Should only process reachable instructions");
13095 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13109 auto *NodeEUI = DT->
getNode(InsertBlock);
13112 assert((NodeUI == NodeEUI) ==
13113 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13114 "Different nodes should have different DFS numbers");
13116 if (TEInsertPt->
getParent() != InsertBlock &&
13119 if (TEInsertPt->
getParent() == InsertBlock &&
13133 for (
Value *V : VL) {
13138 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13139 if (TEPtr == TE || TEPtr->Idx == 0)
13142 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13143 "Must contain at least single gathered value.");
13144 assert(TEPtr->UserTreeIndices.size() == 1 &&
13145 "Expected only single user of a gather node.");
13146 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13148 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13151 : &getLastInstructionInBundle(UseEI.UserTE);
13152 if (TEInsertPt == InsertPt) {
13156 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13160 if (TEUseEI.UserTE != UseEI.UserTE &&
13161 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13167 if ((TEInsertBlock != InsertPt->
getParent() ||
13168 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13169 !CheckOrdering(InsertPt))
13174 const TreeEntry *VTE = VTEs.front();
13175 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
13176 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
13177 VTEs = VTEs.drop_front();
13179 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
13180 return MTE->State == TreeEntry::Vectorize;
13182 if (MIt == VTEs.end())
13186 if (
none_of(
TE->CombinedEntriesWithIndices,
13187 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
13188 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13189 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13194 if (VToTEs.
empty())
13196 if (UsedTEs.
empty()) {
13210 if (!VToTEs.
empty()) {
13216 VToTEs = SavedVToTEs;
13225 if (UsedTEs.
size() == 2)
13227 UsedTEs.push_back(SavedVToTEs);
13234 if (UsedTEs.
empty()) {
13236 return std::nullopt;
13240 if (UsedTEs.
size() == 1) {
13243 UsedTEs.front().
end());
13244 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13245 return TE1->Idx < TE2->Idx;
13248 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13249 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13251 if (It != FirstEntries.end() &&
13252 ((*It)->getVectorFactor() == VL.size() ||
13253 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13254 TE->ReuseShuffleIndices.size() == VL.size() &&
13255 (*It)->isSame(
TE->Scalars)))) {
13256 Entries.push_back(*It);
13257 if ((*It)->getVectorFactor() == VL.size()) {
13258 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13259 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13265 for (
unsigned I : seq<unsigned>(VL.size()))
13266 if (isa<PoisonValue>(VL[
I]))
13272 Entries.push_back(FirstEntries.front());
13273 VF = FirstEntries.front()->getVectorFactor();
13276 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13279 for (
const TreeEntry *TE : UsedTEs.front()) {
13280 unsigned VF =
TE->getVectorFactor();
13281 auto It = VFToTE.
find(VF);
13282 if (It != VFToTE.
end()) {
13283 if (It->second->Idx >
TE->Idx)
13284 It->getSecond() =
TE;
13291 UsedTEs.back().
end());
13292 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13293 return TE1->Idx < TE2->Idx;
13295 for (
const TreeEntry *TE : SecondEntries) {
13296 auto It = VFToTE.
find(
TE->getVectorFactor());
13297 if (It != VFToTE.
end()) {
13299 Entries.push_back(It->second);
13300 Entries.push_back(TE);
13306 if (Entries.empty()) {
13308 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13309 return TE1->Idx < TE2->Idx;
13311 Entries.push_back(SecondEntries.front());
13312 VF = std::max(Entries.front()->getVectorFactor(),
13313 Entries.back()->getVectorFactor());
13315 VF = Entries.front()->getVectorFactor();
13319 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13322 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13323 auto *
PHI = cast<PHINode>(V);
13324 auto *PHI1 = cast<PHINode>(V1);
13329 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13331 Value *In1 = PHI1->getIncomingValue(
I);
13336 if (cast<Instruction>(In)->
getParent() !=
13346 auto MightBeIgnored = [=](
Value *
V) {
13347 auto *
I = dyn_cast<Instruction>(V);
13350 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13355 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13357 bool UsedInSameVTE =
false;
13358 auto It = UsedValuesEntry.
find(V1);
13359 if (It != UsedValuesEntry.
end())
13360 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13361 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13363 cast<Instruction>(V)->getParent() ==
13364 cast<Instruction>(V1)->getParent() &&
13365 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13370 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13372 auto It = UsedValuesEntry.
find(V);
13373 if (It == UsedValuesEntry.
end())
13379 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13380 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13382 unsigned Idx = It->second;
13389 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13390 if (!UsedIdxs.test(
I))
13396 for (std::pair<unsigned, int> &Pair : EntryLanes)
13397 if (Pair.first ==
I)
13398 Pair.first = TempEntries.
size();
13401 Entries.swap(TempEntries);
13402 if (EntryLanes.size() == Entries.size() &&
13404 .
slice(Part * VL.size(),
13405 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13411 return std::nullopt;
13414 bool IsIdentity = Entries.size() == 1;
13417 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13418 unsigned Idx = Part * VL.size() + Pair.second;
13421 (ForOrder ? std::distance(
13422 Entries[Pair.first]->Scalars.begin(),
13423 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13424 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13425 IsIdentity &=
Mask[
Idx] == Pair.second;
13427 if (ForOrder || IsIdentity || Entries.empty()) {
13428 switch (Entries.size()) {
13430 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13434 if (EntryLanes.size() > 2 || VL.size() <= 2)
13440 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13441 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13444 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13445 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13446 for (
int Idx : SubMask) {
13454 assert(MaxElement >= 0 && MinElement >= 0 &&
13455 MaxElement % VF >= MinElement % VF &&
13456 "Expected at least single element.");
13457 unsigned NewVF = std::max<unsigned>(
13459 (MaxElement % VF) -
13460 (MinElement % VF) + 1));
13465 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13466 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13474 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13475 auto GetShuffleCost = [&,
13479 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13481 Mask, Entries.front()->getInterleaveFactor()))
13483 return ::getShuffleCost(
TTI,
13488 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13491 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13492 FirstShuffleCost = ShuffleCost;
13496 bool IsIdentity =
true;
13498 if (
Idx >=
static_cast<int>(NewVF)) {
13503 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13507 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13509 MaskVecTy, DemandedElts,
true,
13514 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13515 SecondShuffleCost = ShuffleCost;
13519 bool IsIdentity =
true;
13521 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13527 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13532 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13534 MaskVecTy, DemandedElts,
true,
13544 const TreeEntry *BestEntry =
nullptr;
13545 if (FirstShuffleCost < ShuffleCost) {
13546 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13547 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13549 if (Idx >= static_cast<int>(VF))
13550 Idx = PoisonMaskElem;
13552 BestEntry = Entries.front();
13553 ShuffleCost = FirstShuffleCost;
13555 if (SecondShuffleCost < ShuffleCost) {
13556 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13557 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13559 if (Idx < static_cast<int>(VF))
13560 Idx = PoisonMaskElem;
13564 BestEntry = Entries[1];
13565 ShuffleCost = SecondShuffleCost;
13567 if (BuildVectorCost >= ShuffleCost) {
13570 Entries.push_back(BestEntry);
13578 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13580 return std::nullopt;
13584BoUpSLP::isGatherShuffledEntry(
13588 assert(NumParts > 0 && NumParts < VL.
size() &&
13589 "Expected positive number of registers.");
13592 if (TE == VectorizableTree.front().get() &&
13593 (!GatheredLoadsEntriesFirst.has_value() ||
13595 [](
const std::unique_ptr<TreeEntry> &TE) {
13596 return !
TE->isGather();
13601 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
13604 assert((
TE->UserTreeIndices.size() == 1 ||
13605 TE == VectorizableTree.front().get()) &&
13606 "Expected only single user of the gather node.");
13608 "Number of scalars must be divisible by NumParts.");
13609 if (!
TE->UserTreeIndices.empty() &&
13610 TE->UserTreeIndices.front().UserTE->isGather() &&
13611 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13614 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
13616 "Expected splat or extractelements only node.");
13621 for (
unsigned Part : seq<unsigned>(NumParts)) {
13625 std::optional<TTI::ShuffleKind> SubRes =
13626 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13629 SubEntries.
clear();
13632 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13633 (SubEntries.
front()->isSame(
TE->Scalars) ||
13634 SubEntries.
front()->isSame(VL))) {
13636 LocalSubEntries.
swap(SubEntries);
13639 std::iota(
Mask.begin(),
Mask.end(), 0);
13641 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13642 if (isa<PoisonValue>(VL[
I]))
13644 Entries.emplace_back(1, LocalSubEntries.
front());
13650 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13658 Type *ScalarTy)
const {
13660 bool DuplicateNonConst =
false;
13668 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13669 if (
V->getType() != ScalarTy) {
13680 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13683 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13691 EstimateInsertCost(
I, V);
13692 ShuffleMask[
I] =
I;
13696 DuplicateNonConst =
true;
13698 ShuffleMask[
I] = Res.first->second;
13700 if (ForPoisonSrc) {
13701 if (isa<FixedVectorType>(ScalarTy)) {
13707 for (
unsigned I : seq<unsigned>(VL.
size()))
13708 if (!ShuffledElements[
I])
13711 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13719 if (DuplicateNonConst)
13721 VecTy, ShuffleMask);
13725Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13726 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13732 auto *Front = E->getMainOp();
13734 assert(((GatheredLoadsEntriesFirst.has_value() &&
13735 E->getOpcode() == Instruction::Load && E->isGather() &&
13736 E->Idx < *GatheredLoadsEntriesFirst) ||
13738 [=](
Value *V) ->
bool {
13739 if (E->getOpcode() == Instruction::GetElementPtr &&
13740 !isa<GetElementPtrInst>(V))
13742 auto *I = dyn_cast<Instruction>(V);
13743 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13744 isVectorLikeInstWithConstOps(I);
13746 "Expected gathered loads or GEPs or instructions from same basic "
13749 auto FindLastInst = [&]() {
13751 for (
Value *V : E->Scalars) {
13752 auto *
I = dyn_cast<Instruction>(V);
13755 if (LastInst->
getParent() ==
I->getParent()) {
13760 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13761 !isa<GetElementPtrInst>(
I)) ||
13764 (GatheredLoadsEntriesFirst.has_value() &&
13765 E->getOpcode() == Instruction::Load && E->isGather() &&
13766 E->Idx < *GatheredLoadsEntriesFirst)) &&
13767 "Expected vector-like or non-GEP in GEP node insts only.");
13775 auto *NodeB = DT->
getNode(
I->getParent());
13776 assert(NodeA &&
"Should only process reachable instructions");
13777 assert(NodeB &&
"Should only process reachable instructions");
13778 assert((NodeA == NodeB) ==
13779 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13780 "Different nodes should have different DFS numbers");
13781 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13788 auto FindFirstInst = [&]() {
13790 for (
Value *V : E->Scalars) {
13791 auto *
I = dyn_cast<Instruction>(V);
13794 if (FirstInst->
getParent() ==
I->getParent()) {
13795 if (
I->comesBefore(FirstInst))
13799 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13800 !isa<GetElementPtrInst>(
I)) ||
13803 "Expected vector-like or non-GEP in GEP node insts only.");
13811 auto *NodeB = DT->
getNode(
I->getParent());
13812 assert(NodeA &&
"Should only process reachable instructions");
13813 assert(NodeB &&
"Should only process reachable instructions");
13814 assert((NodeA == NodeB) ==
13815 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13816 "Different nodes should have different DFS numbers");
13817 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13824 if (GatheredLoadsEntriesFirst.has_value() &&
13825 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13826 E->getOpcode() == Instruction::Load) {
13827 Res = FindFirstInst();
13835 if ((E->getOpcode() == Instruction::GetElementPtr &&
13838 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13842 return isa<PoisonValue>(V) ||
13843 (!isVectorLikeInstWithConstOps(V) &&
13844 isUsedOutsideBlock(V));
13846 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13847 return isa<ExtractElementInst, UndefValue>(V) ||
13848 areAllOperandsNonInsts(V);
13850 Res = FindLastInst();
13852 Res = FindFirstInst();
13860 if (BlocksSchedules.count(BB) && !E->isGather()) {
13861 Value *
V = E->isOneOf(E->Scalars.back());
13864 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13865 if (Bundle && Bundle->isPartOfBundle())
13866 for (; Bundle; Bundle = Bundle->NextInBundle)
13867 Res = Bundle->Inst;
13889 Res = FindLastInst();
13890 assert(Res &&
"Failed to find last instruction in bundle");
13894void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13895 auto *Front = E->getMainOp();
13896 Instruction *LastInst = &getLastInstructionInBundle(E);
13897 assert(LastInst &&
"Failed to find last instruction in bundle");
13900 bool IsPHI = isa<PHINode>(LastInst);
13902 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13904 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13908 Builder.SetInsertPoint(
13912 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13915Value *BoUpSLP::gather(
13924 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13927 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13928 InsertBB = InsertBB->getSinglePredecessor();
13929 return InsertBB && InsertBB == InstBB;
13931 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13932 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13933 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13935 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13936 PostponedIndices.
insert(
I).second)
13940 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13943 if (
Scalar->getType() != Ty) {
13947 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13948 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13950 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13954 Scalar = Builder.CreateIntCast(
13959 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13963 auto *
II = dyn_cast<IntrinsicInst>(Vec);
13964 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13968 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13969 InsElt = dyn_cast<InsertElementInst>(Vec);
13973 GatherShuffleExtractSeq.
insert(InsElt);
13976 if (isa<Instruction>(V)) {
13979 User *UserOp =
nullptr;
13981 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13987 unsigned FoundLane = Entries.front()->findLaneForValue(V);
13988 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
13998 std::iota(
Mask.begin(),
Mask.end(), 0);
13999 Value *OriginalRoot = Root;
14000 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14001 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14002 SV->getOperand(0)->getType() == VecTy) {
14003 Root = SV->getOperand(0);
14004 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14007 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
14014 if (isa<PoisonValue>(VL[
I]))
14016 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14020 if (isa<PoisonValue>(Vec)) {
14021 Vec = OriginalRoot;
14023 Vec = CreateShuffle(Root, Vec, Mask);
14024 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14025 OI && OI->hasNUses(0) &&
14026 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14027 return TE->VectorizedValue == OI;
14033 for (
int I : NonConsts)
14034 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14037 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14038 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14076 bool IsFinalized =
false;
14089 class ShuffleIRBuilder {
14102 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14103 CSEBlocks(CSEBlocks),
DL(
DL) {}
14104 ~ShuffleIRBuilder() =
default;
14107 if (V1->
getType() != V2->getType()) {
14110 "Expected integer vector types only.");
14111 if (V1->
getType() != V2->getType()) {
14112 if (cast<VectorType>(V2->getType())
14114 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14116 ->getIntegerBitWidth())
14125 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14126 GatherShuffleExtractSeq.
insert(
I);
14127 CSEBlocks.
insert(
I->getParent());
14136 unsigned VF = Mask.size();
14137 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14141 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14142 GatherShuffleExtractSeq.
insert(
I);
14143 CSEBlocks.
insert(
I->getParent());
14147 Value *createIdentity(
Value *V) {
return V; }
14148 Value *createPoison(
Type *Ty,
unsigned VF) {
14153 void resizeToMatch(
Value *&V1,
Value *&V2) {
14154 if (V1->
getType() == V2->getType())
14156 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14157 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14158 int VF = std::max(V1VF, V2VF);
14159 int MinVF = std::min(V1VF, V2VF);
14161 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14163 Value *&
Op = MinVF == V1VF ? V1 : V2;
14165 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14166 GatherShuffleExtractSeq.
insert(
I);
14167 CSEBlocks.
insert(
I->getParent());
14180 assert(V1 &&
"Expected at least one vector value.");
14181 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14182 R.CSEBlocks, *R.DL);
14183 return BaseShuffleAnalysis::createShuffle<Value *>(
14184 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14190 std::optional<bool> IsSigned = std::nullopt) {
14191 auto *VecTy = cast<VectorType>(V->getType());
14202 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14206 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14207 unsigned NumParts,
bool &UseVecBaseAsInput) {
14208 UseVecBaseAsInput =
false;
14210 Value *VecBase =
nullptr;
14212 if (!E->ReorderIndices.empty()) {
14214 E->ReorderIndices.end());
14217 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14221 auto *EI = cast<ExtractElementInst>(VL[
I]);
14222 VecBase = EI->getVectorOperand();
14224 VecBase = TEs.front()->VectorizedValue;
14225 assert(VecBase &&
"Expected vectorized value.");
14226 UniqueBases.
insert(VecBase);
14229 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14230 (NumParts != 1 &&
count(VL, EI) > 1) ||
14232 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
14233 return UTEs.empty() || UTEs.size() > 1 ||
14234 (isa<GetElementPtrInst>(U) &&
14235 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14237 count_if(R.VectorizableTree,
14238 [&](const std::unique_ptr<TreeEntry> &TE) {
14239 return any_of(TE->UserTreeIndices,
14240 [&](const EdgeInfo &Edge) {
14241 return Edge.UserTE ==
14244 is_contained(VL, EI);
14248 R.eraseInstruction(EI);
14250 if (NumParts == 1 || UniqueBases.
size() == 1) {
14251 assert(VecBase &&
"Expected vectorized value.");
14252 return castToScalarTyElem(VecBase);
14254 UseVecBaseAsInput =
true;
14264 Value *Vec =
nullptr;
14267 for (
unsigned Part : seq<unsigned>(NumParts)) {
14271 constexpr int MaxBases = 2;
14273 auto VLMask =
zip(SubVL, SubMask);
14274 const unsigned VF = std::accumulate(
14275 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14276 if (std::get<1>(D) == PoisonMaskElem)
14279 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14280 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
14282 VecOp = TEs.front()->VectorizedValue;
14283 assert(VecOp &&
"Expected vectorized value.");
14284 const unsigned Size =
14285 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14286 return std::max(S, Size);
14288 for (
const auto [V,
I] : VLMask) {
14291 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14293 VecOp = TEs.front()->VectorizedValue;
14294 assert(VecOp &&
"Expected vectorized value.");
14295 VecOp = castToScalarTyElem(VecOp);
14296 Bases[
I / VF] = VecOp;
14298 if (!Bases.front())
14301 if (Bases.back()) {
14302 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14303 TransformToIdentity(SubMask);
14305 SubVec = Bases.front();
14312 Mask.slice(
P * SliceSize,
14319 "Expected first part or all previous parts masked.");
14320 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14323 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14325 unsigned SubVecVF =
14326 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14327 NewVF = std::max(NewVF, SubVecVF);
14330 for (
int &
Idx : SubMask)
14333 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14334 Vec = createShuffle(Vec, SubVec, VecMask);
14335 TransformToIdentity(VecMask);
14343 std::optional<Value *>
14349 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14351 return std::nullopt;
14354 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14363 Value *V1 = E1.VectorizedValue;
14365 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14366 if (isa<PoisonValue>(V))
14368 return !isKnownNonNegative(
14369 V, SimplifyQuery(*R.DL));
14371 Value *V2 = E2.VectorizedValue;
14372 if (V2->getType()->isIntOrIntVectorTy())
14373 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14374 if (isa<PoisonValue>(V))
14376 return !isKnownNonNegative(
14377 V, SimplifyQuery(*R.DL));
14384 Value *V1 = E1.VectorizedValue;
14386 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14387 if (isa<PoisonValue>(V))
14389 return !isKnownNonNegative(
14390 V, SimplifyQuery(*R.DL));
14396 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14398 isa<FixedVectorType>(V2->getType()) &&
14399 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14400 V1 = castToScalarTyElem(V1);
14401 V2 = castToScalarTyElem(V2);
14402 if (InVectors.
empty()) {
14405 CommonMask.
assign(Mask.begin(), Mask.end());
14409 if (InVectors.
size() == 2) {
14410 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14411 transformMaskAfterShuffle(CommonMask, CommonMask);
14412 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14414 Vec = createShuffle(Vec,
nullptr, CommonMask);
14415 transformMaskAfterShuffle(CommonMask, CommonMask);
14417 V1 = createShuffle(V1, V2, Mask);
14418 unsigned VF = std::max(getVF(V1), getVF(Vec));
14419 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14421 CommonMask[
Idx] =
Idx + VF;
14422 InVectors.
front() = Vec;
14423 if (InVectors.
size() == 2)
14424 InVectors.
back() = V1;
14431 "castToScalarTyElem expects V1 to be FixedVectorType");
14432 V1 = castToScalarTyElem(V1);
14433 if (InVectors.
empty()) {
14435 CommonMask.
assign(Mask.begin(), Mask.end());
14438 const auto *It =
find(InVectors, V1);
14439 if (It == InVectors.
end()) {
14440 if (InVectors.
size() == 2 ||
14443 if (InVectors.
size() == 2) {
14444 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14445 transformMaskAfterShuffle(CommonMask, CommonMask);
14446 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14447 CommonMask.
size()) {
14448 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14449 transformMaskAfterShuffle(CommonMask, CommonMask);
14451 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14452 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14455 V->getType() != V1->
getType()
14457 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14458 ->getNumElements();
14459 if (V->getType() != V1->
getType())
14460 V1 = createShuffle(V1,
nullptr, Mask);
14461 InVectors.
front() = V;
14462 if (InVectors.
size() == 2)
14463 InVectors.
back() = V1;
14470 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14477 for (
Value *V : InVectors)
14478 VF = std::max(VF, getVF(V));
14479 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14481 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
14490 Value *Root =
nullptr) {
14491 return R.gather(VL, Root, ScalarTy,
14493 return createShuffle(V1, V2, Mask);
14502 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14505 IsFinalized =
true;
14508 if (InVectors.
size() == 2) {
14509 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14512 Vec = createShuffle(Vec,
nullptr, CommonMask);
14514 transformMaskAfterShuffle(CommonMask, CommonMask);
14516 "Expected vector length for the final value before action.");
14517 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14520 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14521 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14523 Action(Vec, CommonMask);
14524 InVectors.
front() = Vec;
14526 if (!SubVectors.empty()) {
14528 if (InVectors.
size() == 2) {
14529 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14532 Vec = createShuffle(Vec,
nullptr, CommonMask);
14534 transformMaskAfterShuffle(CommonMask, CommonMask);
14535 auto CreateSubVectors = [&](
Value *Vec,
14537 for (
auto [E,
Idx] : SubVectors) {
14538 Value *
V = E->VectorizedValue;
14539 if (
V->getType()->isIntOrIntVectorTy())
14540 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14541 if (isa<PoisonValue>(V))
14543 return !isKnownNonNegative(
14544 V, SimplifyQuery(*R.DL));
14548 Builder, Vec, V, InsertionIndex,
14549 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14551 if (!CommonMask.
empty()) {
14552 std::iota(std::next(CommonMask.
begin(),
Idx),
14553 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
14559 if (SubVectorsMask.
empty()) {
14560 Vec = CreateSubVectors(Vec, CommonMask);
14563 copy(SubVectorsMask, SVMask.begin());
14564 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14567 I1 = I2 + CommonMask.
size();
14572 Vec = createShuffle(InsertVec, Vec, SVMask);
14573 transformMaskAfterShuffle(CommonMask, SVMask);
14575 InVectors.
front() = Vec;
14578 if (!ExtMask.
empty()) {
14579 if (CommonMask.
empty()) {
14583 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14586 NewMask[
I] = CommonMask[ExtMask[
I]];
14588 CommonMask.
swap(NewMask);
14591 if (CommonMask.
empty()) {
14592 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14593 return InVectors.
front();
14595 if (InVectors.
size() == 2)
14596 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14597 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14602 "Shuffle construction must be finalized.");
14606BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14607 unsigned NodeIdx) {
14611 if (!S && VL.
front()->getType()->isPointerTy()) {
14612 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14613 if (It != VL.
end())
14618 auto CheckSameVE = [&](
const TreeEntry *VE) {
14619 return any_of(VE->UserTreeIndices,
14620 [E, NodeIdx](
const EdgeInfo &EI) {
14621 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14623 any_of(VectorizableTree,
14624 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14625 return TE->isOperandGatherNode(
14626 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14627 VE->isSame(TE->Scalars);
14630 TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
14631 if (VE && CheckSameVE(VE))
14636Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14637 bool PostponedPHIs) {
14638 ValueList &VL = E->getOperand(NodeIdx);
14639 const unsigned VF = VL.size();
14640 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14645 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14647 ShuffleInstructionBuilder ShuffleBuilder(
14651 ShuffleBuilder.add(V, Mask);
14653 E->CombinedEntriesWithIndices.size());
14654 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14655 [&](
const auto &
P) {
14656 return std::make_pair(VectorizableTree[P.first].get(),
14659 assert((E->CombinedEntriesWithIndices.empty() ||
14660 E->ReorderIndices.empty()) &&
14661 "Expected either combined subnodes or reordering");
14662 return ShuffleBuilder.finalize({}, SubVectors, {});
14666 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14667 if (!VE->ReuseShuffleIndices.empty()) {
14688 if (isa<PoisonValue>(V))
14690 Mask[
I] = VE->findLaneForValue(V);
14692 V = FinalShuffle(V, Mask);
14694 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14695 "Expected vectorization factor less "
14696 "than original vector size.");
14698 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14699 V = FinalShuffle(V, UniformMask);
14705 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14706 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14707 }) == VE->UserTreeIndices.end()) {
14709 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14710 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14711 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14713 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14714 (*It)->VectorizedValue =
V;
14722 auto *
I =
find_if(VectorizableTree,
14723 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14724 return TE->isOperandGatherNode({E, NodeIdx});
14726 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14727 assert(
I->get()->UserTreeIndices.size() == 1 &&
14728 "Expected only single user for the gather node.");
14729 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14733template <
typename BVTy,
typename ResTy,
typename...
Args>
14734ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14736 assert(E->isGather() &&
"Expected gather node.");
14737 unsigned VF = E->getVectorFactor();
14739 bool NeedFreeze =
false;
14741 E->ReuseShuffleIndices.end());
14744 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14746 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14749 E->CombinedEntriesWithIndices.size());
14750 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14751 [&](
const auto &
P) {
14752 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14757 E->ReorderIndices.end());
14758 if (!ReorderMask.empty())
14764 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14765 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14766 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14769 SubVectorsMask.
clear();
14773 unsigned I,
unsigned SliceSize,
14774 bool IsNotPoisonous) {
14776 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14779 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14780 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14781 if (UserTE->getNumOperands() != 2)
14783 if (!IsNotPoisonous) {
14785 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14786 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14787 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14788 }) !=
TE->UserTreeIndices.end();
14790 if (It == VectorizableTree.end())
14793 if (!(*It)->ReorderIndices.empty()) {
14797 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14798 Value *V0 = std::get<0>(
P);
14799 Value *V1 = std::get<1>(
P);
14800 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14801 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14807 if ((
Mask.size() < InputVF &&
14810 (
Mask.size() == InputVF &&
14813 std::next(
Mask.begin(),
I * SliceSize),
14814 std::next(
Mask.begin(),
14821 std::next(
Mask.begin(),
I * SliceSize),
14822 std::next(
Mask.begin(),
14828 BVTy ShuffleBuilder(ScalarTy, Params...);
14829 ResTy Res = ResTy();
14833 Value *ExtractVecBase =
nullptr;
14834 bool UseVecBaseAsInput =
false;
14837 Type *OrigScalarTy = GatheredScalars.front()->getType();
14840 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14842 bool Resized =
false;
14844 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14845 if (!ExtractShuffles.
empty()) {
14851 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand());
14853 ExtractEntries.
append(TEs.begin(), TEs.end());
14855 if (std::optional<ResTy> Delayed =
14856 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14858 PostponedGathers.
insert(E);
14863 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14864 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14865 ExtractVecBase = VecBase;
14866 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14867 if (VF == VecBaseTy->getNumElements() &&
14868 GatheredScalars.size() != VF) {
14870 GatheredScalars.append(VF - GatheredScalars.size(),
14878 if (!ExtractShuffles.
empty() || !E->hasState() ||
14879 E->getOpcode() != Instruction::Load ||
14880 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14881 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14884 return isa<LoadInst>(V) && isVectorized(V);
14886 (E->hasState() && E->isAltShuffle()) ||
14887 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
14889 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14891 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14893 if (!GatherShuffles.
empty()) {
14894 if (std::optional<ResTy> Delayed =
14895 ShuffleBuilder.needToDelay(E, Entries)) {
14897 PostponedGathers.
insert(E);
14902 if (GatherShuffles.
size() == 1 &&
14904 Entries.front().front()->isSame(E->Scalars)) {
14907 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14910 Mask.resize(E->Scalars.size());
14911 const TreeEntry *FrontTE = Entries.front().front();
14912 if (FrontTE->ReorderIndices.empty() &&
14913 ((FrontTE->ReuseShuffleIndices.empty() &&
14914 E->Scalars.size() == FrontTE->Scalars.size()) ||
14915 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14916 std::iota(
Mask.begin(),
Mask.end(), 0);
14919 if (isa<PoisonValue>(V)) {
14923 Mask[
I] = FrontTE->findLaneForValue(V);
14926 ShuffleBuilder.add(*FrontTE, Mask);
14928 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14932 if (GatheredScalars.size() != VF &&
14934 return any_of(TEs, [&](
const TreeEntry *TE) {
14935 return TE->getVectorFactor() == VF;
14938 GatheredScalars.append(VF - GatheredScalars.size(),
14942 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14950 bool IsRootPoison) {
14953 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14960 int NumNonConsts = 0;
14963 if (isa<UndefValue>(V)) {
14964 if (!isa<PoisonValue>(V)) {
14979 Scalars.
front() = OrigV;
14982 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
14983 Scalars[Res.first->second] = OrigV;
14984 ReuseMask[
I] = Res.first->second;
14987 if (NumNonConsts == 1) {
14992 if (!UndefPos.
empty() && UndefPos.
front() == 0)
14995 ReuseMask[SinglePos] = SinglePos;
14996 }
else if (!UndefPos.
empty() && IsSplat) {
15001 return !isa<UndefValue>(V) &&
15003 (E->UserTreeIndices.size() == 1 &&
15007 return E->UserTreeIndices.front().EdgeIdx !=
15008 U.getOperandNo() &&
15010 E->UserTreeIndices.front().UserTE->Scalars,
15014 if (It != Scalars.
end()) {
15016 int Pos = std::distance(Scalars.
begin(), It);
15017 for (
int I : UndefPos) {
15019 ReuseMask[
I] = Pos;
15028 for (
int I : UndefPos) {
15030 if (isa<UndefValue>(Scalars[
I]))
15037 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15038 bool IsNonPoisoned =
true;
15039 bool IsUsedInExpr =
true;
15040 Value *Vec1 =
nullptr;
15041 if (!ExtractShuffles.
empty()) {
15045 Value *Vec2 =
nullptr;
15046 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15050 if (UseVecBaseAsInput) {
15051 Vec1 = ExtractVecBase;
15053 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15056 if (isa<UndefValue>(E->Scalars[
I]))
15058 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15059 Value *VecOp = EI->getVectorOperand();
15061 !TEs.
empty() && TEs.
front()->VectorizedValue)
15062 VecOp = TEs.
front()->VectorizedValue;
15065 }
else if (Vec1 != VecOp) {
15066 assert((!Vec2 || Vec2 == VecOp) &&
15067 "Expected only 1 or 2 vectors shuffle.");
15073 IsUsedInExpr =
false;
15076 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15079 IsUsedInExpr &= FindReusedSplat(
15081 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15082 ExtractMask.size(), IsNotPoisonedVec);
15083 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15084 IsNonPoisoned &= IsNotPoisonedVec;
15086 IsUsedInExpr =
false;
15091 if (!GatherShuffles.
empty()) {
15094 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15097 "No shuffles with empty entries list expected.");
15101 "Expected shuffle of 1 or 2 entries.");
15105 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15106 if (TEs.
size() == 1) {
15107 bool IsNotPoisonedVec =
15108 TEs.
front()->VectorizedValue
15112 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15113 SliceSize, IsNotPoisonedVec);
15114 ShuffleBuilder.add(*TEs.
front(), VecMask);
15115 IsNonPoisoned &= IsNotPoisonedVec;
15117 IsUsedInExpr =
false;
15118 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15119 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15130 int EMSz = ExtractMask.size();
15131 int MSz =
Mask.size();
15134 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15135 bool IsIdentityShuffle =
15136 ((UseVecBaseAsInput ||
15138 [](
const std::optional<TTI::ShuffleKind> &SK) {
15142 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15144 (!GatherShuffles.
empty() &&
15146 [](
const std::optional<TTI::ShuffleKind> &SK) {
15150 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15152 bool EnoughConstsForShuffle =
15156 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15160 return isa<Constant>(V) && !isa<UndefValue>(V);
15162 (!IsIdentityShuffle ||
15163 (GatheredScalars.size() == 2 &&
15165 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15167 return isa<Constant>(V) && !isa<PoisonValue>(V);
15171 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15172 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15178 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15180 TryPackScalars(GatheredScalars, BVMask,
true);
15181 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15182 ShuffleBuilder.add(BV, BVMask);
15185 return isa<PoisonValue>(V) ||
15186 (IsSingleShuffle && ((IsIdentityShuffle &&
15187 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15189 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15192 Res = ShuffleBuilder.finalize(
15193 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15195 TryPackScalars(NonConstants, Mask,
false);
15196 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15201 TryPackScalars(GatheredScalars, ReuseMask,
true);
15202 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15203 ShuffleBuilder.add(BV, ReuseMask);
15204 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15209 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15210 if (!isa<PoisonValue>(V))
15213 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15214 ShuffleBuilder.add(BV, Mask);
15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15220 Res = ShuffleBuilder.createFreeze(Res);
15224Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15225 bool PostponedPHIs) {
15226 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15228 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15236 for (
Value *V : VL)
15237 if (isa<Instruction>(V))
15245 if (E->VectorizedValue &&
15246 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15247 E->isAltShuffle())) {
15248 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15249 return E->VectorizedValue;
15252 Value *
V = E->Scalars.front();
15253 Type *ScalarTy =
V->getType();
15254 if (!isa<CmpInst>(V))
15256 auto It = MinBWs.
find(E);
15257 if (It != MinBWs.
end()) {
15258 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15264 if (E->isGather()) {
15266 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15267 setInsertPointAfterBundle(E);
15268 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15269 E->VectorizedValue = Vec;
15273 bool IsReverseOrder =
15274 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15275 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15276 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15277 if (E->getOpcode() == Instruction::Store &&
15278 E->State == TreeEntry::Vectorize) {
15280 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15281 E->ReorderIndices.size());
15282 ShuffleBuilder.add(V, Mask);
15283 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15284 ShuffleBuilder.addOrdered(V, {});
15286 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15289 E->CombinedEntriesWithIndices.size());
15291 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15292 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15295 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15296 "Expected either combined subnodes or reordering");
15297 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15300 assert(!E->isGather() &&
"Unhandled state");
15301 unsigned ShuffleOrOp =
15302 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15304 auto GetOperandSignedness = [&](
unsigned Idx) {
15305 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15306 bool IsSigned =
false;
15307 auto It = MinBWs.
find(OpE);
15308 if (It != MinBWs.
end())
15309 IsSigned = It->second.second;
15312 if (isa<PoisonValue>(V))
15314 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15318 switch (ShuffleOrOp) {
15319 case Instruction::PHI: {
15320 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15321 E != VectorizableTree.front().get() ||
15322 !E->UserTreeIndices.empty()) &&
15323 "PHI reordering is free.");
15324 if (PostponedPHIs && E->VectorizedValue)
15325 return E->VectorizedValue;
15326 auto *PH = cast<PHINode>(VL0);
15328 PH->getParent()->getFirstNonPHIIt());
15330 if (PostponedPHIs || !E->VectorizedValue) {
15337 PH->getParent()->getFirstInsertionPt());
15340 V = FinalShuffle(V, E);
15342 E->VectorizedValue =
V;
15346 PHINode *NewPhi = cast<PHINode>(E->PHI);
15355 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15361 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15365 if (!VisitedBBs.
insert(IBB).second) {
15372 Value *Vec = vectorizeOperand(E,
I,
true);
15373 if (VecTy != Vec->
getType()) {
15375 MinBWs.
contains(getOperandEntry(E,
I))) &&
15376 "Expected item in MinBWs.");
15377 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15383 "Invalid number of incoming values");
15384 assert(E->VectorizedValue &&
"Expected vectorized value.");
15385 return E->VectorizedValue;
15388 case Instruction::ExtractElement: {
15389 Value *
V = E->getSingleOperand(0);
15391 V = TEs.
front()->VectorizedValue;
15392 setInsertPointAfterBundle(E);
15393 V = FinalShuffle(V, E);
15394 E->VectorizedValue =
V;
15397 case Instruction::ExtractValue: {
15398 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15403 NewV = FinalShuffle(NewV, E);
15404 E->VectorizedValue = NewV;
15407 case Instruction::InsertElement: {
15408 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15410 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15412 Type *ScalarTy =
Op.front()->getType();
15413 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15415 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15416 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15421 cast<FixedVectorType>(
V->getType())->getNumElements()),
15426 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15427 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15429 const unsigned NumElts =
15430 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15431 const unsigned NumScalars = E->Scalars.size();
15434 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15438 if (!E->ReorderIndices.empty()) {
15443 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15446 bool IsIdentity =
true;
15448 Mask.swap(PrevMask);
15449 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15452 IsIdentity &= InsertIdx -
Offset ==
I;
15455 if (!IsIdentity || NumElts != NumScalars) {
15457 bool IsVNonPoisonous =
15460 if (NumElts != NumScalars &&
Offset == 0) {
15469 InsertMask[*InsertIdx] = *InsertIdx;
15470 if (!
Ins->hasOneUse())
15472 Ins = dyn_cast_or_null<InsertElementInst>(
15473 Ins->getUniqueUndroppableUser());
15476 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15478 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15481 if (!IsFirstPoison.
all()) {
15483 for (
unsigned I = 0;
I < NumElts;
I++) {
15485 IsFirstUndef.
test(
I)) {
15486 if (IsVNonPoisonous) {
15487 InsertMask[
I] =
I < NumScalars ?
I : 0;
15492 if (
Idx >= NumScalars)
15493 Idx = NumScalars - 1;
15494 InsertMask[
I] = NumScalars +
Idx;
15508 if (
auto *
I = dyn_cast<Instruction>(V)) {
15509 GatherShuffleExtractSeq.
insert(
I);
15510 CSEBlocks.
insert(
I->getParent());
15515 for (
unsigned I = 0;
I < NumElts;
I++) {
15520 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15523 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15524 NumElts != NumScalars) {
15525 if (IsFirstUndef.
all()) {
15528 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15529 if (!IsFirstPoison.
all()) {
15530 for (
unsigned I = 0;
I < NumElts;
I++) {
15532 InsertMask[
I] =
I + NumElts;
15539 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15540 if (
auto *
I = dyn_cast<Instruction>(V)) {
15541 GatherShuffleExtractSeq.
insert(
I);
15542 CSEBlocks.
insert(
I->getParent());
15547 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15548 for (
unsigned I = 0;
I < NumElts;
I++) {
15552 InsertMask[
I] += NumElts;
15555 FirstInsert->getOperand(0), V, InsertMask,
15556 cast<Instruction>(E->Scalars.back())->getName());
15557 if (
auto *
I = dyn_cast<Instruction>(V)) {
15558 GatherShuffleExtractSeq.
insert(
I);
15559 CSEBlocks.
insert(
I->getParent());
15564 ++NumVectorInstructions;
15565 E->VectorizedValue =
V;
15568 case Instruction::ZExt:
15569 case Instruction::SExt:
15570 case Instruction::FPToUI:
15571 case Instruction::FPToSI:
15572 case Instruction::FPExt:
15573 case Instruction::PtrToInt:
15574 case Instruction::IntToPtr:
15575 case Instruction::SIToFP:
15576 case Instruction::UIToFP:
15577 case Instruction::Trunc:
15578 case Instruction::FPTrunc:
15579 case Instruction::BitCast: {
15580 setInsertPointAfterBundle(E);
15582 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15583 if (E->VectorizedValue) {
15584 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15585 return E->VectorizedValue;
15588 auto *CI = cast<CastInst>(VL0);
15590 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15591 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15593 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15596 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15597 if (SrcIt != MinBWs.
end())
15598 SrcBWSz = SrcIt->second.first;
15600 if (BWSz == SrcBWSz) {
15601 VecOpcode = Instruction::BitCast;
15602 }
else if (BWSz < SrcBWSz) {
15603 VecOpcode = Instruction::Trunc;
15604 }
else if (It != MinBWs.
end()) {
15605 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15606 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15607 }
else if (SrcIt != MinBWs.
end()) {
15608 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15610 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15612 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15613 !SrcIt->second.second) {
15614 VecOpcode = Instruction::UIToFP;
15616 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15618 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15619 V = FinalShuffle(V, E);
15621 E->VectorizedValue =
V;
15622 ++NumVectorInstructions;
15625 case Instruction::FCmp:
15626 case Instruction::ICmp: {
15627 setInsertPointAfterBundle(E);
15629 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15630 if (E->VectorizedValue) {
15631 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15632 return E->VectorizedValue;
15634 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15635 if (E->VectorizedValue) {
15636 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15637 return E->VectorizedValue;
15639 if (
L->getType() !=
R->getType()) {
15641 getOperandEntry(E, 1)->
isGather() ||
15642 MinBWs.
contains(getOperandEntry(E, 0)) ||
15643 MinBWs.
contains(getOperandEntry(E, 1))) &&
15644 "Expected item in MinBWs.");
15645 if (cast<VectorType>(
L->getType())
15647 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15649 ->getIntegerBitWidth()) {
15650 Type *CastTy =
R->getType();
15653 Type *CastTy =
L->getType();
15661 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15662 ICmp->setSameSign(
false);
15664 VecTy = cast<FixedVectorType>(
V->getType());
15665 V = FinalShuffle(V, E);
15667 E->VectorizedValue =
V;
15668 ++NumVectorInstructions;
15671 case Instruction::Select: {
15672 setInsertPointAfterBundle(E);
15674 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15675 if (E->VectorizedValue) {
15676 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15677 return E->VectorizedValue;
15679 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15680 if (E->VectorizedValue) {
15681 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15682 return E->VectorizedValue;
15684 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15685 if (E->VectorizedValue) {
15686 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15687 return E->VectorizedValue;
15691 getOperandEntry(E, 2)->
isGather() ||
15692 MinBWs.
contains(getOperandEntry(E, 1)) ||
15693 MinBWs.
contains(getOperandEntry(E, 2))) &&
15694 "Expected item in MinBWs.");
15695 if (True->
getType() != VecTy)
15696 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15697 if (False->
getType() != VecTy)
15698 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15703 assert(TrueNumElements >= CondNumElements &&
15704 TrueNumElements % CondNumElements == 0 &&
15705 "Cannot vectorize Instruction::Select");
15707 "Cannot vectorize Instruction::Select");
15708 if (CondNumElements != TrueNumElements) {
15716 "Cannot vectorize Instruction::Select");
15718 V = FinalShuffle(V, E);
15720 E->VectorizedValue =
V;
15721 ++NumVectorInstructions;
15724 case Instruction::FNeg: {
15725 setInsertPointAfterBundle(E);
15727 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15729 if (E->VectorizedValue) {
15730 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15731 return E->VectorizedValue;
15737 if (
auto *
I = dyn_cast<Instruction>(V))
15740 V = FinalShuffle(V, E);
15742 E->VectorizedValue =
V;
15743 ++NumVectorInstructions;
15747 case Instruction::Freeze: {
15748 setInsertPointAfterBundle(E);
15750 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15752 if (E->VectorizedValue) {
15753 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15754 return E->VectorizedValue;
15757 if (
Op->getType() != VecTy) {
15759 MinBWs.
contains(getOperandEntry(E, 0))) &&
15760 "Expected item in MinBWs.");
15764 V = FinalShuffle(V, E);
15766 E->VectorizedValue =
V;
15767 ++NumVectorInstructions;
15771 case Instruction::Add:
15772 case Instruction::FAdd:
15773 case Instruction::Sub:
15774 case Instruction::FSub:
15775 case Instruction::Mul:
15776 case Instruction::FMul:
15777 case Instruction::UDiv:
15778 case Instruction::SDiv:
15779 case Instruction::FDiv:
15780 case Instruction::URem:
15781 case Instruction::SRem:
15782 case Instruction::FRem:
15783 case Instruction::Shl:
15784 case Instruction::LShr:
15785 case Instruction::AShr:
15786 case Instruction::And:
15787 case Instruction::Or:
15788 case Instruction::Xor: {
15789 setInsertPointAfterBundle(E);
15791 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15792 if (E->VectorizedValue) {
15793 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15794 return E->VectorizedValue;
15796 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15797 if (E->VectorizedValue) {
15798 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15799 return E->VectorizedValue;
15801 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15802 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15805 auto *CI = dyn_cast<ConstantInt>(
Op);
15806 return CI && CI->getValue().countr_one() >= It->second.first;
15808 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15809 E->VectorizedValue =
V;
15810 ++NumVectorInstructions;
15817 getOperandEntry(E, 1)->
isGather() ||
15818 MinBWs.
contains(getOperandEntry(E, 0)) ||
15819 MinBWs.
contains(getOperandEntry(E, 1))) &&
15820 "Expected item in MinBWs.");
15831 if (
auto *
I = dyn_cast<Instruction>(V)) {
15834 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15836 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15838 I->setHasNoUnsignedWrap(
false);
15841 V = FinalShuffle(V, E);
15843 E->VectorizedValue =
V;
15844 ++NumVectorInstructions;
15848 case Instruction::Load: {
15851 setInsertPointAfterBundle(E);
15853 LoadInst *LI = cast<LoadInst>(VL0);
15856 if (E->State == TreeEntry::Vectorize) {
15858 }
else if (E->State == TreeEntry::StridedVectorize) {
15859 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15860 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15861 PO = IsReverseOrder ? PtrN : Ptr0;
15867 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15869 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15870 DL->getTypeAllocSize(ScalarTy));
15874 return cast<LoadInst>(V)->getPointerOperand();
15877 std::optional<Value *> Stride =
15886 (IsReverseOrder ? -1 : 1) *
15887 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15889 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15891 Intrinsic::experimental_vp_strided_load,
15892 {VecTy, PO->
getType(), StrideTy},
15894 Builder.
getInt32(E->Scalars.size())});
15900 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15901 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15902 if (E->VectorizedValue) {
15903 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15904 return E->VectorizedValue;
15906 if (isa<FixedVectorType>(ScalarTy)) {
15910 unsigned ScalarTyNumElements =
15911 cast<FixedVectorType>(ScalarTy)->getNumElements();
15912 unsigned VecTyNumElements =
15913 cast<FixedVectorType>(VecTy)->getNumElements();
15914 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15915 "Cannot expand getelementptr.");
15916 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15919 return Builder.getInt64(I % ScalarTyNumElements);
15928 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15933 V = FinalShuffle(V, E);
15934 E->VectorizedValue =
V;
15935 ++NumVectorInstructions;
15938 case Instruction::Store: {
15939 auto *
SI = cast<StoreInst>(VL0);
15941 setInsertPointAfterBundle(E);
15943 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15944 if (VecValue->
getType() != VecTy)
15946 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15947 VecValue = FinalShuffle(VecValue, E);
15951 if (E->State == TreeEntry::Vectorize) {
15954 assert(E->State == TreeEntry::StridedVectorize &&
15955 "Expected either strided or consecutive stores.");
15956 if (!E->ReorderIndices.empty()) {
15957 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15958 Ptr =
SI->getPointerOperand();
15960 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15961 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15963 Intrinsic::experimental_vp_strided_store,
15964 {VecTy,
Ptr->getType(), StrideTy},
15967 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15969 Builder.
getInt32(E->Scalars.size())});
15978 E->VectorizedValue =
V;
15979 ++NumVectorInstructions;
15982 case Instruction::GetElementPtr: {
15983 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15984 setInsertPointAfterBundle(E);
15986 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15987 if (E->VectorizedValue) {
15988 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15989 return E->VectorizedValue;
15993 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
15994 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15995 if (E->VectorizedValue) {
15996 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15997 return E->VectorizedValue;
16002 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16003 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
16005 for (
Value *V : E->Scalars) {
16006 if (isa<GetElementPtrInst>(V))
16012 V = FinalShuffle(V, E);
16014 E->VectorizedValue =
V;
16015 ++NumVectorInstructions;
16019 case Instruction::Call: {
16020 CallInst *CI = cast<CallInst>(VL0);
16021 setInsertPointAfterBundle(E);
16027 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16030 VecCallCosts.first <= VecCallCosts.second;
16032 Value *ScalarArg =
nullptr;
16038 auto *CEI = cast<CallInst>(VL0);
16039 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16044 ScalarArg = CEI->getArgOperand(
I);
16047 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16048 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16056 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16057 if (E->VectorizedValue) {
16058 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16059 return E->VectorizedValue;
16061 ScalarArg = CEI->getArgOperand(
I);
16062 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16064 It == MinBWs.
end()) {
16067 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16068 }
else if (It != MinBWs.
end()) {
16069 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16078 if (!UseIntrinsic) {
16094 V = FinalShuffle(V, E);
16096 E->VectorizedValue =
V;
16097 ++NumVectorInstructions;
16100 case Instruction::ShuffleVector: {
16102 if (
SLPReVec && !E->isAltShuffle()) {
16103 setInsertPointAfterBundle(E);
16104 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16105 if (E->VectorizedValue) {
16106 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16107 return E->VectorizedValue;
16110 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16111 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16112 "Not supported shufflevector usage.");
16115 return SVSrc->getShuffleMask()[Mask];
16122 if (
auto *
I = dyn_cast<Instruction>(V))
16124 V = FinalShuffle(V, E);
16126 assert(E->isAltShuffle() &&
16131 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16132 "Invalid Shuffle Vector Operand");
16136 setInsertPointAfterBundle(E);
16137 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16138 if (E->VectorizedValue) {
16139 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16140 return E->VectorizedValue;
16142 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16144 setInsertPointAfterBundle(E);
16145 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16147 if (E->VectorizedValue) {
16148 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16149 return E->VectorizedValue;
16156 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16157 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16158 MinBWs.
contains(getOperandEntry(E, 0)) ||
16159 MinBWs.
contains(getOperandEntry(E, 1))) &&
16160 "Expected item in MinBWs.");
16161 Type *CastTy = VecTy;
16165 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16167 ->getIntegerBitWidth())
16184 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16185 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16186 auto *AltCI = cast<CmpInst>(E->getAltOp());
16188 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16191 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16192 cast<VectorType>(
LHS->
getType())->getElementType());
16193 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16194 if (BWSz <= SrcBWSz) {
16195 if (BWSz < SrcBWSz)
16198 "Expected same type as operand.");
16199 if (
auto *
I = dyn_cast<Instruction>(LHS))
16201 LHS = FinalShuffle(LHS, E);
16202 E->VectorizedValue =
LHS;
16203 ++NumVectorInstructions;
16214 for (
Value *V : {V0, V1}) {
16215 if (
auto *
I = dyn_cast<Instruction>(V)) {
16216 GatherShuffleExtractSeq.
insert(
I);
16217 CSEBlocks.
insert(
I->getParent());
16226 E->buildAltOpShuffleMask(
16228 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16232 Mask, &OpScalars, &AltScalars);
16236 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16238 if (
auto *
I = dyn_cast<Instruction>(Vec);
16239 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16241 if (isa<PoisonValue>(V))
16243 auto *IV = cast<Instruction>(V);
16244 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16246 I->setHasNoUnsignedWrap(
false);
16248 DropNuwFlag(V0, E->getOpcode());
16249 DropNuwFlag(V1, E->getAltOpcode());
16251 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16256 if (
auto *
I = dyn_cast<Instruction>(V)) {
16258 GatherShuffleExtractSeq.
insert(
I);
16259 CSEBlocks.
insert(
I->getParent());
16263 E->VectorizedValue =
V;
16264 ++NumVectorInstructions;
16283 for (
auto &BSIter : BlocksSchedules) {
16284 scheduleBlock(BSIter.second.get());
16288 EntryToLastInstruction.
clear();
16298 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16299 if (GatheredLoadsEntriesFirst.has_value() &&
16300 TE->Idx >= *GatheredLoadsEntriesFirst &&
16301 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16302 assert((!TE->UserTreeIndices.empty() ||
16303 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16304 "Expected gathered load node.");
16310 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16311 if (TE->State == TreeEntry::Vectorize &&
16312 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16313 TE->VectorizedValue)
16319 for (
const TreeEntry *E : PostponedNodes) {
16320 auto *TE =
const_cast<TreeEntry *
>(E);
16321 if (
auto *VecTE = getSameValuesTreeEntry(
16322 TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
16323 TE->UserTreeIndices.front().EdgeIdx));
16324 VecTE && VecTE->isSame(TE->Scalars))
16328 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16329 TE->VectorizedValue =
nullptr;
16331 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16340 if (isa<PHINode>(UserI)) {
16343 for (
User *U : PrevVec->users()) {
16346 auto *UI = dyn_cast<Instruction>(U);
16347 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16349 if (UI->comesBefore(InsertPt))
16358 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16363 if (Vec->
getType() != PrevVec->getType()) {
16365 PrevVec->getType()->isIntOrIntVectorTy() &&
16366 "Expected integer vector types only.");
16367 std::optional<bool> IsSigned;
16368 for (
Value *V : TE->Scalars) {
16370 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
16371 auto It = MinBWs.
find(MNTE);
16372 if (It != MinBWs.
end()) {
16373 IsSigned = IsSigned.value_or(
false) || It->second.second;
16378 if (IsSigned.value_or(
false))
16381 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16382 auto It = MinBWs.
find(BVE);
16383 if (It != MinBWs.
end()) {
16384 IsSigned = IsSigned.value_or(
false) || It->second.second;
16389 if (IsSigned.value_or(
false))
16391 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16393 IsSigned.value_or(
false) ||
16397 if (IsSigned.value_or(
false))
16401 if (IsSigned.value_or(
false)) {
16403 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16404 if (It != MinBWs.
end())
16405 IsSigned = It->second.second;
16408 "Expected user node or perfect diamond match in MinBWs.");
16412 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16415 auto It = PostponedValues.
find(PrevVec);
16416 if (It != PostponedValues.
end()) {
16417 for (TreeEntry *VTE : It->getSecond())
16418 VTE->VectorizedValue = Vec;
16438 for (
const auto &ExternalUse : ExternalUses) {
16439 Value *Scalar = ExternalUse.Scalar;
16446 const TreeEntry *E = &ExternalUse.E;
16447 assert(E &&
"Invalid scalar");
16448 assert(!E->isGather() &&
"Extracting from a gather list");
16450 if (E->getOpcode() == Instruction::GetElementPtr &&
16451 !isa<GetElementPtrInst>(Scalar))
16454 Value *Vec = E->VectorizedValue;
16455 assert(Vec &&
"Can't find vectorizable value");
16458 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16459 if (Scalar->getType() != Vec->
getType()) {
16460 Value *Ex =
nullptr;
16461 Value *ExV =
nullptr;
16462 auto *Inst = dyn_cast<Instruction>(Scalar);
16463 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16464 auto It = ScalarToEEs.
find(Scalar);
16465 if (It != ScalarToEEs.
end()) {
16468 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16470 if (EEIt != It->second.end()) {
16471 Value *PrevV = EEIt->second.first;
16472 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16473 I && !ReplaceInst &&
16478 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16482 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16490 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16491 IgnoredExtracts.
insert(EE);
16494 auto *CloneInst = Inst->clone();
16495 CloneInst->insertBefore(Inst->getIterator());
16496 if (Inst->hasName())
16500 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16501 ES && isa<Instruction>(Vec)) {
16502 Value *V = ES->getVectorOperand();
16503 auto *IVec = cast<Instruction>(Vec);
16505 V = ETEs.front()->VectorizedValue;
16506 if (
auto *
IV = dyn_cast<Instruction>(V);
16507 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16508 IV->comesBefore(IVec))
16512 }
else if (
auto *VecTy =
16513 dyn_cast<FixedVectorType>(Scalar->getType())) {
16520 ExternalUse.Lane * VecTyNumElements);
16527 if (Scalar->getType() != Ex->
getType())
16529 Ex, Scalar->getType(),
16531 auto *
I = dyn_cast<Instruction>(Ex);
16533 : &
F->getEntryBlock(),
16534 std::make_pair(Ex, ExV));
16538 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16540 GatherShuffleExtractSeq.
insert(ExI);
16541 CSEBlocks.
insert(ExI->getParent());
16545 assert(isa<FixedVectorType>(Scalar->getType()) &&
16546 isa<InsertElementInst>(Scalar) &&
16547 "In-tree scalar of vector type is not insertelement?");
16548 auto *IE = cast<InsertElementInst>(Scalar);
16556 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16559 (ExternallyUsedValues.
count(Scalar) ||
16561 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16565 if (ExternalUsesAsOriginalScalar.contains(U))
16567 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
16568 return !UseEntries.empty() &&
16569 (E->State == TreeEntry::Vectorize ||
16570 E->State == TreeEntry::StridedVectorize) &&
16571 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
16572 return (UseEntry->State == TreeEntry::Vectorize ||
16574 TreeEntry::StridedVectorize) &&
16575 doesInTreeUserNeedToExtract(
16576 Scalar, getRootEntryInstruction(*UseEntry),
16580 "Scalar with nullptr User must be registered in "
16581 "ExternallyUsedValues map or remain as scalar in vectorized "
16583 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16584 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16585 if (
PHI->getParent()->isLandingPad())
16589 PHI->getParent()->getLandingPadInst()->getIterator()));
16592 PHI->getParent()->getFirstNonPHIIt());
16595 std::next(VecI->getIterator()));
16600 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16602 if (Scalar != NewInst) {
16603 assert((!isa<ExtractElementInst>(Scalar) ||
16604 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16605 "Extractelements should not be replaced.");
16606 Scalar->replaceAllUsesWith(NewInst);
16611 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16614 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16615 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16616 if (!UsedInserts.
insert(VU).second)
16619 auto BWIt = MinBWs.
find(E);
16621 auto *ScalarTy = FTy->getElementType();
16622 auto Key = std::make_pair(Vec, ScalarTy);
16623 auto VecIt = VectorCasts.
find(Key);
16624 if (VecIt == VectorCasts.
end()) {
16626 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16627 if (IVec->getParent()->isLandingPad())
16629 std::next(IVec->getParent()
16630 ->getLandingPadInst()
16634 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16635 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16642 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16643 BWIt->second.second);
16646 Vec = VecIt->second;
16653 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16660 unsigned Idx = *InsertIdx;
16661 if (It == ShuffledInserts.
end()) {
16663 It = std::next(ShuffledInserts.
begin(),
16664 ShuffledInserts.
size() - 1);
16669 Mask[
Idx] = ExternalUse.Lane;
16670 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16679 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16681 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16682 if (PH->getIncomingValue(
I) == Scalar) {
16684 PH->getIncomingBlock(
I)->getTerminator();
16685 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16687 std::next(VecI->getIterator()));
16691 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16692 PH->setOperand(
I, NewInst);
16697 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16702 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16712 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16713 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16715 CombinedMask1[
I] = Mask[
I];
16717 CombinedMask2[
I] = Mask[
I] - VF;
16719 ShuffleInstructionBuilder ShuffleBuilder(
16720 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16721 ShuffleBuilder.add(V1, CombinedMask1);
16723 ShuffleBuilder.add(V2, CombinedMask2);
16724 return ShuffleBuilder.finalize({}, {}, {});
16728 bool ForSingleMask) {
16729 unsigned VF =
Mask.size();
16730 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16732 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16733 Vec = CreateShuffle(Vec,
nullptr, Mask);
16734 return std::make_pair(Vec,
true);
16736 if (!ForSingleMask) {
16738 for (
unsigned I = 0;
I < VF; ++
I) {
16742 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16746 return std::make_pair(Vec,
false);
16750 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
16756 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16757 Value *NewInst = performExtractsShuffleAction<Value>(
16761 return cast<VectorType>(Vec->getType())
16762 ->getElementCount()
16763 .getKnownMinValue();
16768 assert((Vals.size() == 1 || Vals.size() == 2) &&
16769 "Expected exactly 1 or 2 input values.");
16770 if (Vals.size() == 1) {
16773 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16774 ->getNumElements() ||
16775 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16776 return CreateShuffle(Vals.front(), nullptr, Mask);
16777 return Vals.front();
16779 return CreateShuffle(Vals.
front() ? Vals.
front()
16781 Vals.
back(), Mask);
16783 auto It = ShuffledInserts[
I].InsertElements.rbegin();
16786 if (It != ShuffledInserts[
I].InsertElements.rend())
16789 while (It != ShuffledInserts[
I].InsertElements.rend()) {
16790 assert(
II &&
"Must be an insertelement instruction.");
16795 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16798 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16799 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16800 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16801 II->moveAfter(NewI);
16806 IE->replaceUsesOfWith(
IE->getOperand(0),
16808 IE->replaceUsesOfWith(
IE->getOperand(1),
16817 for (
auto &TEPtr : VectorizableTree) {
16818 TreeEntry *
Entry = TEPtr.get();
16821 if (
Entry->isGather())
16824 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
16827 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
16830 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
16831 !isa<GetElementPtrInst>(Scalar))
16833 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16834 EE && IgnoredExtracts.contains(EE))
16836 if (isa<PoisonValue>(Scalar))
16845 assert((isVectorized(U) ||
16846 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16847 (isa_and_nonnull<Instruction>(U) &&
16848 isDeleted(cast<Instruction>(U)))) &&
16849 "Deleting out-of-tree value");
16853 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16854 auto *
I = cast<Instruction>(Scalar);
16861 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16862 V->mergeDIAssignID(RemovedInsts);
16865 if (UserIgnoreList) {
16867 const TreeEntry *
IE = getTreeEntries(
I).front();
16868 if (
IE->Idx != 0 &&
16869 !(VectorizableTree.front()->isGather() &&
16870 !
IE->UserTreeIndices.empty() &&
16871 (ValueToGatherNodes.lookup(
I).contains(
16872 VectorizableTree.front().get()) ||
16874 [&](
const EdgeInfo &EI) {
16875 return EI.UserTE == VectorizableTree.front().get() &&
16876 EI.EdgeIdx == UINT_MAX;
16878 !(GatheredLoadsEntriesFirst.has_value() &&
16879 IE->Idx >= *GatheredLoadsEntriesFirst &&
16880 VectorizableTree.front()->isGather() &&
16886 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16887 (match(U.getUser(), m_LogicalAnd()) ||
16888 match(U.getUser(), m_LogicalOr())) &&
16889 U.getOperandNo() == 0;
16890 if (IsPoisoningLogicalOp) {
16891 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16894 return UserIgnoreList->contains(
U.getUser());
16906 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16909 InstrElementSize.
clear();
16911 const TreeEntry &RootTE = *VectorizableTree.front();
16912 Value *Vec = RootTE.VectorizedValue;
16913 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16914 It != MinBWs.end() &&
16915 ReductionBitWidth != It->second.first) {
16918 ReductionRoot->getIterator());
16922 cast<VectorType>(Vec->
getType())->getElementCount()),
16923 It->second.second);
16930 <<
" gather sequences instructions.\n");
16937 Loop *L = LI->getLoopFor(
I->getParent());
16942 BasicBlock *PreHeader = L->getLoopPreheader();
16950 auto *OpI = dyn_cast<Instruction>(V);
16951 return OpI && L->contains(OpI);
16957 CSEBlocks.
insert(PreHeader);
16972 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
16973 "Different nodes should have different DFS numbers");
16974 return A->getDFSNumIn() <
B->getDFSNumIn();
16985 if (I1->getType() != I2->getType())
16987 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16988 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16990 return I1->isIdenticalTo(I2);
16991 if (SI1->isIdenticalTo(SI2))
16993 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
16994 if (SI1->getOperand(
I) != SI2->getOperand(
I))
16997 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17001 unsigned LastUndefsCnt = 0;
17002 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
17008 NewMask[
I] != SM1[
I])
17011 NewMask[
I] = SM1[
I];
17015 return SM1.
size() - LastUndefsCnt > 1 &&
17019 SM1.
size() - LastUndefsCnt));
17025 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17028 "Worklist not sorted properly!");
17034 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17035 !GatherShuffleExtractSeq.contains(&In))
17040 bool Replaced =
false;
17043 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17044 DT->
dominates(V->getParent(), In.getParent())) {
17045 In.replaceAllUsesWith(V);
17047 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17048 if (!NewMask.
empty())
17049 SI->setShuffleMask(NewMask);
17053 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17054 GatherShuffleExtractSeq.contains(V) &&
17055 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17056 DT->
dominates(In.getParent(), V->getParent())) {
17058 V->replaceAllUsesWith(&In);
17060 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17061 if (!NewMask.
empty())
17062 SI->setShuffleMask(NewMask);
17070 Visited.push_back(&In);
17075 GatherShuffleExtractSeq.clear();
17078BoUpSLP::ScheduleData *
17080 ScheduleData *Bundle =
nullptr;
17081 ScheduleData *PrevInBundle =
nullptr;
17082 for (
Value *V : VL) {
17085 ScheduleData *BundleMember = getScheduleData(V);
17087 "no ScheduleData for bundle member "
17088 "(maybe not in same basic block)");
17089 assert(BundleMember->isSchedulingEntity() &&
17090 "bundle member already part of other bundle");
17091 if (PrevInBundle) {
17092 PrevInBundle->NextInBundle = BundleMember;
17094 Bundle = BundleMember;
17098 BundleMember->FirstInBundle = Bundle;
17099 PrevInBundle = BundleMember;
17101 assert(Bundle &&
"Failed to find schedule bundle");
17107std::optional<BoUpSLP::ScheduleData *>
17109 const InstructionsState &S) {
17112 if (isa<PHINode>(S.getMainOp()) ||
17118 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17120 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17121 ScheduleData *Bundle) {
17127 if (ScheduleEnd != OldScheduleEnd) {
17128 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17129 if (ScheduleData *SD = getScheduleData(
I))
17130 SD->clearDependencies();
17135 <<
" in block " << BB->
getName() <<
"\n");
17136 calculateDependencies(Bundle,
true, SLP);
17141 initialFillReadyList(ReadyInsts);
17148 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17149 !ReadyInsts.empty()) {
17150 ScheduleData *Picked = ReadyInsts.pop_back_val();
17151 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17152 "must be ready to schedule");
17153 schedule(Picked, ReadyInsts);
17159 for (
Value *V : VL) {
17162 if (!extendSchedulingRegion(V, S)) {
17169 TryScheduleBundleImpl(
false,
nullptr);
17170 return std::nullopt;
17174 bool ReSchedule =
false;
17175 for (
Value *V : VL) {
17178 ScheduleData *BundleMember = getScheduleData(V);
17180 "no ScheduleData for bundle member (maybe not in same basic block)");
17184 ReadyInsts.remove(BundleMember);
17186 if (!BundleMember->IsScheduled)
17191 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17192 <<
" was already scheduled\n");
17196 auto *Bundle = buildBundle(VL);
17197 TryScheduleBundleImpl(ReSchedule, Bundle);
17198 if (!Bundle->isReady()) {
17199 cancelScheduling(VL, S.getMainOp());
17200 return std::nullopt;
17213 ScheduleData *Bundle = getScheduleData(OpValue);
17214 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17215 assert(!Bundle->IsScheduled &&
17216 "Can't cancel bundle which is already scheduled");
17217 assert(Bundle->isSchedulingEntity() &&
17219 "tried to unbundle something which is not a bundle");
17222 if (Bundle->isReady())
17223 ReadyInsts.remove(Bundle);
17226 ScheduleData *BundleMember = Bundle;
17227 while (BundleMember) {
17228 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17229 BundleMember->FirstInBundle = BundleMember;
17230 ScheduleData *Next = BundleMember->NextInBundle;
17231 BundleMember->NextInBundle =
nullptr;
17232 BundleMember->TE =
nullptr;
17233 if (BundleMember->unscheduledDepsInBundle() == 0) {
17234 ReadyInsts.insert(BundleMember);
17236 BundleMember = Next;
17240BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17242 if (ChunkPos >= ChunkSize) {
17243 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17246 return &(ScheduleDataChunks.back()[ChunkPos++]);
17249bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17250 Value *V,
const InstructionsState &S) {
17252 assert(
I &&
"bundle member must be an instruction");
17255 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17257 if (getScheduleData(
I))
17259 if (!ScheduleStart) {
17261 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17263 ScheduleEnd =
I->getNextNode();
17264 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17265 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17273 ++ScheduleStart->getIterator().getReverse();
17278 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17279 return II->isAssumeLikeIntrinsic();
17282 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17283 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17284 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17286 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17287 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17294 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17295 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17297 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17298 assert(
I->getParent() == ScheduleStart->getParent() &&
17299 "Instruction is in wrong basic block.");
17300 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17306 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17307 "Expected to reach top of the basic block or instruction down the "
17309 assert(
I->getParent() == ScheduleEnd->getParent() &&
17310 "Instruction is in wrong basic block.");
17311 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17313 ScheduleEnd =
I->getNextNode();
17314 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17315 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17319void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17321 ScheduleData *PrevLoadStore,
17322 ScheduleData *NextLoadStore) {
17323 ScheduleData *CurrentLoadStore = PrevLoadStore;
17328 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17330 SD = allocateScheduleDataChunks();
17331 ScheduleDataMap[
I] = SD;
17333 assert(!isInSchedulingRegion(SD) &&
17334 "new ScheduleData already in scheduling region");
17335 SD->init(SchedulingRegionID,
I);
17337 if (
I->mayReadOrWriteMemory() &&
17338 (!isa<IntrinsicInst>(
I) ||
17339 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17340 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17341 Intrinsic::pseudoprobe))) {
17343 if (CurrentLoadStore) {
17344 CurrentLoadStore->NextLoadStore = SD;
17346 FirstLoadStoreInRegion = SD;
17348 CurrentLoadStore = SD;
17351 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17352 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17353 RegionHasStackSave =
true;
17355 if (NextLoadStore) {
17356 if (CurrentLoadStore)
17357 CurrentLoadStore->NextLoadStore = NextLoadStore;
17359 LastLoadStoreInRegion = CurrentLoadStore;
17363void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17364 bool InsertInReadyList,
17366 assert(SD->isSchedulingEntity());
17371 while (!WorkList.
empty()) {
17373 for (ScheduleData *BundleMember = SD; BundleMember;
17374 BundleMember = BundleMember->NextInBundle) {
17375 assert(isInSchedulingRegion(BundleMember));
17376 if (BundleMember->hasValidDependencies())
17381 BundleMember->Dependencies = 0;
17382 BundleMember->resetUnscheduledDeps();
17385 for (
User *U : BundleMember->Inst->
users()) {
17386 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17387 BundleMember->Dependencies++;
17388 ScheduleData *DestBundle = UseSD->FirstInBundle;
17389 if (!DestBundle->IsScheduled)
17390 BundleMember->incrementUnscheduledDeps(1);
17391 if (!DestBundle->hasValidDependencies())
17397 auto *DepDest = getScheduleData(
I);
17398 assert(DepDest &&
"must be in schedule window");
17399 DepDest->ControlDependencies.push_back(BundleMember);
17400 BundleMember->Dependencies++;
17401 ScheduleData *DestBundle = DepDest->FirstInBundle;
17402 if (!DestBundle->IsScheduled)
17403 BundleMember->incrementUnscheduledDeps(1);
17404 if (!DestBundle->hasValidDependencies())
17412 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17413 I != ScheduleEnd;
I =
I->getNextNode()) {
17418 MakeControlDependent(
I);
17426 if (RegionHasStackSave) {
17430 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17431 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17432 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17433 I != ScheduleEnd;
I =
I->getNextNode()) {
17434 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17435 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17440 if (!isa<AllocaInst>(
I))
17444 MakeControlDependent(
I);
17453 if (isa<AllocaInst>(BundleMember->Inst) ||
17454 BundleMember->Inst->mayReadOrWriteMemory()) {
17455 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17456 I != ScheduleEnd;
I =
I->getNextNode()) {
17457 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17458 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17462 MakeControlDependent(
I);
17469 ScheduleData *DepDest = BundleMember->NextLoadStore;
17474 "NextLoadStore list for non memory effecting bundle?");
17476 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17477 unsigned NumAliased = 0;
17478 unsigned DistToSrc = 1;
17480 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17481 assert(isInSchedulingRegion(DepDest));
17491 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17493 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17500 DepDest->MemoryDependencies.push_back(BundleMember);
17501 BundleMember->Dependencies++;
17502 ScheduleData *DestBundle = DepDest->FirstInBundle;
17503 if (!DestBundle->IsScheduled) {
17504 BundleMember->incrementUnscheduledDeps(1);
17506 if (!DestBundle->hasValidDependencies()) {
17529 if (InsertInReadyList && SD->isReady()) {
17530 ReadyInsts.insert(SD);
17537void BoUpSLP::BlockScheduling::resetSchedule() {
17539 "tried to reset schedule on block which has not been scheduled");
17540 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17541 if (ScheduleData *SD = getScheduleData(
I)) {
17542 assert(isInSchedulingRegion(SD) &&
17543 "ScheduleData not in scheduling region");
17544 SD->IsScheduled =
false;
17545 SD->resetUnscheduledDeps();
17548 ReadyInsts.clear();
17551void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17552 if (!BS->ScheduleStart)
17555 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17562 BS->resetSchedule();
17569 struct ScheduleDataCompare {
17570 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17571 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17574 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17579 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17580 I =
I->getNextNode()) {
17581 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17584 SD->isPartOfBundle() ==
17587 "scheduler and vectorizer bundle mismatch");
17588 SD->FirstInBundle->SchedulingPriority =
Idx++;
17590 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17591 BS->calculateDependencies(SD,
false,
this);
17594 BS->initialFillReadyList(ReadyInsts);
17596 Instruction *LastScheduledInst = BS->ScheduleEnd;
17599 while (!ReadyInsts.empty()) {
17600 ScheduleData *Picked = *ReadyInsts.begin();
17601 ReadyInsts.erase(ReadyInsts.begin());
17605 for (ScheduleData *BundleMember = Picked; BundleMember;
17606 BundleMember = BundleMember->NextInBundle) {
17610 LastScheduledInst = PickedInst;
17613 BS->schedule(Picked, ReadyInsts);
17617#ifdef EXPENSIVE_CHECKS
17621#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17623 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17624 ScheduleData *SD = BS->getScheduleData(
I);
17625 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17626 assert(SD->IsScheduled &&
"must be scheduled at this point");
17631 BS->ScheduleStart =
nullptr;
17638 if (
auto *Store = dyn_cast<StoreInst>(V))
17639 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17641 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17644 auto E = InstrElementSize.
find(V);
17645 if (E != InstrElementSize.
end())
17654 if (
auto *
I = dyn_cast<Instruction>(V)) {
17662 Value *FirstNonBool =
nullptr;
17663 while (!Worklist.
empty()) {
17668 auto *Ty =
I->getType();
17669 if (isa<VectorType>(Ty))
17671 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17678 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17679 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17687 for (
Use &U :
I->operands()) {
17688 if (
auto *J = dyn_cast<Instruction>(U.get()))
17689 if (Visited.
insert(J).second &&
17690 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17694 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17695 FirstNonBool = U.get();
17706 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17708 Width =
DL->getTypeSizeInBits(V->getType());
17712 InstrElementSize[
I] = Width;
17717bool BoUpSLP::collectValuesToDemote(
17718 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17721 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17723 if (
all_of(E.Scalars, IsaPred<Constant>))
17726 unsigned OrigBitWidth =
17727 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17734 if (NodesToKeepBWs.
contains(E.Idx))
17740 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17741 if (isa<PoisonValue>(R))
17743 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17745 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17746 if (isa<PoisonValue>(V))
17748 if (getTreeEntries(V).size() > 1)
17754 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17760 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17763 if (
auto *
I = dyn_cast<Instruction>(V)) {
17765 unsigned BitWidth2 =
17766 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17767 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17773 BitWidth1 = std::min(BitWidth1, BitWidth2);
17778 auto FinalAnalysis = [&,
TTI =
TTI]() {
17779 if (!IsProfitableToDemote)
17782 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17784 if (Res && E.isGather()) {
17788 for (
Value *V : E.Scalars) {
17789 auto *EE = dyn_cast<ExtractElementInst>(V);
17792 UniqueBases.
insert(EE->getVectorOperand());
17794 const unsigned VF = E.Scalars.size();
17795 Type *OrigScalarTy = E.Scalars.front()->getType();
17796 if (UniqueBases.
size() <= 2 ||
17807 if (E.isGather() || !Visited.
insert(&E).second ||
17809 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17810 return isa<InsertElementInst>(U) && !isVectorized(U);
17813 return FinalAnalysis();
17816 return !all_of(V->users(), [=](User *U) {
17817 return isVectorized(U) ||
17818 (E.Idx == 0 && UserIgnoreList &&
17819 UserIgnoreList->contains(U)) ||
17820 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17821 !U->getType()->isScalableTy() &&
17822 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17823 }) && !IsPotentiallyTruncated(V,
BitWidth);
17828 bool &NeedToExit) {
17829 NeedToExit =
false;
17830 unsigned InitLevel = MaxDepthLevel;
17832 unsigned Level = InitLevel;
17833 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17834 ToDemote, Visited, NodesToKeepBWs, Level,
17835 IsProfitableToDemote, IsTruncRoot)) {
17836 if (!IsProfitableToDemote)
17839 if (!FinalAnalysis())
17843 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17847 auto AttemptCheckBitwidth =
17850 NeedToExit =
false;
17851 unsigned BestFailBitwidth = 0;
17853 if (Checker(
BitWidth, OrigBitWidth))
17855 if (BestFailBitwidth == 0 && FinalAnalysis())
17859 if (BestFailBitwidth == 0) {
17870 auto TryProcessInstruction =
17876 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17881 if (E.UserTreeIndices.size() > 1 &&
17882 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17885 bool NeedToExit =
false;
17886 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17890 if (!ProcessOperands(
Operands, NeedToExit))
17899 return IsProfitableToDemote;
17901 switch (E.getOpcode()) {
17905 case Instruction::Trunc:
17906 if (IsProfitableToDemoteRoot)
17907 IsProfitableToDemote =
true;
17908 return TryProcessInstruction(
BitWidth);
17909 case Instruction::ZExt:
17910 case Instruction::SExt:
17911 IsProfitableToDemote =
true;
17912 return TryProcessInstruction(
BitWidth);
17916 case Instruction::Add:
17917 case Instruction::Sub:
17918 case Instruction::Mul:
17919 case Instruction::And:
17920 case Instruction::Or:
17921 case Instruction::Xor: {
17922 return TryProcessInstruction(
17923 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17925 case Instruction::Freeze:
17926 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17927 case Instruction::Shl: {
17932 if (isa<PoisonValue>(V))
17934 auto *I = cast<Instruction>(V);
17935 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17936 return AmtKnownBits.getMaxValue().ult(BitWidth);
17939 return TryProcessInstruction(
17940 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17942 case Instruction::LShr: {
17946 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17948 if (isa<PoisonValue>(V))
17950 auto *I = cast<Instruction>(V);
17951 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17952 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17953 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17954 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17955 SimplifyQuery(*DL));
17958 return TryProcessInstruction(
17959 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17962 case Instruction::AShr: {
17966 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17968 if (isa<PoisonValue>(V))
17970 auto *I = cast<Instruction>(V);
17971 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17972 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17973 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17974 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17978 return TryProcessInstruction(
17979 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17982 case Instruction::UDiv:
17983 case Instruction::URem: {
17985 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17988 auto *I = cast<Instruction>(V);
17989 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17990 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17991 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17994 return TryProcessInstruction(
17995 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17999 case Instruction::Select: {
18000 return TryProcessInstruction(
18001 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18006 case Instruction::PHI: {
18007 const unsigned NumOps = E.getNumOperands();
18010 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18012 return TryProcessInstruction(
BitWidth, Ops);
18015 case Instruction::Call: {
18016 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18020 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18021 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18025 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18028 auto *I = cast<Instruction>(V);
18029 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18030 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18031 return MaskedValueIsZero(I->getOperand(0), Mask,
18032 SimplifyQuery(*DL)) &&
18033 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18035 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18036 "Expected min/max intrinsics only.");
18037 unsigned SignBits = OrigBitWidth -
BitWidth;
18043 return SignBits <= Op0SignBits &&
18044 ((SignBits != Op0SignBits &&
18048 SignBits <= Op1SignBits &&
18049 ((SignBits != Op1SignBits &&
18054 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18057 auto *I = cast<Instruction>(V);
18058 unsigned SignBits = OrigBitWidth - BitWidth;
18059 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18060 unsigned Op0SignBits =
18061 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18062 return SignBits <= Op0SignBits &&
18063 ((SignBits != Op0SignBits &&
18064 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18065 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18068 if (
ID != Intrinsic::abs) {
18069 Operands.push_back(getOperandEntry(&E, 1));
18070 CallChecker = CompChecker;
18072 CallChecker = AbsChecker;
18075 std::numeric_limits<InstructionCost::CostType>::max();
18077 unsigned VF = E.Scalars.size();
18087 if (
Cost < BestCost) {
18093 [[maybe_unused]]
bool NeedToExit;
18094 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18104 return FinalAnalysis();
18111 bool IsStoreOrInsertElt =
18112 VectorizableTree.front()->hasState() &&
18113 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18114 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18115 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18116 ExtraBitWidthNodes.
size() <= 1 &&
18117 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18118 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18121 unsigned NodeIdx = 0;
18122 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18126 if (VectorizableTree[NodeIdx]->
isGather() ||
18127 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18128 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18130 return EI.
UserTE->Idx > NodeIdx;
18136 bool IsTruncRoot =
false;
18137 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18140 if (NodeIdx != 0 &&
18141 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18142 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18143 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18144 IsTruncRoot =
true;
18146 IsProfitableToDemoteRoot =
true;
18151 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18155 auto ComputeMaxBitWidth =
18156 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
18157 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
18161 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18162 !NodesToKeepBWs.
contains(E.Idx) &&
18163 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18165 return V->hasOneUse() || isa<Constant>(V) ||
18168 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
18169 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18170 if (TEs.empty() || is_contained(TEs, UserTE))
18172 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18174 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18175 SelectInst>(UserTE->getMainOp()))
18177 unsigned UserTESz = DL->getTypeSizeInBits(
18178 UserTE->Scalars.front()->getType());
18179 if (all_of(TEs, [&](const TreeEntry *TE) {
18180 auto It = MinBWs.find(TE);
18181 return It != MinBWs.end() &&
18182 It->second.first > UserTESz;
18185 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18189 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18190 auto It = MinBWs.
find(UserTE);
18191 if (It != MinBWs.
end())
18192 return It->second.first;
18193 unsigned MaxBitWidth =
18194 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18195 MaxBitWidth =
bit_ceil(MaxBitWidth);
18196 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18198 return MaxBitWidth;
18204 unsigned VF = E.getVectorFactor();
18205 Type *ScalarTy = E.Scalars.front()->getType();
18207 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18212 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18221 unsigned MaxBitWidth = 1u;
18229 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18230 if (isa<PoisonValue>(R))
18232 KnownBits Known = computeKnownBits(R, *DL);
18233 return Known.isNonNegative();
18238 for (
Value *Root : E.Scalars) {
18239 if (isa<PoisonValue>(Root))
18244 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18260 if (!IsKnownPositive)
18264 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18266 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18269 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18274 if (NumParts > 1 &&
18282 unsigned Opcode = E.getOpcode();
18283 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18284 Opcode == Instruction::SExt ||
18285 Opcode == Instruction::ZExt || NumParts > 1;
18290 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18291 bool NeedToDemote = IsProfitableToDemote;
18293 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18294 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18295 NeedToDemote, IsTruncRoot) ||
18296 (MaxDepthLevel <= Limit &&
18297 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18298 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18299 DL->getTypeSizeInBits(TreeRootIT) /
18300 DL->getTypeSizeInBits(
18301 E.getMainOp()->getOperand(0)->getType()) >
18305 MaxBitWidth =
bit_ceil(MaxBitWidth);
18307 return MaxBitWidth;
18314 if (UserIgnoreList &&
18315 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18318 if (
all_of(*UserIgnoreList,
18320 return isa<PoisonValue>(V) ||
18321 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18323 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18324 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18325 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18327 ReductionBitWidth = 1;
18329 for (
Value *V : *UserIgnoreList) {
18330 if (isa<PoisonValue>(V))
18333 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18334 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18337 unsigned BitWidth2 = BitWidth1;
18339 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18340 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18342 ReductionBitWidth =
18343 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18345 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18346 ReductionBitWidth = 8;
18348 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18351 bool IsTopRoot = NodeIdx == 0;
18352 while (NodeIdx < VectorizableTree.size() &&
18353 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18354 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18355 RootDemotes.push_back(NodeIdx);
18357 IsTruncRoot =
true;
18359 bool IsSignedCmp =
false;
18360 while (NodeIdx < VectorizableTree.size()) {
18362 unsigned Limit = 2;
18364 ReductionBitWidth ==
18365 DL->getTypeSizeInBits(
18366 VectorizableTree.front()->Scalars.front()->getType()))
18368 unsigned MaxBitWidth = ComputeMaxBitWidth(
18369 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18370 IsTruncRoot, IsSignedCmp);
18371 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18372 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18373 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18374 else if (MaxBitWidth == 0)
18375 ReductionBitWidth = 0;
18378 for (
unsigned Idx : RootDemotes) {
18381 DL->getTypeSizeInBits(
V->getType()->getScalarType());
18382 if (OrigBitWidth > MaxBitWidth) {
18390 RootDemotes.clear();
18392 IsProfitableToDemoteRoot =
true;
18394 if (ExtraBitWidthNodes.empty()) {
18395 NodeIdx = VectorizableTree.size();
18397 unsigned NewIdx = 0;
18399 NewIdx = *ExtraBitWidthNodes.begin();
18400 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18401 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18404 NodeIdx < VectorizableTree.size() &&
18405 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18406 [](
const EdgeInfo &EI) {
18407 return EI.EdgeIdx == 0 &&
18408 EI.UserTE->getOpcode() == Instruction::Trunc &&
18409 !EI.UserTE->isAltShuffle();
18412 NodeIdx < VectorizableTree.size() &&
18414 VectorizableTree[NodeIdx]->UserTreeIndices,
18415 [&](
const EdgeInfo &EI) {
18416 return (EI.UserTE->hasState() &&
18417 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18419 auto *IC = dyn_cast<ICmpInst>(V);
18422 !isKnownNonNegative(IC->getOperand(0),
18423 SimplifyQuery(*DL)) ||
18424 !isKnownNonNegative(IC->getOperand(1),
18425 SimplifyQuery(*DL)));
18432 if (MaxBitWidth == 0 ||
18434 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18436 if (UserIgnoreList)
18444 for (
unsigned Idx : ToDemote) {
18445 TreeEntry *
TE = VectorizableTree[
Idx].get();
18446 if (MinBWs.contains(TE))
18449 if (isa<PoisonValue>(R))
18451 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18453 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18469 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18494 DL = &
F.getDataLayout();
18498 bool Changed =
false;
18504 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18509 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18512 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18516 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18525 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18530 R.clearReductionData();
18531 collectSeedInstructions(BB);
18534 if (!Stores.
empty()) {
18536 <<
" underlying objects.\n");
18537 Changed |= vectorizeStoreChains(R);
18541 Changed |= vectorizeChainsInBlock(BB, R);
18546 if (!GEPs.
empty()) {
18548 <<
" underlying objects.\n");
18549 Changed |= vectorizeGEPIndices(BB, R);
18554 R.optimizeGatherSequence();
18562 unsigned Idx,
unsigned MinVF,
18567 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18568 unsigned VF = Chain.
size();
18572 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18574 VF < 2 || VF < MinVF) {
18586 for (
Value *V : Chain)
18587 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18590 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18592 bool IsAllowedSize =
18596 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18597 (!S.getMainOp()->isSafeToRemove() ||
18600 return !isa<ExtractElementInst>(V) &&
18601 (V->getNumUses() > Chain.size() ||
18602 any_of(V->users(), [&](User *U) {
18603 return !Stores.contains(U);
18606 (ValOps.
size() > Chain.size() / 2 && !S)) {
18607 Size = (!IsAllowedSize && S) ? 1 : 2;
18611 if (
R.isLoadCombineCandidate(Chain))
18613 R.buildTree(Chain);
18615 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18616 if (
R.isGathered(Chain.front()) ||
18617 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18618 return std::nullopt;
18619 Size =
R.getCanonicalGraphSize();
18622 R.reorderTopToBottom();
18623 R.reorderBottomToTop();
18624 R.transformNodes();
18625 R.buildExternalUses();
18627 R.computeMinimumValueSizes();
18629 Size =
R.getCanonicalGraphSize();
18630 if (S && S.getOpcode() == Instruction::Load)
18638 using namespace ore;
18641 cast<StoreInst>(Chain[0]))
18642 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18643 <<
" and with tree size "
18644 <<
NV(
"TreeSize",
R.getTreeSize()));
18658 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18659 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18660 unsigned Size = First ? Val.first : Val.second;
18672 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18673 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18674 unsigned P = First ? Val.first : Val.second;
18677 return V + (P - Mean) * (P - Mean);
18680 return Dev * 81 / (Mean * Mean) == 0;
18683bool SLPVectorizerPass::vectorizeStores(
18685 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18690 bool Changed =
false;
18692 struct StoreDistCompare {
18693 bool operator()(
const std::pair<unsigned, int> &Op1,
18694 const std::pair<unsigned, int> &Op2)
const {
18695 return Op1.second < Op2.second;
18700 using StoreIndexToDistSet =
18701 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18702 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18707 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18709 PrevDist =
Data.second;
18710 if (
Idx !=
Set.size() - 1)
18715 Operands.push_back(Stores[DataVar.first]);
18716 PrevDist = DataVar.second;
18721 .
insert({Operands.front(),
18722 cast<StoreInst>(Operands.front())->getValueOperand(),
18724 cast<StoreInst>(Operands.back())->getValueOperand(),
18729 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18730 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18734 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18736 Type *StoreTy =
Store->getValueOperand()->getType();
18737 Type *ValueTy = StoreTy;
18738 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18739 ValueTy = Trunc->getSrcTy();
18740 unsigned MinVF = std::max<unsigned>(
18742 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18745 if (MaxVF < MinVF) {
18746 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18748 <<
"MinVF (" << MinVF <<
")\n");
18752 unsigned NonPowerOf2VF = 0;
18757 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18759 NonPowerOf2VF = CandVF;
18760 assert(NonPowerOf2VF != MaxVF &&
18761 "Non-power-of-2 VF should not be equal to MaxVF");
18765 unsigned MaxRegVF = MaxVF;
18767 if (MaxVF < MinVF) {
18768 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18770 <<
"MinVF (" << MinVF <<
")\n");
18776 unsigned Size = MinVF;
18778 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18782 unsigned Repeat = 0;
18783 constexpr unsigned MaxAttempts = 4;
18785 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18786 P.first =
P.second = 1;
18789 auto IsNotVectorized = [](
bool First,
18790 const std::pair<unsigned, unsigned> &
P) {
18791 return First ?
P.first > 0 :
P.second > 0;
18793 auto IsVectorized = [](
bool First,
18794 const std::pair<unsigned, unsigned> &
P) {
18795 return First ?
P.first == 0 :
P.second == 0;
18797 auto VFIsProfitable = [](
bool First,
unsigned Size,
18798 const std::pair<unsigned, unsigned> &
P) {
18801 auto FirstSizeSame = [](
unsigned Size,
18802 const std::pair<unsigned, unsigned> &
P) {
18803 return Size ==
P.first;
18807 bool RepeatChanged =
false;
18808 bool AnyProfitableGraph =
false;
18809 for (
unsigned Size : CandidateVFs) {
18810 AnyProfitableGraph =
false;
18811 unsigned StartIdx = std::distance(
18812 RangeSizes.begin(),
18813 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18814 std::placeholders::_1)));
18815 while (StartIdx <
End) {
18817 std::distance(RangeSizes.begin(),
18818 find_if(RangeSizes.drop_front(StartIdx),
18819 std::bind(IsVectorized,
Size >= MaxRegVF,
18820 std::placeholders::_1)));
18821 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18822 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18824 Size >= MaxRegVF)) {
18831 return cast<StoreInst>(V)
18832 ->getValueOperand()
18834 cast<StoreInst>(Slice.
front())
18835 ->getValueOperand()
18838 "Expected all operands of same type.");
18839 if (!NonSchedulable.empty()) {
18840 auto [NonSchedSizeMax, NonSchedSizeMin] =
18841 NonSchedulable.lookup(Slice.
front());
18842 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18843 Cnt += NonSchedSizeMax;
18848 std::optional<bool> Res =
18849 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18853 .first->getSecond()
18861 AnyProfitableGraph = RepeatChanged = Changed =
true;
18865 [](std::pair<unsigned, unsigned> &
P) {
18866 P.first = P.second = 0;
18868 if (Cnt < StartIdx + MinVF) {
18869 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18870 [](std::pair<unsigned, unsigned> &
P) {
18871 P.first = P.second = 0;
18873 StartIdx = Cnt +
Size;
18875 if (Cnt > Sz -
Size - MinVF) {
18877 [](std::pair<unsigned, unsigned> &
P) {
18878 P.first = P.second = 0;
18887 if (
Size > 2 && Res &&
18889 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18890 std::placeholders::_1))) {
18896 if (
Size > MaxRegVF && TreeSize > 1 &&
18898 std::bind(FirstSizeSame, TreeSize,
18899 std::placeholders::_1))) {
18901 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18907 [&](std::pair<unsigned, unsigned> &
P) {
18908 if (Size >= MaxRegVF)
18909 P.second = std::max(P.second, TreeSize);
18911 P.first = std::max(P.first, TreeSize);
18914 AnyProfitableGraph =
true;
18916 if (StartIdx >=
End)
18918 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18919 AnyProfitableGraph =
true;
18920 StartIdx = std::distance(
18921 RangeSizes.begin(),
18922 find_if(RangeSizes.drop_front(Sz),
18923 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18924 std::placeholders::_1)));
18930 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18931 return P.first == 0 &&
P.second == 0;
18935 if (Repeat >= MaxAttempts ||
18936 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18938 constexpr unsigned StoresLimit = 64;
18939 const unsigned MaxTotalNum = std::min<unsigned>(
18941 static_cast<unsigned>(
18944 RangeSizes.begin(),
18945 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18946 std::placeholders::_1))) +
18948 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18951 CandidateVFs.clear();
18953 CandidateVFs.push_back(Limit);
18954 if (VF > MaxTotalNum || VF >= StoresLimit)
18956 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18958 P.first = std::max(
P.second,
P.first);
18962 CandidateVFs.push_back(VF);
19009 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19011 Stores[
Set.first]->getValueOperand()->getType(),
19012 Stores[
Set.first]->getPointerOperand(),
19013 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19017 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19018 if (It ==
Set.second.end()) {
19019 Set.second.emplace(
Idx, *Diff);
19023 TryToVectorize(
Set.second);
19024 unsigned ItIdx = It->first;
19025 int ItDist = It->second;
19026 StoreIndexToDistSet PrevSet;
19027 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19028 [&](
const std::pair<unsigned, int> &Pair) {
19029 return Pair.first > ItIdx;
19031 Set.second.clear();
19033 Set.second.emplace(
Idx, 0);
19036 unsigned StartIdx = ItIdx + 1;
19041 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19043 if (VectorizedStores.
contains(Stores[Pair.first]))
19045 unsigned BI = Pair.first - StartIdx;
19046 UsedStores.set(BI);
19047 Dists[BI] = Pair.second - ItDist;
19049 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19050 unsigned BI =
I - StartIdx;
19051 if (UsedStores.test(BI))
19052 Set.second.emplace(
I, Dists[BI]);
19056 auto &Res = SortedStores.emplace_back();
19058 Res.second.emplace(
Idx, 0);
19060 Type *PrevValTy =
nullptr;
19062 if (
R.isDeleted(SI))
19065 PrevValTy =
SI->getValueOperand()->getType();
19067 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19068 for (
auto &Set : SortedStores)
19069 TryToVectorize(
Set.second);
19070 SortedStores.clear();
19071 PrevValTy =
SI->getValueOperand()->getType();
19073 FillStoresSet(
I, SI);
19077 for (
auto &Set : SortedStores)
19078 TryToVectorize(
Set.second);
19083void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19094 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19095 if (!
SI->isSimple())
19105 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19106 if (
GEP->getNumIndices() != 1)
19109 if (isa<Constant>(
Idx))
19113 if (
GEP->getType()->isVectorTy())
19125 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19126 << VL.
size() <<
".\n");
19137 for (
Value *V : VL) {
19138 Type *Ty =
V->getType();
19142 R.getORE()->emit([&]() {
19143 std::string TypeStr;
19147 <<
"Cannot SLP vectorize list: type "
19148 << TypeStr +
" is unsupported by vectorizer";
19155 unsigned Sz =
R.getVectorElementSize(I0);
19156 unsigned MinVF =
R.getMinVF(Sz);
19157 unsigned MaxVF = std::max<unsigned>(
19159 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19161 R.getORE()->emit([&]() {
19163 <<
"Cannot SLP vectorize list: vectorization factor "
19164 <<
"less than 2 is not supported";
19169 bool Changed =
false;
19170 bool CandidateFound =
false;
19173 unsigned NextInst = 0, MaxInst = VL.size();
19174 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19182 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19183 unsigned ActualVF = std::min(MaxInst -
I, VF);
19188 if (MaxVFOnly && ActualVF < MaxVF)
19190 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19195 for (
Value *V : VL.drop_front(
I)) {
19198 if (
auto *Inst = dyn_cast<Instruction>(V);
19199 !Inst || !
R.isDeleted(Inst)) {
19202 if (
Idx == ActualVF)
19207 if (
Idx != ActualVF)
19210 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19214 if (
R.isTreeTinyAndNotFullyVectorizable())
19216 R.reorderTopToBottom();
19217 R.reorderBottomToTop(
19218 !isa<InsertElementInst>(Ops.
front()) &&
19219 !
R.doesRootHaveInTreeUses());
19220 R.transformNodes();
19221 R.buildExternalUses();
19223 R.computeMinimumValueSizes();
19225 CandidateFound =
true;
19226 MinCost = std::min(MinCost,
Cost);
19229 <<
" for VF=" << ActualVF <<
"\n");
19233 cast<Instruction>(Ops[0]))
19234 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19235 <<
" and with tree size "
19236 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19247 if (!Changed && CandidateFound) {
19248 R.getORE()->emit([&]() {
19250 <<
"List vectorization was possible but not beneficial with cost "
19251 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19254 }
else if (!Changed) {
19255 R.getORE()->emit([&]() {
19257 <<
"Cannot SLP vectorize list: vectorization was impossible"
19258 <<
" with available vectorization factors";
19268 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19274 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19275 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19276 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19277 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19284 auto *
A = dyn_cast<BinaryOperator>(Op0);
19285 auto *
B = dyn_cast<BinaryOperator>(Op1);
19287 if (
A &&
B &&
B->hasOneUse()) {
19288 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19289 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19290 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19292 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19296 if (
B &&
A &&
A->hasOneUse()) {
19297 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19298 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19299 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19301 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19305 if (Candidates.
size() == 1)
19306 return tryToVectorizeList({Op0, Op1},
R);
19309 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19310 if (!BestCandidate)
19312 return tryToVectorizeList(
19313 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19347 ReductionOpsListType ReductionOps;
19357 bool IsSupportedHorRdxIdentityOp =
false;
19368 return isa<SelectInst>(
I) &&
19374 if (Kind == RecurKind::None)
19382 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19386 return I->getFastMathFlags().noNaNs();
19389 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19392 return I->isAssociative();
19401 return I->getOperand(2);
19402 return I->getOperand(
Index);
19409 case RecurKind::Or: {
19417 case RecurKind::And: {
19425 case RecurKind::Add:
19426 case RecurKind::Mul:
19427 case RecurKind::Xor:
19428 case RecurKind::FAdd:
19429 case RecurKind::FMul: {
19434 case RecurKind::SMax:
19435 case RecurKind::SMin:
19436 case RecurKind::UMax:
19437 case RecurKind::UMin:
19444 case RecurKind::FMax:
19445 case RecurKind::FMin:
19446 case RecurKind::FMaximum:
19447 case RecurKind::FMinimum: {
19460 const ReductionOpsListType &ReductionOps) {
19461 bool UseSelect = ReductionOps.size() == 2 ||
19463 (ReductionOps.size() == 1 &&
19464 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19465 assert((!UseSelect || ReductionOps.size() != 2 ||
19466 isa<SelectInst>(ReductionOps[1][0])) &&
19467 "Expected cmp + select pairs for reduction");
19470 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19484 auto *
I = dyn_cast<Instruction>(V);
19486 return RecurKind::None;
19488 return RecurKind::Add;
19490 return RecurKind::Mul;
19493 return RecurKind::And;
19496 return RecurKind::Or;
19498 return RecurKind::Xor;
19500 return RecurKind::FAdd;
19502 return RecurKind::FMul;
19505 return RecurKind::FMax;
19507 return RecurKind::FMin;
19510 return RecurKind::FMaximum;
19512 return RecurKind::FMinimum;
19518 return RecurKind::SMax;
19520 return RecurKind::SMin;
19522 return RecurKind::UMax;
19524 return RecurKind::UMin;
19526 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19548 if (!isa<ExtractElementInst>(
RHS) ||
19550 return RecurKind::None;
19552 if (!isa<ExtractElementInst>(
LHS) ||
19554 return RecurKind::None;
19556 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19557 return RecurKind::None;
19561 return RecurKind::None;
19566 return RecurKind::None;
19569 return RecurKind::SMax;
19572 return RecurKind::SMin;
19575 return RecurKind::UMax;
19578 return RecurKind::UMin;
19581 return RecurKind::None;
19585 static unsigned getFirstOperandIndex(
Instruction *
I) {
19586 return isCmpSelMinMax(
I) ? 1 : 0;
19592 return isCmpSelMinMax(
I) ? 3 : 2;
19598 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19599 auto *Sel = cast<SelectInst>(
I);
19600 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19601 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19603 return I->getParent() == BB;
19607 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19608 if (IsCmpSelMinMax) {
19611 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19612 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19613 return I->hasNUses(2);
19617 return I->hasOneUse();
19622 if (isCmpSelMinMax(
I))
19623 ReductionOps.assign(2, ReductionOpsType());
19625 ReductionOps.assign(1, ReductionOpsType());
19630 if (isCmpSelMinMax(
I)) {
19631 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19632 ReductionOps[1].emplace_back(
I);
19634 ReductionOps[0].emplace_back(
I);
19639 int Sz = Data.size();
19640 auto *
I = dyn_cast<Instruction>(Data.front());
19641 return Sz > 1 ||
isConstant(Data.front()) ||
19652 RdxKind = HorizontalReduction::getRdxKind(Root);
19653 if (!isVectorizable(RdxKind, Root))
19664 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19665 if (!Sel->getCondition()->hasOneUse())
19668 ReductionRoot = Root;
19673 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19675 1, std::make_pair(Root, 0));
19683 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19684 getNumberOfOperands(TreeN)))) {
19685 Value *EdgeVal = getRdxOperand(TreeN,
I);
19686 ReducedValsToOps[EdgeVal].push_back(TreeN);
19687 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19694 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19695 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19696 !isVectorizable(RdxKind, EdgeInst) ||
19697 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19698 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19699 PossibleReducedVals.push_back(EdgeVal);
19702 ReductionOps.push_back(EdgeInst);
19713 PossibleReducedVals;
19714 initReductionOps(Root);
19718 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19722 if (!LoadKeyUsed.
insert(Key).second) {
19723 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19724 if (LIt != LoadsMap.
end()) {
19725 for (
LoadInst *RLI : LIt->second) {
19731 for (
LoadInst *RLI : LIt->second) {
19738 if (LIt->second.size() > 2) {
19740 hash_value(LIt->second.back()->getPointerOperand());
19746 .first->second.push_back(LI);
19750 while (!Worklist.empty()) {
19751 auto [TreeN, Level] = Worklist.pop_back_val();
19754 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19755 addReductionOps(TreeN);
19758 for (
Value *V : PossibleRedVals) {
19762 ++PossibleReducedVals[
Key][
Idx]
19763 .
insert(std::make_pair(V, 0))
19767 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19769 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19772 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19773 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19775 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19778 auto RedValsVect = It->second.takeVector();
19780 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19781 PossibleRedValsVect.
back().append(Data.second, Data.first);
19783 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19784 return P1.size() > P2.size();
19789 (!isGoodForReduction(Data) &&
19790 (!isa<LoadInst>(Data.front()) ||
19791 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19793 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19795 cast<LoadInst>(ReducedVals[NewIdx].front())
19797 NewIdx = ReducedVals.
size();
19800 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19806 return P1.size() > P2.
size();
19815 constexpr unsigned RegMaxNumber = 4;
19816 constexpr unsigned RedValsMaxNumber = 128;
19820 if (
unsigned NumReducedVals = std::accumulate(
19821 ReducedVals.
begin(), ReducedVals.
end(), 0,
19823 if (!isGoodForReduction(Vals))
19825 return Num + Vals.size();
19827 NumReducedVals < ReductionLimit &&
19831 for (ReductionOpsType &RdxOps : ReductionOps)
19832 for (
Value *RdxOp : RdxOps)
19833 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19844 ReducedVals.
front().size());
19848 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19849 assert(isa<SelectInst>(RdxRootInst) &&
19850 "Expected min/max reduction to have select root instruction");
19851 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19852 assert(isa<Instruction>(ScalarCond) &&
19853 "Expected min/max reduction to have compare condition");
19854 return cast<Instruction>(ScalarCond);
19857 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19858 return isBoolLogicOp(cast<Instruction>(V));
19861 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19862 if (VectorizedTree) {
19865 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19866 if (AnyBoolLogicOp) {
19867 auto It = ReducedValsToOps.
find(VectorizedTree);
19868 auto It1 = ReducedValsToOps.
find(Res);
19869 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19871 (It != ReducedValsToOps.
end() &&
19873 return isBoolLogicOp(I) &&
19874 getRdxOperand(I, 0) == VectorizedTree;
19878 (It1 != ReducedValsToOps.
end() &&
19880 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19884 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19888 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19895 ReductionOps.front().size());
19896 for (ReductionOpsType &RdxOps : ReductionOps)
19897 for (
Value *RdxOp : RdxOps) {
19900 IgnoreList.insert(RdxOp);
19905 for (
Value *U : IgnoreList)
19906 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19907 RdxFMF &= FPMO->getFastMathFlags();
19908 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19913 for (
Value *V : Candidates)
19914 TrackedVals.try_emplace(V, V);
19917 Value *
V) ->
unsigned & {
19918 auto *It = MV.
find(V);
19919 assert(It != MV.
end() &&
"Unable to find given key.");
19928 bool CheckForReusedReductionOps =
false;
19933 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19935 InstructionsState S = States[
I];
19939 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19940 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19945 auto *Inst = dyn_cast<Instruction>(RdxVal);
19947 (!S || !S.isOpcodeOrAlt(Inst))) ||
19951 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19953 bool ShuffledExtracts =
false;
19955 if (S && S.getOpcode() == Instruction::ExtractElement &&
19956 !S.isAltShuffle() &&
I + 1 <
E) {
19958 for (
Value *RV : ReducedVals[
I + 1]) {
19959 Value *RdxVal = TrackedVals.at(RV);
19963 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19966 CommonCandidates.push_back(RdxVal);
19967 TrackedToOrig.try_emplace(RdxVal, RV);
19972 Candidates.
swap(CommonCandidates);
19973 ShuffledExtracts =
true;
19980 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19981 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19983 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
19984 Value *OrigV = TrackedToOrig.at(VC);
19985 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19986 if (
auto *ResI = dyn_cast<Instruction>(Res))
19987 V.analyzedReductionRoot(ResI);
19989 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19993 unsigned NumReducedVals = Candidates.
size();
19994 if (NumReducedVals < ReductionLimit &&
19995 (NumReducedVals < 2 || !
isSplat(Candidates)))
20000 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20001 RdxKind != RecurKind::FMul &&
20002 RdxKind != RecurKind::FMulAdd;
20005 if (IsSupportedHorRdxIdentityOp)
20006 for (
Value *V : Candidates) {
20007 Value *OrigV = TrackedToOrig.at(V);
20008 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20020 bool SameScaleFactor =
false;
20021 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20022 SameValuesCounter.
size() != Candidates.size();
20024 if (OptReusedScalars) {
20026 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20027 RdxKind == RecurKind::Xor) &&
20029 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20030 return P.second == SameValuesCounter.
front().second;
20032 Candidates.resize(SameValuesCounter.
size());
20033 transform(SameValuesCounter, Candidates.begin(),
20034 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20035 NumReducedVals = Candidates.size();
20037 if (NumReducedVals == 1) {
20038 Value *OrigV = TrackedToOrig.at(Candidates.front());
20039 unsigned Cnt = At(SameValuesCounter, OrigV);
20041 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20042 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20043 VectorizedVals.try_emplace(OrigV, Cnt);
20044 ExternallyUsedValues.
insert(OrigV);
20049 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20050 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20051 const unsigned MaxElts = std::clamp<unsigned>(
20053 RegMaxNumber * RedValsMaxNumber);
20055 unsigned ReduxWidth = NumReducedVals;
20056 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20057 unsigned NumParts, NumRegs;
20058 Type *ScalarTy = Candidates.front()->getType();
20065 while (NumParts > NumRegs) {
20066 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
20067 ReduxWidth =
bit_floor(ReduxWidth - 1);
20073 if (NumParts > NumRegs / 2)
20078 ReduxWidth = GetVectorFactor(ReduxWidth);
20079 ReduxWidth = std::min(ReduxWidth, MaxElts);
20081 unsigned Start = 0;
20082 unsigned Pos = Start;
20084 unsigned PrevReduxWidth = ReduxWidth;
20085 bool CheckForReusedReductionOpsLocal =
false;
20086 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20087 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20088 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20091 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20094 if (Pos < NumReducedVals - ReduxWidth + 1)
20095 return IsAnyRedOpGathered;
20098 if (ReduxWidth > 1)
20099 ReduxWidth = GetVectorFactor(ReduxWidth);
20100 return IsAnyRedOpGathered;
20102 bool AnyVectorized =
false;
20104 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20105 ReduxWidth >= ReductionLimit) {
20108 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20110 CheckForReusedReductionOps =
true;
20113 PrevReduxWidth = ReduxWidth;
20116 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20119 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20121 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20123 V.areAnalyzedReductionVals(VL)) {
20124 (void)AdjustReducedVals(
true);
20130 auto *RedValI = dyn_cast<Instruction>(RedVal);
20133 return V.isDeleted(RedValI);
20136 V.buildTree(VL, IgnoreList);
20137 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20138 if (!AdjustReducedVals())
20139 V.analyzedReductionVals(VL);
20142 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20143 if (!AdjustReducedVals())
20144 V.analyzedReductionVals(VL);
20147 V.reorderTopToBottom();
20149 V.reorderBottomToTop(
true);
20153 ExternallyUsedValues);
20157 LocalExternallyUsedValues.insert(ReductionRoot);
20158 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20159 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20161 for (
Value *V : ReducedVals[Cnt])
20162 if (isa<Instruction>(V))
20163 LocalExternallyUsedValues.insert(TrackedVals[V]);
20165 if (!IsSupportedHorRdxIdentityOp) {
20168 "Reused values counter map is not empty");
20169 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20170 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20172 Value *
V = Candidates[Cnt];
20173 Value *OrigV = TrackedToOrig.at(V);
20174 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20177 V.transformNodes();
20181 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20182 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20184 Value *RdxVal = Candidates[Cnt];
20185 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20186 RdxVal = It->second;
20187 if (!Visited.
insert(RdxVal).second)
20191 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20192 LocalExternallyUsedValues.insert(RdxVal);
20195 Value *OrigV = TrackedToOrig.at(RdxVal);
20197 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20198 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20199 LocalExternallyUsedValues.insert(RdxVal);
20202 if (!IsSupportedHorRdxIdentityOp)
20203 SameValuesCounter.
clear();
20204 for (
Value *RdxVal : VL)
20205 if (RequiredExtract.
contains(RdxVal))
20206 LocalExternallyUsedValues.insert(RdxVal);
20207 V.buildExternalUses(LocalExternallyUsedValues);
20209 V.computeMinimumValueSizes();
20214 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20217 <<
" for reduction\n");
20221 V.getORE()->emit([&]() {
20223 ReducedValsToOps.
at(VL[0]).front())
20224 <<
"Vectorizing horizontal reduction is possible "
20225 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20226 <<
" and threshold "
20229 if (!AdjustReducedVals()) {
20230 V.analyzedReductionVals(VL);
20231 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20232 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20235 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20236 VF >= ReductionLimit;
20238 *
TTI, VL.front()->getType(), VF - 1)) {
20240 V.getCanonicalGraphSize() !=
V.getTreeSize())
20242 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20250 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20251 <<
Cost <<
". (HorRdx)\n");
20252 V.getORE()->emit([&]() {
20254 ReducedValsToOps.
at(VL[0]).front())
20255 <<
"Vectorized horizontal reduction with cost "
20256 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20257 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20264 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20266 if (IsCmpSelMinMax)
20267 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20270 Value *VectorizedRoot =
20271 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20274 for (
Value *RdxVal : Candidates) {
20275 Value *OrigVal = TrackedToOrig.at(RdxVal);
20276 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20277 if (TransformedRdxVal != RdxVal)
20278 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20287 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20290 if (OptReusedScalars && !SameScaleFactor) {
20291 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20292 SameValuesCounter, TrackedToOrig);
20295 Value *ReducedSubTree;
20296 Type *ScalarTy = VL.front()->getType();
20297 if (isa<FixedVectorType>(ScalarTy)) {
20302 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20320 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20323 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20326 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20327 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20328 "Expected different reduction type.");
20330 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20331 V.isSignedMinBitwidthRootNode());
20337 if (OptReusedScalars && SameScaleFactor)
20338 ReducedSubTree = emitScaleForReusedOps(
20339 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20341 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20343 for (
Value *RdxVal : VL) {
20344 Value *OrigV = TrackedToOrig.at(RdxVal);
20345 if (IsSupportedHorRdxIdentityOp) {
20346 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20349 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20350 if (!
V.isVectorized(RdxVal))
20351 RequiredExtract.
insert(RdxVal);
20355 ReduxWidth = NumReducedVals - Pos;
20356 if (ReduxWidth > 1)
20357 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20358 AnyVectorized =
true;
20360 if (OptReusedScalars && !AnyVectorized) {
20361 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20362 Value *RdxVal = TrackedVals.at(
P.first);
20363 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20364 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20365 VectorizedVals.try_emplace(
P.first,
P.second);
20370 if (VectorizedTree) {
20391 if (!AnyBoolLogicOp)
20393 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20394 getRdxOperand(RedOp1, 0) ==
LHS ||
20397 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20398 getRdxOperand(RedOp2, 0) ==
RHS ||
20403 if (
LHS != VectorizedTree)
20414 unsigned Sz = InstVals.
size();
20417 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20420 Value *RdxVal1 = InstVals[
I].second;
20421 Value *StableRdxVal1 = RdxVal1;
20422 auto It1 = TrackedVals.find(RdxVal1);
20423 if (It1 != TrackedVals.end())
20424 StableRdxVal1 = It1->second;
20425 Value *RdxVal2 = InstVals[
I + 1].second;
20426 Value *StableRdxVal2 = RdxVal2;
20427 auto It2 = TrackedVals.find(RdxVal2);
20428 if (It2 != TrackedVals.end())
20429 StableRdxVal2 = It2->second;
20433 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20435 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20436 StableRdxVal2,
"op.rdx", ReductionOps);
20437 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20440 ExtraReds[Sz / 2] = InstVals.
back();
20444 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20448 for (
Value *RdxVal : Candidates) {
20449 if (!Visited.
insert(RdxVal).second)
20451 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20458 bool InitStep =
true;
20459 while (ExtraReductions.
size() > 1) {
20461 FinalGen(ExtraReductions, InitStep);
20462 ExtraReductions.
swap(NewReds);
20465 VectorizedTree = ExtraReductions.
front().second;
20467 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20476 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20483 for (
auto *U :
Ignore->users()) {
20485 "All users must be either in the reduction ops list.");
20488 if (!
Ignore->use_empty()) {
20490 Ignore->replaceAllUsesWith(
P);
20493 V.removeInstructionsAndOperands(RdxOps);
20495 }
else if (!CheckForReusedReductionOps) {
20496 for (ReductionOpsType &RdxOps : ReductionOps)
20497 for (
Value *RdxOp : RdxOps)
20498 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20500 return VectorizedTree;
20510 Type *ScalarTy = ReducedVals.
front()->getType();
20511 unsigned ReduxWidth = ReducedVals.
size();
20520 int Cnt = ReducedVals.
size();
20521 for (
Value *RdxVal : ReducedVals) {
20526 Cost += GenCostFn();
20531 auto *RdxOp = cast<Instruction>(U);
20532 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20540 Cost += ScalarCost;
20542 Cost += GenCostFn();
20547 case RecurKind::Add:
20548 case RecurKind::Mul:
20549 case RecurKind::Or:
20550 case RecurKind::And:
20551 case RecurKind::Xor:
20552 case RecurKind::FAdd:
20553 case RecurKind::FMul: {
20556 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20559 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20571 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20572 std::make_pair(RedTy,
true));
20573 if (RType == RedTy) {
20583 ScalarCost = EvaluateScalarCost([&]() {
20588 case RecurKind::FMax:
20589 case RecurKind::FMin:
20590 case RecurKind::FMaximum:
20591 case RecurKind::FMinimum:
20592 case RecurKind::SMax:
20593 case RecurKind::SMin:
20594 case RecurKind::UMax:
20595 case RecurKind::UMin: {
20599 ScalarCost = EvaluateScalarCost([&]() {
20609 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20611 <<
" (It is a splitting reduction)\n");
20612 return VectorCost - ScalarCost;
20618 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20619 assert(RdxKind != RecurKind::FMulAdd &&
20620 "A call to the llvm.fmuladd intrinsic is not handled yet");
20622 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20623 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20624 RdxKind == RecurKind::Add &&
20629 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20630 ++NumVectorInstructions;
20633 ++NumVectorInstructions;
20640 assert(IsSupportedHorRdxIdentityOp &&
20641 "The optimization of matched scalar identity horizontal reductions "
20642 "must be supported.");
20644 return VectorizedValue;
20646 case RecurKind::Add: {
20648 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20650 << VectorizedValue <<
". (HorRdx)\n");
20651 return Builder.
CreateMul(VectorizedValue, Scale);
20653 case RecurKind::Xor: {
20655 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20656 <<
". (HorRdx)\n");
20659 return VectorizedValue;
20661 case RecurKind::FAdd: {
20663 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20665 << VectorizedValue <<
". (HorRdx)\n");
20666 return Builder.
CreateFMul(VectorizedValue, Scale);
20668 case RecurKind::And:
20669 case RecurKind::Or:
20670 case RecurKind::SMax:
20671 case RecurKind::SMin:
20672 case RecurKind::UMax:
20673 case RecurKind::UMin:
20674 case RecurKind::FMax:
20675 case RecurKind::FMin:
20676 case RecurKind::FMaximum:
20677 case RecurKind::FMinimum:
20679 return VectorizedValue;
20680 case RecurKind::Mul:
20681 case RecurKind::FMul:
20682 case RecurKind::FMulAdd:
20683 case RecurKind::IAnyOf:
20684 case RecurKind::FAnyOf:
20685 case RecurKind::IFindLastIV:
20686 case RecurKind::FFindLastIV:
20687 case RecurKind::None:
20699 assert(IsSupportedHorRdxIdentityOp &&
20700 "The optimization of matched scalar identity horizontal reductions "
20701 "must be supported.");
20703 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20704 if (VTy->getElementType() != VL.
front()->getType()) {
20708 R.isSignedMinBitwidthRootNode());
20711 case RecurKind::Add: {
20714 for (
Value *V : VL) {
20715 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20716 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20720 << VectorizedValue <<
". (HorRdx)\n");
20721 return Builder.
CreateMul(VectorizedValue, Scale);
20723 case RecurKind::And:
20724 case RecurKind::Or:
20727 <<
". (HorRdx)\n");
20728 return VectorizedValue;
20729 case RecurKind::SMax:
20730 case RecurKind::SMin:
20731 case RecurKind::UMax:
20732 case RecurKind::UMin:
20733 case RecurKind::FMax:
20734 case RecurKind::FMin:
20735 case RecurKind::FMaximum:
20736 case RecurKind::FMinimum:
20739 <<
". (HorRdx)\n");
20740 return VectorizedValue;
20741 case RecurKind::Xor: {
20747 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20749 std::iota(
Mask.begin(),
Mask.end(), 0);
20750 bool NeedShuffle =
false;
20751 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20753 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20754 if (Cnt % 2 == 0) {
20756 NeedShuffle =
true;
20762 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20766 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20767 return VectorizedValue;
20769 case RecurKind::FAdd: {
20772 for (
Value *V : VL) {
20773 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20774 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20777 return Builder.
CreateFMul(VectorizedValue, Scale);
20779 case RecurKind::Mul:
20780 case RecurKind::FMul:
20781 case RecurKind::FMulAdd:
20782 case RecurKind::IAnyOf:
20783 case RecurKind::FAnyOf:
20784 case RecurKind::IFindLastIV:
20785 case RecurKind::FFindLastIV:
20786 case RecurKind::None:
20796 return HorizontalReduction::getRdxKind(V);
20799 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20800 return cast<FixedVectorType>(IE->getType())->getNumElements();
20802 unsigned AggregateSize = 1;
20803 auto *
IV = cast<InsertValueInst>(InsertInst);
20804 Type *CurrentType =
IV->getType();
20806 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20807 for (
auto *Elt : ST->elements())
20808 if (Elt != ST->getElementType(0))
20809 return std::nullopt;
20810 AggregateSize *= ST->getNumElements();
20811 CurrentType = ST->getElementType(0);
20812 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20813 AggregateSize *= AT->getNumElements();
20814 CurrentType = AT->getElementType();
20815 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20816 AggregateSize *= VT->getNumElements();
20817 return AggregateSize;
20819 return AggregateSize;
20821 return std::nullopt;
20830 unsigned OperandOffset,
const BoUpSLP &R) {
20833 std::optional<unsigned> OperandIndex =
20835 if (!OperandIndex || R.isDeleted(LastInsertInst))
20837 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20839 BuildVectorOpds, InsertElts, *OperandIndex, R);
20842 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20843 InsertElts[*OperandIndex] = LastInsertInst;
20845 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20846 }
while (LastInsertInst !=
nullptr &&
20847 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20871 assert((isa<InsertElementInst>(LastInsertInst) ||
20872 isa<InsertValueInst>(LastInsertInst)) &&
20873 "Expected insertelement or insertvalue instruction!");
20876 "Expected empty result vectors!");
20879 if (!AggregateSize)
20881 BuildVectorOpds.
resize(*AggregateSize);
20882 InsertElts.
resize(*AggregateSize);
20888 if (BuildVectorOpds.
size() >= 2)
20906 auto DominatedReduxValue = [&](
Value *R) {
20907 return isa<Instruction>(R) &&
20908 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20914 if (
P->getIncomingBlock(0) == ParentBB) {
20915 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20916 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20917 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20920 if (Rdx && DominatedReduxValue(Rdx))
20933 if (
P->getIncomingBlock(0) == BBLatch) {
20934 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20935 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20936 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20939 if (Rdx && DominatedReduxValue(Rdx))
20973 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20974 isa<IntrinsicInst>(Root)) &&
20975 "Expected binop, select, or intrinsic for reduction matching");
20977 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20979 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20981 return dyn_cast<Instruction>(
RHS);
20983 return dyn_cast<Instruction>(
LHS);
20990 Value *Op0 =
nullptr;
20991 Value *Op1 =
nullptr;
20994 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21000 Value *B0 =
nullptr, *B1 =
nullptr;
21005bool SLPVectorizerPass::vectorizeHorReduction(
21010 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21012 if (Root->
getParent() != BB || isa<PHINode>(Root))
21016 auto SelectRoot = [&]() {
21035 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21036 Stack.emplace(SelectRoot(), 0);
21040 if (
R.isAnalyzedReductionRoot(Inst))
21045 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21047 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21049 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21050 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21057 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21062 while (!
Stack.empty()) {
21065 std::tie(Inst, Level) =
Stack.front();
21070 if (
R.isDeleted(Inst))
21072 if (
Value *VectorizedV = TryToReduce(Inst)) {
21074 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21076 Stack.emplace(
I, Level);
21079 if (
R.isDeleted(Inst))
21083 if (!TryAppendToPostponedInsts(Inst)) {
21094 if (VisitedInstrs.
insert(
Op).second)
21095 if (
auto *
I = dyn_cast<Instruction>(
Op))
21098 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21099 !
R.isDeleted(
I) &&
I->getParent() == BB)
21100 Stack.emplace(
I, Level);
21108 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21109 Res |= tryToVectorize(PostponedInsts, R);
21116 for (
Value *V : Insts)
21117 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21118 Res |= tryToVectorize(Inst, R);
21122bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21125 if (!
R.canMapToVector(IVI->
getType()))
21133 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21134 R.getORE()->emit([&]() {
21136 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21137 "trying reduction first.";
21141 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21143 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21153 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21157 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21158 R.getORE()->emit([&]() {
21160 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21161 "trying reduction first.";
21165 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21166 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21169template <
typename T>
21174 bool MaxVFOnly,
BoUpSLP &R) {
21175 bool Changed =
false;
21186 auto *
I = dyn_cast<Instruction>(*IncIt);
21187 if (!
I || R.isDeleted(
I)) {
21191 auto *SameTypeIt = IncIt;
21192 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21193 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21194 AreCompatible(*SameTypeIt, *IncIt))) {
21195 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21197 if (
I && !R.isDeleted(
I))
21202 unsigned NumElts = VL.
size();
21203 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21204 << NumElts <<
")\n");
21214 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21217 VL.
swap(Candidates);
21218 Candidates.
clear();
21220 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21226 auto GetMinNumElements = [&R](
Value *V) {
21227 unsigned EltSize = R.getVectorElementSize(V);
21228 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21230 if (NumElts < GetMinNumElements(*IncIt) &&
21231 (Candidates.
empty() ||
21232 Candidates.
front()->getType() == (*IncIt)->getType())) {
21234 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21240 if (Candidates.
size() > 1 &&
21241 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21242 if (TryToVectorizeHelper(Candidates,
false)) {
21245 }
else if (MaxVFOnly) {
21248 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21250 auto *
I = dyn_cast<Instruction>(*It);
21251 if (!
I || R.isDeleted(
I)) {
21255 auto *SameTypeIt = It;
21256 while (SameTypeIt !=
End &&
21257 (!isa<Instruction>(*SameTypeIt) ||
21258 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21259 AreCompatible(*SameTypeIt, *It))) {
21260 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21262 if (
I && !R.isDeleted(
I))
21265 unsigned NumElts = VL.
size();
21266 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21272 Candidates.
clear();
21276 IncIt = SameTypeIt;
21288template <
bool IsCompatibility>
21293 "Expected valid element types only.");
21295 return IsCompatibility;
21296 auto *CI1 = cast<CmpInst>(V);
21297 auto *CI2 = cast<CmpInst>(V2);
21298 if (CI1->getOperand(0)->getType()->getTypeID() <
21300 return !IsCompatibility;
21301 if (CI1->getOperand(0)->getType()->getTypeID() >
21304 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21306 return !IsCompatibility;
21307 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21316 if (BasePred1 < BasePred2)
21317 return !IsCompatibility;
21318 if (BasePred1 > BasePred2)
21321 bool CI1Preds = Pred1 == BasePred1;
21322 bool CI2Preds = Pred2 == BasePred1;
21323 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21324 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21325 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21329 return !IsCompatibility;
21332 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21333 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21334 if (IsCompatibility) {
21335 if (I1->getParent() != I2->getParent())
21342 return NodeI2 !=
nullptr;
21345 assert((NodeI1 == NodeI2) ==
21347 "Different nodes should have different DFS numbers");
21348 if (NodeI1 != NodeI2)
21352 if (S && (IsCompatibility || !S.isAltShuffle()))
21354 if (IsCompatibility)
21356 if (I1->getOpcode() != I2->getOpcode())
21357 return I1->getOpcode() < I2->getOpcode();
21360 return IsCompatibility;
21363template <
typename ItT>
21366 bool Changed =
false;
21369 if (
R.isDeleted(
I))
21372 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21373 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21374 if (
R.isDeleted(
I))
21380 if (
R.isDeleted(
I))
21382 Changed |= tryToVectorize(
I, R);
21389 return compareCmp<false>(V, V2, *TLI, *DT);
21392 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21395 return compareCmp<true>(V1, V2, *TLI, *DT);
21402 if (Vals.
size() <= 1)
21404 Changed |= tryToVectorizeSequence<Value>(
21405 Vals, CompareSorter, AreCompatibleCompares,
21408 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21410 auto *Select = dyn_cast<SelectInst>(U);
21412 Select->getParent() != cast<Instruction>(V)->getParent();
21415 if (ArePossiblyReducedInOtherBlock)
21417 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21423bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21425 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21426 "This function only accepts Insert instructions");
21427 bool OpsChanged =
false;
21429 for (
auto *
I :
reverse(Instructions)) {
21431 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21433 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21435 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21436 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21438 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21441 if (
R.isDeleted(
I))
21443 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21444 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21447 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21449 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21450 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21451 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21456 OpsChanged |= tryToVectorize(PostponedInsts, R);
21463 bool Changed =
false;
21470 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21473 "Expected vectorizable types only.");
21481 V2->getType()->getScalarSizeInBits())
21484 V2->getType()->getScalarSizeInBits())
21488 if (Opcodes1.
size() < Opcodes2.
size())
21490 if (Opcodes1.
size() > Opcodes2.
size())
21492 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21495 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21496 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21501 return NodeI2 !=
nullptr;
21504 assert((NodeI1 == NodeI2) ==
21506 "Different nodes should have different DFS numbers");
21507 if (NodeI1 != NodeI2)
21510 if (S && !S.isAltShuffle())
21512 return I1->getOpcode() < I2->getOpcode();
21521 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21522 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21530 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21531 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21535 auto ValID1 = Opcodes1[
I]->getValueID();
21536 auto ValID2 = Opcodes2[
I]->getValueID();
21537 if (ValID1 == ValID2)
21539 if (ValID1 < ValID2)
21541 if (ValID1 > ValID2)
21550 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21554 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21557 if (V1->getType() !=
V2->getType())
21561 if (Opcodes1.
size() != Opcodes2.
size())
21563 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21565 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21567 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21568 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21569 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21571 if (
I1->getParent() != I2->getParent())
21577 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21579 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21585 bool HaveVectorizedPhiNodes =
false;
21590 auto *
P = dyn_cast<PHINode>(&
I);
21596 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21609 if (!Opcodes.
empty())
21613 while (!Nodes.empty()) {
21614 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21617 for (
Value *V :
PHI->incoming_values()) {
21618 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21619 Nodes.push_back(PHI1);
21627 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21628 Incoming, PHICompare, AreCompatiblePHIs,
21630 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21633 Changed |= HaveVectorizedPhiNodes;
21634 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21635 auto *
PHI = dyn_cast<PHINode>(
P.first);
21636 return !
PHI ||
R.isDeleted(
PHI);
21638 PHIToOpcodes.
clear();
21640 }
while (HaveVectorizedPhiNodes);
21642 VisitedInstrs.
clear();
21644 InstSetVector PostProcessInserts;
21648 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21649 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21650 if (VectorizeCmps) {
21651 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21652 PostProcessCmps.
clear();
21654 PostProcessInserts.clear();
21659 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21660 return PostProcessCmps.
contains(Cmp);
21661 return isa<InsertElementInst, InsertValueInst>(
I) &&
21662 PostProcessInserts.contains(
I);
21668 return I->use_empty() &&
21669 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21674 if (isa<ScalableVectorType>(It->getType()))
21678 if (
R.isDeleted(&*It))
21681 if (!VisitedInstrs.
insert(&*It).second) {
21682 if (HasNoUsers(&*It) &&
21683 VectorizeInsertsAndCmps(It->isTerminator())) {
21693 if (isa<DbgInfoIntrinsic>(It))
21697 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21699 if (
P->getNumIncomingValues() == 2) {
21702 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21711 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21716 if (BB ==
P->getIncomingBlock(
I) ||
21722 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21723 PI && !IsInPostProcessInstrs(PI)) {
21725 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21727 if (Res &&
R.isDeleted(
P)) {
21737 if (HasNoUsers(&*It)) {
21738 bool OpsChanged =
false;
21739 auto *
SI = dyn_cast<StoreInst>(It);
21749 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21750 SI->getValueOperand()->hasOneUse();
21752 if (TryToVectorizeRoot) {
21753 for (
auto *V : It->operand_values()) {
21756 if (
auto *VI = dyn_cast<Instruction>(V);
21757 VI && !IsInPostProcessInstrs(VI))
21759 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21766 VectorizeInsertsAndCmps(It->isTerminator());
21777 if (isa<InsertElementInst, InsertValueInst>(It))
21778 PostProcessInserts.insert(&*It);
21779 else if (isa<CmpInst>(It))
21780 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21787 auto Changed =
false;
21788 for (
auto &Entry : GEPs) {
21791 if (
Entry.second.size() < 2)
21794 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21795 <<
Entry.second.size() <<
".\n");
21803 return !R.isDeleted(GEP);
21805 if (It ==
Entry.second.end())
21807 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21808 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21809 if (MaxVecRegSize < EltSize)
21812 unsigned MaxElts = MaxVecRegSize / EltSize;
21813 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21814 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21827 Candidates.remove_if([&R](
Value *
I) {
21828 return R.isDeleted(cast<Instruction>(
I)) ||
21829 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21837 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21838 auto *GEPI = GEPList[
I];
21839 if (!Candidates.count(GEPI))
21842 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21843 auto *GEPJ = GEPList[J];
21845 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21846 Candidates.remove(GEPI);
21847 Candidates.remove(GEPJ);
21848 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21849 Candidates.remove(GEPJ);
21856 if (Candidates.
size() < 2)
21863 auto BundleIndex = 0
u;
21864 for (
auto *V : Candidates) {
21865 auto *
GEP = cast<GetElementPtrInst>(V);
21866 auto *GEPIdx =
GEP->idx_begin()->get();
21867 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21868 Bundle[BundleIndex++] = GEPIdx;
21880 Changed |= tryToVectorizeList(Bundle, R);
21886bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21887 bool Changed =
false;
21892 if (
V->getValueOperand()->getType()->getTypeID() <
21893 V2->getValueOperand()->getType()->getTypeID())
21895 if (
V->getValueOperand()->getType()->getTypeID() >
21896 V2->getValueOperand()->getType()->getTypeID())
21898 if (
V->getPointerOperandType()->getTypeID() <
21899 V2->getPointerOperandType()->getTypeID())
21901 if (
V->getPointerOperandType()->getTypeID() >
21902 V2->getPointerOperandType()->getTypeID())
21904 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21905 V2->getValueOperand()->getType()->getScalarSizeInBits())
21907 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21908 V2->getValueOperand()->getType()->getScalarSizeInBits())
21911 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21912 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21916 DT->
getNode(I2->getParent());
21917 assert(NodeI1 &&
"Should only process reachable instructions");
21918 assert(NodeI2 &&
"Should only process reachable instructions");
21919 assert((NodeI1 == NodeI2) ==
21921 "Different nodes should have different DFS numbers");
21922 if (NodeI1 != NodeI2)
21924 return I1->getOpcode() < I2->getOpcode();
21926 return V->getValueOperand()->getValueID() <
21927 V2->getValueOperand()->getValueID();
21939 isa<UndefValue>(
V2->getValueOperand()))
21942 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21943 if (
I1->getParent() != I2->getParent())
21948 isa<Constant>(
V2->getValueOperand()))
21951 V2->getValueOperand()->getValueID();
21956 for (
auto &Pair : Stores) {
21957 if (Pair.second.size() < 2)
21961 << Pair.second.size() <<
".\n");
21970 Pair.second.rend());
21971 Changed |= tryToVectorizeSequence<StoreInst>(
21972 ReversedStores, StoreSorter, AreCompatibleStores,
21974 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.