74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(
const Instruction *
I)
const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(
I->getOpcode());
1116 bool initializeAltOp(
const Instruction *
I) {
1119 if (!isValidForAlternation(
I))
1126 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1127 const Instruction *AltOp =
nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1131 bool add(
const Instruction *
I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode =
I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(
I) && AltOp.equal(Opcode));
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->
getValue();
1173 case Instruction::Shl:
1175 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1177 case Instruction::Mul:
1178 if (CIValue.
isOne()) {
1179 InterchangeableMask = CanBeAll;
1183 InterchangeableMask = MulBIT | ShlBIT;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1189 case Instruction::And:
1191 InterchangeableMask = CanBeAll;
1193 case Instruction::Xor:
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1199 InterchangeableMask = CanBeAll;
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(
I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1207 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1209 bool hasCandidateOpcode(
unsigned Opcode)
const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1212 bool hasAltOp()
const {
return AltOp.I; }
1213 unsigned getAltOpcode()
const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 return MainOp.getOperand(
I);
1222class InstructionsState {
1248 bool HasCopyables =
false;
1252 assert(valid() &&
"InstructionsState is invalid.");
1257 assert(valid() &&
"InstructionsState is invalid.");
1262 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1264 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1267 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1277 assert(MainOp &&
"MainOp cannot be nullptr.");
1278 if (
I->getOpcode() == MainOp->getOpcode())
1281 assert(AltOp &&
"AltOp cannot be nullptr.");
1282 if (
I->getOpcode() == AltOp->getOpcode())
1284 if (!
I->isBinaryOp())
1286 BinOpSameOpcodeHelper
Converter(MainOp);
1289 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 if (
Converter.hasAltOp() && !isAltShuffle())
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1301 bool isShiftOp()
const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1311 bool isMulDivLikeOp()
const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1321 bool isAddSubLikeOp()
const {
1322 constexpr std::array<unsigned, 4>
AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1330 bool isCmpOp()
const {
1331 return (
getOpcode() == Instruction::ICmp ||
1337 bool valid()
const {
return MainOp && AltOp; }
1339 explicit operator bool()
const {
return valid(); }
1341 InstructionsState() =
delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables =
false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1348 bool isCopyableElement(
Value *V)
const {
1349 assert(valid() &&
"InstructionsState is invalid.");
1352 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1357 if (
I->getParent() != MainOp->getParent() &&
1361 if (
I->getOpcode() == MainOp->getOpcode())
1363 if (!
I->isBinaryOp())
1365 BinOpSameOpcodeHelper
Converter(MainOp);
1371 bool isNonSchedulable(
Value *V)
const {
1372 assert(valid() &&
"InstructionsState is invalid.");
1379 if (getMainOp() == V)
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1384 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1389 !MainOp->comesBefore(
I));
1392 return IsNonSchedulableCopyableElement(V);
1399 bool areInstructionsWithCopyableElements()
const {
1400 assert(valid() &&
"InstructionsState is invalid.");
1401 return HasCopyables;
1405std::pair<Instruction *, SmallVector<Value *>>
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1408 assert(SelectedOp &&
"Cannot convert the instruction.");
1409 if (
I->isBinaryOp()) {
1411 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1430 for (
Value *V : VL) {
1435 if (Inst->getOpcode() == Opcode)
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1460 "Assessing comparisons of different types?");
1470 return (BasePred == Pred &&
1472 (BasePred == SwappedPred &&
1483 return InstructionsState::invalid();
1487 return InstructionsState::invalid();
1492 (VL.
size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1502 unsigned AltOpcode = Opcode;
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1507 UniquePreds.
insert(BasePred);
1508 UniqueNonSwappedPreds.
insert(BasePred);
1509 for (
Value *V : VL) {
1516 UniqueNonSwappedPreds.
insert(CurrentPred);
1517 if (!UniquePreds.
contains(CurrentPred) &&
1518 !UniquePreds.
contains(SwappedCurrentPred))
1519 UniquePreds.
insert(CurrentPred);
1524 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1534 return InstructionsState::invalid();
1536 bool AnyPoison = InstCnt != VL.
size();
1547 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode =
I->getOpcode();
1551 if (BinOpHelper.add(
I))
1556 Value *Op1 =
I->getOperand(0);
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1575 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1585 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1592 if (MainOp != AltOp) {
1595 }
else if (BasePred != CurrentPred) {
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 }
else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1612 if (Gep->getNumOperands() != 2 ||
1614 return InstructionsState::invalid();
1617 return InstructionsState::invalid();
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1625 return InstructionsState::invalid();
1626 if (
Call->hasOperandBundles() &&
1628 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1632 return InstructionsState::invalid();
1635 return InstructionsState::invalid();
1638 if (Mappings.
size() != BaseMappings.
size() ||
1639 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1640 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1641 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1642 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1643 Mappings.
front().Shape.Parameters !=
1644 BaseMappings.
front().Shape.Parameters)
1645 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1655 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1657 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1667 "Invalid InstructionsState.");
1675 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1685 unsigned Opcode = UserInst->
getOpcode();
1687 case Instruction::Load: {
1691 case Instruction::Store: {
1693 return (
SI->getPointerOperand() == Scalar);
1695 case Instruction::Call: {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !
MI->isVolatile();
1732 bool ExtendingManyInputs =
false) {
1733 if (SubMask.
empty())
1736 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1739 "SubMask with many inputs support must be larger than the mask.");
1741 Mask.append(SubMask.
begin(), SubMask.
end());
1745 int TermValue = std::min(Mask.size(), SubMask.
size());
1746 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1748 (!ExtendingManyInputs &&
1749 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1751 NewMask[
I] = Mask[SubMask[
I]];
1767 const size_t Sz = Order.
size();
1770 for (
unsigned I = 0;
I < Sz; ++
I) {
1772 UnusedIndices.
reset(Order[
I]);
1774 MaskedIndices.
set(
I);
1776 if (MaskedIndices.
none())
1779 "Non-synced masked/available indices.");
1783 assert(Idx >= 0 &&
"Indices must be synced.");
1793 unsigned Opcode0,
unsigned Opcode1) {
1800 OpcodeMask.
set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1810 "Expected scalar constants.");
1813 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I < E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1928 class ScheduleEntity;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1941 struct StridedPtrInfo {
1942 Value *StrideVal =
nullptr;
1943 const SCEV *StrideSCEV =
nullptr;
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2023 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.
front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2066 VectorizableTree.front()->getVectorFactor());
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode =
false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (
auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2100 ReductionBitWidth = 0;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList =
nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2124 assert(!Order.
empty() &&
"expected non-empty order");
2125 const unsigned Sz = Order.
size();
2127 return P.value() ==
P.index() ||
P.value() == Sz;
2140 bool IgnoreReorder);
2153 std::optional<OrdersType>
2191 return MaxVecRegSize;
2196 return MinVecRegSize;
2204 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2245 Align Alignment,
const int64_t Diff,
Value *Ptr0,
2246 Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2264 Align CommonAlignment,
2266 StridedPtrInfo &SPtrInfo)
const;
2281 StridedPtrInfo &SPtrInfo,
2282 unsigned *BestVF =
nullptr,
2283 bool TryRecursiveCheck =
true)
const;
2287 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2291 template <
typename T>
2293 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2318 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2319 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2344 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2345 MaxLevel(MaxLevel) {}
2401 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2406 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2408 return U == U1 || U == U2 || R.isVectorized(U);
2411 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2414 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2416 ((
int)V1->getNumUses() == NumLanes ||
2417 AllUsersAreInternal(V1, V2)))
2423 auto CheckSameEntryOrFail = [&]() {
2428 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2437 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2439 return CheckSameEntryOrFail();
2442 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2443 LI2->getPointerOperand(), DL, SE,
true);
2444 if (!Dist || *Dist == 0) {
2447 R.TTI->isLegalMaskedGather(
2450 return CheckSameEntryOrFail();
2454 if (std::abs(*Dist) > NumLanes / 2)
2487 Value *EV2 =
nullptr;
2500 int Dist = Idx2 - Idx1;
2503 if (std::abs(Dist) == 0)
2505 if (std::abs(Dist) > NumLanes / 2)
2512 return CheckSameEntryOrFail();
2518 if (I1->getParent() != I2->getParent())
2519 return CheckSameEntryOrFail();
2527 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2528 !S.isAltShuffle()) &&
2532 S.getMainOp()->getNumOperands();
2544 return CheckSameEntryOrFail();
2578 int ShallowScoreAtThisLevel =
2589 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2592 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2594 ShallowScoreAtThisLevel))
2595 return ShallowScoreAtThisLevel;
2596 assert(I1 && I2 &&
"Should have early exited.");
2603 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2604 OpIdx1 != NumOperands1; ++OpIdx1) {
2606 int MaxTmpScore = 0;
2607 unsigned MaxOpIdx2 = 0;
2608 bool FoundBest =
false;
2612 ? I2->getNumOperands()
2613 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2614 assert(FromIdx <= ToIdx &&
"Bad index");
2615 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2617 if (Op2Used.
count(OpIdx2))
2622 I1, I2, CurrLevel + 1, {});
2625 TmpScore > MaxTmpScore) {
2626 MaxTmpScore = TmpScore;
2633 Op2Used.
insert(MaxOpIdx2);
2634 ShallowScoreAtThisLevel += MaxTmpScore;
2637 return ShallowScoreAtThisLevel;
2668 struct OperandData {
2669 OperandData() =
default;
2670 OperandData(
Value *V,
bool APO,
bool IsUsed)
2671 : V(V), APO(APO), IsUsed(IsUsed) {}
2681 bool IsUsed =
false;
2690 enum class ReorderingMode {
2704 unsigned ArgSize = 0;
2710 const Loop *L =
nullptr;
2713 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2714 return OpsVec[
OpIdx][Lane];
2718 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2719 return OpsVec[
OpIdx][Lane];
2724 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2726 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2728 OpsVec[
OpIdx][Lane].IsUsed =
false;
2732 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2733 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2745 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2747 Value *IdxLaneV = getData(Idx, Lane).V;
2760 unsigned UniquesCount = Uniques.
size();
2761 auto IdxIt = Uniques.
find(IdxLaneV);
2762 unsigned UniquesCntWithIdxLaneV =
2763 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2765 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2766 unsigned UniquesCntWithOpIdxLaneV =
2767 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2768 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2770 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2771 UniquesCntWithOpIdxLaneV,
2772 UniquesCntWithOpIdxLaneV -
2774 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2775 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2776 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2785 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2786 Value *IdxLaneV = getData(Idx, Lane).V;
2799 return R.areAllUsersVectorized(IdxLaneI)
2807 static const int ScoreScaleFactor = 10;
2815 int Lane,
unsigned OpIdx,
unsigned Idx,
2825 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2826 if (Score <= -SplatScore) {
2830 Score += SplatScore;
2836 Score *= ScoreScaleFactor;
2837 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2855 std::optional<unsigned>
2856 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2860 unsigned NumOperands = getNumOperands();
2863 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2866 ReorderingMode RMode = ReorderingModes[
OpIdx];
2867 if (RMode == ReorderingMode::Failed)
2868 return std::nullopt;
2871 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2877 std::optional<unsigned> Idx;
2881 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2887 bool IsUsed = RMode == ReorderingMode::Splat ||
2888 RMode == ReorderingMode::Constant ||
2889 RMode == ReorderingMode::Load;
2891 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2893 OperandData &OpData = getData(Idx, Lane);
2895 bool OpAPO = OpData.APO;
2904 if (OpAPO != OpIdxAPO)
2909 case ReorderingMode::Load:
2910 case ReorderingMode::Opcode: {
2911 bool LeftToRight = Lane > LastLane;
2912 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2913 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2914 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2915 OpIdx, Idx, IsUsed, UsedLanes);
2916 if (Score >
static_cast<int>(BestOp.Score) ||
2917 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2920 BestOp.Score = Score;
2921 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2925 case ReorderingMode::Constant:
2927 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2931 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2938 case ReorderingMode::Splat:
2940 IsUsed =
Op == OpLastLane;
2941 if (
Op == OpLastLane) {
2943 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2949 case ReorderingMode::Failed:
2955 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2959 return std::nullopt;
2966 unsigned getBestLaneToStartReordering()
const {
2967 unsigned Min = UINT_MAX;
2968 unsigned SameOpNumber = 0;
2979 for (
int I = getNumLanes();
I > 0; --
I) {
2980 unsigned Lane =
I - 1;
2981 OperandsOrderData NumFreeOpsHash =
2982 getMaxNumOperandsThatCanBeReordered(Lane);
2985 if (NumFreeOpsHash.NumOfAPOs < Min) {
2986 Min = NumFreeOpsHash.NumOfAPOs;
2987 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2989 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2990 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2991 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2994 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2995 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2996 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2997 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2998 auto [It, Inserted] =
2999 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3005 unsigned BestLane = 0;
3006 unsigned CntMin = UINT_MAX;
3008 if (
Data.second.first < CntMin) {
3009 CntMin =
Data.second.first;
3010 BestLane =
Data.second.second;
3017 struct OperandsOrderData {
3020 unsigned NumOfAPOs = UINT_MAX;
3023 unsigned NumOpsWithSameOpcodeParent = 0;
3037 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3038 unsigned CntTrue = 0;
3039 unsigned NumOperands = getNumOperands();
3049 bool AllUndefs =
true;
3050 unsigned NumOpsWithSameOpcodeParent = 0;
3055 const OperandData &OpData = getData(
OpIdx, Lane);
3062 I->getParent() != Parent) {
3063 if (NumOpsWithSameOpcodeParent == 0) {
3064 NumOpsWithSameOpcodeParent = 1;
3066 Parent =
I->getParent();
3068 --NumOpsWithSameOpcodeParent;
3071 ++NumOpsWithSameOpcodeParent;
3080 OperandsOrderData
Data;
3081 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3082 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3089 const InstructionsState &S) {
3093 return VL.
size() == getNumLanes();
3095 "Expected same number of lanes");
3096 assert(S.valid() &&
"InstructionsState is invalid.");
3102 OpsVec.resize(ArgSize);
3103 unsigned NumLanes = VL.
size();
3104 for (OperandDataVec &
Ops : OpsVec)
3105 Ops.resize(NumLanes);
3123 bool IsInverseOperation =
false;
3124 if (S.isCopyableElement(VL[Lane])) {
3128 assert(
I &&
"Expected instruction");
3129 auto [SelectedOp,
Ops] = convertTo(
I, S);
3136 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3143 unsigned getNumOperands()
const {
return ArgSize; }
3146 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3149 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3150 return getData(
OpIdx, Lane).V;
3154 bool empty()
const {
return OpsVec.empty(); }
3157 void clear() { OpsVec.clear(); }
3162 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3164 "Op is expected to be getValue(OpIdx, Lane).");
3168 bool OpAPO = getData(
OpIdx, Lane).APO;
3169 bool IsInvariant = L && L->isLoopInvariant(
Op);
3171 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3175 bool FoundCandidate =
false;
3176 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3177 OperandData &
Data = getData(OpI, Ln);
3178 if (
Data.APO != OpAPO ||
Data.IsUsed)
3180 Value *OpILane = getValue(OpI, Lane);
3204 L->isLoopInvariant(
Data.V))) {
3205 FoundCandidate =
true;
3212 if (!FoundCandidate)
3215 return getNumLanes() == 2 || Cnt > 1;
3222 "Op is expected to be getValue(OpIdx, Lane).");
3223 bool OpAPO = getData(
OpIdx, Lane).APO;
3224 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3228 const OperandData &
Data = getData(OpI, Ln);
3229 if (
Data.APO != OpAPO ||
Data.IsUsed)
3231 Value *OpILn = getValue(OpI, Ln);
3232 return (L && L->isLoopInvariant(OpILn)) ||
3244 const InstructionsState &S,
const BoUpSLP &R)
3245 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3246 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3248 appendOperands(RootVL,
Operands, S);
3256 "Expected same num of lanes across all operands");
3257 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3258 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3266 unsigned NumOperands = getNumOperands();
3267 unsigned NumLanes = getNumLanes();
3287 unsigned FirstLane = getBestLaneToStartReordering();
3296 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3297 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3298 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3300 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3302 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3304 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3307 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3317 auto &&SkipReordering = [
this]() {
3320 for (
const OperandData &
Data : Op0)
3323 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3324 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3331 return UniqueValues.
size() != 2 &&
3333 UniqueValues.
size());
3345 if (SkipReordering())
3348 bool StrategyFailed =
false;
3356 for (
unsigned I = 0;
I < NumOperands; ++
I)
3357 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3360 UsedLanes.
set(FirstLane);
3361 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3363 for (
int Direction : {+1, -1}) {
3364 int Lane = FirstLane + Direction * Distance;
3365 if (Lane < 0 || Lane >= (
int)NumLanes)
3367 UsedLanes.
set(Lane);
3368 int LastLane = Lane - Direction;
3369 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3374 std::optional<unsigned> BestIdx =
3375 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3376 MainAltOps[
OpIdx], UsedLanes);
3383 swap(
OpIdx, *BestIdx, Lane);
3386 StrategyFailed =
true;
3390 OperandData &AltOp = getData(
OpIdx, Lane);
3391 InstructionsState OpS =
3393 if (OpS && OpS.isAltShuffle())
3400 if (!StrategyFailed)
3405#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3408 case ReorderingMode::Load:
3410 case ReorderingMode::Opcode:
3412 case ReorderingMode::Constant:
3414 case ReorderingMode::Splat:
3416 case ReorderingMode::Failed:
3437 const unsigned Indent = 2;
3439 for (
const OperandDataVec &OpDataVec : OpsVec) {
3440 OS <<
"Operand " << Cnt++ <<
"\n";
3441 for (
const OperandData &OpData : OpDataVec) {
3442 OS.
indent(Indent) <<
"{";
3443 if (
Value *V = OpData.V)
3447 OS <<
", APO:" << OpData.APO <<
"}\n";
3469 int BestScore = Limit;
3470 std::optional<int> Index;
3471 for (
int I :
seq<int>(0, Candidates.size())) {
3473 Candidates[
I].second,
3476 if (Score > BestScore) {
3491 DeletedInstructions.insert(
I);
3496 template <
typename T>
3499 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3501 for (
T *V : DeadVals) {
3506 for (
T *V : DeadVals) {
3507 if (!V || !Processed.
insert(V).second)
3512 for (
Use &U :
I->operands()) {
3514 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3516 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3517 return Entry->VectorizedValue == OpI;
3521 I->dropAllReferences();
3523 for (
T *V : DeadVals) {
3525 if (!
I->getParent())
3530 cast<Instruction>(U.getUser()));
3532 "trying to erase instruction with users.");
3533 I->removeFromParent();
3537 while (!DeadInsts.
empty()) {
3540 if (!VI || !VI->getParent())
3543 "Live instruction found in dead worklist!");
3544 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3551 for (
Use &OpU : VI->operands()) {
3552 Value *OpV = OpU.get();
3564 if (!DeletedInstructions.contains(OpI) &&
3565 (!OpI->getType()->isVectorTy() ||
3566 none_of(VectorValuesAndScales,
3567 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3568 return std::get<0>(V) == OpI;
3574 VI->removeFromParent();
3576 SE->forgetValue(VI);
3583 return AnalyzedReductionsRoots.count(
I);
3588 AnalyzedReductionsRoots.insert(
I);
3593 return AnalyzedReductionVals.contains(
hash_value(VL));
3598 AnalyzedReductionVals.insert(
hash_value(VL));
3602 AnalyzedReductionsRoots.clear();
3603 AnalyzedReductionVals.clear();
3604 AnalyzedMinBWVals.clear();
3612 return MustGather.contains(V);
3616 return NonScheduledFirst.contains(V);
3621 assert(V &&
"V cannot be nullptr.");
3622 return ScalarToTreeEntries.contains(V);
3632 bool collectValuesToDemote(
3633 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3636 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3645 void buildReorderableOperands(
3653 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3656 bool areAllUsersVectorized(
3665 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3666 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3667 return const_cast<TreeEntry *
>(
3668 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3674 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3678 getCastContextHint(
const TreeEntry &TE)
const;
3692 const InstructionsState &LocalState,
3699 unsigned InterleaveFactor = 0);
3710 bool ResizeAllowed =
false)
const;
3717 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3722 template <
typename BVTy,
typename ResTy,
typename... Args>
3723 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3728 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3734 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3741 std::optional<TargetTransformInfo::ShuffleKind>
3753 unsigned NumParts)
const;
3765 std::optional<TargetTransformInfo::ShuffleKind>
3766 isGatherShuffledSingleRegisterEntry(
3783 isGatherShuffledEntry(
3786 unsigned NumParts,
bool ForOrder =
false);
3792 Type *ScalarTy)
const;
3796 void setInsertPointAfterBundle(
const TreeEntry *E);
3806 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3811 void tryToVectorizeGatheredLoads(
3813 std::tuple<BasicBlock *, Value *, Type *>,
3821 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3837 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3841 void reorderGatherNode(TreeEntry &TE);
3846 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3850 if (State == TreeEntry::SplitVectorize)
3852 SmallVector<int>
Mask;
3859 SmallVector<int> getSplitMask()
const {
3860 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3861 "Expected only split vectorize node.");
3863 unsigned CommonVF = std::max<unsigned>(
3864 CombinedEntriesWithIndices.back().second,
3865 Scalars.size() - CombinedEntriesWithIndices.back().second);
3866 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3868 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3869 ? CommonVF - CombinedEntriesWithIndices.back().second
3876 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3877 ArrayRef<int> MaskOrder);
3882 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3883 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3886 [Scalars](
Value *V,
int Idx) {
3887 return (isa<UndefValue>(V) &&
3888 Idx == PoisonMaskElem) ||
3889 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3892 if (!ReorderIndices.empty()) {
3896 SmallVector<int>
Mask;
3898 if (VL.
size() == Scalars.size())
3899 return IsSame(Scalars, Mask);
3900 if (VL.
size() == ReuseShuffleIndices.size()) {
3902 return IsSame(Scalars, Mask);
3906 return IsSame(Scalars, ReuseShuffleIndices);
3910 bool hasEqualOperands(
const TreeEntry &TE)
const {
3911 if (
TE.getNumOperands() != getNumOperands())
3913 SmallBitVector
Used(getNumOperands());
3914 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3915 unsigned PrevCount =
Used.count();
3916 for (
unsigned K = 0;
K <
E; ++
K) {
3919 if (getOperand(K) ==
TE.getOperand(
I)) {
3925 if (PrevCount ==
Used.count())
3934 unsigned getVectorFactor()
const {
3935 if (!ReuseShuffleIndices.empty())
3936 return ReuseShuffleIndices.size();
3937 return Scalars.size();
3941 bool isGather()
const {
return State == NeedToGather; }
3947 WeakTrackingVH VectorizedValue =
nullptr;
3968 enum CombinedOpcode {
3970 MinMax = Instruction::OtherOpsEnd + 1,
3973 CombinedOpcode CombinedOp = NotCombinedOp;
3976 SmallVector<int, 4> ReuseShuffleIndices;
3979 SmallVector<unsigned, 4> ReorderIndices;
3987 VecTreeTy &Container;
3990 EdgeInfo UserTreeIndex;
4006 SmallPtrSet<const Value *, 4> CopyableElements;
4010 InstructionsState S = InstructionsState::invalid();
4013 unsigned InterleaveFactor = 0;
4016 bool DoesNotNeedToSchedule =
false;
4020 if (Operands.size() <
OpIdx + 1)
4021 Operands.resize(
OpIdx + 1);
4024 "Number of operands is greater than the number of scalars.");
4031 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4033 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4036 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4039 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4044 setOperand(
I, Operands[
I]);
4048 void reorderOperands(ArrayRef<int> Mask) {
4056 return Operands[
OpIdx];
4062 return Operands[
OpIdx];
4066 unsigned getNumOperands()
const {
return Operands.size(); }
4069 Value *getSingleOperand(
unsigned OpIdx)
const {
4072 return Operands[
OpIdx][0];
4076 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4078 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4079 return S.getMatchingMainOpOrAltOp(
I);
4087 if (
I && getMatchingMainOpOrAltOp(
I))
4089 return S.getMainOp();
4092 void setOperations(
const InstructionsState &S) {
4093 assert(S &&
"InstructionsState is invalid.");
4097 Instruction *getMainOp()
const {
return S.getMainOp(); }
4099 Instruction *getAltOp()
const {
return S.getAltOp(); }
4102 unsigned getOpcode()
const {
return S.getOpcode(); }
4104 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4106 bool hasState()
const {
return S.valid(); }
4109 void addCopyableElement(
Value *V) {
4110 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4111 CopyableElements.insert(V);
4115 bool isCopyableElement(
Value *V)
const {
4116 return CopyableElements.contains(V);
4120 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4123 const InstructionsState &getOperations()
const {
return S; }
4127 unsigned findLaneForValue(
Value *V)
const {
4128 unsigned FoundLane = getVectorFactor();
4129 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4130 std::advance(It, 1)) {
4133 FoundLane = std::distance(Scalars.begin(), It);
4134 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4135 if (!ReorderIndices.empty())
4136 FoundLane = ReorderIndices[FoundLane];
4137 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4138 if (ReuseShuffleIndices.empty())
4140 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4141 RIt != ReuseShuffleIndices.end()) {
4142 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4146 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4153 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4154 SmallVectorImpl<int> &Mask,
4155 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4156 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4159 bool isNonPowOf2Vec()
const {
4161 return IsNonPowerOf2;
4167 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4170 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4171 "Reshuffling not supported with non-power-of-2 vectors yet.");
4172 return IsNonPowerOf2;
4175 Value *getOrdered(
unsigned Idx)
const {
4176 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4177 if (ReorderIndices.empty())
4178 return Scalars[Idx];
4179 SmallVector<int>
Mask;
4181 return Scalars[
Mask[Idx]];
4187 dbgs() << Idx <<
".\n";
4188 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4189 dbgs() <<
"Operand " << OpI <<
":\n";
4190 for (
const Value *V : Operands[OpI])
4193 dbgs() <<
"Scalars: \n";
4194 for (
Value *V : Scalars)
4196 dbgs() <<
"State: ";
4197 if (S && hasCopyableElements())
4198 dbgs() <<
"[[Copyable]] ";
4201 if (InterleaveFactor > 0) {
4202 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4205 dbgs() <<
"Vectorize\n";
4208 case ScatterVectorize:
4209 dbgs() <<
"ScatterVectorize\n";
4211 case StridedVectorize:
4212 dbgs() <<
"StridedVectorize\n";
4214 case CompressVectorize:
4215 dbgs() <<
"CompressVectorize\n";
4218 dbgs() <<
"NeedToGather\n";
4220 case CombinedVectorize:
4221 dbgs() <<
"CombinedVectorize\n";
4223 case SplitVectorize:
4224 dbgs() <<
"SplitVectorize\n";
4228 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4229 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4231 dbgs() <<
"MainOp: NULL\n";
4232 dbgs() <<
"AltOp: NULL\n";
4234 dbgs() <<
"VectorizedValue: ";
4235 if (VectorizedValue)
4236 dbgs() << *VectorizedValue <<
"\n";
4239 dbgs() <<
"ReuseShuffleIndices: ";
4240 if (ReuseShuffleIndices.empty())
4243 for (
int ReuseIdx : ReuseShuffleIndices)
4244 dbgs() << ReuseIdx <<
", ";
4246 dbgs() <<
"ReorderIndices: ";
4247 for (
unsigned ReorderIdx : ReorderIndices)
4248 dbgs() << ReorderIdx <<
", ";
4250 dbgs() <<
"UserTreeIndex: ";
4252 dbgs() << UserTreeIndex;
4254 dbgs() <<
"<invalid>";
4256 if (!CombinedEntriesWithIndices.empty()) {
4257 dbgs() <<
"Combined entries: ";
4259 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4270 StringRef Banner)
const {
4271 dbgs() <<
"SLP: " << Banner <<
":\n";
4273 dbgs() <<
"SLP: Costs:\n";
4274 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4275 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4276 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4277 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4278 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4284 const InstructionsState &S,
4286 ArrayRef<int> ReuseShuffleIndices = {}) {
4287 auto Invalid = ScheduleBundle::invalid();
4288 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4293 const InstructionsState &S,
4295 ArrayRef<int> ReuseShuffleIndices = {},
4296 ArrayRef<unsigned> ReorderIndices = {},
4297 unsigned InterleaveFactor = 0) {
4298 TreeEntry::EntryState EntryState =
4299 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4300 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4301 ReuseShuffleIndices, ReorderIndices);
4302 if (
E && InterleaveFactor > 0)
4303 E->setInterleave(InterleaveFactor);
4308 TreeEntry::EntryState EntryState,
4309 ScheduleBundle &Bundle,
const InstructionsState &S,
4311 ArrayRef<int> ReuseShuffleIndices = {},
4312 ArrayRef<unsigned> ReorderIndices = {}) {
4313 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4314 EntryState == TreeEntry::SplitVectorize)) ||
4315 (Bundle && EntryState != TreeEntry::NeedToGather &&
4316 EntryState != TreeEntry::SplitVectorize)) &&
4317 "Need to vectorize gather entry?");
4319 if (GatheredLoadsEntriesFirst.has_value() &&
4320 EntryState == TreeEntry::NeedToGather && S &&
4321 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4322 !UserTreeIdx.UserTE)
4324 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4325 TreeEntry *
Last = VectorizableTree.back().get();
4326 Last->Idx = VectorizableTree.size() - 1;
4327 Last->State = EntryState;
4328 if (UserTreeIdx.UserTE)
4329 OperandsToTreeEntry.try_emplace(
4330 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4335 ReuseShuffleIndices.empty()) &&
4336 "Reshuffling scalars not yet supported for nodes with padding");
4337 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4338 ReuseShuffleIndices.end());
4339 if (ReorderIndices.
empty()) {
4342 Last->setOperations(S);
4345 Last->Scalars.assign(VL.
size(),
nullptr);
4347 [VL](
unsigned Idx) ->
Value * {
4348 if (Idx >= VL.size())
4349 return UndefValue::get(VL.front()->getType());
4354 Last->setOperations(S);
4355 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4357 if (EntryState == TreeEntry::SplitVectorize) {
4358 assert(S &&
"Split nodes must have operations.");
4359 Last->setOperations(S);
4360 SmallPtrSet<Value *, 4> Processed;
4361 for (
Value *V : VL) {
4365 auto It = ScalarsInSplitNodes.find(V);
4366 if (It == ScalarsInSplitNodes.end()) {
4367 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4368 (void)Processed.
insert(V);
4369 }
else if (Processed.
insert(V).second) {
4371 "Value already associated with the node.");
4372 It->getSecond().push_back(
Last);
4375 }
else if (!
Last->isGather()) {
4378 (!S.areInstructionsWithCopyableElements() &&
4380 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4381 Last->setDoesNotNeedToSchedule();
4382 SmallPtrSet<Value *, 4> Processed;
4383 for (
Value *V : VL) {
4386 if (S.isCopyableElement(V)) {
4387 Last->addCopyableElement(V);
4390 auto It = ScalarToTreeEntries.find(V);
4391 if (It == ScalarToTreeEntries.end()) {
4392 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4393 (void)Processed.
insert(V);
4394 }
else if (Processed.
insert(V).second) {
4396 "Value already associated with the node.");
4397 It->getSecond().push_back(
Last);
4401 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4402 "Bundle and VL out of sync");
4403 if (!Bundle.getBundle().empty()) {
4404#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4405 auto *BundleMember = Bundle.getBundle().begin();
4406 SmallPtrSet<Value *, 4> Processed;
4407 for (
Value *V : VL) {
4408 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4412 assert(BundleMember == Bundle.getBundle().end() &&
4413 "Bundle and VL out of sync");
4415 Bundle.setTreeEntry(
Last);
4419 bool AllConstsOrCasts =
true;
4420 for (
Value *V : VL) {
4421 if (S && S.areInstructionsWithCopyableElements() &&
4422 S.isCopyableElement(V))
4423 Last->addCopyableElement(V);
4426 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4427 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4428 !UserTreeIdx.UserTE->isGather())
4429 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4432 if (AllConstsOrCasts)
4434 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4435 MustGather.insert_range(VL);
4438 if (UserTreeIdx.UserTE)
4439 Last->UserTreeIndex = UserTreeIdx;
4445 TreeEntry::VecTreeTy VectorizableTree;
4450 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4451 VectorizableTree[
Id]->dump();
4459 assert(V &&
"V cannot be nullptr.");
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end())
4463 return It->getSecond();
4468 assert(V &&
"V cannot be nullptr.");
4469 auto It = ScalarsInSplitNodes.find(V);
4470 if (It == ScalarsInSplitNodes.end())
4472 return It->getSecond();
4477 bool SameVF =
false)
const {
4478 assert(V &&
"V cannot be nullptr.");
4479 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4480 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4491 bool areAltOperandsProfitable(
const InstructionsState &S,
4496 class ScalarsVectorizationLegality {
4497 InstructionsState S;
4499 bool TryToFindDuplicates;
4500 bool TrySplitVectorize;
4503 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4504 bool TryToFindDuplicates =
true,
4505 bool TrySplitVectorize =
false)
4506 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4507 TrySplitVectorize(TrySplitVectorize) {
4508 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4509 "Inconsistent state");
4511 const InstructionsState &getInstructionsState()
const {
return S; };
4512 bool isLegal()
const {
return IsLegal; }
4513 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4514 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4519 ScalarsVectorizationLegality
4522 bool TryCopyableElementsVectorization)
const;
4526 TreeEntry::EntryState getScalarsVectorizationState(
4528 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4529 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4532 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4535 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4536 OperandsToTreeEntry;
4539 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4542 SmallDenseMap<Value *, unsigned> InstrElementSize;
4556 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4561 SetVector<const TreeEntry *> PostponedGathers;
4563 using ValueToGatherNodesMap =
4564 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4565 ValueToGatherNodesMap ValueToGatherNodes;
4570 SetVector<unsigned> LoadEntriesToVectorize;
4573 bool IsGraphTransformMode =
false;
4576 std::optional<unsigned> GatheredLoadsEntriesFirst;
4579 SmallDenseMap<
const TreeEntry *,
4580 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4581 CompressEntryToData;
4584 struct ExternalUser {
4585 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4586 : Scalar(S), User(
U), E(E), Lane(
L) {}
4589 Value *Scalar =
nullptr;
4592 llvm::User *User =
nullptr;
4600 using UserList = SmallVector<ExternalUser, 16>;
4606 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4607 Instruction *Inst2) {
4610 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4611 auto Res = AliasCache.try_emplace(
Key);
4613 return Res.first->second;
4614 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4616 Res.first->getSecond() = Aliased;
4620 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4624 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4629 BatchAAResults BatchAA;
4636 DenseSet<Instruction *> DeletedInstructions;
4639 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4642 DenseSet<size_t> AnalyzedReductionVals;
4646 DenseSet<Value *> AnalyzedMinBWVals;
4652 UserList ExternalUses;
4656 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4660 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4663 SmallPtrSet<const Value *, 32> EphValues;
4667 SetVector<Instruction *> GatherShuffleExtractSeq;
4670 DenseSet<BasicBlock *> CSEBlocks;
4673 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4680 class ScheduleEntity {
4681 friend class ScheduleBundle;
4682 friend class ScheduleData;
4683 friend class ScheduleCopyableData;
4686 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4687 Kind getKind()
const {
return K; }
4688 ScheduleEntity(Kind K) : K(K) {}
4692 int SchedulingPriority = 0;
4695 bool IsScheduled =
false;
4697 const Kind K = Kind::ScheduleData;
4700 ScheduleEntity() =
delete;
4702 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4703 int getSchedulingPriority()
const {
return SchedulingPriority; }
4704 bool isReady()
const {
4706 return SD->isReady();
4708 return CD->isReady();
4714 bool hasValidDependencies()
const {
4716 return SD->hasValidDependencies();
4718 return CD->hasValidDependencies();
4722 int getUnscheduledDeps()
const {
4724 return SD->getUnscheduledDeps();
4726 return CD->getUnscheduledDeps();
4730 int incrementUnscheduledDeps(
int Incr) {
4732 return SD->incrementUnscheduledDeps(Incr);
4736 int getDependencies()
const {
4738 return SD->getDependencies();
4744 return SD->getInst();
4749 bool isScheduled()
const {
return IsScheduled; }
4750 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4752 static bool classof(
const ScheduleEntity *) {
return true; }
4754#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4755 void dump(raw_ostream &OS)
const {
4757 return SD->dump(OS);
4759 return CD->dump(OS);
4770#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4772 const BoUpSLP::ScheduleEntity &SE) {
4782 class ScheduleData final :
public ScheduleEntity {
4786 enum { InvalidDeps = -1 };
4788 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4789 static bool classof(
const ScheduleEntity *Entity) {
4790 return Entity->getKind() == Kind::ScheduleData;
4793 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4794 NextLoadStore =
nullptr;
4795 IsScheduled =
false;
4796 SchedulingRegionID = BlockSchedulingRegionID;
4797 clearDependencies();
4803 if (hasValidDependencies()) {
4804 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4806 assert(UnscheduledDeps == Dependencies &&
"invariant");
4810 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4811 "unexpected scheduled state");
4818 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4822 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4827 int incrementUnscheduledDeps(
int Incr) {
4828 assert(hasValidDependencies() &&
4829 "increment of unscheduled deps would be meaningless");
4830 UnscheduledDeps += Incr;
4831 return UnscheduledDeps;
4836 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4839 void clearDependencies() {
4840 clearDirectDependencies();
4841 MemoryDependencies.clear();
4842 ControlDependencies.clear();
4849 void clearDirectDependencies() {
4850 Dependencies = InvalidDeps;
4851 resetUnscheduledDeps();
4852 IsScheduled =
false;
4856 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4858 int getDependencies()
const {
return Dependencies; }
4860 void initDependencies() { Dependencies = 0; }
4862 void incDependencies() { Dependencies++; }
4865 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4872 return MemoryDependencies;
4875 void addMemoryDependency(ScheduleData *Dep) {
4876 MemoryDependencies.push_back(Dep);
4880 return ControlDependencies;
4883 void addControlDependency(ScheduleData *Dep) {
4884 ControlDependencies.push_back(Dep);
4887 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4888 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4890 void dump(raw_ostream &OS)
const { OS << *Inst; }
4902 ScheduleData *NextLoadStore =
nullptr;
4906 SmallVector<ScheduleData *> MemoryDependencies;
4912 SmallVector<ScheduleData *> ControlDependencies;
4916 int SchedulingRegionID = 0;
4922 int Dependencies = InvalidDeps;
4928 int UnscheduledDeps = InvalidDeps;
4933 const BoUpSLP::ScheduleData &SD) {
4939 class ScheduleBundle final :
public ScheduleEntity {
4943 bool IsValid =
true;
4945 TreeEntry *TE =
nullptr;
4946 ScheduleBundle(
bool IsValid)
4947 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4950 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4951 static bool classof(
const ScheduleEntity *Entity) {
4952 return Entity->getKind() == Kind::ScheduleBundle;
4957 for (
const ScheduleEntity *SD : Bundle) {
4958 if (SD->hasValidDependencies()) {
4959 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4962 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4966 if (isScheduled()) {
4967 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4968 "unexpected scheduled state");
4974 int unscheduledDepsInBundle()
const {
4975 assert(*
this &&
"bundle must not be empty");
4977 for (
const ScheduleEntity *BundleMember : Bundle) {
4978 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4979 return ScheduleData::InvalidDeps;
4980 Sum += BundleMember->getUnscheduledDeps();
4988 bool hasValidDependencies()
const {
4989 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4990 return SD->hasValidDependencies();
4996 bool isReady()
const {
4997 assert(*
this &&
"bundle must not be empty");
4998 return unscheduledDepsInBundle() == 0 && !isScheduled();
5006 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5009 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5010 TreeEntry *getTreeEntry()
const {
return TE; }
5012 static ScheduleBundle invalid() {
return {
false}; }
5014 operator bool()
const {
return IsValid; }
5017 void dump(raw_ostream &OS)
const {
5026 OS << *SD->getInst();
5040 const BoUpSLP::ScheduleBundle &Bundle) {
5051 class ScheduleCopyableData final :
public ScheduleEntity {
5058 int SchedulingRegionID = 0;
5060 ScheduleBundle &Bundle;
5063 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5064 const EdgeInfo &EI, ScheduleBundle &Bundle)
5065 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5066 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5067 static bool classof(
const ScheduleEntity *Entity) {
5068 return Entity->getKind() == Kind::ScheduleCopyableData;
5073 if (hasValidDependencies()) {
5074 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5076 assert(UnscheduledDeps == Dependencies &&
"invariant");
5080 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5081 "unexpected scheduled state");
5088 bool hasValidDependencies()
const {
5089 return Dependencies != ScheduleData::InvalidDeps;
5094 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5099 int incrementUnscheduledDeps(
int Incr) {
5100 assert(hasValidDependencies() &&
5101 "increment of unscheduled deps would be meaningless");
5102 UnscheduledDeps += Incr;
5103 assert(UnscheduledDeps >= 0 &&
"invariant");
5104 return UnscheduledDeps;
5109 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5112 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5114 int getDependencies()
const {
return Dependencies; }
5116 void initDependencies() { Dependencies = 0; }
5118 void incDependencies() { Dependencies++; }
5121 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5127 void clearDependencies() {
5128 Dependencies = ScheduleData::InvalidDeps;
5129 UnscheduledDeps = ScheduleData::InvalidDeps;
5130 IsScheduled =
false;
5134 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5137 ScheduleBundle &getBundle() {
return Bundle; }
5138 const ScheduleBundle &getBundle()
const {
return Bundle; }
5140#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5141 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5152 int Dependencies = ScheduleData::InvalidDeps;
5158 int UnscheduledDeps = ScheduleData::InvalidDeps;
5188 struct BlockScheduling {
5190 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5193 ScheduledBundles.clear();
5194 ScheduledBundlesList.
clear();
5195 ScheduleCopyableDataMap.clear();
5196 ScheduleCopyableDataMapByInst.clear();
5197 ScheduleCopyableDataMapByInstUser.clear();
5198 ScheduleCopyableDataMapByUsers.clear();
5200 ScheduleStart =
nullptr;
5201 ScheduleEnd =
nullptr;
5202 FirstLoadStoreInRegion =
nullptr;
5203 LastLoadStoreInRegion =
nullptr;
5204 RegionHasStackSave =
false;
5208 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5211 ScheduleRegionSize = 0;
5215 ++SchedulingRegionID;
5221 if (BB !=
I->getParent())
5224 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5225 if (SD && isInSchedulingRegion(*SD))
5230 ScheduleData *getScheduleData(
Value *V) {
5236 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5237 const Value *V)
const {
5238 if (ScheduleCopyableDataMap.empty())
5240 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5241 if (It == ScheduleCopyableDataMap.end())
5243 ScheduleCopyableData *SD = It->getSecond().get();
5244 if (!isInSchedulingRegion(*SD))
5252 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5254 if (ScheduleCopyableDataMapByInstUser.empty())
5256 const auto It = ScheduleCopyableDataMapByInstUser.find(
5257 std::make_pair(std::make_pair(User, OperandIdx), V));
5258 if (It == ScheduleCopyableDataMapByInstUser.end())
5261 for (ScheduleCopyableData *SD : It->getSecond()) {
5262 if (isInSchedulingRegion(*SD))
5276 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5280 if (ScheduleCopyableDataMap.empty())
5282 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5283 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5284 for (
const Use &U :
User->operands()) {
5288 if (Entries.
empty())
5292 for (TreeEntry *TE : Entries) {
5298 bool IsCommutativeUser =
5301 EdgeInfo EI(TE,
U.getOperandNo());
5304 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5305 if (!getScheduleCopyableData(EI,
Op) && OpCnt <
NumOps)
5311 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5312 .first->getSecond();
5316 if (!PotentiallyReorderedEntriesCount.
empty()) {
5317 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5318 auto *It =
find(
P.first->Scalars, User);
5319 assert(It !=
P.first->Scalars.end() &&
5320 "User is not in the tree entry");
5321 int Lane = std::distance(
P.first->Scalars.begin(), It);
5322 assert(Lane >= 0 &&
"Lane is not found");
5324 Lane =
P.first->ReorderIndices[Lane];
5325 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5326 "Couldn't find extract lane");
5327 SmallVector<unsigned> OpIndices;
5328 for (
unsigned OpIdx :
5330 P.first->getMainOp()))) {
5331 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5332 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5336 return all_of(PotentiallyReorderedEntriesCount,
5337 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5338 return P.second ==
NumOps - 1;
5345 getScheduleCopyableData(
const Instruction *
I)
const {
5346 if (ScheduleCopyableDataMapByInst.empty())
5348 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5349 if (It == ScheduleCopyableDataMapByInst.end())
5352 for (ScheduleCopyableData *SD : It->getSecond()) {
5353 if (isInSchedulingRegion(*SD))
5360 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5361 if (ScheduleCopyableDataMapByUsers.empty())
5363 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5364 if (It == ScheduleCopyableDataMapByUsers.end())
5367 for (ScheduleCopyableData *SD : It->getSecond()) {
5368 if (isInSchedulingRegion(*SD))
5374 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5376 int SchedulingRegionID,
5377 ScheduleBundle &Bundle) {
5378 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5379 ScheduleCopyableData *CD =
5380 ScheduleCopyableDataMap
5381 .try_emplace(std::make_pair(EI,
I),
5382 std::make_unique<ScheduleCopyableData>(
5383 SchedulingRegionID,
I, EI, Bundle))
5386 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5390 assert(It !=
Op.end() &&
"Lane not set");
5391 SmallPtrSet<Instruction *, 4> Visited;
5393 int Lane = std::distance(
Op.begin(), It);
5394 assert(Lane >= 0 &&
"Lane not set");
5396 !EI.UserTE->ReorderIndices.empty())
5397 Lane = EI.UserTE->ReorderIndices[Lane];
5398 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5399 "Couldn't find extract lane");
5401 if (!Visited.
insert(In).second) {
5405 ScheduleCopyableDataMapByInstUser
5406 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5409 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5416 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5417 if (ScheduleCopyableData *UserCD =
5418 getScheduleCopyableData(UserEI, In))
5419 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5422 }
while (It !=
Op.end());
5424 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5434 auto It = ScheduledBundles.find(
I);
5435 if (It == ScheduledBundles.end())
5437 return It->getSecond();
5441 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5443 return Data->getSchedulingRegionID() == SchedulingRegionID;
5445 return CD->getSchedulingRegionID() == SchedulingRegionID;
5447 [&](
const ScheduleEntity *BundleMember) {
5448 return isInSchedulingRegion(*BundleMember);
5454 template <
typename ReadyListType>
5455 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5456 const EdgeInfo &EI, ScheduleEntity *
Data,
5457 ReadyListType &ReadyList) {
5458 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5463 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5464 if ((IsControl ||
Data->hasValidDependencies()) &&
5465 Data->incrementUnscheduledDeps(-1) == 0) {
5472 CopyableBundle.
push_back(&CD->getBundle());
5473 Bundles = CopyableBundle;
5475 Bundles = getScheduleBundles(
Data->getInst());
5477 if (!Bundles.
empty()) {
5478 for (ScheduleBundle *Bundle : Bundles) {
5479 if (Bundle->unscheduledDepsInBundle() == 0) {
5480 assert(!Bundle->isScheduled() &&
5481 "already scheduled bundle gets ready");
5482 ReadyList.insert(Bundle);
5484 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5490 "already scheduled bundle gets ready");
5492 "Expected non-copyable data");
5493 ReadyList.insert(
Data);
5500 if (!ScheduleCopyableDataMap.empty()) {
5502 getScheduleCopyableData(User,
OpIdx,
I);
5503 for (ScheduleCopyableData *CD : CopyableData)
5504 DecrUnsched(CD,
false);
5505 if (!CopyableData.empty())
5508 if (ScheduleData *OpSD = getScheduleData(
I))
5509 DecrUnsched(OpSD,
false);
5515 if (!Bundles.empty()) {
5516 auto *
In = BundleMember->getInst();
5518 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5519 unsigned TotalOpCount = 0;
5522 TotalOpCount = OperandsUses[
In] = 1;
5524 for (
const Use &U :
In->operands()) {
5527 ++Res.first->getSecond();
5534 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5536 if (!ScheduleCopyableDataMap.empty()) {
5537 const EdgeInfo EI = {UserTE,
OpIdx};
5538 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5539 DecrUnsched(CD,
false);
5543 auto It = OperandsUses.
find(
I);
5544 assert(It != OperandsUses.
end() &&
"Operand not found");
5545 if (It->second > 0) {
5547 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5549 if (ScheduleData *OpSD = getScheduleData(
I))
5550 DecrUnsched(OpSD,
false);
5554 for (ScheduleBundle *Bundle : Bundles) {
5555 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5559 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5560 find(Bundle->getTreeEntry()->Scalars, In));
5561 assert(Lane >= 0 &&
"Lane not set");
5563 !Bundle->getTreeEntry()->ReorderIndices.empty())
5564 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5565 assert(Lane <
static_cast<int>(
5566 Bundle->getTreeEntry()->Scalars.size()) &&
5567 "Couldn't find extract lane");
5577 In->getNumOperands() ==
5578 Bundle->getTreeEntry()->getNumOperands() ||
5579 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5580 "Missed TreeEntry operands?");
5582 for (
unsigned OpIdx :
5585 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5588 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5594 for (Use &U : BundleMember->getInst()->operands()) {
5597 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5598 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5606 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5607 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5608 if (!VisitedMemory.
insert(MemoryDep).second)
5613 << *MemoryDep <<
"\n");
5614 DecrUnsched(MemoryDep);
5617 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5618 for (ScheduleData *Dep : SD->getControlDependencies()) {
5619 if (!VisitedControl.
insert(Dep).second)
5624 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5625 DecrUnsched(Dep,
true);
5629 SD->setScheduled(
true);
5634 if (
R.isVectorized(In)) {
5636 for (TreeEntry *TE : Entries) {
5638 In->getNumOperands() !=
TE->getNumOperands())
5641 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5642 BundlePtr->setTreeEntry(TE);
5647 ProcessBundleMember(SD, Bundles);
5650 Bundle.setScheduled(
true);
5652 auto AreAllBundlesScheduled =
5653 [&](
const ScheduleEntity *SD,
5657 return !SDBundles.empty() &&
5658 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5659 return SDBundle->isScheduled();
5662 for (ScheduleEntity *SD : Bundle.getBundle()) {
5665 SDBundles = getScheduleBundles(SD->getInst());
5666 if (AreAllBundlesScheduled(SD, SDBundles)) {
5667 SD->setScheduled(
true);
5680 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5681 ScheduleStart->comesBefore(ScheduleEnd) &&
5682 "Not a valid scheduling region?");
5684 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5686 if (!Bundles.
empty()) {
5687 for (ScheduleBundle *Bundle : Bundles) {
5688 assert(isInSchedulingRegion(*Bundle) &&
5689 "primary schedule data not in window?");
5694 auto *SD = getScheduleData(
I);
5697 assert(isInSchedulingRegion(*SD) &&
5698 "primary schedule data not in window?");
5703 [](
const ScheduleEntity *Bundle) {
5704 return Bundle->isReady();
5706 "item in ready list not ready?");
5710 template <
typename ReadyListType>
5711 void initialFillReadyList(ReadyListType &ReadyList) {
5712 SmallPtrSet<ScheduleBundle *, 16> Visited;
5713 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5714 ScheduleData *SD = getScheduleData(
I);
5715 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5718 for (ScheduleBundle *Bundle : Bundles) {
5719 if (!Visited.
insert(Bundle).second)
5721 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5722 ReadyList.insert(Bundle);
5724 << *Bundle <<
"\n");
5729 ReadyList.insert(SD);
5731 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5742 const InstructionsState &S,
const EdgeInfo &EI);
5749 std::optional<ScheduleBundle *>
5751 const InstructionsState &S,
const EdgeInfo &EI);
5754 ScheduleData *allocateScheduleDataChunks();
5758 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5762 void initScheduleData(Instruction *FromI, Instruction *ToI,
5763 ScheduleData *PrevLoadStore,
5764 ScheduleData *NextLoadStore);
5768 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5773 void resetSchedule();
5790 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5794 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5795 std::unique_ptr<ScheduleCopyableData>>
5796 ScheduleCopyableDataMap;
5802 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5803 ScheduleCopyableDataMapByInst;
5809 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5811 ScheduleCopyableDataMapByInstUser;
5831 SmallSetVector<ScheduleCopyableData *, 4>>
5832 ScheduleCopyableDataMapByUsers;
5835 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5841 SetVector<ScheduleEntity *> ReadyInsts;
5851 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5855 ScheduleData *LastLoadStoreInRegion =
nullptr;
5860 bool RegionHasStackSave =
false;
5863 int ScheduleRegionSize = 0;
5872 int SchedulingRegionID = 1;
5876 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5880 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5883 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5887 struct OrdersTypeDenseMapInfo {
5900 static unsigned getHashValue(
const OrdersType &V) {
5911 ScalarEvolution *SE;
5912 TargetTransformInfo *TTI;
5913 TargetLibraryInfo *TLI;
5916 AssumptionCache *AC;
5918 const DataLayout *DL;
5919 OptimizationRemarkEmitter *ORE;
5921 unsigned MaxVecRegSize;
5922 unsigned MinVecRegSize;
5925 IRBuilder<TargetFolder> Builder;
5932 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5937 unsigned ReductionBitWidth = 0;
5940 unsigned BaseGraphSize = 1;
5944 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5948 DenseSet<unsigned> ExtraBitWidthNodes;
5958 SecondInfo::getEmptyKey());
5963 SecondInfo::getTombstoneKey());
5968 SecondInfo::getHashValue(Val.
EdgeIdx));
5989 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6000 return R.VectorizableTree[0].get();
6004 return {&
N->UserTreeIndex,
N->Container};
6008 return {&
N->UserTreeIndex + 1,
N->Container};
6035 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6046 OS << Entry->Idx <<
".\n";
6049 for (
auto *V : Entry->Scalars) {
6051 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6052 return EU.Scalar == V;
6062 if (Entry->isGather())
6064 if (Entry->State == TreeEntry::ScatterVectorize ||
6065 Entry->State == TreeEntry::StridedVectorize ||
6066 Entry->State == TreeEntry::CompressVectorize)
6067 return "color=blue";
6076 for (
auto *
I : DeletedInstructions) {
6077 if (!
I->getParent()) {
6082 I->insertBefore(F->getEntryBlock(),
6083 F->getEntryBlock().getFirstNonPHIIt());
6085 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6088 for (
Use &U :
I->operands()) {
6090 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6094 I->dropAllReferences();
6096 for (
auto *
I : DeletedInstructions) {
6098 "trying to erase instruction with users.");
6099 I->eraseFromParent();
6105#ifdef EXPENSIVE_CHECKS
6116 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6117 "Expected non-empty mask.");
6120 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6122 Reuses[Mask[
I]] = Prev[
I];
6130 bool BottomOrder =
false) {
6131 assert(!Mask.empty() &&
"Expected non-empty mask.");
6132 unsigned Sz = Mask.size();
6135 if (Order.
empty()) {
6137 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6139 PrevOrder.
swap(Order);
6142 for (
unsigned I = 0;
I < Sz; ++
I)
6144 Order[
I] = PrevOrder[Mask[
I]];
6146 return Data.value() == Sz ||
Data.index() ==
Data.value();
6155 if (Order.
empty()) {
6157 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6167 for (
unsigned I = 0;
I < Sz; ++
I)
6169 Order[MaskOrder[
I]] =
I;
6173std::optional<BoUpSLP::OrdersType>
6175 bool TopToBottom,
bool IgnoreReorder) {
6176 assert(TE.isGather() &&
"Expected gather node only.");
6180 Type *ScalarTy = GatheredScalars.
front()->getType();
6181 size_t NumScalars = GatheredScalars.
size();
6183 return std::nullopt;
6190 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6192 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6195 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6196 return std::nullopt;
6197 OrdersType CurrentOrder(NumScalars, NumScalars);
6198 if (GatherShuffles.
size() == 1 &&
6200 Entries.
front().front()->isSame(TE.Scalars)) {
6204 return std::nullopt;
6206 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6207 TE.UserTreeIndex.UserTE)
6208 return std::nullopt;
6211 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6212 return std::nullopt;
6215 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6216 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6219 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6221 return std::nullopt;
6225 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6226 return CurrentOrder;
6230 return all_of(Mask, [&](
int I) {
6237 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6238 (Entries.
size() != 1 ||
6239 Entries.
front().front()->ReorderIndices.empty())) ||
6240 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6241 return std::nullopt;
6247 if (ShuffledSubMasks.
test(
I))
6249 const int VF = GetVF(
I);
6255 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6257 ShuffledSubMasks.
set(
I);
6261 int FirstMin = INT_MAX;
6262 int SecondVecFound =
false;
6264 int Idx = Mask[
I * PartSz + K];
6266 Value *V = GatheredScalars[
I * PartSz + K];
6268 SecondVecFound =
true;
6277 SecondVecFound =
true;
6281 FirstMin = (FirstMin / PartSz) * PartSz;
6283 if (SecondVecFound) {
6285 ShuffledSubMasks.
set(
I);
6289 int Idx = Mask[
I * PartSz + K];
6293 if (Idx >= PartSz) {
6294 SecondVecFound =
true;
6297 if (CurrentOrder[
I * PartSz + Idx] >
6298 static_cast<unsigned>(
I * PartSz + K) &&
6299 CurrentOrder[
I * PartSz + Idx] !=
6300 static_cast<unsigned>(
I * PartSz + Idx))
6301 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6304 if (SecondVecFound) {
6306 ShuffledSubMasks.
set(
I);
6312 if (!ExtractShuffles.
empty())
6313 TransformMaskToOrder(
6314 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6315 if (!ExtractShuffles[
I])
6318 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6320 int K =
I * PartSz + Idx;
6323 if (!TE.ReuseShuffleIndices.empty())
6324 K = TE.ReuseShuffleIndices[K];
6327 if (!TE.ReorderIndices.empty())
6328 K = std::distance(TE.ReorderIndices.begin(),
6329 find(TE.ReorderIndices, K));
6335 .getKnownMinValue());
6340 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6341 if (ShuffledSubMasks.
any())
6342 return std::nullopt;
6343 PartSz = NumScalars;
6346 if (!Entries.
empty())
6347 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6348 if (!GatherShuffles[
I])
6350 return std::max(Entries[
I].front()->getVectorFactor(),
6351 Entries[
I].back()->getVectorFactor());
6353 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6354 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6355 return std::nullopt;
6356 return std::move(CurrentOrder);
6361 bool CompareOpcodes =
true) {
6367 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6368 (!GEP2 || GEP2->getNumOperands() == 2) &&
6369 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6370 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6373 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6377template <
typename T>
6382 return CommonAlignment;
6388 "Order is empty. Please check it before using isReverseOrder.");
6389 unsigned Sz = Order.
size();
6391 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6402 const SCEV *PtrSCEVLowest =
nullptr;
6403 const SCEV *PtrSCEVHighest =
nullptr;
6411 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6412 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6419 PtrSCEVLowest = PtrSCEV;
6426 PtrSCEVHighest = PtrSCEV;
6434 int Size =
DL.getTypeStoreSize(ElemTy);
6435 auto TryGetStride = [&](
const SCEV *Dist,
6436 const SCEV *Multiplier) ->
const SCEV * {
6438 if (M->getOperand(0) == Multiplier)
6439 return M->getOperand(1);
6440 if (M->getOperand(1) == Multiplier)
6441 return M->getOperand(0);
6444 if (Multiplier == Dist)
6449 const SCEV *Stride =
nullptr;
6450 if (
Size != 1 || SCEVs.
size() > 2) {
6452 Stride = TryGetStride(Dist, Sz);
6460 using DistOrdPair = std::pair<int64_t, int>;
6462 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6464 bool IsConsecutive =
true;
6465 for (
const SCEV *PtrSCEV : SCEVs) {
6467 if (PtrSCEV != PtrSCEVLowest) {
6469 const SCEV *Coeff = TryGetStride(Diff, Stride);
6479 Dist = SC->getAPInt().getZExtValue();
6484 auto Res = Offsets.emplace(Dist, Cnt);
6488 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6491 if (Offsets.size() != SCEVs.
size())
6493 SortedIndices.
clear();
6494 if (!IsConsecutive) {
6498 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6499 SortedIndices[Cnt] = Pair.second;
6506static std::pair<InstructionCost, InstructionCost>
6509 Type *ScalarTy, VectorType *VecTy);
6527 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6530 Mask, NumSrcElts, NumSubElts, Index)) {
6531 if (Index + NumSubElts > NumSrcElts &&
6532 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6549 "ScalableVectorType is not supported.");
6552 "Incorrect usage.");
6557 unsigned ScalarTyNumElements = VecTy->getNumElements();
6560 if (!DemandedElts[
I])
6564 I * ScalarTyNumElements, VecTy);
6567 I * ScalarTyNumElements, VecTy);
6580 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6581 if (Opcode == Instruction::ExtractElement) {
6587 Index * VecTy->getNumElements(), VecTy);
6590 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6603 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6605 Index * ScalarTy->getNumElements(), SubTp) +
6609 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6625 auto *Begin = std::next(
Mask.begin(), Index);
6626 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6630 std::iota(
Mask.begin(),
Mask.end(), 0);
6631 std::iota(std::next(
Mask.begin(), Index),
6632 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6634 return Generator(Vec, V, Mask);
6637 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6645 unsigned SubVecVF,
unsigned Index) {
6647 std::iota(Mask.begin(), Mask.end(), Index);
6648 return Builder.CreateShuffleVector(Vec, Mask);
6658 const unsigned Sz = PointerOps.
size();
6661 CompressMask[0] = 0;
6663 std::optional<unsigned> Stride = 0;
6667 std::optional<int64_t> OptPos =
6669 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6671 unsigned Pos =
static_cast<unsigned>(*OptPos);
6672 CompressMask[
I] = Pos;
6679 if (Pos != *Stride *
I)
6682 return Stride.has_value();
6695 InterleaveFactor = 0;
6697 const size_t Sz = VL.
size();
6705 if (AreAllUsersVectorized(V))
6708 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6709 Mask.empty() ?
I : Mask[
I]);
6712 if (ExtractCost <= ScalarCost)
6717 if (Order.
empty()) {
6718 Ptr0 = PointerOps.
front();
6719 PtrN = PointerOps.
back();
6721 Ptr0 = PointerOps[Order.
front()];
6722 PtrN = PointerOps[Order.
back()];
6724 std::optional<int64_t> Diff =
6728 const size_t MaxRegSize =
6732 if (*Diff / Sz >= MaxRegSize / 8)
6736 Align CommonAlignment = LI->getAlign();
6738 Ptr0, LoadVecTy, CommonAlignment,
DL,
6741 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6742 LI->getPointerAddressSpace()))
6748 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6752 auto [ScalarGEPCost, VectorGEPCost] =
6754 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6772 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6773 LI->getPointerAddressSpace(),
CostKind);
6776 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6777 LI->getPointerAddressSpace(),
CostKind);
6779 if (IsStrided && !IsMasked && Order.
empty()) {
6786 AlignedLoadVecTy = LoadVecTy;
6787 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6789 LI->getPointerAddressSpace())) {
6791 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6792 Instruction::Load, AlignedLoadVecTy,
6793 CompressMask[1], {}, CommonAlignment,
6794 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6795 if (InterleavedCost < GatherCost) {
6796 InterleaveFactor = CompressMask[1];
6797 LoadVecTy = AlignedLoadVecTy;
6804 if (!Order.
empty()) {
6807 NewMask[
I] = CompressMask[Mask[
I]];
6809 CompressMask.
swap(NewMask);
6811 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6812 return TotalVecCost < GatherCost;
6825 unsigned InterleaveFactor;
6829 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6830 CompressMask, LoadVecTy);
6847 Align Alignment,
const int64_t Diff,
Value *Ptr0,
6848 Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
6849 const size_t Sz = PointerOps.
size();
6850 if (Diff % (Sz - 1) != 0)
6854 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6856 return !isVectorized(U) && !MustGather.contains(U);
6860 const uint64_t AbsoluteDiff = std::abs(Diff);
6862 if (IsAnyPointerUsedOutGraph ||
6863 (AbsoluteDiff > Sz &&
6866 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6867 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6868 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6869 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6871 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6881 else if (
Ptr != Ptr0)
6885 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6888 if (Dists.
size() == Sz) {
6889 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
6890 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6901 StridedPtrInfo &SPtrInfo)
const {
6902 const unsigned Sz = PointerOps.
size();
6904 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6905 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6907 if (
const SCEV *Stride =
6910 SPtrInfo.StrideSCEV = Stride;
6919 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6932 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6938 const size_t Sz = VL.
size();
6940 auto *POIter = PointerOps.
begin();
6941 for (
Value *V : VL) {
6943 if (!L || !L->isSimple())
6945 *POIter = L->getPointerOperand();
6951 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6960 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6961 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6972 if (Order.
empty()) {
6973 Ptr0 = PointerOps.
front();
6974 PtrN = PointerOps.
back();
6976 Ptr0 = PointerOps[Order.
front()];
6977 PtrN = PointerOps[Order.
back()];
6979 std::optional<int64_t> Diff =
6982 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6985 *TLI, [&](
Value *V) {
6986 return areAllUsersVectorized(
6993 if (
isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
6997 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6998 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7003 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7005 bool ProfitableGatherPointers) {
7010 auto [ScalarGEPCost, VectorGEPCost] =
7012 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7016 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7018 if (
static_cast<unsigned>(
count_if(
7037 return C + TTI.getInstructionCost(
7043 TTI.getGatherScatterOpCost(
7045 false, CommonAlignment,
CostKind) +
7046 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7054 constexpr unsigned ListLimit = 4;
7055 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7064 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7074 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7079 PointerOps, SPtrInfo, BestVF,
7087 DemandedElts.
setBits(Cnt, Cnt + VF);
7103 if (!DemandedElts.
isZero()) {
7109 if (DemandedElts[Idx])
7120 LI0->getPointerOperand(),
7121 Instruction::GetElementPtr,
CostKind, ScalarTy,
7125 if (
static_cast<unsigned>(
7127 PointerOps.
size() - 1 ||
7146 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7147 LI0->getPointerAddressSpace(),
CostKind,
7152 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7153 LI0->getPointerOperand(),
7159 VecLdCost += TTI.getMaskedMemoryOpCost(
7160 Instruction::Load, SubVecTy, CommonAlignment,
7161 LI0->getPointerAddressSpace(),
CostKind) +
7167 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7168 LI0->getPointerOperand(),
7179 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7188 if (MaskedGatherCost >= VecLdCost &&
7201 bool ProfitableGatherPointers =
7202 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7203 return L->isLoopInvariant(V);
7205 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7208 (
GEP &&
GEP->getNumOperands() == 2 &&
7216 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7217 ProfitableGatherPointers))
7229 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7230 "Expected list of pointer operands.");
7235 std::pair<BasicBlock *, Value *>,
7239 .try_emplace(std::make_pair(
7243 SortedIndices.
clear();
7245 auto Key = std::make_pair(BBs[Cnt + 1],
7247 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7249 std::optional<int64_t> Diff =
7250 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7251 ElemTy, Ptr, DL, SE,
7256 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7262 if (Bases.size() > VL.
size() / 2 - 1)
7266 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7270 if (Bases.size() == VL.
size())
7273 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7274 Bases.front().second.size() == VL.
size()))
7279 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7288 FirstPointers.
insert(P1);
7289 SecondPointers.
insert(P2);
7295 "Unable to find matching root.");
7298 for (
auto &
Base : Bases) {
7299 for (
auto &Vec :
Base.second) {
7300 if (Vec.size() > 1) {
7302 int64_t InitialOffset = std::get<1>(Vec[0]);
7303 bool AnyConsecutive =
7305 return std::get<1>(
P.value()) ==
7306 int64_t(
P.index()) + InitialOffset;
7310 if (!AnyConsecutive)
7315 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7319 for (
auto &
T : Bases)
7320 for (
const auto &Vec :
T.second)
7321 for (
const auto &
P : Vec)
7325 "Expected SortedIndices to be the size of VL");
7329std::optional<BoUpSLP::OrdersType>
7331 assert(TE.isGather() &&
"Expected gather node only.");
7332 Type *ScalarTy = TE.Scalars[0]->getType();
7335 Ptrs.
reserve(TE.Scalars.size());
7337 BBs.
reserve(TE.Scalars.size());
7338 for (
Value *V : TE.Scalars) {
7340 if (!L || !L->isSimple())
7341 return std::nullopt;
7347 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7349 return std::move(Order);
7350 return std::nullopt;
7361 if (VU->
getType() != V->getType())
7364 if (!VU->
hasOneUse() && !V->hasOneUse())
7370 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7377 bool IsReusedIdx =
false;
7379 if (IE2 == VU && !IE1)
7381 if (IE1 == V && !IE2)
7382 return V->hasOneUse();
7383 if (IE1 && IE1 != V) {
7385 IsReusedIdx |= ReusedIdx.
test(Idx1);
7386 ReusedIdx.
set(Idx1);
7387 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7392 if (IE2 && IE2 != VU) {
7394 IsReusedIdx |= ReusedIdx.
test(Idx2);
7395 ReusedIdx.
set(Idx2);
7396 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7401 }
while (!IsReusedIdx && (IE1 || IE2));
7409 const TargetLibraryInfo &TLI);
7411std::optional<BoUpSLP::OrdersType>
7413 bool IgnoreReorder) {
7416 if (!TE.ReuseShuffleIndices.empty()) {
7418 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7419 "Reshuffling scalars not yet supported for nodes with padding");
7422 return std::nullopt;
7430 unsigned Sz = TE.Scalars.size();
7431 if (TE.isGather()) {
7432 if (std::optional<OrdersType> CurrentOrder =
7437 ::addMask(Mask, TE.ReuseShuffleIndices);
7438 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7439 unsigned Sz = TE.Scalars.size();
7440 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7443 Res[Idx + K * Sz] =
I + K * Sz;
7445 return std::move(Res);
7448 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7450 2 * TE.getVectorFactor())) == 1)
7451 return std::nullopt;
7452 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7453 return std::nullopt;
7457 if (TE.ReorderIndices.empty())
7458 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7461 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7462 unsigned VF = ReorderMask.
size();
7466 for (
unsigned I = 0;
I < VF;
I += Sz) {
7468 unsigned UndefCnt = 0;
7469 unsigned Limit = std::min(Sz, VF -
I);
7478 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7480 return std::nullopt;
7482 for (
unsigned K = 0; K < NumParts; ++K) {
7483 unsigned Idx = Val + Sz * K;
7484 if (Idx < VF &&
I + K < VF)
7485 ResOrder[Idx] =
I + K;
7488 return std::move(ResOrder);
7490 unsigned VF = TE.getVectorFactor();
7493 TE.ReuseShuffleIndices.end());
7494 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7496 if (isa<PoisonValue>(V))
7498 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7499 return Idx && *Idx < Sz;
7501 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7502 "by BinaryOperator and CastInst.");
7504 if (TE.ReorderIndices.empty())
7505 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7508 for (
unsigned I = 0;
I < VF; ++
I) {
7509 int &Idx = ReusedMask[
I];
7512 Value *V = TE.Scalars[ReorderMask[Idx]];
7514 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7520 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7521 auto *It = ResOrder.
begin();
7522 for (
unsigned K = 0; K < VF; K += Sz) {
7526 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7528 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7529 std::advance(It, Sz);
7532 return Data.index() ==
Data.value();
7534 return std::nullopt;
7535 return std::move(ResOrder);
7537 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7538 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7540 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7541 return std::nullopt;
7542 if (TE.State == TreeEntry::SplitVectorize ||
7543 ((TE.State == TreeEntry::Vectorize ||
7544 TE.State == TreeEntry::StridedVectorize ||
7545 TE.State == TreeEntry::CompressVectorize) &&
7548 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7549 "Alternate instructions are only supported by "
7550 "BinaryOperator and CastInst.");
7551 return TE.ReorderIndices;
7553 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7554 TE.isAltShuffle()) {
7555 assert(TE.ReuseShuffleIndices.empty() &&
7556 "ReuseShuffleIndices should be "
7557 "empty for alternate instructions.");
7559 TE.buildAltOpShuffleMask(
7561 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7562 "Unexpected main/alternate opcode");
7566 const int VF = TE.getVectorFactor();
7571 ResOrder[Mask[
I] % VF] =
I;
7573 return std::move(ResOrder);
7575 if (!TE.ReorderIndices.empty())
7576 return TE.ReorderIndices;
7577 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7578 if (!TE.ReorderIndices.empty())
7579 return TE.ReorderIndices;
7582 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7590 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7598 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7599 if (!DT->isReachableFromEntry(BB1))
7601 if (!DT->isReachableFromEntry(BB2))
7603 auto *NodeA = DT->getNode(BB1);
7604 auto *NodeB = DT->getNode(BB2);
7605 assert(NodeA &&
"Should only process reachable instructions");
7606 assert(NodeB &&
"Should only process reachable instructions");
7607 assert((NodeA == NodeB) ==
7608 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7609 "Different nodes should have different DFS numbers");
7610 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7612 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7613 Value *V1 = TE.Scalars[I1];
7614 Value *V2 = TE.Scalars[I2];
7627 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7628 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7629 FirstUserOfPhi2->getParent());
7639 if (UserBVHead[I1] && !UserBVHead[I2])
7641 if (!UserBVHead[I1])
7643 if (UserBVHead[I1] == UserBVHead[I2])
7646 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7648 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7661 if (EE1->getOperand(0) == EE2->getOperand(0))
7663 if (!Inst1 && Inst2)
7665 if (Inst1 && Inst2) {
7673 "Expected either instructions or arguments vector operands.");
7674 return P1->getArgNo() < P2->getArgNo();
7679 std::iota(Phis.
begin(), Phis.
end(), 0);
7682 return std::nullopt;
7683 return std::move(Phis);
7685 if (TE.isGather() &&
7686 (!TE.hasState() || !TE.isAltShuffle() ||
7687 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7691 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7695 auto *EE = dyn_cast<ExtractElementInst>(V);
7696 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7702 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7703 if (Reuse || !CurrentOrder.
empty())
7704 return std::move(CurrentOrder);
7712 int Sz = TE.Scalars.size();
7716 if (It == TE.Scalars.begin())
7719 if (It != TE.Scalars.end()) {
7721 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7736 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7739 return std::move(Order);
7744 return std::nullopt;
7745 if (TE.Scalars.size() >= 3)
7750 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7752 StridedPtrInfo SPtrInfo;
7755 CurrentOrder, PointerOps, SPtrInfo);
7758 return std::move(CurrentOrder);
7763 if (std::optional<OrdersType> CurrentOrder =
7765 return CurrentOrder;
7767 return std::nullopt;
7777 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7779 if (Cluster != FirstCluster)
7785void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7788 const unsigned Sz =
TE.Scalars.size();
7790 if (!
TE.isGather() ||
7795 SmallVector<int> NewMask;
7797 addMask(NewMask,
TE.ReuseShuffleIndices);
7799 TE.ReorderIndices.clear();
7801 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7802 SmallVector<unsigned> NewOrder(Slice);
7806 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7807 *End =
TE.ReuseShuffleIndices.end();
7808 It != End; std::advance(It, Sz))
7809 std::iota(It, std::next(It, Sz), 0);
7815 "Expected same size of orders");
7816 size_t Sz = Order.
size();
7819 if (Order[Idx] != Sz)
7820 UsedIndices.
set(Order[Idx]);
7822 if (SecondaryOrder.
empty()) {
7824 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7828 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7829 !UsedIndices.
test(SecondaryOrder[Idx]))
7830 Order[Idx] = SecondaryOrder[Idx];
7838 constexpr unsigned TinyVF = 2;
7839 constexpr unsigned TinyTree = 10;
7840 constexpr unsigned PhiOpsLimit = 12;
7841 constexpr unsigned GatherLoadsLimit = 2;
7842 if (VectorizableTree.size() <= TinyTree)
7844 if (VectorizableTree.front()->hasState() &&
7845 !VectorizableTree.front()->isGather() &&
7846 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7847 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7848 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7849 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7850 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7851 VectorizableTree.front()->ReorderIndices.empty()) {
7855 if (VectorizableTree.front()->hasState() &&
7856 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7857 VectorizableTree.front()->Scalars.size() == TinyVF &&
7858 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7861 if (VectorizableTree.front()->hasState() &&
7862 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7863 VectorizableTree.front()->ReorderIndices.empty()) {
7864 const unsigned ReorderedSplitsCnt =
7865 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7866 return TE->State == TreeEntry::SplitVectorize &&
7867 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7868 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7871 if (ReorderedSplitsCnt <= 1 &&
7873 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7874 return ((!TE->isGather() &&
7875 (TE->ReorderIndices.empty() ||
7876 (TE->UserTreeIndex.UserTE &&
7877 TE->UserTreeIndex.UserTE->State ==
7878 TreeEntry::Vectorize &&
7879 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7881 (TE->isGather() && TE->ReorderIndices.empty() &&
7882 (!TE->hasState() || TE->isAltShuffle() ||
7883 TE->getOpcode() == Instruction::Load ||
7884 TE->getOpcode() == Instruction::ZExt ||
7885 TE->getOpcode() == Instruction::SExt))) &&
7886 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7887 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7888 return !isConstant(V) && isVectorized(V);
7890 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7893 bool HasPhis =
false;
7894 bool HasLoad =
true;
7895 unsigned GatherLoads = 0;
7896 for (
const std::unique_ptr<TreeEntry> &TE :
7897 ArrayRef(VectorizableTree).drop_front()) {
7898 if (TE->State == TreeEntry::SplitVectorize)
7900 if (!TE->hasState()) {
7904 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7909 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7910 if (!TE->isGather()) {
7917 if (GatherLoads >= GatherLoadsLimit)
7920 if (TE->getOpcode() == Instruction::GetElementPtr ||
7923 if (TE->getOpcode() != Instruction::PHI &&
7924 (!TE->hasCopyableElements() ||
7926 TE->Scalars.size() / 2))
7928 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7929 TE->getNumOperands() > PhiOpsLimit)
7938void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
7940 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7943 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7944 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7947 copy(MaskOrder, NewMaskOrder.begin());
7949 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
7950 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7959 ReorderIndices.clear();
7978 ExternalUserReorderMap;
7982 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7983 const std::unique_ptr<TreeEntry> &TE) {
7986 findExternalStoreUsersReorderIndices(TE.get());
7987 if (!ExternalUserReorderIndices.
empty()) {
7988 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7990 std::move(ExternalUserReorderIndices));
7996 if (TE->hasState() && TE->isAltShuffle() &&
7997 TE->State != TreeEntry::SplitVectorize) {
7998 Type *ScalarTy = TE->Scalars[0]->getType();
8000 unsigned Opcode0 = TE->getOpcode();
8001 unsigned Opcode1 = TE->getAltOpcode();
8005 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8006 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8012 bool IgnoreReorder =
8013 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8014 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8015 VectorizableTree.front()->getOpcode() == Instruction::Store);
8016 if (std::optional<OrdersType> CurrentOrder =
8026 const TreeEntry *UserTE = TE.get();
8028 if (!UserTE->UserTreeIndex)
8030 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8031 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8032 UserTE->UserTreeIndex.UserTE->Idx != 0)
8034 UserTE = UserTE->UserTreeIndex.UserTE;
8037 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8038 if (!(TE->State == TreeEntry::Vectorize ||
8039 TE->State == TreeEntry::StridedVectorize ||
8040 TE->State == TreeEntry::SplitVectorize ||
8041 TE->State == TreeEntry::CompressVectorize) ||
8042 !TE->ReuseShuffleIndices.empty())
8043 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8044 if (TE->State == TreeEntry::Vectorize &&
8045 TE->getOpcode() == Instruction::PHI)
8046 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8051 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8052 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8053 auto It = VFToOrderedEntries.
find(VF);
8054 if (It == VFToOrderedEntries.
end())
8068 for (
const TreeEntry *OpTE : OrderedEntries) {
8071 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8072 OpTE->State != TreeEntry::SplitVectorize)
8075 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8077 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8078 auto It = GathersToOrders.find(OpTE);
8079 if (It != GathersToOrders.end())
8082 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8083 auto It = AltShufflesToOrders.find(OpTE);
8084 if (It != AltShufflesToOrders.end())
8087 if (OpTE->State == TreeEntry::Vectorize &&
8088 OpTE->getOpcode() == Instruction::PHI) {
8089 auto It = PhisToOrders.
find(OpTE);
8090 if (It != PhisToOrders.
end())
8093 return OpTE->ReorderIndices;
8096 auto It = ExternalUserReorderMap.
find(OpTE);
8097 if (It != ExternalUserReorderMap.
end()) {
8098 const auto &ExternalUserReorderIndices = It->second;
8102 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8103 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8104 ExternalUserReorderIndices.size();
8106 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8107 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8114 if (OpTE->State == TreeEntry::Vectorize &&
8115 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8116 assert(!OpTE->isAltShuffle() &&
8117 "Alternate instructions are only supported by BinaryOperator "
8121 unsigned E = Order.
size();
8124 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8127 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8129 ++OrdersUses.try_emplace(Order, 0).first->second;
8132 if (OrdersUses.empty())
8135 unsigned IdentityCnt = 0;
8136 unsigned FilledIdentityCnt = 0;
8138 for (
auto &Pair : OrdersUses) {
8140 if (!Pair.first.empty())
8141 FilledIdentityCnt += Pair.second;
8142 IdentityCnt += Pair.second;
8147 unsigned Cnt = IdentityCnt;
8148 for (
auto &Pair : OrdersUses) {
8152 if (Cnt < Pair.second ||
8153 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8154 Cnt == Pair.second && !BestOrder.
empty() &&
8157 BestOrder = Pair.first;
8170 unsigned E = BestOrder.
size();
8172 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8175 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8177 if (TE->Scalars.size() != VF) {
8178 if (TE->ReuseShuffleIndices.size() == VF) {
8179 assert(TE->State != TreeEntry::SplitVectorize &&
8180 "Split vectorized not expected.");
8185 (!TE->UserTreeIndex ||
8186 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8187 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8188 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8189 "All users must be of VF size.");
8196 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8202 reorderNodeWithReuses(*TE, Mask);
8204 if (TE->UserTreeIndex &&
8205 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8206 TE->UserTreeIndex.UserTE->reorderSplitNode(
8207 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8211 if ((TE->State == TreeEntry::SplitVectorize &&
8212 TE->ReuseShuffleIndices.empty()) ||
8213 ((TE->State == TreeEntry::Vectorize ||
8214 TE->State == TreeEntry::StridedVectorize ||
8215 TE->State == TreeEntry::CompressVectorize) &&
8220 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8221 TE->ReuseShuffleIndices.empty())) &&
8222 "Alternate instructions are only supported by BinaryOperator "
8228 TE->reorderOperands(Mask);
8231 TE->reorderOperands(Mask);
8232 assert(TE->ReorderIndices.empty() &&
8233 "Expected empty reorder sequence.");
8236 if (!TE->ReuseShuffleIndices.empty()) {
8243 addMask(NewReuses, TE->ReuseShuffleIndices);
8244 TE->ReuseShuffleIndices.swap(NewReuses);
8245 }
else if (TE->UserTreeIndex &&
8246 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8248 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8254void BoUpSLP::buildReorderableOperands(
8255 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8259 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8260 return OpData.first ==
I &&
8261 (OpData.second->State == TreeEntry::Vectorize ||
8262 OpData.second->State == TreeEntry::StridedVectorize ||
8263 OpData.second->State == TreeEntry::CompressVectorize ||
8264 OpData.second->State == TreeEntry::SplitVectorize);
8268 if (UserTE->hasState()) {
8269 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8270 UserTE->getOpcode() == Instruction::ExtractValue)
8272 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8274 if (UserTE->getOpcode() == Instruction::Store &&
8275 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8277 if (UserTE->getOpcode() == Instruction::Load &&
8278 (UserTE->State == TreeEntry::Vectorize ||
8279 UserTE->State == TreeEntry::StridedVectorize ||
8280 UserTE->State == TreeEntry::CompressVectorize))
8283 TreeEntry *TE = getOperandEntry(UserTE,
I);
8284 assert(TE &&
"Expected operand entry.");
8285 if (!TE->isGather()) {
8288 Edges.emplace_back(
I, TE);
8294 if (TE->State == TreeEntry::ScatterVectorize &&
8295 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8299 if (ReorderableGathers.
contains(TE))
8305 struct TreeEntryCompare {
8306 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8307 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8308 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8309 return LHS->Idx < RHS->Idx;
8318 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8319 if (TE->State != TreeEntry::Vectorize &&
8320 TE->State != TreeEntry::StridedVectorize &&
8321 TE->State != TreeEntry::CompressVectorize &&
8322 TE->State != TreeEntry::SplitVectorize)
8323 NonVectorized.
insert(TE.get());
8324 if (std::optional<OrdersType> CurrentOrder =
8326 Queue.push(TE.get());
8327 if (!(TE->State == TreeEntry::Vectorize ||
8328 TE->State == TreeEntry::StridedVectorize ||
8329 TE->State == TreeEntry::CompressVectorize ||
8330 TE->State == TreeEntry::SplitVectorize) ||
8331 !TE->ReuseShuffleIndices.empty())
8332 GathersToOrders.
insert(TE.get());
8341 while (!Queue.empty()) {
8343 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8344 TreeEntry *TE = Queue.top();
8345 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8348 while (!Queue.empty()) {
8350 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8355 for (TreeEntry *TE : OrderedOps) {
8356 if (!(TE->State == TreeEntry::Vectorize ||
8357 TE->State == TreeEntry::StridedVectorize ||
8358 TE->State == TreeEntry::CompressVectorize ||
8359 TE->State == TreeEntry::SplitVectorize ||
8360 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8361 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8362 !Visited.
insert(TE).second)
8366 Users.first = TE->UserTreeIndex.UserTE;
8367 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8371 if (
Data.first->State == TreeEntry::SplitVectorize) {
8373 Data.second.size() <= 2 &&
8374 "Expected not greater than 2 operands for split vectorize node.");
8376 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8379 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8380 "Expected exactly 2 entries.");
8381 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8382 TreeEntry &OpTE = *VectorizableTree[
P.first];
8384 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8385 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8387 const auto BestOrder =
8396 const unsigned E = Order.
size();
8399 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8401 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8403 if (!OpTE.ReorderIndices.empty()) {
8404 OpTE.ReorderIndices.clear();
8405 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8408 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8412 if (
Data.first->ReuseShuffleIndices.empty() &&
8413 !
Data.first->ReorderIndices.empty()) {
8416 Queue.push(
Data.first);
8422 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8434 for (
const auto &
Op :
Data.second) {
8435 TreeEntry *OpTE =
Op.second;
8436 if (!VisitedOps.
insert(OpTE).second)
8438 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8440 const auto Order = [&]() ->
const OrdersType {
8441 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8445 return OpTE->ReorderIndices;
8449 if (Order.
size() == 1)
8455 Value *Root = OpTE->hasState()
8458 auto GetSameNodesUsers = [&](
Value *Root) {
8460 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8461 if (TE != OpTE && TE->UserTreeIndex &&
8462 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8463 TE->Scalars.size() == OpTE->Scalars.size() &&
8464 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8465 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8466 Res.
insert(TE->UserTreeIndex.UserTE);
8468 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8469 if (TE != OpTE && TE->UserTreeIndex &&
8470 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8471 TE->Scalars.size() == OpTE->Scalars.size() &&
8472 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8473 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8474 Res.
insert(TE->UserTreeIndex.UserTE);
8478 auto GetNumOperands = [](
const TreeEntry *TE) {
8479 if (TE->State == TreeEntry::SplitVectorize)
8480 return TE->getNumOperands();
8482 return CI->arg_size();
8483 return TE->getNumOperands();
8485 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8486 const TreeEntry *TE) {
8494 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8495 if (
Op->isGather() &&
Op->hasState()) {
8496 const TreeEntry *VecOp =
8497 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8501 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8508 if (!RevisitedOps.
insert(UTE).second)
8510 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8511 !UTE->ReuseShuffleIndices.empty() ||
8512 (UTE->UserTreeIndex &&
8513 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8514 (
Data.first->UserTreeIndex &&
8515 Data.first->UserTreeIndex.UserTE == UTE) ||
8516 (IgnoreReorder && UTE->UserTreeIndex &&
8517 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8518 NodeShouldBeReorderedWithOperands(UTE);
8521 for (TreeEntry *UTE :
Users) {
8529 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8531 Queue.push(
const_cast<TreeEntry *
>(
Op));
8536 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8537 return P.second == OpTE;
8540 if (OpTE->State == TreeEntry::Vectorize &&
8541 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8542 assert(!OpTE->isAltShuffle() &&
8543 "Alternate instructions are only supported by BinaryOperator "
8547 unsigned E = Order.
size();
8550 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8553 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8555 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8557 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8558 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8559 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8560 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8561 (IgnoreReorder && TE->Idx == 0))
8563 if (TE->isGather()) {
8573 if (OpTE->UserTreeIndex) {
8574 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8575 if (!VisitedUsers.
insert(UserTE).second)
8580 if (AllowsReordering(UserTE))
8588 if (
static_cast<unsigned>(
count_if(
8589 Ops, [UserTE, &AllowsReordering](
8590 const std::pair<unsigned, TreeEntry *> &
Op) {
8591 return AllowsReordering(
Op.second) &&
8592 Op.second->UserTreeIndex.UserTE == UserTE;
8593 })) <=
Ops.size() / 2)
8594 ++Res.first->second;
8597 if (OrdersUses.empty()) {
8602 unsigned IdentityCnt = 0;
8603 unsigned VF =
Data.second.front().second->getVectorFactor();
8605 for (
auto &Pair : OrdersUses) {
8607 IdentityCnt += Pair.second;
8612 unsigned Cnt = IdentityCnt;
8613 for (
auto &Pair : OrdersUses) {
8617 if (Cnt < Pair.second) {
8619 BestOrder = Pair.first;
8636 unsigned E = BestOrder.
size();
8638 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8640 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8641 TreeEntry *TE =
Op.second;
8642 if (!VisitedOps.
insert(TE).second)
8644 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8645 reorderNodeWithReuses(*TE, Mask);
8649 if (TE->State != TreeEntry::Vectorize &&
8650 TE->State != TreeEntry::StridedVectorize &&
8651 TE->State != TreeEntry::CompressVectorize &&
8652 TE->State != TreeEntry::SplitVectorize &&
8653 (TE->State != TreeEntry::ScatterVectorize ||
8654 TE->ReorderIndices.empty()))
8656 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8657 TE->ReorderIndices.empty()) &&
8658 "Non-matching sizes of user/operand entries.");
8660 if (IgnoreReorder && TE == VectorizableTree.front().get())
8661 IgnoreReorder =
false;
8664 for (TreeEntry *
Gather : GatherOps) {
8666 "Unexpected reordering of gathers.");
8667 if (!
Gather->ReuseShuffleIndices.empty()) {
8677 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8678 return TE.isAltShuffle() &&
8679 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8680 TE.ReorderIndices.empty());
8682 if (
Data.first->State != TreeEntry::Vectorize ||
8684 Data.first->getMainOp()) ||
8685 IsNotProfitableAltCodeNode(*
Data.first))
8686 Data.first->reorderOperands(Mask);
8688 IsNotProfitableAltCodeNode(*
Data.first) ||
8689 Data.first->State == TreeEntry::StridedVectorize ||
8690 Data.first->State == TreeEntry::CompressVectorize) {
8694 if (
Data.first->ReuseShuffleIndices.empty() &&
8695 !
Data.first->ReorderIndices.empty() &&
8696 !IsNotProfitableAltCodeNode(*
Data.first)) {
8699 Queue.push(
Data.first);
8707 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8708 VectorizableTree.front()->ReuseShuffleIndices.empty())
8709 VectorizableTree.front()->ReorderIndices.
clear();
8712Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8713 if (Entry.hasState() &&
8714 (Entry.getOpcode() == Instruction::Store ||
8715 Entry.getOpcode() == Instruction::Load) &&
8716 Entry.State == TreeEntry::StridedVectorize &&
8717 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8724 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8728 for (
auto &TEPtr : VectorizableTree) {
8729 TreeEntry *Entry = TEPtr.get();
8732 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8736 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8737 Value *Scalar = Entry->Scalars[Lane];
8742 auto It = ScalarToExtUses.
find(Scalar);
8743 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8746 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8747 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8748 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8749 <<
" from " << *Scalar <<
"for many users.\n");
8750 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8751 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8752 ExternalUsesWithNonUsers.insert(Scalar);
8757 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8758 if (ExtI != ExternallyUsedValues.
end()) {
8759 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8760 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8761 << FoundLane <<
" from " << *Scalar <<
".\n");
8762 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8763 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8774 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8779 !UseEntries.
empty()) {
8783 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8786 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8787 return UseEntry->State == TreeEntry::ScatterVectorize ||
8789 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8792 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8795 [](TreeEntry *UseEntry) {
8796 return UseEntry->isGather();
8802 if (It != ScalarToExtUses.
end()) {
8803 ExternalUses[It->second].User =
nullptr;
8808 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8810 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8812 <<
" from lane " << FoundLane <<
" from " << *Scalar
8814 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8815 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8816 ExternalUsesWithNonUsers.insert(Scalar);
8825BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8829 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8830 Value *V = TE->Scalars[Lane];
8843 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8852 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8853 SI->getValueOperand()->getType(),
Ptr}];
8856 if (StoresVec.size() > Lane)
8858 if (!StoresVec.empty()) {
8860 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8861 SI->getValueOperand()->getType(),
8862 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8868 StoresVec.push_back(
SI);
8873 for (
auto &
P : PtrToStoresMap) {
8888 StoreInst *S0 = StoresVec[0];
8893 StoreInst *
SI = StoresVec[Idx];
8894 std::optional<int64_t> Diff =
8896 SI->getPointerOperand(), *DL, *SE,
8902 if (StoreOffsetVec.
size() != StoresVec.
size())
8904 sort(StoreOffsetVec, llvm::less_first());
8906 int64_t PrevDist = 0;
8907 for (
const auto &
P : StoreOffsetVec) {
8908 if (Idx > 0 &&
P.first != PrevDist + 1)
8916 ReorderIndices.assign(StoresVec.
size(), 0);
8917 bool IsIdentity =
true;
8919 ReorderIndices[
P.second] =
I;
8920 IsIdentity &=
P.second ==
I;
8926 ReorderIndices.clear();
8933 for (
unsigned Idx : Order)
8934 dbgs() << Idx <<
", ";
8940BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8941 unsigned NumLanes =
TE->Scalars.size();
8954 if (StoresVec.
size() != NumLanes)
8959 if (!canFormVector(StoresVec, ReorderIndices))
8964 ExternalReorderIndices.
push_back(ReorderIndices);
8966 return ExternalReorderIndices;
8972 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8973 "TreeEntryToStridedPtrInfoMap is not cleared");
8974 UserIgnoreList = &UserIgnoreLst;
8977 buildTreeRec(Roots, 0,
EdgeInfo());
8982 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8983 "TreeEntryToStridedPtrInfoMap is not cleared");
8986 buildTreeRec(Roots, 0,
EdgeInfo());
8995 bool AddNew =
true) {
9003 for (
Value *V : VL) {
9007 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9009 bool IsFound =
false;
9010 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9011 assert(LI->getParent() ==
Data.front().first->getParent() &&
9012 LI->getType() ==
Data.front().first->getType() &&
9016 "Expected loads with the same type, same parent and same "
9017 "underlying pointer.");
9019 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9020 Data.front().first->getPointerOperand(),
DL, SE,
9024 auto It = Map.find(*Dist);
9025 if (It != Map.end() && It->second != LI)
9027 if (It == Map.end()) {
9028 Data.emplace_back(LI, *Dist);
9029 Map.try_emplace(*Dist, LI);
9039 auto FindMatchingLoads =
9044 int64_t &
Offset,
unsigned &Start) {
9046 return GatheredLoads.
end();
9055 std::optional<int64_t> Dist =
9057 Data.front().first->getType(),
9058 Data.front().first->getPointerOperand(),
DL, SE,
9064 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9070 unsigned NumUniques = 0;
9071 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9072 bool Used = DataLoads.
contains(Pair.first);
9073 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9077 Repeated.insert(Cnt);
9080 if (NumUniques > 0 &&
9081 (Loads.
size() == NumUniques ||
9082 (Loads.
size() - NumUniques >= 2 &&
9083 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9089 return std::next(GatheredLoads.
begin(), Idx);
9093 return GatheredLoads.
end();
9095 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9099 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9101 while (It != GatheredLoads.
end()) {
9102 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9103 for (
unsigned Idx : LocalToAdd)
9106 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9110 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9117 Loads.push_back(
Data[Idx]);
9123 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9124 return PD.front().first->getParent() == LI->
getParent() &&
9125 PD.front().first->getType() == LI->
getType();
9127 while (It != GatheredLoads.
end()) {
9130 std::next(It), GatheredLoads.
end(),
9131 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9132 return PD.front().first->getParent() == LI->getParent() &&
9133 PD.front().first->getType() == LI->getType();
9137 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9138 AddNewLoads(GatheredLoads.emplace_back());
9143void BoUpSLP::tryToVectorizeGatheredLoads(
9144 const SmallMapVector<
9145 std::tuple<BasicBlock *, Value *, Type *>,
9148 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9151 LoadEntriesToVectorize.size());
9152 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9153 Set.insert_range(VectorizableTree[Idx]->Scalars);
9156 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9157 const std::pair<LoadInst *, int64_t> &L2) {
9158 return L1.second > L2.second;
9165 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9166 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9167 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9172 SmallVectorImpl<LoadInst *> &NonVectorized,
9173 bool Final,
unsigned MaxVF) {
9175 unsigned StartIdx = 0;
9176 SmallVector<int> CandidateVFs;
9180 *TTI, Loads.
front()->getType(), MaxVF);
9182 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9188 if (Final && CandidateVFs.
empty())
9191 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9192 for (
unsigned NumElts : CandidateVFs) {
9193 if (Final && NumElts > BestVF)
9195 SmallVector<unsigned> MaskedGatherVectorized;
9196 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9200 if (VectorizedLoads.count(Slice.
front()) ||
9201 VectorizedLoads.count(Slice.
back()) ||
9207 bool AllowToVectorize =
false;
9210 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9213 for (LoadInst *LI : Slice) {
9215 if (LI->hasOneUse())
9221 if (
static_cast<unsigned int>(std::distance(
9222 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9224 if (!IsLegalBroadcastLoad)
9228 for (User *U : LI->users()) {
9231 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9232 for (
int I :
seq<int>(UTE->getNumOperands())) {
9234 return V == LI || isa<PoisonValue>(V);
9244 AllowToVectorize = CheckIfAllowed(Slice);
9248 any_of(ValueToGatherNodes.at(Slice.front()),
9249 [=](
const TreeEntry *TE) {
9250 return TE->Scalars.size() == 2 &&
9251 ((TE->Scalars.front() == Slice.front() &&
9252 TE->Scalars.back() == Slice.back()) ||
9253 (TE->Scalars.front() == Slice.back() &&
9254 TE->Scalars.back() == Slice.front()));
9259 if (AllowToVectorize) {
9264 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9265 StridedPtrInfo SPtrInfo;
9267 PointerOps, SPtrInfo, &BestVF);
9269 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9271 if (MaskedGatherVectorized.
empty() ||
9272 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9277 Results.emplace_back(Values, LS);
9278 VectorizedLoads.insert_range(Slice);
9281 if (Cnt == StartIdx)
9282 StartIdx += NumElts;
9285 if (StartIdx >= Loads.
size())
9289 if (!MaskedGatherVectorized.
empty() &&
9290 Cnt < MaskedGatherVectorized.
back() + NumElts)
9296 if (!AllowToVectorize || BestVF == 0)
9300 for (
unsigned Cnt : MaskedGatherVectorized) {
9302 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9306 VectorizedLoads.insert_range(Slice);
9308 if (Cnt == StartIdx)
9309 StartIdx += NumElts;
9312 for (LoadInst *LI : Loads) {
9313 if (!VectorizedLoads.contains(LI))
9314 NonVectorized.push_back(LI);
9318 auto ProcessGatheredLoads =
9321 bool Final =
false) {
9323 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9325 if (LoadsDists.size() <= 1) {
9326 NonVectorized.
push_back(LoadsDists.back().first);
9334 unsigned MaxConsecutiveDistance = 0;
9335 unsigned CurrentConsecutiveDist = 1;
9336 int64_t LastDist = LocalLoadsDists.front().second;
9337 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9338 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9341 assert(LastDist >=
L.second &&
9342 "Expected first distance always not less than second");
9343 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9344 CurrentConsecutiveDist) {
9345 ++CurrentConsecutiveDist;
9346 MaxConsecutiveDistance =
9347 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9351 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9354 CurrentConsecutiveDist = 1;
9355 LastDist =
L.second;
9358 if (Loads.
size() <= 1)
9360 if (AllowMaskedGather)
9361 MaxConsecutiveDistance = Loads.
size();
9362 else if (MaxConsecutiveDistance < 2)
9367 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9368 Final, MaxConsecutiveDistance);
9370 OriginalLoads.size() == Loads.
size() &&
9371 MaxConsecutiveDistance == Loads.
size() &&
9376 VectorizedLoads.
clear();
9380 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9381 UnsortedNonVectorized, Final,
9382 OriginalLoads.size());
9383 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9384 SortedNonVectorized.
swap(UnsortedNonVectorized);
9385 Results.swap(UnsortedResults);
9390 << Slice.
size() <<
")\n");
9392 for (
Value *L : Slice)
9400 unsigned MaxVF = Slice.size();
9401 unsigned UserMaxVF = 0;
9402 unsigned InterleaveFactor = 0;
9407 std::optional<unsigned> InterleavedLoadsDistance = 0;
9409 std::optional<unsigned> CommonVF = 0;
9410 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9411 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9412 for (
auto [Idx, V] :
enumerate(Slice)) {
9413 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9414 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9417 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9419 if (*CommonVF == 0) {
9420 CommonVF =
E->Scalars.size();
9423 if (*CommonVF !=
E->Scalars.size())
9427 if (Pos != Idx && InterleavedLoadsDistance) {
9430 if (isa<Constant>(V))
9432 if (isVectorized(V))
9434 const auto &Nodes = ValueToGatherNodes.at(V);
9435 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9436 !is_contained(Slice, V);
9438 InterleavedLoadsDistance.reset();
9442 if (*InterleavedLoadsDistance == 0) {
9443 InterleavedLoadsDistance = Idx - Pos;
9446 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9447 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9448 InterleavedLoadsDistance.reset();
9449 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9453 DeinterleavedNodes.
clear();
9455 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9456 CommonVF.value_or(0) != 0) {
9457 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9458 unsigned VF = *CommonVF;
9461 StridedPtrInfo SPtrInfo;
9463 if (InterleaveFactor <= Slice.size() &&
9464 TTI.isLegalInterleavedAccessType(
9472 UserMaxVF = InterleaveFactor * VF;
9474 InterleaveFactor = 0;
9479 unsigned ConsecutiveNodesSize = 0;
9480 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9481 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9482 [&, Slice = Slice](
const auto &
P) {
9484 return std::get<1>(
P).contains(V);
9486 if (It == Slice.end())
9488 const TreeEntry &
TE =
9489 *VectorizableTree[std::get<0>(
P)];
9493 StridedPtrInfo SPtrInfo;
9495 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9499 ConsecutiveNodesSize += VL.
size();
9500 size_t Start = std::distance(Slice.begin(), It);
9501 size_t Sz = Slice.size() -
Start;
9502 return Sz < VL.
size() ||
9503 Slice.slice(Start, VL.
size()) != VL;
9508 if (InterleaveFactor == 0 &&
9510 [&, Slice = Slice](
unsigned Idx) {
9512 SmallVector<Value *> PointerOps;
9513 StridedPtrInfo SPtrInfo;
9514 return canVectorizeLoads(
9515 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9516 Slice[Idx * UserMaxVF], Order, PointerOps,
9517 SPtrInfo) == LoadsState::ScatterVectorize;
9520 if (Slice.size() != ConsecutiveNodesSize)
9521 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9523 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9524 bool IsVectorized =
true;
9525 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9527 Slice.slice(
I, std::min(VF,
E -
I));
9532 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9533 [&](
const auto &
P) {
9535 VectorizableTree[std::get<0>(
P)]
9540 unsigned Sz = VectorizableTree.size();
9541 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9542 if (Sz == VectorizableTree.size()) {
9543 IsVectorized =
false;
9546 if (InterleaveFactor > 0) {
9547 VF = 2 * (MaxVF / InterleaveFactor);
9548 InterleaveFactor = 0;
9557 NonVectorized.
append(SortedNonVectorized);
9559 return NonVectorized;
9561 for (
const auto &GLs : GatheredLoads) {
9562 const auto &
Ref = GLs.second;
9564 if (!
Ref.empty() && !NonVectorized.
empty() &&
9566 Ref.begin(),
Ref.end(), 0u,
9567 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9568 ->
unsigned { return S + LoadsDists.size(); }) !=
9569 NonVectorized.
size() &&
9570 IsMaskedGatherSupported(NonVectorized)) {
9573 for (LoadInst *LI : NonVectorized) {
9581 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9585 for (
unsigned Idx : LoadEntriesToVectorize) {
9586 const TreeEntry &
E = *VectorizableTree[Idx];
9589 if (!
E.ReorderIndices.empty()) {
9592 SmallVector<int> ReorderMask;
9596 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9600 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9601 VectorizableTree.size())
9602 GatheredLoadsEntriesFirst.reset();
9612 bool AllowAlternate) {
9635 isValidForAlternation(
I->getOpcode())) {
9647 std::pair<size_t, size_t> OpVals =
9655 if (CI->isCommutative())
9677 SubKey =
hash_value(Gep->getPointerOperand());
9689 return std::make_pair(
Key, SubKey);
9695 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9697bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9699 Type *ScalarTy = S.getMainOp()->getType();
9700 unsigned Opcode0 = S.getOpcode();
9701 unsigned Opcode1 = S.getAltOpcode();
9702 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9705 Opcode1, OpcodeMask))
9708 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9711 for (
Value *V : VL) {
9728 switch (Res.value_or(0)) {
9742 DenseSet<unsigned> UniqueOpcodes;
9743 constexpr unsigned NumAltInsts = 3;
9744 unsigned NonInstCnt = 0;
9747 unsigned UndefCnt = 0;
9749 unsigned ExtraShuffleInsts = 0;
9758 return is_contained(Operands.back(), V);
9761 ++ExtraShuffleInsts;
9764 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9776 DenseMap<Value *, unsigned> Uniques;
9786 if (!Res.second && Res.first->second == 1)
9787 ++ExtraShuffleInsts;
9788 ++Res.first->getSecond();
9790 UniqueOpcodes.
insert(
I->getOpcode());
9791 else if (Res.second)
9794 return none_of(Uniques, [&](
const auto &
P) {
9795 return P.first->hasNUsesOrMore(
P.second + 1) &&
9796 none_of(
P.first->users(), [&](User *U) {
9797 return isVectorized(U) || Uniques.contains(U);
9806 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9807 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9808 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9815 const unsigned VF,
unsigned MinBW,
9838static std::pair<InstructionCost, InstructionCost>
9858 FMF = FPCI->getFastMathFlags();
9861 LibCost.isValid() ? LibCost : ScalarLimit);
9871BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9873 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9874 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9876 "Expected instructions with same/alternate opcodes only.");
9878 unsigned ShuffleOrOp =
9879 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9881 switch (ShuffleOrOp) {
9882 case Instruction::PHI: {
9885 return TreeEntry::NeedToGather;
9887 for (
Value *V : VL) {
9891 for (
Value *Incoming :
PHI->incoming_values()) {
9893 if (Term &&
Term->isTerminator()) {
9895 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9896 return TreeEntry::NeedToGather;
9901 return TreeEntry::Vectorize;
9903 case Instruction::ExtractElement:
9910 return TreeEntry::NeedToGather;
9912 case Instruction::ExtractValue: {
9913 bool Reuse = canReuseExtract(VL, CurrentOrder);
9917 return TreeEntry::NeedToGather;
9918 if (Reuse || !CurrentOrder.empty())
9919 return TreeEntry::Vectorize;
9921 return TreeEntry::NeedToGather;
9923 case Instruction::InsertElement: {
9927 for (
Value *V : VL) {
9929 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9930 return TreeEntry::NeedToGather;
9934 "Non-constant or undef index?");
9938 return !SourceVectors.contains(V);
9941 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9942 "different source vectors.\n");
9943 return TreeEntry::NeedToGather;
9948 return SourceVectors.contains(V) && !
V->hasOneUse();
9951 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9952 "multiple uses.\n");
9953 return TreeEntry::NeedToGather;
9956 return TreeEntry::Vectorize;
9958 case Instruction::Load: {
9965 auto IsGatheredNode = [&]() {
9966 if (!GatheredLoadsEntriesFirst)
9971 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9972 return TE->Idx >= *GatheredLoadsEntriesFirst;
9978 return TreeEntry::Vectorize;
9980 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9982 LoadEntriesToVectorize.insert(VectorizableTree.size());
9983 return TreeEntry::NeedToGather;
9985 return IsGatheredNode() ? TreeEntry::NeedToGather
9986 : TreeEntry::CompressVectorize;
9988 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9990 LoadEntriesToVectorize.insert(VectorizableTree.size());
9991 return TreeEntry::NeedToGather;
9993 return IsGatheredNode() ? TreeEntry::NeedToGather
9994 : TreeEntry::ScatterVectorize;
9996 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9998 LoadEntriesToVectorize.insert(VectorizableTree.size());
9999 return TreeEntry::NeedToGather;
10001 return IsGatheredNode() ? TreeEntry::NeedToGather
10002 : TreeEntry::StridedVectorize;
10006 if (DL->getTypeSizeInBits(ScalarTy) !=
10007 DL->getTypeAllocSizeInBits(ScalarTy))
10008 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10011 return !LI || !LI->isSimple();
10015 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10018 return TreeEntry::NeedToGather;
10022 case Instruction::ZExt:
10023 case Instruction::SExt:
10024 case Instruction::FPToUI:
10025 case Instruction::FPToSI:
10026 case Instruction::FPExt:
10027 case Instruction::PtrToInt:
10028 case Instruction::IntToPtr:
10029 case Instruction::SIToFP:
10030 case Instruction::UIToFP:
10031 case Instruction::Trunc:
10032 case Instruction::FPTrunc:
10033 case Instruction::BitCast: {
10035 for (
Value *V : VL) {
10041 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10042 return TreeEntry::NeedToGather;
10045 return TreeEntry::Vectorize;
10047 case Instruction::ICmp:
10048 case Instruction::FCmp: {
10053 for (
Value *V : VL) {
10057 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10058 Cmp->getOperand(0)->getType() != ComparedTy) {
10059 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10060 return TreeEntry::NeedToGather;
10063 return TreeEntry::Vectorize;
10065 case Instruction::Select:
10066 case Instruction::FNeg:
10067 case Instruction::Add:
10068 case Instruction::FAdd:
10069 case Instruction::Sub:
10070 case Instruction::FSub:
10071 case Instruction::Mul:
10072 case Instruction::FMul:
10073 case Instruction::UDiv:
10074 case Instruction::SDiv:
10075 case Instruction::FDiv:
10076 case Instruction::URem:
10077 case Instruction::SRem:
10078 case Instruction::FRem:
10079 case Instruction::Shl:
10080 case Instruction::LShr:
10081 case Instruction::AShr:
10082 case Instruction::And:
10083 case Instruction::Or:
10084 case Instruction::Xor:
10085 case Instruction::Freeze:
10086 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10087 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10089 return I &&
I->isBinaryOp() && !
I->isFast();
10091 return TreeEntry::NeedToGather;
10092 return TreeEntry::Vectorize;
10093 case Instruction::GetElementPtr: {
10095 for (
Value *V : VL) {
10099 if (
I->getNumOperands() != 2) {
10100 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10101 return TreeEntry::NeedToGather;
10108 for (
Value *V : VL) {
10112 Type *CurTy =
GEP->getSourceElementType();
10113 if (Ty0 != CurTy) {
10114 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10115 return TreeEntry::NeedToGather;
10121 for (
Value *V : VL) {
10125 auto *
Op =
I->getOperand(1);
10127 (
Op->getType() != Ty1 &&
10129 Op->getType()->getScalarSizeInBits() >
10130 DL->getIndexSizeInBits(
10131 V->getType()->getPointerAddressSpace())))) {
10133 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10134 return TreeEntry::NeedToGather;
10138 return TreeEntry::Vectorize;
10140 case Instruction::Store: {
10142 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10145 if (DL->getTypeSizeInBits(ScalarTy) !=
10146 DL->getTypeAllocSizeInBits(ScalarTy)) {
10147 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10148 return TreeEntry::NeedToGather;
10152 for (
Value *V : VL) {
10154 if (!
SI->isSimple()) {
10156 return TreeEntry::NeedToGather;
10165 if (CurrentOrder.empty()) {
10166 Ptr0 = PointerOps.
front();
10167 PtrN = PointerOps.
back();
10169 Ptr0 = PointerOps[CurrentOrder.front()];
10170 PtrN = PointerOps[CurrentOrder.back()];
10172 std::optional<int64_t> Dist =
10175 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10176 return TreeEntry::Vectorize;
10180 return TreeEntry::NeedToGather;
10182 case Instruction::Call: {
10183 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10184 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10186 return I && !
I->isFast();
10188 return TreeEntry::NeedToGather;
10198 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10202 return TreeEntry::NeedToGather;
10205 unsigned NumArgs = CI->
arg_size();
10207 for (
unsigned J = 0; J != NumArgs; ++J)
10210 for (
Value *V : VL) {
10215 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10217 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10219 return TreeEntry::NeedToGather;
10223 for (
unsigned J = 0; J != NumArgs; ++J) {
10226 if (ScalarArgs[J] != A1J) {
10228 <<
"SLP: mismatched arguments in call:" << *CI
10229 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10230 return TreeEntry::NeedToGather;
10239 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10240 <<
"!=" << *V <<
'\n');
10241 return TreeEntry::NeedToGather;
10246 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10248 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10249 return TreeEntry::NeedToGather;
10251 return TreeEntry::Vectorize;
10253 case Instruction::ShuffleVector: {
10254 if (!S.isAltShuffle()) {
10257 return TreeEntry::Vectorize;
10260 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10261 return TreeEntry::NeedToGather;
10266 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10267 "the whole alt sequence is not profitable.\n");
10268 return TreeEntry::NeedToGather;
10271 return TreeEntry::Vectorize;
10275 return TreeEntry::NeedToGather;
10284 PHINode *Main =
nullptr;
10289 PHIHandler() =
delete;
10291 : DT(DT), Main(Main), Phis(Phis),
10292 Operands(Main->getNumIncomingValues(),
10294 void buildOperands() {
10295 constexpr unsigned FastLimit = 4;
10304 for (
auto [Idx, V] :
enumerate(Phis)) {
10308 "Expected isa instruction or poison value.");
10312 if (
P->getIncomingBlock(
I) == InBB)
10315 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10320 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10330 for (
auto [Idx, V] :
enumerate(Phis)) {
10345 auto *It = Blocks.
find(InBB);
10346 if (It == Blocks.
end())
10348 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10351 for (
const auto &
P : Blocks) {
10352 ArrayRef<unsigned> IncomingValues =
P.second;
10353 if (IncomingValues.
size() <= 1)
10356 for (
unsigned I : IncomingValues) {
10358 [&](
const auto &
Data) {
10359 return !
Data.value() ||
10362 "Expected empty operands list.");
10376static std::pair<Instruction *, Instruction *>
10380 for (
Value *V : VL) {
10390 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10409 "Expected different main and alt instructions.");
10410 return std::make_pair(MainOp, AltOp);
10423 const InstructionsState &S,
10425 bool TryPad =
false) {
10429 for (
Value *V : VL) {
10445 size_t NumUniqueScalarValues = UniqueValues.
size();
10448 if (NumUniqueScalarValues == VL.
size() &&
10450 ReuseShuffleIndices.
clear();
10455 if ((UserTreeIdx.
UserTE &&
10456 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10458 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10459 "for nodes with padding.\n");
10460 ReuseShuffleIndices.
clear();
10465 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10469 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10470 S.getMainOp()->isSafeToRemove() &&
10471 (S.areInstructionsWithCopyableElements() ||
10475 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10476 PWSz = std::min<unsigned>(PWSz, VL.
size());
10477 if (PWSz == VL.
size()) {
10481 ReuseShuffleIndices.
clear();
10485 UniqueValues.
end());
10486 PaddedUniqueValues.
append(
10487 PWSz - UniqueValues.
size(),
10491 if (!S.areInstructionsWithCopyableElements() &&
10494 ReuseShuffleIndices.
clear();
10497 VL = std::move(PaddedUniqueValues);
10502 ReuseShuffleIndices.
clear();
10505 VL = std::move(UniqueValues);
10510 const InstructionsState &LocalState,
10511 SmallVectorImpl<Value *> &Op1,
10512 SmallVectorImpl<Value *> &Op2,
10514 constexpr unsigned SmallNodeSize = 4;
10515 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10520 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10522 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10523 if (
E->isSame(VL)) {
10525 << *LocalState.getMainOp() <<
".\n");
10537 ReorderIndices.assign(VL.
size(), VL.
size());
10538 SmallBitVector Op1Indices(VL.
size());
10543 Op1Indices.set(Idx);
10546 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10549 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10551 LocalState.getAltOp(), *TLI))) {
10553 Op1Indices.set(Idx);
10560 unsigned Opcode0 = LocalState.getOpcode();
10561 unsigned Opcode1 = LocalState.getAltOpcode();
10562 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10567 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10568 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10573 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10575 if (Op1Indices.test(Idx)) {
10576 ReorderIndices[Op1Cnt] = Idx;
10579 ReorderIndices[Op2Cnt] = Idx;
10584 ReorderIndices.clear();
10585 SmallVector<int>
Mask;
10586 if (!ReorderIndices.empty())
10588 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10593 if (NumParts >= VL.
size())
10598 FixedVectorType *SubVecTy =
10602 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10603 (
Mask.empty() || InsertCost >= NewShuffleCost))
10605 if ((LocalState.getMainOp()->isBinaryOp() &&
10606 LocalState.getAltOp()->isBinaryOp() &&
10607 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10608 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10609 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10610 (LocalState.getMainOp()->isUnaryOp() &&
10611 LocalState.getAltOp()->isUnaryOp())) {
10613 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10614 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10619 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10623 VecTy, OriginalMask, Kind);
10625 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10626 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10628 NewVecOpsCost + InsertCost +
10629 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10630 VectorizableTree.front()->getOpcode() == Instruction::Store
10634 if (NewCost >= OriginalCost)
10644class InstructionsCompatibilityAnalysis {
10646 const DataLayout &
DL;
10647 const TargetTransformInfo &
TTI;
10648 const TargetLibraryInfo &TLI;
10649 unsigned MainOpcode = 0;
10654 static bool isSupportedOpcode(
const unsigned Opcode) {
10655 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10656 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10657 Opcode == Instruction::UDiv;
10667 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10668 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10670 return I && isSupportedOpcode(
I->getOpcode()) &&
10675 SmallDenseSet<Value *, 8>
Operands;
10676 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10677 bool AnyUndef =
false;
10678 for (
Value *V : VL) {
10686 if (Candidates.
empty()) {
10687 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10689 Operands.insert(
I->op_begin(),
I->op_end());
10692 if (Parent ==
I->getParent()) {
10693 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10694 Operands.insert(
I->op_begin(),
I->op_end());
10697 auto *NodeA = DT.
getNode(Parent);
10698 auto *NodeB = DT.
getNode(
I->getParent());
10699 assert(NodeA &&
"Should only process reachable instructions");
10700 assert(NodeB &&
"Should only process reachable instructions");
10701 assert((NodeA == NodeB) ==
10702 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10703 "Different nodes should have different DFS numbers");
10704 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10705 Candidates.
clear();
10706 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10709 Operands.insert(
I->op_begin(),
I->op_end());
10712 unsigned BestOpcodeNum = 0;
10714 for (
const auto &
P : Candidates) {
10715 if (
P.second.size() < BestOpcodeNum)
10717 for (Instruction *
I :
P.second) {
10718 if (IsSupportedInstruction(
I, AnyUndef) && !
Operands.contains(
I)) {
10720 BestOpcodeNum =
P.second.size();
10730 return I &&
I->getParent() == MainOp->
getParent() &&
10743 Value *selectBestIdempotentValue()
const {
10744 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10755 if (!S.isCopyableElement(V))
10757 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10758 return {
V, selectBestIdempotentValue()};
10764 SmallVectorImpl<BoUpSLP::ValueList> &
Operands)
const {
10766 unsigned ShuffleOrOp =
10767 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10770 switch (ShuffleOrOp) {
10771 case Instruction::PHI: {
10775 PHIHandler Handler(DT, PH, VL);
10776 Handler.buildOperands();
10777 Operands.assign(PH->getNumOperands(), {});
10779 Operands[
I].assign(Handler.getOperands(
I).begin(),
10780 Handler.getOperands(
I).end());
10783 case Instruction::ExtractValue:
10784 case Instruction::ExtractElement:
10789 case Instruction::InsertElement:
10797 case Instruction::Load:
10805 Op = LI->getPointerOperand();
10808 case Instruction::ZExt:
10809 case Instruction::SExt:
10810 case Instruction::FPToUI:
10811 case Instruction::FPToSI:
10812 case Instruction::FPExt:
10813 case Instruction::PtrToInt:
10814 case Instruction::IntToPtr:
10815 case Instruction::SIToFP:
10816 case Instruction::UIToFP:
10817 case Instruction::Trunc:
10818 case Instruction::FPTrunc:
10819 case Instruction::BitCast:
10820 case Instruction::ICmp:
10821 case Instruction::FCmp:
10822 case Instruction::Select:
10823 case Instruction::FNeg:
10824 case Instruction::Add:
10825 case Instruction::FAdd:
10826 case Instruction::Sub:
10827 case Instruction::FSub:
10828 case Instruction::Mul:
10829 case Instruction::FMul:
10830 case Instruction::UDiv:
10831 case Instruction::SDiv:
10832 case Instruction::FDiv:
10833 case Instruction::URem:
10834 case Instruction::SRem:
10835 case Instruction::FRem:
10836 case Instruction::Shl:
10837 case Instruction::LShr:
10838 case Instruction::AShr:
10839 case Instruction::And:
10840 case Instruction::Or:
10841 case Instruction::Xor:
10842 case Instruction::Freeze:
10843 case Instruction::Store:
10844 case Instruction::ShuffleVector:
10853 auto [
Op, ConvertedOps] = convertTo(
I, S);
10858 case Instruction::GetElementPtr: {
10865 const unsigned IndexIdx = 1;
10871 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10874 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10875 ->getPointerOperandType()
10876 ->getScalarType());
10881 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10885 auto *
Op =
GEP->getOperand(IndexIdx);
10888 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10893 case Instruction::Call: {
10900 for (
Value *V : VL) {
10902 Ops.push_back(
I ?
I->getOperand(Idx)
10915 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10916 const TargetTransformInfo &
TTI,
10917 const TargetLibraryInfo &TLI)
10922 bool TryCopyableElementsVectorization,
10923 bool WithProfitabilityCheck =
false,
10924 bool SkipSameCodeCheck =
false) {
10925 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10926 ? InstructionsState::invalid()
10932 findAndSetMainInstruction(VL, R);
10934 return InstructionsState::invalid();
10935 S = InstructionsState(MainOp, MainOp,
true);
10936 if (!WithProfitabilityCheck)
10940 auto BuildCandidates =
10941 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
10947 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10948 I1->getParent() != I2->getParent())
10952 if (VL.
size() == 2) {
10957 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10958 R.findBestRootPair(Candidates1) &&
10959 R.findBestRootPair(Candidates2);
10961 Candidates1.
clear();
10962 Candidates2.
clear();
10965 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10966 R.findBestRootPair(Candidates1) &&
10967 R.findBestRootPair(Candidates2);
10970 return InstructionsState::invalid();
10974 FixedVectorType *VecTy =
10976 switch (MainOpcode) {
10977 case Instruction::Add:
10978 case Instruction::LShr:
10979 case Instruction::Shl:
10980 case Instruction::SDiv:
10981 case Instruction::UDiv:
10987 if (VectorCost > ScalarCost)
10988 return InstructionsState::invalid();
10991 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10992 unsigned CopyableNum =
10993 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10994 if (CopyableNum < VL.
size() / 2)
10997 const unsigned Limit = VL.
size() / 24;
10998 if ((CopyableNum >= VL.
size() - Limit ||
10999 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11004 return InstructionsState::invalid();
11023 return InstructionsState::invalid();
11029 constexpr unsigned Limit = 4;
11030 if (
Operands.front().size() >= Limit) {
11031 SmallDenseMap<const Value *, unsigned>
Counters;
11039 return C.second == 1;
11045 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11046 InstructionsState OpS =
Analysis.buildInstructionsState(
11048 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11050 unsigned CopyableNum =
11052 return CopyableNum <= VL.
size() / 2;
11054 if (!CheckOperand(
Operands.front()))
11055 return InstructionsState::invalid();
11062 assert(S &&
"Invalid state!");
11064 if (S.areInstructionsWithCopyableElements()) {
11065 MainOp = S.getMainOp();
11066 MainOpcode = S.getOpcode();
11071 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11072 Operands[OperandIdx][Idx] = Operand;
11075 buildOriginalOperands(S, VL,
Operands);
11082BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11084 bool TryCopyableElementsVectorization)
const {
11087 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11088 InstructionsState S =
Analysis.buildInstructionsState(
11089 VL, *
this, TryCopyableElementsVectorization,
11090 true, TryCopyableElementsVectorization);
11098 return ScalarsVectorizationLegality(S,
false,
11104 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11105 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11106 if (
E->isSame(VL)) {
11107 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11109 return ScalarsVectorizationLegality(S,
false);
11114 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11115 LI->getLoopFor(S.getMainOp()->getParent()) &&
11119 return ScalarsVectorizationLegality(S,
false);
11128 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11135 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11136 return ScalarsVectorizationLegality(S,
false);
11140 if (S && S.getOpcode() == Instruction::ExtractElement &&
11143 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11144 return ScalarsVectorizationLegality(S,
false);
11151 return ScalarsVectorizationLegality(S,
false,
11161 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11169 SmallVector<unsigned, 8> InstsCount;
11170 for (
Value *V : VL) {
11173 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11176 bool IsCommutative =
11178 if ((IsCommutative &&
11179 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11181 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11183 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11187 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11189 I2->getOperand(
Op));
11190 if (
static_cast<unsigned>(
count_if(
11191 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11193 })) >= S.getMainOp()->getNumOperands() / 2)
11195 if (S.getMainOp()->getNumOperands() > 2)
11197 if (IsCommutative) {
11199 Candidates.
clear();
11200 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11202 I2->getOperand((
Op + 1) %
E));
11204 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11211 SmallVector<unsigned> SortedIndices;
11213 bool IsScatterVectorizeUserTE =
11214 UserTreeIdx.UserTE &&
11215 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11216 bool AreAllSameBlock = S.valid();
11217 bool AreScatterAllGEPSameBlock =
11230 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11232 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11238 NotProfitableForVectorization(VL)) {
11240 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11241 "C,S,B,O, small shuffle. \n";
11245 return ScalarsVectorizationLegality(S,
false,
11249 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11253 return ScalarsVectorizationLegality(S,
false);
11257 if (S && !EphValues.empty()) {
11258 for (
Value *V : VL) {
11259 if (EphValues.count(V)) {
11261 <<
") is ephemeral.\n");
11263 return ScalarsVectorizationLegality(S,
false,
11275 if (S && S.isAltShuffle()) {
11276 auto GetNumVectorizedExtracted = [&]() {
11282 all_of(
I->operands(), [&](
const Use &U) {
11283 return isa<ExtractElementInst>(U.get());
11288 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11291 return std::make_pair(Vectorized, Extracted);
11293 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11295 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11296 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11299 Type *ScalarTy = VL.front()->getType();
11304 false,
true, Kind);
11306 *TTI, ScalarTy, VecTy, Vectorized,
11307 true,
false, Kind,
false);
11308 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11310 if (PreferScalarize) {
11311 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11312 "node is not profitable.\n");
11313 return ScalarsVectorizationLegality(S,
false);
11318 if (UserIgnoreList && !UserIgnoreList->empty()) {
11319 for (
Value *V : VL) {
11320 if (UserIgnoreList->contains(V)) {
11321 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11322 return ScalarsVectorizationLegality(S,
false);
11329 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11330 assert(VL.front()->getType()->isPointerTy() &&
11332 "Expected pointers only.");
11335 assert(It != VL.end() &&
"Expected at least one GEP.");
11346 !DT->isReachableFromEntry(BB))) {
11352 return ScalarsVectorizationLegality(S,
false);
11354 return ScalarsVectorizationLegality(S,
true);
11359 unsigned InterleaveFactor) {
11362 SmallVector<int> ReuseShuffleIndices;
11366 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11369 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11372 auto Invalid = ScheduleBundle::invalid();
11373 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11374 UserTreeIdx, {}, ReorderIndices);
11379 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11381 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11382 Idx == 0 ? 0 : Op1.
size());
11383 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11385 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11386 Idx == 0 ? 0 : Op1.
size());
11396 bool AreConsts =
false;
11397 for (
Value *V : VL) {
11409 if (AreOnlyConstsWithPHIs(VL)) {
11410 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11411 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11415 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11416 VL,
Depth, UserTreeIdx,
false);
11417 InstructionsState S = Legality.getInstructionsState();
11418 if (!Legality.isLegal()) {
11419 if (Legality.trySplitVectorize()) {
11422 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11426 Legality = getScalarsVectorizationLegality(
11427 VL,
Depth, UserTreeIdx,
true);
11428 if (!Legality.isLegal()) {
11429 if (Legality.tryToFindDuplicates())
11433 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11436 S = Legality.getInstructionsState();
11440 if (S.isAltShuffle() && TrySplitNode(S))
11446 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11451 bool IsScatterVectorizeUserTE =
11452 UserTreeIdx.UserTE &&
11453 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11456 StridedPtrInfo SPtrInfo;
11457 TreeEntry::EntryState State = getScalarsVectorizationState(
11458 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11459 if (State == TreeEntry::NeedToGather) {
11460 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11466 auto &BSRef = BlocksSchedules[BB];
11468 BSRef = std::make_unique<BlockScheduling>(BB);
11470 BlockScheduling &BS = *BSRef;
11473 std::optional<ScheduleBundle *> BundlePtr =
11474 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11475#ifdef EXPENSIVE_CHECKS
11479 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11480 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11482 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11484 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11485 NonScheduledFirst.insert(VL.front());
11486 if (S.getOpcode() == Instruction::Load &&
11487 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11491 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11493 ScheduleBundle
Empty;
11494 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11495 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11497 unsigned ShuffleOrOp =
11498 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11499 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11501 SmallVector<unsigned> PHIOps;
11507 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11512 for (
unsigned I : PHIOps)
11515 switch (ShuffleOrOp) {
11516 case Instruction::PHI: {
11518 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11526 case Instruction::ExtractValue:
11527 case Instruction::ExtractElement: {
11528 if (CurrentOrder.empty()) {
11529 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11532 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11534 for (
unsigned Idx : CurrentOrder)
11535 dbgs() <<
" " << Idx;
11542 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11543 ReuseShuffleIndices, CurrentOrder);
11545 "(ExtractValueInst/ExtractElementInst).\n";
11552 case Instruction::InsertElement: {
11553 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11555 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11556 const std::pair<int, int> &P2) {
11557 return P1.first > P2.first;
11560 decltype(OrdCompare)>
11561 Indices(OrdCompare);
11562 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11564 Indices.emplace(Idx,
I);
11566 OrdersType CurrentOrder(VL.size(), VL.size());
11567 bool IsIdentity =
true;
11568 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11569 CurrentOrder[Indices.top().second] =
I;
11570 IsIdentity &= Indices.top().second ==
I;
11574 CurrentOrder.clear();
11575 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11577 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11581 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11584 case Instruction::Load: {
11591 TreeEntry *
TE =
nullptr;
11594 case TreeEntry::Vectorize:
11595 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11596 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11597 if (CurrentOrder.empty())
11598 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11602 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11605 case TreeEntry::CompressVectorize:
11607 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11608 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11611 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11614 case TreeEntry::StridedVectorize:
11616 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11617 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11618 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11619 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11622 case TreeEntry::ScatterVectorize:
11624 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11625 UserTreeIdx, ReuseShuffleIndices);
11628 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11631 case TreeEntry::CombinedVectorize:
11632 case TreeEntry::SplitVectorize:
11633 case TreeEntry::NeedToGather:
11636 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11637 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11638 SmallVector<int>
Mask;
11643 if (State == TreeEntry::ScatterVectorize)
11644 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11647 case Instruction::ZExt:
11648 case Instruction::SExt:
11649 case Instruction::FPToUI:
11650 case Instruction::FPToSI:
11651 case Instruction::FPExt:
11652 case Instruction::PtrToInt:
11653 case Instruction::IntToPtr:
11654 case Instruction::SIToFP:
11655 case Instruction::UIToFP:
11656 case Instruction::Trunc:
11657 case Instruction::FPTrunc:
11658 case Instruction::BitCast: {
11659 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11660 std::make_pair(std::numeric_limits<unsigned>::min(),
11661 std::numeric_limits<unsigned>::max()));
11662 if (ShuffleOrOp == Instruction::ZExt ||
11663 ShuffleOrOp == Instruction::SExt) {
11664 CastMaxMinBWSizes = std::make_pair(
11665 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11667 std::min<unsigned>(
11670 }
else if (ShuffleOrOp == Instruction::Trunc) {
11671 CastMaxMinBWSizes = std::make_pair(
11672 std::max<unsigned>(
11675 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11678 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11679 ReuseShuffleIndices);
11680 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11685 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11686 if (ShuffleOrOp == Instruction::Trunc) {
11687 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11688 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11689 ShuffleOrOp == Instruction::UIToFP) {
11690 unsigned NumSignBits =
11693 APInt
Mask = DB->getDemandedBits(OpI);
11694 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11696 if (NumSignBits * 2 >=
11698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11702 case Instruction::ICmp:
11703 case Instruction::FCmp: {
11706 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11707 ReuseShuffleIndices);
11716 "Commutative Predicate mismatch");
11726 if (
Cmp->getPredicate() != P0)
11733 if (ShuffleOrOp == Instruction::ICmp) {
11734 unsigned NumSignBits0 =
11736 if (NumSignBits0 * 2 >=
11738 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11739 unsigned NumSignBits1 =
11741 if (NumSignBits1 * 2 >=
11743 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11747 case Instruction::Select:
11748 case Instruction::FNeg:
11749 case Instruction::Add:
11750 case Instruction::FAdd:
11751 case Instruction::Sub:
11752 case Instruction::FSub:
11753 case Instruction::Mul:
11754 case Instruction::FMul:
11755 case Instruction::UDiv:
11756 case Instruction::SDiv:
11757 case Instruction::FDiv:
11758 case Instruction::URem:
11759 case Instruction::SRem:
11760 case Instruction::FRem:
11761 case Instruction::Shl:
11762 case Instruction::LShr:
11763 case Instruction::AShr:
11764 case Instruction::And:
11765 case Instruction::Or:
11766 case Instruction::Xor:
11767 case Instruction::Freeze: {
11768 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11769 ReuseShuffleIndices);
11771 dbgs() <<
"SLP: added a new TreeEntry "
11772 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11783 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11786 case Instruction::GetElementPtr: {
11787 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11788 ReuseShuffleIndices);
11789 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11797 case Instruction::Store: {
11798 bool Consecutive = CurrentOrder.empty();
11801 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11802 ReuseShuffleIndices, CurrentOrder);
11804 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11808 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11811 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11814 case Instruction::Call: {
11820 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11821 ReuseShuffleIndices);
11822 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11836 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11840 case Instruction::ShuffleVector: {
11841 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11842 ReuseShuffleIndices);
11843 if (S.isAltShuffle()) {
11844 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11849 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11863 "Expected different main/alternate predicates.");
11893 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11911 for (
const auto *Ty : ST->elements())
11912 if (Ty != *ST->element_begin())
11914 N *= ST->getNumElements();
11915 EltTy = *ST->element_begin();
11917 N *= AT->getNumElements();
11918 EltTy = AT->getElementType();
11921 N *= VT->getNumElements();
11922 EltTy = VT->getElementType();
11928 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
11929 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11930 VTSize != DL->getTypeStoreSizeInBits(
T))
11937 bool ResizeAllowed)
const {
11939 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11946 Value *Vec = E0->getOperand(0);
11948 CurrentOrder.
clear();
11952 if (E0->getOpcode() == Instruction::ExtractValue) {
11964 unsigned E = VL.
size();
11965 if (!ResizeAllowed && NElts !=
E)
11968 unsigned MinIdx = NElts, MaxIdx = 0;
11973 if (Inst->getOperand(0) != Vec)
11981 const unsigned ExtIdx = *Idx;
11982 if (ExtIdx >= NElts)
11984 Indices[
I] = ExtIdx;
11985 if (MinIdx > ExtIdx)
11987 if (MaxIdx < ExtIdx)
11990 if (MaxIdx - MinIdx + 1 >
E)
11992 if (MaxIdx + 1 <=
E)
11996 bool ShouldKeepOrder =
true;
12003 for (
unsigned I = 0;
I <
E; ++
I) {
12006 const unsigned ExtIdx = Indices[
I] - MinIdx;
12007 if (CurrentOrder[ExtIdx] !=
E) {
12008 CurrentOrder.
clear();
12011 ShouldKeepOrder &= ExtIdx ==
I;
12012 CurrentOrder[ExtIdx] =
I;
12014 if (ShouldKeepOrder)
12015 CurrentOrder.
clear();
12017 return ShouldKeepOrder;
12020bool BoUpSLP::areAllUsersVectorized(
12021 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12022 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12023 all_of(
I->users(), [
this](User *U) {
12024 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12025 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12029void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12030 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12031 SmallVectorImpl<Value *> *OpScalars,
12032 SmallVectorImpl<Value *> *AltScalars)
const {
12033 unsigned Sz = Scalars.size();
12035 SmallVector<int> OrderMask;
12036 if (!ReorderIndices.empty())
12038 for (
unsigned I = 0;
I < Sz; ++
I) {
12040 if (!ReorderIndices.empty())
12041 Idx = OrderMask[
I];
12045 if (IsAltOp(OpInst)) {
12046 Mask[
I] = Sz + Idx;
12055 if (!ReuseShuffleIndices.
empty()) {
12057 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12058 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12060 Mask.swap(NewMask);
12067 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12077 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12086 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12087 "CmpInst expected to match either main or alternate predicate or "
12089 return MainP !=
P && MainP != SwappedP;
12091 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12096 const auto *Op0 =
Ops.front();
12109 return CI->getValue().isPowerOf2();
12115 return CI->getValue().isNegatedPowerOf2();
12120 if (IsConstant && IsUniform)
12122 else if (IsConstant)
12124 else if (IsUniform)
12136class BaseShuffleAnalysis {
12138 Type *ScalarTy =
nullptr;
12140 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12148 unsigned getVF(
Value *V)
const {
12149 assert(V &&
"V cannot be nullptr");
12151 "V does not have FixedVectorType");
12152 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12154 unsigned VNumElements =
12156 assert(VNumElements > ScalarTyNumElements &&
12157 "the number of elements of V is not large enough");
12158 assert(VNumElements % ScalarTyNumElements == 0 &&
12159 "the number of elements of V is not a vectorized value");
12160 return VNumElements / ScalarTyNumElements;
12166 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12168 int Limit =
Mask.size();
12180 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12181 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12194 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12195 ArrayRef<int> ExtMask) {
12196 unsigned VF =
Mask.size();
12198 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12201 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12205 Mask.swap(NewMask);
12241 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12242 bool SinglePermute) {
12244 ShuffleVectorInst *IdentityOp =
nullptr;
12245 SmallVector<int> IdentityMask;
12254 if (isIdentityMask(Mask, SVTy,
false)) {
12255 if (!IdentityOp || !SinglePermute ||
12256 (isIdentityMask(Mask, SVTy,
true) &&
12258 IdentityMask.
size()))) {
12263 IdentityMask.
assign(Mask);
12283 if (SV->isZeroEltSplat()) {
12285 IdentityMask.
assign(Mask);
12287 int LocalVF =
Mask.size();
12290 LocalVF = SVOpTy->getNumElements();
12294 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12296 ExtMask[Idx] = SV->getMaskValue(
I);
12306 if (!IsOp1Undef && !IsOp2Undef) {
12308 for (
int &
I : Mask) {
12311 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12317 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12318 combineMasks(LocalVF, ShuffleMask, Mask);
12319 Mask.swap(ShuffleMask);
12321 Op = SV->getOperand(0);
12323 Op = SV->getOperand(1);
12326 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12331 "Expected masks of same sizes.");
12336 Mask.swap(IdentityMask);
12338 return SinglePermute &&
12341 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12342 Shuffle->isZeroEltSplat() &&
12346 Shuffle->getShuffleMask()[
P.index()] == 0;
12359 template <
typename T,
typename ShuffleBuilderTy>
12360 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12361 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12362 assert(V1 &&
"Expected at least one vector value.");
12364 SmallVector<int> NewMask(Mask);
12365 if (ScalarTyNumElements != 1) {
12371 Builder.resizeToMatch(V1, V2);
12372 int VF =
Mask.size();
12374 VF = FTy->getNumElements();
12385 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12387 CombinedMask1[
I] =
Mask[
I];
12389 CombinedMask2[
I] =
Mask[
I] - VF;
12396 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12397 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12403 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12406 ExtMask1[Idx] = SV1->getMaskValue(
I);
12410 ->getNumElements(),
12411 ExtMask1, UseMask::SecondArg);
12412 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12413 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12416 ExtMask2[Idx] = SV2->getMaskValue(
I);
12420 ->getNumElements(),
12421 ExtMask2, UseMask::SecondArg);
12422 if (SV1->getOperand(0)->getType() ==
12423 SV2->getOperand(0)->getType() &&
12424 SV1->getOperand(0)->getType() != SV1->getType() &&
12427 Op1 = SV1->getOperand(0);
12428 Op2 = SV2->getOperand(0);
12429 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12430 int LocalVF = ShuffleMask1.size();
12432 LocalVF = FTy->getNumElements();
12433 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12434 CombinedMask1.swap(ShuffleMask1);
12435 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12436 LocalVF = ShuffleMask2.size();
12438 LocalVF = FTy->getNumElements();
12439 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12440 CombinedMask2.swap(ShuffleMask2);
12443 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12444 Builder.resizeToMatch(Op1, Op2);
12446 ->getElementCount()
12447 .getKnownMinValue(),
12449 ->getElementCount()
12450 .getKnownMinValue());
12451 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12454 "Expected undefined mask element");
12455 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12464 return Builder.createIdentity(Op1);
12465 return Builder.createShuffleVector(
12470 return Builder.createPoison(
12472 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12473 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12476 return Builder.createShuffleVector(V1, NewMask);
12477 return Builder.createIdentity(V1);
12483 ArrayRef<int> Mask) {
12492static std::pair<InstructionCost, InstructionCost>
12503 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12512 ScalarCost =
TTI.getPointersChainCost(
12513 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12517 for (
Value *V : Ptrs) {
12518 if (V == BasePtr) {
12527 if (!
Ptr || !
Ptr->hasOneUse())
12531 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12536 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12537 TTI::PointersChainInfo::getKnownStride(),
12547 [](
const Value *V) {
12549 return Ptr && !
Ptr->hasAllConstantIndices();
12551 ? TTI::PointersChainInfo::getUnknownStride()
12552 : TTI::PointersChainInfo::getKnownStride();
12555 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12559 if (It != Ptrs.
end())
12564 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12565 BaseGEP->getPointerOperand(), Indices, VecTy,
12570 return std::make_pair(ScalarCost, VecCost);
12573void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12574 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12575 "Expected gather node without reordering.");
12577 SmallSet<size_t, 2> LoadKeyUsed;
12581 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12586 return VectorizableTree[Idx]->isSame(TE.Scalars);
12590 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12595 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12596 if (LIt != LoadsMap.
end()) {
12597 for (LoadInst *RLI : LIt->second) {
12599 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12603 for (LoadInst *RLI : LIt->second) {
12605 LI->getPointerOperand(), *TLI)) {
12610 if (LIt->second.size() > 2) {
12612 hash_value(LIt->second.back()->getPointerOperand());
12621 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12622 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12623 bool IsOrdered =
true;
12624 unsigned NumInstructions = 0;
12628 size_t Key = 1, Idx = 1;
12636 auto &Container = SortedValues[
Key];
12637 if (IsOrdered && !KeyToIndex.
contains(V) &&
12640 ((Container.contains(Idx) &&
12641 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12642 (!Container.empty() && !Container.contains(Idx) &&
12643 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12645 auto &KTI = KeyToIndex[
V];
12647 Container[Idx].push_back(V);
12652 if (!IsOrdered && NumInstructions > 1) {
12654 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12655 for (
const auto &
D : SortedValues) {
12656 for (
const auto &
P :
D.second) {
12658 for (
Value *V :
P.second) {
12659 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12660 for (
auto [K, Idx] :
enumerate(Indices)) {
12661 TE.ReorderIndices[Cnt +
K] = Idx;
12662 TE.Scalars[Cnt +
K] =
V;
12664 Sz += Indices.
size();
12665 Cnt += Indices.
size();
12669 *TTI,
TE.Scalars.front()->getType(), Sz);
12673 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12681 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12686 auto *ScalarTy =
TE.Scalars.front()->getType();
12688 for (
auto [Idx, Sz] : SubVectors) {
12695 int Sz =
TE.Scalars.size();
12696 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12697 TE.ReorderIndices.end());
12703 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12707 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12710 VecTy, ReorderMask);
12716 DemandedElts.clearBit(
I);
12718 ReorderMask[
I] =
I;
12720 ReorderMask[
I] =
I + Sz;
12726 if (!DemandedElts.isAllOnes())
12728 if (
Cost >= BVCost) {
12729 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12731 TE.ReorderIndices.clear();
12738 const InstructionsState &S,
12744 return V->getType()->getScalarType()->isFloatingPointTy();
12746 "Can only convert to FMA for floating point types");
12747 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12752 for (
Value *V : VL) {
12756 if (S.isCopyableElement(
I))
12758 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12759 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12762 FMF &= FPCI->getFastMathFlags();
12766 if (!CheckForContractable(VL))
12769 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12776 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12778 if (!CheckForContractable(
Operands.front()))
12786 for (
Value *V : VL) {
12790 if (!S.isCopyableElement(
I))
12792 FMF &= FPCI->getFastMathFlags();
12793 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12797 if (S.isCopyableElement(V))
12800 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12802 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12809 FMF &= FPCI->getFastMathFlags();
12810 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12820 BaseGraphSize = VectorizableTree.size();
12822 class GraphTransformModeRAAI {
12823 bool &SavedIsGraphTransformMode;
12826 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12827 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12828 IsGraphTransformMode =
true;
12830 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12831 } TransformContext(IsGraphTransformMode);
12840 const InstructionsState &S) {
12844 I2->getOperand(
Op));
12846 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12848 [](
const std::pair<Value *, Value *> &
P) {
12858 TreeEntry &E = *VectorizableTree[Idx];
12860 reorderGatherNode(E);
12865 constexpr unsigned VFLimit = 16;
12866 bool ForceLoadGather =
12867 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12868 return TE->isGather() && TE->hasState() &&
12869 TE->getOpcode() == Instruction::Load &&
12870 TE->getVectorFactor() < VFLimit;
12876 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12885 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12886 if (E.hasState()) {
12888 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12889 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12890 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12891 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12892 return is_contained(TEs, TE);
12899 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12900 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12901 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12902 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12903 return is_contained(TEs, TE);
12911 if (It != E.Scalars.end()) {
12913 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12914 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12915 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12916 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12917 return is_contained(TEs, TE);
12927 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
12928 TreeEntry &
E = *VectorizableTree[Idx];
12929 if (
E.isGather()) {
12932 unsigned MinVF =
getMinVF(2 * Sz);
12935 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12936 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
12942 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
12945 if (CheckForSameVectorNodes(
E))
12949 unsigned StartIdx = 0;
12950 unsigned End = VL.
size();
12952 *TTI, VL.
front()->getType(), VL.
size() - 1);
12954 *TTI, VL.
front()->getType(), VF - 1)) {
12955 if (StartIdx + VF > End)
12958 bool AllStrided =
true;
12959 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12964 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12971 bool IsSplat =
isSplat(Slice);
12972 bool IsTwoRegisterSplat =
true;
12973 if (IsSplat && VF == 2) {
12976 IsTwoRegisterSplat = NumRegs2VF == 2;
12978 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12986 (S.getOpcode() == Instruction::Load &&
12988 (S.getOpcode() != Instruction::Load &&
12994 if ((!UserIgnoreList ||
E.Idx != 0) &&
12995 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13004 if (S.getOpcode() == Instruction::Load) {
13007 StridedPtrInfo SPtrInfo;
13009 PointerOps, SPtrInfo);
13020 if (UserIgnoreList &&
E.Idx == 0)
13025 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13026 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13028 !CheckOperandsProfitability(
13045 if (VF == 2 && AllStrided && Slices.
size() > 2)
13047 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13048 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13049 if (StartIdx == Cnt)
13050 StartIdx = Cnt + Sz;
13051 if (End == Cnt + Sz)
13054 for (
auto [Cnt, Sz] : Slices) {
13056 const TreeEntry *SameTE =
nullptr;
13058 It != Slice.
end()) {
13060 SameTE = getSameValuesTreeEntry(*It, Slice);
13062 unsigned PrevSize = VectorizableTree.size();
13063 [[maybe_unused]]
unsigned PrevEntriesSize =
13064 LoadEntriesToVectorize.size();
13065 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13066 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13067 VectorizableTree[PrevSize]->isGather() &&
13068 VectorizableTree[PrevSize]->hasState() &&
13069 VectorizableTree[PrevSize]->getOpcode() !=
13070 Instruction::ExtractElement &&
13072 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13074 VectorizableTree.pop_back();
13075 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13076 "LoadEntriesToVectorize expected to remain the same");
13079 AddCombinedNode(PrevSize, Cnt, Sz);
13083 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13084 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13086 E.ReorderIndices.clear();
13091 switch (
E.getOpcode()) {
13092 case Instruction::Load: {
13095 if (
E.State != TreeEntry::Vectorize)
13097 Type *ScalarTy =
E.getMainOp()->getType();
13103 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13104 SmallVector<int>
Mask;
13108 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13109 BaseLI->getPointerAddressSpace(),
CostKind,
13113 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13114 false, CommonAlignment,
CostKind, BaseLI);
13119 ->getPointerOperand()
13121 StridedPtrInfo SPtrInfo;
13122 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13123 SPtrInfo.Ty = VecTy;
13124 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13125 E.State = TreeEntry::StridedVectorize;
13130 case Instruction::Store: {
13138 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13139 SmallVector<int>
Mask;
13143 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13144 BaseSI->getPointerAddressSpace(),
CostKind,
13148 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13149 false, CommonAlignment,
CostKind, BaseSI);
13150 if (StridedCost < OriginalVecCost)
13153 E.State = TreeEntry::StridedVectorize;
13154 }
else if (!
E.ReorderIndices.empty()) {
13156 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13158 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13159 if (
Mask.size() < 4)
13163 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13164 TTI.isLegalInterleavedAccessType(
13165 VecTy, Factor, BaseSI->getAlign(),
13166 BaseSI->getPointerAddressSpace()))
13172 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13173 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13174 if (InterleaveFactor != 0)
13175 E.setInterleave(InterleaveFactor);
13179 case Instruction::Select: {
13180 if (
E.State != TreeEntry::Vectorize)
13186 E.CombinedOp = TreeEntry::MinMax;
13187 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13188 if (SelectOnly && CondEntry->UserTreeIndex &&
13189 CondEntry->State == TreeEntry::Vectorize) {
13191 CondEntry->State = TreeEntry::CombinedVectorize;
13195 case Instruction::FSub:
13196 case Instruction::FAdd: {
13198 if (
E.State != TreeEntry::Vectorize ||
13199 !
E.getOperations().isAddSubLikeOp())
13205 E.CombinedOp = TreeEntry::FMulAdd;
13206 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13207 if (FMulEntry->UserTreeIndex &&
13208 FMulEntry->State == TreeEntry::Vectorize) {
13210 FMulEntry->State = TreeEntry::CombinedVectorize;
13219 if (LoadEntriesToVectorize.empty()) {
13221 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13222 VectorizableTree.front()->getOpcode() == Instruction::Load)
13225 constexpr unsigned SmallTree = 3;
13226 constexpr unsigned SmallVF = 2;
13227 if ((VectorizableTree.size() <= SmallTree &&
13228 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13229 (VectorizableTree.size() <= 2 && UserIgnoreList))
13232 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13236 [](
const std::unique_ptr<TreeEntry> &TE) {
13237 return TE->isGather() &&
TE->hasState() &&
13238 TE->getOpcode() == Instruction::Load &&
13246 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13250 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13251 TreeEntry &
E = *
TE;
13252 if (
E.isGather() &&
13253 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13254 (!
E.hasState() &&
any_of(
E.Scalars,
13256 return isa<LoadInst>(V) &&
13257 !isVectorized(V) &&
13258 !isDeleted(cast<Instruction>(V));
13261 for (
Value *V :
E.Scalars) {
13268 *
this, V, *DL, *SE, *TTI,
13269 GatheredLoads[std::make_tuple(
13277 if (!GatheredLoads.
empty())
13278 tryToVectorizeGatheredLoads(GatheredLoads);
13288 bool IsFinalized =
false;
13301 bool SameNodesEstimated =
true;
13304 if (Ty->getScalarType()->isPointerTy()) {
13308 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13309 Ty->getScalarType());
13327 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13330 count(VL, *It) > 1 &&
13332 if (!NeedShuffle) {
13335 return TTI.getShuffleCost(
13340 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13341 CostKind, std::distance(VL.
begin(), It),
13347 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13350 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13354 VecTy, ShuffleMask, CostKind,
13358 return GatherCost +
13361 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13369 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13370 unsigned NumParts) {
13371 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13373 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13374 auto *EE = dyn_cast<ExtractElementInst>(V);
13377 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13380 return std::max(Sz, VecTy->getNumElements());
13387 -> std::optional<TTI::ShuffleKind> {
13388 if (NumElts <= EltsPerVector)
13389 return std::nullopt;
13391 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13393 if (I == PoisonMaskElem)
13395 return std::min(S, I);
13398 int OffsetReg1 = OffsetReg0;
13402 int FirstRegId = -1;
13403 Indices.assign(1, OffsetReg0);
13407 int Idx =
I - OffsetReg0;
13409 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13410 if (FirstRegId < 0)
13411 FirstRegId = RegId;
13412 RegIndices.
insert(RegId);
13413 if (RegIndices.
size() > 2)
13414 return std::nullopt;
13415 if (RegIndices.
size() == 2) {
13417 if (Indices.
size() == 1) {
13420 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13421 [&](
int S,
int I) {
13422 if (I == PoisonMaskElem)
13424 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13425 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13426 if (RegId == FirstRegId)
13428 return std::min(S, I);
13431 unsigned Index = OffsetReg1 % NumElts;
13432 Indices.push_back(Index);
13433 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13435 Idx =
I - OffsetReg1;
13437 I = (Idx % NumElts) % EltsPerVector +
13438 (RegId == FirstRegId ? 0 : EltsPerVector);
13440 return ShuffleKind;
13448 if (!ShuffleKinds[Part])
13451 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13456 std::optional<TTI::ShuffleKind> RegShuffleKind =
13457 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13458 if (!RegShuffleKind) {
13461 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13474 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13475 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13476 assert((Idx + SubVecSize) <= BaseVF &&
13477 "SK_ExtractSubvector index out of range");
13487 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13488 if (OriginalCost < Cost)
13489 Cost = OriginalCost;
13496 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13498 unsigned SliceSize) {
13499 if (SameNodesEstimated) {
13505 if ((InVectors.size() == 2 &&
13509 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13512 "Expected all poisoned elements.");
13514 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13519 Cost += createShuffle(InVectors.front(),
13520 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13522 transformMaskAfterShuffle(CommonMask, CommonMask);
13523 }
else if (InVectors.size() == 2) {
13524 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13525 transformMaskAfterShuffle(CommonMask, CommonMask);
13527 SameNodesEstimated =
false;
13528 if (!E2 && InVectors.size() == 1) {
13529 unsigned VF = E1.getVectorFactor();
13531 VF = std::max(VF, getVF(V1));
13534 VF = std::max(VF, E->getVectorFactor());
13536 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13538 CommonMask[Idx] = Mask[Idx] + VF;
13539 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13540 transformMaskAfterShuffle(CommonMask, CommonMask);
13542 auto P = InVectors.front();
13543 Cost += createShuffle(&E1, E2, Mask);
13544 unsigned VF = Mask.size();
13550 VF = std::max(VF, E->getVectorFactor());
13552 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13554 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13555 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13556 transformMaskAfterShuffle(CommonMask, CommonMask);
13560 class ShuffleCostBuilder {
13563 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13565 return Mask.empty() ||
13566 (VF == Mask.size() &&
13574 ~ShuffleCostBuilder() =
default;
13580 if (isEmptyOrIdentity(Mask, VF))
13589 if (isEmptyOrIdentity(Mask, VF))
13598 void resizeToMatch(
Value *&,
Value *&)
const {}
13608 ShuffleCostBuilder Builder(TTI);
13611 unsigned CommonVF = Mask.size();
13613 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13617 Type *EScalarTy = E.Scalars.front()->getType();
13618 bool IsSigned =
true;
13619 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13621 IsSigned = It->second.second;
13623 if (EScalarTy != ScalarTy) {
13624 unsigned CastOpcode = Instruction::Trunc;
13625 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13626 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13628 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13629 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13639 Type *EScalarTy = VecTy->getElementType();
13640 if (EScalarTy != ScalarTy) {
13642 unsigned CastOpcode = Instruction::Trunc;
13643 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13644 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13646 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13647 return TTI.getCastInstrCost(
13653 if (!V1 && !V2 && !P2.
isNull()) {
13656 unsigned VF = E->getVectorFactor();
13658 CommonVF = std::max(VF, E2->getVectorFactor());
13661 return Idx < 2 * static_cast<int>(CommonVF);
13663 "All elements in mask must be less than 2 * CommonVF.");
13664 if (E->Scalars.size() == E2->Scalars.size()) {
13668 for (
int &Idx : CommonMask) {
13671 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13673 else if (Idx >=
static_cast<int>(CommonVF))
13674 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13678 CommonVF = E->Scalars.size();
13679 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13680 GetNodeMinBWAffectedCost(*E2, CommonVF);
13682 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13683 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13686 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13687 }
else if (!V1 && P2.
isNull()) {
13690 unsigned VF = E->getVectorFactor();
13694 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13695 "All elements in mask must be less than CommonVF.");
13696 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13698 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13699 for (
int &Idx : CommonMask) {
13703 CommonVF = E->Scalars.size();
13704 }
else if (
unsigned Factor = E->getInterleaveFactor();
13705 Factor > 0 && E->Scalars.size() != Mask.size() &&
13709 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13711 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13714 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13715 CommonVF == CommonMask.size() &&
13717 [](
const auto &&
P) {
13719 static_cast<unsigned>(
P.value()) !=
P.index();
13727 }
else if (V1 && P2.
isNull()) {
13729 ExtraCost += GetValueMinBWAffectedCost(V1);
13730 CommonVF = getVF(V1);
13733 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13734 "All elements in mask must be less than CommonVF.");
13735 }
else if (V1 && !V2) {
13737 unsigned VF = getVF(V1);
13739 CommonVF = std::max(VF, E2->getVectorFactor());
13742 return Idx < 2 * static_cast<int>(CommonVF);
13744 "All elements in mask must be less than 2 * CommonVF.");
13745 if (E2->Scalars.size() == VF && VF != CommonVF) {
13747 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13748 for (
int &Idx : CommonMask) {
13751 if (Idx >=
static_cast<int>(CommonVF))
13752 Idx = E2Mask[Idx - CommonVF] + VF;
13756 ExtraCost += GetValueMinBWAffectedCost(V1);
13758 ExtraCost += GetNodeMinBWAffectedCost(
13759 *E2, std::min(CommonVF, E2->getVectorFactor()));
13760 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13761 }
else if (!V1 && V2) {
13763 unsigned VF = getVF(V2);
13765 CommonVF = std::max(VF, E1->getVectorFactor());
13768 return Idx < 2 * static_cast<int>(CommonVF);
13770 "All elements in mask must be less than 2 * CommonVF.");
13771 if (E1->Scalars.size() == VF && VF != CommonVF) {
13773 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13774 for (
int &Idx : CommonMask) {
13777 if (Idx >=
static_cast<int>(CommonVF))
13778 Idx = E1Mask[Idx - CommonVF] + VF;
13784 ExtraCost += GetNodeMinBWAffectedCost(
13785 *E1, std::min(CommonVF, E1->getVectorFactor()));
13787 ExtraCost += GetValueMinBWAffectedCost(V2);
13788 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13790 assert(V1 && V2 &&
"Expected both vectors.");
13791 unsigned VF = getVF(V1);
13792 CommonVF = std::max(VF, getVF(V2));
13795 return Idx < 2 * static_cast<int>(CommonVF);
13797 "All elements in mask must be less than 2 * CommonVF.");
13799 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13802 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13807 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13810 InVectors.front() =
13812 if (InVectors.size() == 2)
13813 InVectors.pop_back();
13814 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13815 V1, V2, CommonMask, Builder, ScalarTy);
13822 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13823 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13824 CheckedExtracts(CheckedExtracts) {}
13826 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13827 unsigned NumParts,
bool &UseVecBaseAsInput) {
13828 UseVecBaseAsInput =
false;
13831 Value *VecBase =
nullptr;
13833 if (!E->ReorderIndices.empty()) {
13835 E->ReorderIndices.end());
13840 bool PrevNodeFound =
any_of(
13841 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13842 [&](
const std::unique_ptr<TreeEntry> &TE) {
13843 return ((TE->hasState() && !TE->isAltShuffle() &&
13844 TE->getOpcode() == Instruction::ExtractElement) ||
13846 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13847 return VL.size() > Data.index() &&
13848 (Mask[Data.index()] == PoisonMaskElem ||
13849 isa<UndefValue>(VL[Data.index()]) ||
13850 Data.value() == VL[Data.index()]);
13858 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13872 VecBase = EE->getVectorOperand();
13873 UniqueBases.
insert(VecBase);
13875 if (!CheckedExtracts.
insert(V).second ||
13879 return isa<GetElementPtrInst>(U) &&
13880 !R.areAllUsersVectorized(cast<Instruction>(U),
13888 unsigned Idx = *EEIdx;
13890 if (EE->hasOneUse() || !PrevNodeFound) {
13896 Cost -= TTI.getExtractWithExtendCost(
13897 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13900 Cost += TTI.getCastInstrCost(
13901 Ext->getOpcode(), Ext->getType(), EE->getType(),
13906 APInt &DemandedElts =
13907 VectorOpsToExtracts
13910 .first->getSecond();
13911 DemandedElts.
setBit(Idx);
13914 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13916 DemandedElts,
false,
13924 if (!PrevNodeFound)
13925 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13928 transformMaskAfterShuffle(CommonMask, CommonMask);
13929 SameNodesEstimated =
false;
13930 if (NumParts != 1 && UniqueBases.
size() != 1) {
13931 UseVecBaseAsInput =
true;
13939 std::optional<InstructionCost>
13943 return std::nullopt;
13947 IsFinalized =
false;
13948 CommonMask.clear();
13951 VectorizedVals.clear();
13952 SameNodesEstimated =
true;
13958 return Idx < static_cast<int>(E1.getVectorFactor());
13960 "Expected single vector shuffle mask.");
13964 if (InVectors.empty()) {
13965 CommonMask.assign(Mask.begin(), Mask.end());
13966 InVectors.assign({&E1, &E2});
13969 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13975 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13976 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13979 if (InVectors.empty()) {
13980 CommonMask.assign(Mask.begin(), Mask.end());
13981 InVectors.assign(1, &E1);
13984 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13990 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13991 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13992 if (!SameNodesEstimated && InVectors.size() == 1)
13993 InVectors.emplace_back(&E1);
13999 assert(InVectors.size() == 1 &&
14006 ->getOrdered(
P.index()));
14007 return EI->getVectorOperand() == V1 ||
14008 EI->getVectorOperand() == V2;
14010 "Expected extractelement vectors.");
14014 if (InVectors.empty()) {
14015 assert(CommonMask.empty() && !ForExtracts &&
14016 "Expected empty input mask/vectors.");
14017 CommonMask.assign(Mask.begin(), Mask.end());
14018 InVectors.assign(1, V1);
14024 !CommonMask.empty() &&
14028 ->getOrdered(
P.index());
14030 return P.value() == Mask[
P.index()] ||
14035 return EI->getVectorOperand() == V1;
14037 "Expected only tree entry for extractelement vectors.");
14040 assert(!InVectors.empty() && !CommonMask.empty() &&
14041 "Expected only tree entries from extracts/reused buildvectors.");
14042 unsigned VF = getVF(V1);
14043 if (InVectors.size() == 2) {
14044 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14045 transformMaskAfterShuffle(CommonMask, CommonMask);
14046 VF = std::max<unsigned>(VF, CommonMask.size());
14047 }
else if (
const auto *InTE =
14048 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14049 VF = std::max(VF, InTE->getVectorFactor());
14053 ->getNumElements());
14055 InVectors.push_back(V1);
14056 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14058 CommonMask[Idx] = Mask[Idx] + VF;
14061 Value *Root =
nullptr) {
14062 Cost += getBuildVectorCost(VL, Root);
14066 unsigned VF = VL.
size();
14068 VF = std::min(VF, MaskVF);
14069 Type *VLScalarTy = VL.
front()->getType();
14093 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14099 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14104 IsFinalized =
true;
14107 if (InVectors.
size() == 2)
14108 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14110 Cost += createShuffle(Vec,
nullptr, CommonMask);
14111 transformMaskAfterShuffle(CommonMask, CommonMask);
14113 "Expected vector length for the final value before action.");
14116 Cost += createShuffle(V1, V2, Mask);
14119 InVectors.
front() = V;
14121 if (!SubVectors.empty()) {
14123 if (InVectors.
size() == 2)
14124 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14126 Cost += createShuffle(Vec,
nullptr, CommonMask);
14127 transformMaskAfterShuffle(CommonMask, CommonMask);
14129 if (!SubVectorsMask.
empty()) {
14131 "Expected same size of masks for subvectors and common mask.");
14133 copy(SubVectorsMask, SVMask.begin());
14134 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14137 I1 = I2 + CommonMask.
size();
14144 for (
auto [
E, Idx] : SubVectors) {
14145 Type *EScalarTy =
E->Scalars.front()->getType();
14146 bool IsSigned =
true;
14147 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14150 IsSigned = It->second.second;
14152 if (ScalarTy != EScalarTy) {
14153 unsigned CastOpcode = Instruction::Trunc;
14154 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14155 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14157 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14158 Cost += TTI.getCastInstrCost(
14167 if (!CommonMask.
empty()) {
14168 std::iota(std::next(CommonMask.
begin(), Idx),
14169 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14175 if (!ExtMask.
empty()) {
14176 if (CommonMask.
empty()) {
14180 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14183 NewMask[
I] = CommonMask[ExtMask[
I]];
14185 CommonMask.
swap(NewMask);
14188 if (CommonMask.
empty()) {
14189 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14193 createShuffle(InVectors.
front(),
14194 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14199 assert((IsFinalized || CommonMask.empty()) &&
14200 "Shuffle construction must be finalized.");
14204const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14205 unsigned Idx)
const {
14206 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14207 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14212 if (TE.State == TreeEntry::ScatterVectorize ||
14213 TE.State == TreeEntry::StridedVectorize)
14215 if (TE.State == TreeEntry::CompressVectorize)
14217 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14218 !TE.isAltShuffle()) {
14219 if (TE.ReorderIndices.empty())
14231 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14241 auto It = MinBWs.find(
E);
14242 Type *OrigScalarTy = ScalarTy;
14243 if (It != MinBWs.end()) {
14250 unsigned EntryVF =
E->getVectorFactor();
14253 if (
E->isGather()) {
14259 ScalarTy = VL.
front()->getType();
14260 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14261 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14263 if (
E->State == TreeEntry::SplitVectorize) {
14264 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14265 "Expected exactly 2 combined entries.");
14266 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14268 if (
E->ReorderIndices.empty()) {
14271 E->CombinedEntriesWithIndices.back().second,
14274 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14275 ->getVectorFactor()));
14277 unsigned CommonVF =
14278 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14279 ->getVectorFactor(),
14280 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14281 ->getVectorFactor());
14286 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14290 SmallVector<int>
Mask;
14291 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14292 (
E->State != TreeEntry::StridedVectorize ||
14294 SmallVector<int> NewMask;
14295 if (
E->getOpcode() == Instruction::Store) {
14297 NewMask.
resize(
E->ReorderIndices.size());
14304 if (!
E->ReuseShuffleIndices.empty())
14309 assert((
E->State == TreeEntry::Vectorize ||
14310 E->State == TreeEntry::ScatterVectorize ||
14311 E->State == TreeEntry::StridedVectorize ||
14312 E->State == TreeEntry::CompressVectorize) &&
14313 "Unhandled state");
14316 (
E->getOpcode() == Instruction::GetElementPtr &&
14317 E->getMainOp()->getType()->isPointerTy()) ||
14318 E->hasCopyableElements()) &&
14321 unsigned ShuffleOrOp =
14322 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14323 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14324 ShuffleOrOp =
E->CombinedOp;
14325 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14326 const unsigned Sz = UniqueValues.size();
14327 SmallBitVector UsedScalars(Sz,
false);
14328 for (
unsigned I = 0;
I < Sz; ++
I) {
14330 !
E->isCopyableElement(UniqueValues[
I]) &&
14331 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14333 UsedScalars.set(
I);
14335 auto GetCastContextHint = [&](
Value *
V) {
14337 return getCastContextHint(*OpTEs.front());
14338 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14339 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14340 !SrcState.isAltShuffle())
14353 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14355 for (
unsigned I = 0;
I < Sz; ++
I) {
14356 if (UsedScalars.test(
I))
14358 ScalarCost += ScalarEltCost(
I);
14367 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14369 if (!EI.UserTE->hasState() ||
14370 EI.UserTE->getOpcode() != Instruction::Select ||
14372 auto UserBWIt = MinBWs.find(EI.UserTE);
14373 Type *UserScalarTy =
14374 (EI.UserTE->isGather() ||
14375 EI.UserTE->State == TreeEntry::SplitVectorize)
14376 ? EI.UserTE->Scalars.front()->getType()
14377 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14378 if (UserBWIt != MinBWs.end())
14380 UserBWIt->second.first);
14381 if (ScalarTy != UserScalarTy) {
14382 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14383 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14384 unsigned VecOpcode;
14386 if (BWSz > SrcBWSz)
14387 VecOpcode = Instruction::Trunc;
14390 It->second.second ? Instruction::SExt : Instruction::ZExt;
14392 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14397 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14398 ScalarCost,
"Calculated costs for Tree"));
14399 return VecCost - ScalarCost;
14404 assert((
E->State == TreeEntry::Vectorize ||
14405 E->State == TreeEntry::StridedVectorize ||
14406 E->State == TreeEntry::CompressVectorize) &&
14407 "Entry state expected to be Vectorize, StridedVectorize or "
14408 "MaskedLoadCompressVectorize here.");
14412 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14413 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14414 "Calculated GEPs cost for Tree"));
14416 return VecCost - ScalarCost;
14423 Type *CanonicalType = Ty;
14429 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14430 {CanonicalType, CanonicalType});
14432 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14435 if (VI && SelectOnly) {
14437 "Expected only for scalar type.");
14440 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14441 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14442 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14446 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14451 switch (ShuffleOrOp) {
14452 case Instruction::PHI: {
14455 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14456 for (
Value *V : UniqueValues) {
14462 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14466 if (
const TreeEntry *OpTE =
14468 if (CountedOps.
insert(OpTE).second &&
14469 !OpTE->ReuseShuffleIndices.empty())
14470 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14471 OpTE->Scalars.size());
14474 return CommonCost - ScalarCost;
14476 case Instruction::ExtractValue:
14477 case Instruction::ExtractElement: {
14478 APInt DemandedElts;
14480 auto GetScalarCost = [&](
unsigned Idx) {
14486 if (ShuffleOrOp == Instruction::ExtractElement) {
14488 SrcVecTy = EE->getVectorOperandType();
14491 Type *AggregateTy = EV->getAggregateOperand()->getType();
14494 NumElts = ATy->getNumElements();
14500 if (
I->hasOneUse()) {
14510 Cost -= TTI->getCastInstrCost(
14511 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14516 if (DemandedElts.
isZero())
14522 return CommonCost - (DemandedElts.
isZero()
14524 : TTI.getScalarizationOverhead(
14525 SrcVecTy, DemandedElts,
false,
14528 return GetCostDiff(GetScalarCost, GetVectorCost);
14530 case Instruction::InsertElement: {
14531 assert(
E->ReuseShuffleIndices.empty() &&
14532 "Unique insertelements only are expected.");
14534 unsigned const NumElts = SrcVecTy->getNumElements();
14535 unsigned const NumScalars = VL.
size();
14541 unsigned OffsetEnd = OffsetBeg;
14542 InsertMask[OffsetBeg] = 0;
14545 if (OffsetBeg > Idx)
14547 else if (OffsetEnd < Idx)
14549 InsertMask[Idx] =
I + 1;
14552 if (NumOfParts > 0 && NumOfParts < NumElts)
14553 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14554 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14556 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14557 unsigned InsertVecSz = std::min<unsigned>(
14559 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14560 bool IsWholeSubvector =
14561 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14565 if (OffsetBeg + InsertVecSz > VecSz) {
14568 InsertVecSz = VecSz;
14573 SmallVector<int>
Mask;
14574 if (!
E->ReorderIndices.empty()) {
14579 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14581 bool IsIdentity =
true;
14583 Mask.swap(PrevMask);
14584 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14586 DemandedElts.
setBit(InsertIdx);
14587 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14588 Mask[InsertIdx - OffsetBeg] =
I;
14590 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14604 InsertVecTy, Mask);
14606 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14612 SmallBitVector InMask =
14614 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14615 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14616 if (InsertVecSz != VecSz) {
14621 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14623 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14627 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14636 case Instruction::ZExt:
14637 case Instruction::SExt:
14638 case Instruction::FPToUI:
14639 case Instruction::FPToSI:
14640 case Instruction::FPExt:
14641 case Instruction::PtrToInt:
14642 case Instruction::IntToPtr:
14643 case Instruction::SIToFP:
14644 case Instruction::UIToFP:
14645 case Instruction::Trunc:
14646 case Instruction::FPTrunc:
14647 case Instruction::BitCast: {
14648 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14651 unsigned Opcode = ShuffleOrOp;
14652 unsigned VecOpcode = Opcode;
14654 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14656 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14657 if (SrcIt != MinBWs.end()) {
14658 SrcBWSz = SrcIt->second.first;
14664 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14665 if (BWSz == SrcBWSz) {
14666 VecOpcode = Instruction::BitCast;
14667 }
else if (BWSz < SrcBWSz) {
14668 VecOpcode = Instruction::Trunc;
14669 }
else if (It != MinBWs.end()) {
14670 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14671 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14672 }
else if (SrcIt != MinBWs.end()) {
14673 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14675 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14677 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14678 !SrcIt->second.second) {
14679 VecOpcode = Instruction::UIToFP;
14682 assert(Idx == 0 &&
"Expected 0 index only");
14683 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14690 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14692 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14695 bool IsArithmeticExtendedReduction =
14696 E->Idx == 0 && UserIgnoreList &&
14699 return is_contained({Instruction::Add, Instruction::FAdd,
14700 Instruction::Mul, Instruction::FMul,
14701 Instruction::And, Instruction::Or,
14705 if (IsArithmeticExtendedReduction &&
14706 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14708 return CommonCost +
14709 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14710 VecOpcode == Opcode ? VI :
nullptr);
14712 return GetCostDiff(GetScalarCost, GetVectorCost);
14714 case Instruction::FCmp:
14715 case Instruction::ICmp:
14716 case Instruction::Select: {
14717 CmpPredicate VecPred, SwappedVecPred;
14720 match(VL0, MatchCmp))
14726 auto GetScalarCost = [&](
unsigned Idx) {
14736 !
match(VI, MatchCmp)) ||
14744 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14745 CostKind, getOperandInfo(
VI->getOperand(0)),
14746 getOperandInfo(
VI->getOperand(1)), VI);
14757 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14758 CostKind, getOperandInfo(
E->getOperand(0)),
14759 getOperandInfo(
E->getOperand(1)), VL0);
14763 unsigned CondNumElements = CondType->getNumElements();
14765 assert(VecTyNumElements >= CondNumElements &&
14766 VecTyNumElements % CondNumElements == 0 &&
14767 "Cannot vectorize Instruction::Select");
14768 if (CondNumElements != VecTyNumElements) {
14777 return VecCost + CommonCost;
14779 return GetCostDiff(GetScalarCost, GetVectorCost);
14781 case TreeEntry::MinMax: {
14782 auto GetScalarCost = [&](
unsigned Idx) {
14783 return GetMinMaxCost(OrigScalarTy);
14787 return VecCost + CommonCost;
14789 return GetCostDiff(GetScalarCost, GetVectorCost);
14791 case TreeEntry::FMulAdd: {
14792 auto GetScalarCost = [&](
unsigned Idx) {
14795 return GetFMulAddCost(
E->getOperations(),
14801 for (
Value *V :
E->Scalars) {
14803 FMF &= FPCI->getFastMathFlags();
14805 FMF &= FPCIOp->getFastMathFlags();
14808 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14809 {VecTy, VecTy, VecTy}, FMF);
14811 return VecCost + CommonCost;
14813 return GetCostDiff(GetScalarCost, GetVectorCost);
14815 case Instruction::FNeg:
14816 case Instruction::Add:
14817 case Instruction::FAdd:
14818 case Instruction::Sub:
14819 case Instruction::FSub:
14820 case Instruction::Mul:
14821 case Instruction::FMul:
14822 case Instruction::UDiv:
14823 case Instruction::SDiv:
14824 case Instruction::FDiv:
14825 case Instruction::URem:
14826 case Instruction::SRem:
14827 case Instruction::FRem:
14828 case Instruction::Shl:
14829 case Instruction::LShr:
14830 case Instruction::AShr:
14831 case Instruction::And:
14832 case Instruction::Or:
14833 case Instruction::Xor: {
14834 auto GetScalarCost = [&](
unsigned Idx) {
14841 Value *Op1 =
E->getOperand(0)[Idx];
14843 SmallVector<const Value *, 2>
Operands(1, Op1);
14847 Op2 =
E->getOperand(1)[Idx];
14855 I && (ShuffleOrOp == Instruction::FAdd ||
14856 ShuffleOrOp == Instruction::FSub)) {
14864 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14869 return CI && CI->getValue().countr_one() >= It->second.first;
14877 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14878 Op2Info, {},
nullptr, TLI) +
14881 return GetCostDiff(GetScalarCost, GetVectorCost);
14883 case Instruction::GetElementPtr: {
14884 return CommonCost + GetGEPCostDiff(VL, VL0);
14886 case Instruction::Load: {
14887 auto GetScalarCost = [&](
unsigned Idx) {
14889 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14890 VI->getAlign(),
VI->getPointerAddressSpace(),
14896 switch (
E->State) {
14897 case TreeEntry::Vectorize:
14898 if (
unsigned Factor =
E->getInterleaveFactor()) {
14899 VecLdCost = TTI->getInterleavedMemoryOpCost(
14900 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14901 LI0->getPointerAddressSpace(),
CostKind);
14904 VecLdCost = TTI->getMemoryOpCost(
14905 Instruction::Load, VecTy, LI0->getAlign(),
14909 case TreeEntry::StridedVectorize: {
14910 Align CommonAlignment =
14912 VecLdCost = TTI->getStridedMemoryOpCost(
14913 Instruction::Load, VecTy, LI0->getPointerOperand(),
14914 false, CommonAlignment,
CostKind);
14917 case TreeEntry::CompressVectorize: {
14919 unsigned InterleaveFactor;
14920 SmallVector<int> CompressMask;
14923 if (!
E->ReorderIndices.empty()) {
14924 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
14925 E->ReorderIndices.end());
14932 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14933 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14934 CompressMask, LoadVecTy);
14935 assert(IsVectorized &&
"Failed to vectorize load");
14936 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
14937 InterleaveFactor, IsMasked);
14938 Align CommonAlignment = LI0->getAlign();
14939 if (InterleaveFactor) {
14940 VecLdCost = TTI->getInterleavedMemoryOpCost(
14941 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14942 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14943 }
else if (IsMasked) {
14944 VecLdCost = TTI->getMaskedMemoryOpCost(
14945 Instruction::Load, LoadVecTy, CommonAlignment,
14946 LI0->getPointerAddressSpace(),
CostKind);
14949 LoadVecTy, CompressMask,
CostKind);
14951 VecLdCost = TTI->getMemoryOpCost(
14952 Instruction::Load, LoadVecTy, CommonAlignment,
14956 LoadVecTy, CompressMask,
CostKind);
14960 case TreeEntry::ScatterVectorize: {
14961 Align CommonAlignment =
14963 VecLdCost = TTI->getGatherScatterOpCost(
14964 Instruction::Load, VecTy, LI0->getPointerOperand(),
14965 false, CommonAlignment,
CostKind);
14968 case TreeEntry::CombinedVectorize:
14969 case TreeEntry::SplitVectorize:
14970 case TreeEntry::NeedToGather:
14973 return VecLdCost + CommonCost;
14979 if (
E->State == TreeEntry::ScatterVectorize)
14986 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14988 case Instruction::Store: {
14989 bool IsReorder = !
E->ReorderIndices.empty();
14990 auto GetScalarCost = [=](
unsigned Idx) {
14993 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14994 VI->getAlign(),
VI->getPointerAddressSpace(),
15002 if (
E->State == TreeEntry::StridedVectorize) {
15003 Align CommonAlignment =
15005 VecStCost = TTI->getStridedMemoryOpCost(
15006 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15007 false, CommonAlignment,
CostKind);
15009 assert(
E->State == TreeEntry::Vectorize &&
15010 "Expected either strided or consecutive stores.");
15011 if (
unsigned Factor =
E->getInterleaveFactor()) {
15012 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15013 "No reused shuffles expected");
15015 VecStCost = TTI->getInterleavedMemoryOpCost(
15016 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15017 BaseSI->getPointerAddressSpace(),
CostKind);
15020 VecStCost = TTI->getMemoryOpCost(
15021 Instruction::Store, VecTy, BaseSI->getAlign(),
15022 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15025 return VecStCost + CommonCost;
15029 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15033 return GetCostDiff(GetScalarCost, GetVectorCost) +
15034 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15036 case Instruction::Call: {
15037 auto GetScalarCost = [&](
unsigned Idx) {
15041 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15042 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15052 CI,
ID, VecTy->getNumElements(),
15053 It != MinBWs.end() ? It->second.first : 0, TTI);
15055 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15057 return GetCostDiff(GetScalarCost, GetVectorCost);
15059 case Instruction::ShuffleVector: {
15067 "Invalid Shuffle Vector Operand");
15070 auto TryFindNodeWithEqualOperands = [=]() {
15071 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15074 if (
TE->hasState() &&
TE->isAltShuffle() &&
15075 ((
TE->getOpcode() ==
E->getOpcode() &&
15076 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15077 (
TE->getOpcode() ==
E->getAltOpcode() &&
15078 TE->getAltOpcode() ==
E->getOpcode())) &&
15079 TE->hasEqualOperands(*
E))
15084 auto GetScalarCost = [&](
unsigned Idx) {
15089 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15090 "Unexpected main/alternate opcode");
15092 return TTI->getInstructionCost(VI,
CostKind);
15100 if (TryFindNodeWithEqualOperands()) {
15102 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15109 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15111 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15114 VecCost = TTIRef.getCmpSelInstrCost(
15115 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15116 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15118 VecCost += TTIRef.getCmpSelInstrCost(
15119 E->getOpcode(), VecTy, MaskTy,
15121 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15124 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15127 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15128 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15130 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15131 if (SrcIt != MinBWs.end()) {
15132 SrcBWSz = SrcIt->second.first;
15136 if (BWSz <= SrcBWSz) {
15137 if (BWSz < SrcBWSz)
15139 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15143 <<
"SLP: alternate extension, which should be truncated.\n";
15149 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15152 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15155 SmallVector<int>
Mask;
15156 E->buildAltOpShuffleMask(
15157 [&](Instruction *
I) {
15158 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15159 "Unexpected main/alternate opcode");
15170 unsigned Opcode0 =
E->getOpcode();
15171 unsigned Opcode1 =
E->getAltOpcode();
15172 SmallBitVector OpcodeMask(
15176 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15178 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15179 return AltVecCost < VecCost ? AltVecCost : VecCost;
15185 return GetCostDiff(
15190 "Not supported shufflevector usage.");
15192 unsigned SVNumElements =
15194 ->getNumElements();
15195 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15196 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15201 "Not supported shufflevector usage.");
15204 [[maybe_unused]]
bool IsExtractSubvectorMask =
15205 SV->isExtractSubvectorMask(Index);
15206 assert(IsExtractSubvectorMask &&
15207 "Not supported shufflevector usage.");
15208 if (NextIndex != Index)
15210 NextIndex += SV->getShuffleMask().size();
15213 return ::getShuffleCost(
15219 return GetCostDiff(GetScalarCost, GetVectorCost);
15221 case Instruction::Freeze:
15228bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15230 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15232 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15233 SmallVector<int>
Mask;
15234 return TE->isGather() &&
15236 [
this](
Value *V) { return EphValues.contains(V); }) &&
15238 TE->Scalars.size() < Limit ||
15239 (((
TE->hasState() &&
15240 TE->getOpcode() == Instruction::ExtractElement) ||
15243 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15244 !
TE->isAltShuffle()) ||
15249 if (VectorizableTree.size() == 1 &&
15250 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15251 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15252 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15254 AreVectorizableGathers(VectorizableTree[0].
get(),
15255 VectorizableTree[0]->Scalars.size()) &&
15256 VectorizableTree[0]->getVectorFactor() > 2)))
15259 if (VectorizableTree.size() != 2)
15266 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15267 AreVectorizableGathers(VectorizableTree[1].
get(),
15268 VectorizableTree[0]->Scalars.size()))
15272 if (VectorizableTree[0]->
isGather() ||
15273 (VectorizableTree[1]->
isGather() &&
15274 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15275 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15276 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15284 bool MustMatchOrInst) {
15288 Value *ZextLoad = Root;
15289 const APInt *ShAmtC;
15290 bool FoundOr =
false;
15294 ShAmtC->
urem(8) == 0))) {
15296 ZextLoad = BinOp->getOperand(0);
15297 if (BinOp->getOpcode() == Instruction::Or)
15302 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15309 Type *SrcTy = Load->getType();
15310 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15316 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15326 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15327 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15335 unsigned NumElts = Stores.
size();
15336 for (
Value *Scalar : Stores) {
15350 if (VectorizableTree.empty()) {
15351 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15357 if (VectorizableTree.size() == 2 &&
15359 VectorizableTree[1]->isGather() &&
15360 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15361 !(
isSplat(VectorizableTree[1]->Scalars) ||
15369 constexpr int Limit = 4;
15371 !VectorizableTree.empty() &&
15372 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15373 return (TE->isGather() &&
15374 (!TE->hasState() ||
15375 TE->getOpcode() != Instruction::ExtractElement) &&
15377 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15384 VectorizableTree.size() <= Limit &&
15385 all_of(VectorizableTree,
15386 [&](
const std::unique_ptr<TreeEntry> &TE) {
15387 return (TE->isGather() &&
15388 (!TE->hasState() ||
15389 TE->getOpcode() != Instruction::ExtractElement) &&
15393 (TE->getOpcode() == Instruction::InsertElement ||
15394 (TE->getOpcode() == Instruction::PHI &&
15396 return isa<PoisonValue>(V) || MustGather.contains(V);
15399 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15400 return TE->State == TreeEntry::Vectorize &&
15401 TE->getOpcode() == Instruction::PHI;
15408 unsigned NumGathers = 0;
15409 constexpr int LimitTreeSize = 36;
15411 all_of(VectorizableTree,
15412 [&](
const std::unique_ptr<TreeEntry> &TE) {
15413 if (!TE->isGather() && TE->hasState() &&
15414 (TE->getOpcode() == Instruction::Load ||
15415 TE->getOpcode() == Instruction::Store)) {
15419 if (TE->isGather())
15421 return TE->State == TreeEntry::SplitVectorize ||
15422 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15423 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15424 VectorizableTree.size() > LimitTreeSize) ||
15428 (TE->getOpcode() == Instruction::PHI ||
15429 (TE->hasCopyableElements() &&
15432 TE->Scalars.size() / 2) ||
15433 ((!TE->ReuseShuffleIndices.empty() ||
15434 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15435 TE->Scalars.size() == 2)));
15437 (StoreLoadNodes.
empty() ||
15438 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15439 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15440 return TE->getOpcode() == Instruction::Store ||
15442 return !isa<LoadInst>(V) ||
15443 areAllUsersVectorized(cast<Instruction>(V));
15451 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15452 VectorizableTree.size() >= Limit &&
15454 [&](
const std::unique_ptr<TreeEntry> &TE) {
15455 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15456 TE->UserTreeIndex.UserTE->Idx == 0;
15463 VectorizableTree.size() > 2 &&
15464 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15465 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15466 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15467 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15469 ArrayRef(VectorizableTree).drop_front(2),
15470 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15480 if (isFullyVectorizableTinyTree(ForReduction))
15485 bool IsAllowedSingleBVNode =
15486 VectorizableTree.
size() > 1 ||
15487 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15488 !VectorizableTree.front()->isAltShuffle() &&
15489 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15490 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15492 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15493 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15494 return isa<ExtractElementInst, Constant>(V) ||
15495 (IsAllowedSingleBVNode &&
15496 !V->hasNUsesOrMore(UsesLimit) &&
15497 any_of(V->users(), IsaPred<InsertElementInst>));
15502 if (VectorizableTree.back()->isGather() &&
15503 VectorizableTree.back()->hasState() &&
15504 VectorizableTree.back()->isAltShuffle() &&
15505 VectorizableTree.back()->getVectorFactor() > 2 &&
15507 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15508 TTI->getScalarizationOverhead(
15509 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15510 VectorizableTree.back()->getVectorFactor()),
15523 constexpr unsigned SmallTree = 3;
15524 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15527 [](
const std::unique_ptr<TreeEntry> &TE) {
15528 return TE->isGather() && TE->hasState() &&
15529 TE->getOpcode() == Instruction::Load &&
15537 TreeEntry &E = *VectorizableTree[Idx];
15538 if (E.State == TreeEntry::SplitVectorize)
15542 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15561 const TreeEntry *Root = VectorizableTree.front().get();
15562 if (Root->isGather())
15570 for (
const auto &TEPtr : VectorizableTree) {
15571 if (!TEPtr->isGather()) {
15572 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15573 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15574 LastInstructions.
insert(LastInst);
15576 if (TEPtr->UserTreeIndex)
15577 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15584 if (
II->isAssumeLikeIntrinsic())
15591 return IntrCost < CallCost;
15598 CheckedInstructions;
15599 unsigned Budget = 0;
15600 const unsigned BudgetLimit =
15605 "Expected instructions in same block.");
15606 if (
auto It = CheckedInstructions.
find(
Last);
15607 It != CheckedInstructions.
end()) {
15608 const Instruction *Checked = It->second.getPointer();
15610 return It->second.getInt() != 0;
15616 ++
First->getIterator().getReverse(),
15618 Last->getIterator().getReverse();
15620 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15626 for (
const Instruction *LastInst : LastInstsInRange)
15627 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15630 if (LastInstructions.
contains(&*PrevInstIt))
15631 LastInstsInRange.
push_back(&*PrevInstIt);
15636 for (
const Instruction *LastInst : LastInstsInRange)
15638 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15639 Budget <= BudgetLimit ? 1 : 0);
15640 return Budget <= BudgetLimit;
15642 auto AddCosts = [&](
const TreeEntry *
Op) {
15643 Type *ScalarTy =
Op->Scalars.front()->getType();
15644 auto It = MinBWs.find(
Op);
15645 if (It != MinBWs.end())
15648 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15651 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15658 ParentOpParentToPreds;
15661 auto Key = std::make_pair(Root, OpParent);
15662 if (
auto It = ParentOpParentToPreds.
find(
Key);
15663 It != ParentOpParentToPreds.
end())
15675 for (
const auto &KeyPair : ParentsPairsToAdd) {
15677 "Should not have been added before.");
15681 while (!Worklist.
empty()) {
15683 if (BB == OpParent || !Visited.
insert(BB).second)
15685 auto Pair = std::make_pair(BB, OpParent);
15686 if (
auto It = ParentOpParentToPreds.
find(Pair);
15687 It != ParentOpParentToPreds.
end()) {
15691 ParentsPairsToAdd.
insert(Pair);
15696 if (Budget > BudgetLimit)
15708 while (!LiveEntries.
empty()) {
15713 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15716 if (!
Op->isGather())
15718 if (Entry->State == TreeEntry::SplitVectorize ||
15719 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15725 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15728 if (
Op->isGather()) {
15729 assert(Entry->getOpcode() == Instruction::PHI &&
15730 "Expected phi node only.");
15732 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15734 for (
Value *V :
Op->Scalars) {
15745 OpLastInst = EntriesToLastInstruction.
at(
Op);
15749 if (OpParent == Parent) {
15750 if (Entry->getOpcode() == Instruction::PHI) {
15751 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15755 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15761 if (Entry->getOpcode() != Instruction::PHI &&
15762 !CheckForNonVecCallsInSameBlock(
15763 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15769 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15775 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15791 const auto *I1 = IE1;
15792 const auto *I2 = IE2;
15804 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15807 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15810 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15817struct ValueSelect {
15818 template <
typename U>
15819 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15822 template <
typename U>
15823 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15841template <
typename T>
15847 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15849 auto VMIt = std::next(ShuffleMask.begin());
15852 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15854 if (!IsBaseUndef.
all()) {
15856 std::pair<T *, bool> Res =
15857 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15859 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15863 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15865 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15866 assert((!V || GetVF(V) == Mask.size()) &&
15867 "Expected base vector of VF number of elements.");
15868 Prev = Action(Mask, {
nullptr, Res.first});
15869 }
else if (ShuffleMask.size() == 1) {
15872 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15878 Prev = Action(Mask, {ShuffleMask.begin()->first});
15882 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15883 unsigned Vec2VF = GetVF(VMIt->first);
15884 if (Vec1VF == Vec2VF) {
15888 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15891 Mask[
I] = SecMask[
I] + Vec1VF;
15894 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15897 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15899 std::pair<T *, bool> Res2 =
15900 ResizeAction(VMIt->first, VMIt->second,
false);
15902 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15909 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15912 Prev = Action(Mask, {Res1.first, Res2.first});
15914 VMIt = std::next(VMIt);
15916 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15918 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
15920 std::pair<T *, bool> Res =
15921 ResizeAction(VMIt->first, VMIt->second,
false);
15923 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15926 "Multiple uses of scalars.");
15927 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15932 Prev = Action(Mask, {Prev, Res.first});
15940template <
typename T>
struct ShuffledInsertData {
15944 MapVector<T, SmallVector<int>> ValueMasks;
15952 << VectorizableTree.size() <<
".\n");
15955 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15956 TreeEntry &TE = *VectorizableTree[
I];
15959 if (TE.State == TreeEntry::CombinedVectorize) {
15961 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15962 << *TE.Scalars[0] <<
".\n";
15963 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15966 if (TE.hasState() &&
15967 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15968 if (
const TreeEntry *E =
15969 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15970 E && E->getVectorFactor() == TE.getVectorFactor()) {
15975 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15982 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15983 "Expected gather nodes with users only.");
15989 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15993 none_of(ExternalUses, [](
const ExternalUser &EU) {
16004 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16011 for (ExternalUser &EU : ExternalUses) {
16012 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16015 for (ExternalUser &EU : ExternalUses) {
16016 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16017 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16019 else dbgs() <<
" User: nullptr\n");
16020 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16025 if (EphValues.count(EU.User))
16029 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16031 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16039 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16045 !ExtractCostCalculated.
insert(EU.Scalar).second)
16058 if (!UsedInserts.
insert(VU).second)
16062 const TreeEntry *ScalarTE = &EU.E;
16065 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16070 Value *Op0 =
II->getOperand(0);
16077 if (It == ShuffledInserts.
end()) {
16079 Data.InsertElements.emplace_back(VU);
16081 VecId = ShuffledInserts.
size() - 1;
16082 auto It = MinBWs.find(ScalarTE);
16083 if (It != MinBWs.end() &&
16085 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16087 unsigned BWSz = It->second.first;
16088 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16089 unsigned VecOpcode;
16090 if (DstBWSz < BWSz)
16091 VecOpcode = Instruction::Trunc;
16094 It->second.second ? Instruction::SExt : Instruction::ZExt;
16099 FTy->getNumElements()),
16102 <<
" for extending externally used vector with "
16103 "non-equal minimum bitwidth.\n");
16108 It->InsertElements.front() = VU;
16109 VecId = std::distance(ShuffledInserts.
begin(), It);
16111 int InIdx = *InsertIdx;
16113 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16116 Mask[InIdx] = EU.Lane;
16117 DemandedElts[VecId].setBit(InIdx);
16128 auto *ScalarTy = EU.Scalar->getType();
16129 const unsigned BundleWidth = EU.E.getVectorFactor();
16130 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16132 const TreeEntry *Entry = &EU.E;
16133 auto It = MinBWs.find(Entry);
16134 if (It != MinBWs.end()) {
16139 ? Instruction::ZExt
16140 : Instruction::SExt;
16145 << ExtraCost <<
"\n");
16149 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16150 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16151 << *VecTy <<
": " << ExtraCost <<
"\n");
16154 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16155 Entry->getOpcode() == Instruction::Load) {
16157 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16160 const Loop *L = LI->getLoopFor(Phi->getParent());
16161 return L && (Phi->getParent() ==
I->getParent() ||
16162 L == LI->getLoopFor(
I->getParent()));
16166 if (!ValueToExtUses) {
16167 ValueToExtUses.emplace();
16168 for (
const auto &
P :
enumerate(ExternalUses)) {
16170 if (IsPhiInLoop(
P.value()))
16173 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16180 auto OperandIsScalar = [&](
Value *V) {
16186 return !EE->hasOneUse() || !MustGather.contains(EE);
16189 return ValueToExtUses->contains(V);
16191 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16192 bool CanBeUsedAsScalarCast =
false;
16195 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16200 if (ScalarCost + OpCost <= ExtraCost) {
16201 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16202 ScalarCost += OpCost;
16206 if (CanBeUsedAsScalar) {
16207 bool KeepScalar = ScalarCost <= ExtraCost;
16211 bool IsProfitablePHIUser =
16213 VectorizableTree.front()->Scalars.size() > 2)) &&
16214 VectorizableTree.front()->hasState() &&
16215 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16219 auto *PHIUser = dyn_cast<PHINode>(U);
16220 return (!PHIUser ||
16221 PHIUser->getParent() !=
16223 VectorizableTree.front()->getMainOp())
16228 return ValueToExtUses->contains(V);
16230 if (IsProfitablePHIUser) {
16234 (!GatheredLoadsEntriesFirst.has_value() ||
16235 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16236 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16237 return ValueToExtUses->contains(V);
16239 auto It = ExtractsCount.
find(Entry);
16240 if (It != ExtractsCount.
end()) {
16241 assert(ScalarUsesCount >= It->getSecond().size() &&
16242 "Expected total number of external uses not less than "
16243 "number of scalar uses.");
16244 ScalarUsesCount -= It->getSecond().size();
16249 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16252 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16253 for (
Value *V : Inst->operands()) {
16254 auto It = ValueToExtUses->find(V);
16255 if (It != ValueToExtUses->end()) {
16257 ExternalUses[It->second].User =
nullptr;
16260 ExtraCost = ScalarCost;
16261 if (!IsPhiInLoop(EU))
16262 ExtractsCount[Entry].
insert(Inst);
16263 if (CanBeUsedAsScalarCast) {
16264 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16268 for (
Value *V : IOp->operands()) {
16269 auto It = ValueToExtUses->find(V);
16270 if (It != ValueToExtUses->end()) {
16272 ExternalUses[It->second].User =
nullptr;
16281 ExtractCost += ExtraCost;
16285 for (
Value *V : ScalarOpsFromCasts) {
16286 ExternalUsesAsOriginalScalar.insert(V);
16288 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16289 TEs.front()->findLaneForValue(V));
16293 if (!VectorizedVals.
empty()) {
16294 const TreeEntry &Root = *VectorizableTree.front();
16295 auto BWIt = MinBWs.find(&Root);
16296 if (BWIt != MinBWs.end()) {
16297 Type *DstTy = Root.Scalars.front()->getType();
16298 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16300 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16301 if (OriginalSz != SrcSz) {
16302 unsigned Opcode = Instruction::Trunc;
16303 if (OriginalSz > SrcSz)
16304 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16310 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16317 Cost += ExtractCost;
16319 bool ForSingleMask) {
16321 unsigned VF = Mask.size();
16322 unsigned VecVF = TE->getVectorFactor();
16323 bool HasLargeIndex =
16324 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16325 if ((VF != VecVF && HasLargeIndex) ||
16328 if (HasLargeIndex) {
16330 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16336 dbgs() <<
"SLP: Adding cost " <<
C
16337 <<
" for final shuffle of insertelement external users.\n";
16338 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16340 return std::make_pair(TE,
true);
16343 if (!ForSingleMask) {
16345 for (
unsigned I = 0;
I < VF; ++
I) {
16347 ResizeMask[Mask[
I]] = Mask[
I];
16354 dbgs() <<
"SLP: Adding cost " <<
C
16355 <<
" for final shuffle of insertelement external users.\n";
16356 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16361 return std::make_pair(TE,
false);
16364 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16365 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16366 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16370 assert((TEs.size() == 1 || TEs.size() == 2) &&
16371 "Expected exactly 1 or 2 tree entries.");
16372 if (TEs.size() == 1) {
16374 VF = TEs.front()->getVectorFactor();
16375 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16379 (
Data.index() < VF &&
16380 static_cast<int>(
Data.index()) ==
Data.value());
16385 <<
" for final shuffle of insertelement "
16386 "external users.\n";
16387 TEs.front()->
dump();
16388 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16394 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16395 VF = TEs.front()->getVectorFactor();
16399 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16403 <<
" for final shuffle of vector node and external "
16404 "insertelement users.\n";
16405 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16406 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16414 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16415 EstimateShufflesCost);
16418 ShuffledInserts[
I].InsertElements.
front()->getType()),
16421 Cost -= InsertCost;
16425 if (ReductionBitWidth != 0) {
16426 assert(UserIgnoreList &&
"Expected reduction tree.");
16427 const TreeEntry &E = *VectorizableTree.front();
16428 auto It = MinBWs.find(&E);
16429 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16430 unsigned SrcSize = It->second.first;
16431 unsigned DstSize = ReductionBitWidth;
16432 unsigned Opcode = Instruction::Trunc;
16433 if (SrcSize < DstSize) {
16434 bool IsArithmeticExtendedReduction =
16437 return is_contained({Instruction::Add, Instruction::FAdd,
16438 Instruction::Mul, Instruction::FMul,
16439 Instruction::And, Instruction::Or,
16443 if (IsArithmeticExtendedReduction)
16445 Instruction::BitCast;
16447 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16449 if (Opcode != Instruction::BitCast) {
16451 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16453 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16456 switch (E.getOpcode()) {
16457 case Instruction::SExt:
16458 case Instruction::ZExt:
16459 case Instruction::Trunc: {
16460 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16461 CCH = getCastContextHint(*OpTE);
16467 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16471 <<
" for final resize for reduction from " << SrcVecTy
16472 <<
" to " << DstVecTy <<
"\n";
16473 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16478 std::optional<InstructionCost> SpillCost;
16481 Cost += *SpillCost;
16487 OS <<
"SLP: Spill Cost = ";
16492 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16493 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16497 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16508std::optional<TTI::ShuffleKind>
16509BoUpSLP::tryToGatherSingleRegisterExtractElements(
16515 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16531 if (Idx >= VecTy->getNumElements()) {
16535 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16536 ExtractMask.reset(*Idx);
16541 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16546 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16547 return P1.second.size() > P2.second.size();
16550 const int UndefSz = UndefVectorExtracts.
size();
16551 unsigned SingleMax = 0;
16552 unsigned PairMax = 0;
16553 if (!Vectors.
empty()) {
16554 SingleMax = Vectors.
front().second.size() + UndefSz;
16555 if (Vectors.
size() > 1) {
16556 auto *ItNext = std::next(Vectors.
begin());
16557 PairMax = SingleMax + ItNext->second.size();
16560 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16561 return std::nullopt;
16567 if (SingleMax >= PairMax && SingleMax) {
16568 for (
int Idx : Vectors.
front().second)
16569 std::swap(GatheredExtracts[Idx], VL[Idx]);
16570 }
else if (!Vectors.
empty()) {
16571 for (
unsigned Idx : {0, 1})
16572 for (
int Idx : Vectors[Idx].second)
16573 std::swap(GatheredExtracts[Idx], VL[Idx]);
16576 for (
int Idx : UndefVectorExtracts)
16577 std::swap(GatheredExtracts[Idx], VL[Idx]);
16580 std::optional<TTI::ShuffleKind> Res =
16586 return std::nullopt;
16590 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16611BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16612 SmallVectorImpl<int> &Mask,
16613 unsigned NumParts)
const {
16614 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16623 SmallVector<int> SubMask;
16624 std::optional<TTI::ShuffleKind> Res =
16625 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16626 ShufflesRes[Part] = Res;
16627 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16629 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16630 return Res.has_value();
16632 ShufflesRes.clear();
16633 return ShufflesRes;
16636std::optional<TargetTransformInfo::ShuffleKind>
16637BoUpSLP::isGatherShuffledSingleRegisterEntry(
16639 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16643 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16644 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16645 TE =
TE->UserTreeIndex.UserTE;
16646 if (TE == VectorizableTree.front().get())
16647 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16648 return TE->UserTreeIndex;
16650 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16651 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16652 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16654 TE =
TE->UserTreeIndex.UserTE;
16658 const EdgeInfo TEUseEI = GetUserEntry(TE);
16660 return std::nullopt;
16661 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16666 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16667 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16668 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16671 TEInsertBlock = TEInsertPt->
getParent();
16673 if (!DT->isReachableFromEntry(TEInsertBlock))
16674 return std::nullopt;
16675 auto *NodeUI = DT->getNode(TEInsertBlock);
16676 assert(NodeUI &&
"Should only process reachable instructions");
16678 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16691 const BasicBlock *InsertBlock = InsertPt->getParent();
16692 auto *NodeEUI = DT->getNode(InsertBlock);
16695 assert((NodeUI == NodeEUI) ==
16696 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16697 "Different nodes should have different DFS numbers");
16699 if (TEInsertPt->
getParent() != InsertBlock &&
16700 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16702 if (TEInsertPt->
getParent() == InsertBlock &&
16715 SmallDenseMap<Value *, int> UsedValuesEntry;
16716 SmallPtrSet<const Value *, 16> VisitedValue;
16717 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16719 if ((TEPtr->getVectorFactor() != VL.
size() &&
16720 TEPtr->Scalars.size() != VL.
size()) ||
16721 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16725 for (
Value *V : VL) {
16732 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16733 unsigned EdgeIdx) {
16734 const TreeEntry *Ptr1 = User1;
16735 const TreeEntry *Ptr2 = User2;
16736 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16739 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16740 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16743 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16744 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16745 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16746 return Idx < It->second;
16750 for (
Value *V : VL) {
16754 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16755 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16756 if (TEPtr == TE || TEPtr->Idx == 0)
16759 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16760 "Must contain at least single gathered value.");
16761 assert(TEPtr->UserTreeIndex &&
16762 "Expected only single user of a gather node.");
16763 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16765 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16766 UseEI.UserTE->hasState())
16771 : &getLastInstructionInBundle(UseEI.UserTE);
16772 if (TEInsertPt == InsertPt) {
16774 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16775 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16776 TEUseEI.UserTE->isAltShuffle()) &&
16778 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16779 (UseEI.UserTE->hasState() &&
16780 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16781 !UseEI.UserTE->isAltShuffle()) ||
16790 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16793 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16794 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16795 UseEI.UserTE->State == TreeEntry::Vectorize &&
16796 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16797 TEUseEI.UserTE != UseEI.UserTE)
16802 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16806 if (TEUseEI.UserTE != UseEI.UserTE &&
16807 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16808 HasGatherUser(TEUseEI.UserTE)))
16811 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16815 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16816 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16817 UseEI.UserTE->doesNotNeedToSchedule() &&
16822 if ((TEInsertBlock != InsertPt->
getParent() ||
16823 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16824 !CheckOrdering(InsertPt))
16827 if (CheckAndUseSameNode(TEPtr))
16833 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16834 if (It != VTEs.end()) {
16835 const TreeEntry *VTE = *It;
16836 if (
none_of(
TE->CombinedEntriesWithIndices,
16837 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16838 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16839 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16843 if (CheckAndUseSameNode(VTE))
16849 const TreeEntry *VTE = VTEs.front();
16850 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16851 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16852 VTEs = VTEs.drop_front();
16854 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16855 return MTE->State == TreeEntry::Vectorize;
16857 if (MIt == VTEs.end())
16861 if (
none_of(
TE->CombinedEntriesWithIndices,
16862 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16863 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16864 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16868 if (CheckAndUseSameNode(VTE))
16872 if (VToTEs.
empty())
16874 if (UsedTEs.
empty()) {
16882 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16884 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16888 if (!VToTEs.
empty()) {
16894 VToTEs = SavedVToTEs;
16899 if (Idx == UsedTEs.
size()) {
16903 if (UsedTEs.
size() == 2)
16905 UsedTEs.push_back(SavedVToTEs);
16906 Idx = UsedTEs.
size() - 1;
16912 if (UsedTEs.
empty()) {
16914 return std::nullopt;
16918 if (UsedTEs.
size() == 1) {
16921 UsedTEs.front().
end());
16922 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16923 return TE1->Idx < TE2->Idx;
16926 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16927 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16929 if (It != FirstEntries.end() &&
16930 ((*It)->getVectorFactor() == VL.size() ||
16931 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16932 TE->ReuseShuffleIndices.size() == VL.size() &&
16933 (*It)->isSame(
TE->Scalars)))) {
16935 if ((*It)->getVectorFactor() == VL.size()) {
16936 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16937 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16939 SmallVector<int> CommonMask =
TE->getCommonMask();
16950 Entries.
push_back(FirstEntries.front());
16952 for (
auto &
P : UsedValuesEntry)
16954 VF = FirstEntries.front()->getVectorFactor();
16957 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16959 DenseMap<int, const TreeEntry *> VFToTE;
16960 for (
const TreeEntry *TE : UsedTEs.front()) {
16961 unsigned VF =
TE->getVectorFactor();
16962 auto It = VFToTE.
find(VF);
16963 if (It != VFToTE.
end()) {
16964 if (It->second->Idx >
TE->Idx)
16965 It->getSecond() =
TE;
16972 UsedTEs.back().
end());
16973 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16974 return TE1->Idx < TE2->Idx;
16976 for (
const TreeEntry *TE : SecondEntries) {
16977 auto It = VFToTE.
find(
TE->getVectorFactor());
16978 if (It != VFToTE.
end()) {
16987 if (Entries.
empty()) {
16989 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16990 return TE1->Idx < TE2->Idx;
16992 Entries.
push_back(SecondEntries.front());
16993 VF = std::max(Entries.
front()->getVectorFactor(),
16994 Entries.
back()->getVectorFactor());
16996 VF = Entries.
front()->getVectorFactor();
16999 for (
const TreeEntry *
E : Entries)
17003 for (
auto &
P : UsedValuesEntry) {
17005 if (ValuesToEntries[Idx].
contains(
P.first)) {
17015 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17022 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17024 Value *In1 = PHI1->getIncomingValue(
I);
17039 auto MightBeIgnored = [=](
Value *
V) {
17043 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17048 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17049 Value *V1 = VL[Idx];
17050 bool UsedInSameVTE =
false;
17051 auto It = UsedValuesEntry.find(V1);
17052 if (It != UsedValuesEntry.end())
17053 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17054 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17061 SmallBitVector UsedIdxs(Entries.size());
17063 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17065 auto It = UsedValuesEntry.find(V);
17066 if (It == UsedValuesEntry.end())
17072 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17073 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17075 unsigned Idx = It->second;
17082 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17083 if (!UsedIdxs.test(
I))
17089 for (std::pair<unsigned, int> &Pair : EntryLanes)
17090 if (Pair.first ==
I)
17091 Pair.first = TempEntries.
size();
17094 Entries.swap(TempEntries);
17095 if (EntryLanes.size() == Entries.size() &&
17097 .slice(Part * VL.size(),
17098 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17104 return std::nullopt;
17107 bool IsIdentity = Entries.size() == 1;
17110 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17111 unsigned Idx = Part * VL.size() + Pair.second;
17114 (ForOrder ? std::distance(
17115 Entries[Pair.first]->Scalars.begin(),
17116 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17117 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17118 IsIdentity &=
Mask[Idx] == Pair.second;
17120 if (ForOrder || IsIdentity || Entries.empty()) {
17121 switch (Entries.size()) {
17123 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17127 if (EntryLanes.size() > 2 || VL.size() <= 2)
17134 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17136 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17137 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17138 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17139 for (
int Idx : SubMask) {
17147 assert(MaxElement >= 0 && MinElement >= 0 &&
17148 MaxElement % VF >= MinElement % VF &&
17149 "Expected at least single element.");
17150 unsigned NewVF = std::max<unsigned>(
17152 (MaxElement % VF) -
17153 (MinElement % VF) + 1));
17155 for (
int &Idx : SubMask) {
17158 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17159 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17167 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17168 auto GetShuffleCost = [&,
17169 &TTI = *TTI](ArrayRef<int>
Mask,
17172 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17174 Mask, Entries.front()->getInterleaveFactor()))
17176 return ::getShuffleCost(TTI,
17181 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17183 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17184 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17185 FirstShuffleCost = ShuffleCost;
17189 bool IsIdentity =
true;
17190 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17191 if (Idx >=
static_cast<int>(NewVF)) {
17196 IsIdentity &=
static_cast<int>(
I) == Idx;
17200 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17202 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17206 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17207 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17208 SecondShuffleCost = ShuffleCost;
17212 bool IsIdentity =
true;
17213 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17214 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17220 IsIdentity &=
static_cast<int>(
I) == Idx;
17225 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17227 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17235 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17237 const TreeEntry *BestEntry =
nullptr;
17238 if (FirstShuffleCost < ShuffleCost) {
17239 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17240 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17242 if (Idx >= static_cast<int>(VF))
17243 Idx = PoisonMaskElem;
17245 BestEntry = Entries.front();
17246 ShuffleCost = FirstShuffleCost;
17248 if (SecondShuffleCost < ShuffleCost) {
17249 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17250 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17252 if (Idx < static_cast<int>(VF))
17253 Idx = PoisonMaskElem;
17257 BestEntry = Entries[1];
17258 ShuffleCost = SecondShuffleCost;
17260 if (BuildVectorCost >= ShuffleCost) {
17263 Entries.push_back(BestEntry);
17271 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17273 return std::nullopt;
17277BoUpSLP::isGatherShuffledEntry(
17281 assert(NumParts > 0 && NumParts < VL.
size() &&
17282 "Expected positive number of registers.");
17285 if (TE == VectorizableTree.front().get() &&
17286 (!GatheredLoadsEntriesFirst.has_value() ||
17288 [](
const std::unique_ptr<TreeEntry> &TE) {
17289 return !
TE->isGather();
17294 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17297 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17298 "Expected only single user of the gather node.");
17300 "Number of scalars must be divisible by NumParts.");
17301 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17302 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17304 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17307 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17314 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17315 std::optional<TTI::ShuffleKind> SubRes =
17316 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17319 SubEntries.
clear();
17322 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17323 (SubEntries.
front()->isSame(
TE->Scalars) ||
17324 SubEntries.
front()->isSame(VL))) {
17326 LocalSubEntries.
swap(SubEntries);
17329 std::iota(
Mask.begin(),
Mask.end(), 0);
17331 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17334 Entries.emplace_back(1, LocalSubEntries.
front());
17340 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17348 Type *ScalarTy)
const {
17349 const unsigned VF = VL.
size();
17357 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17359 if (
V->getType() != ScalarTy)
17360 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17364 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17371 ConstantShuffleMask[
I] =
I + VF;
17374 EstimateInsertCost(
I, V);
17377 bool IsAnyNonUndefConst =
17380 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17382 ConstantShuffleMask);
17386 if (!DemandedElements.
isZero())
17390 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17394Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17395 auto It = EntryToLastInstruction.find(
E);
17396 if (It != EntryToLastInstruction.end())
17404 if (
E->hasState()) {
17405 Front =
E->getMainOp();
17406 Opcode =
E->getOpcode();
17413 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17414 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17415 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17417 [=](
Value *V) ->
bool {
17418 if (Opcode == Instruction::GetElementPtr &&
17419 !isa<GetElementPtrInst>(V))
17421 auto *I = dyn_cast<Instruction>(V);
17422 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17423 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17425 "Expected gathered loads or GEPs or instructions from same basic "
17428 auto FindLastInst = [&]() {
17430 for (
Value *V :
E->Scalars) {
17434 if (
E->isCopyableElement(
I))
17436 if (LastInst->
getParent() ==
I->getParent()) {
17441 assert(((Opcode == Instruction::GetElementPtr &&
17443 E->State == TreeEntry::SplitVectorize ||
17446 (GatheredLoadsEntriesFirst.has_value() &&
17447 Opcode == Instruction::Load &&
E->isGather() &&
17448 E->Idx < *GatheredLoadsEntriesFirst)) &&
17449 "Expected vector-like or non-GEP in GEP node insts only.");
17450 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17454 if (!DT->isReachableFromEntry(
I->getParent()))
17456 auto *NodeA = DT->getNode(LastInst->
getParent());
17457 auto *NodeB = DT->getNode(
I->getParent());
17458 assert(NodeA &&
"Should only process reachable instructions");
17459 assert(NodeB &&
"Should only process reachable instructions");
17460 assert((NodeA == NodeB) ==
17461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17462 "Different nodes should have different DFS numbers");
17463 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17470 auto FindFirstInst = [&]() {
17472 for (
Value *V :
E->Scalars) {
17476 if (
E->isCopyableElement(
I))
17478 if (FirstInst->
getParent() ==
I->getParent()) {
17479 if (
I->comesBefore(FirstInst))
17483 assert(((Opcode == Instruction::GetElementPtr &&
17487 "Expected vector-like or non-GEP in GEP node insts only.");
17488 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17492 if (!DT->isReachableFromEntry(
I->getParent()))
17494 auto *NodeA = DT->getNode(FirstInst->
getParent());
17495 auto *NodeB = DT->getNode(
I->getParent());
17496 assert(NodeA &&
"Should only process reachable instructions");
17497 assert(NodeB &&
"Should only process reachable instructions");
17498 assert((NodeA == NodeB) ==
17499 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17500 "Different nodes should have different DFS numbers");
17501 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17507 if (
E->State == TreeEntry::SplitVectorize) {
17508 Res = FindLastInst();
17510 for (
auto *
E : Entries) {
17513 I = &getLastInstructionInBundle(
E);
17518 EntryToLastInstruction.try_emplace(
E, Res);
17523 if (GatheredLoadsEntriesFirst.has_value() &&
17524 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17525 Opcode == Instruction::Load) {
17526 Res = FindFirstInst();
17527 EntryToLastInstruction.try_emplace(
E, Res);
17533 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17537 const auto *It = BlocksSchedules.find(BB);
17538 if (It == BlocksSchedules.end())
17540 for (
Value *V :
E->Scalars) {
17546 if (Bundles.
empty())
17549 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17550 if (It != Bundles.
end())
17555 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17556 if (!
E->isGather() && !Bundle) {
17557 if ((Opcode == Instruction::GetElementPtr &&
17560 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17563 return isa<PoisonValue>(V) ||
17564 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17565 E->isCopyableElement(V) ||
17566 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17568 Res = FindLastInst();
17570 Res = FindFirstInst();
17571 EntryToLastInstruction.try_emplace(
E, Res);
17580 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17581 Res = Bundle->getBundle().back()->getInst();
17582 EntryToLastInstruction.try_emplace(
E, Res);
17605 Res = FindLastInst();
17606 assert(Res &&
"Failed to find last instruction in bundle");
17607 EntryToLastInstruction.try_emplace(
E, Res);
17611void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17612 auto *Front =
E->getMainOp();
17613 Instruction *LastInst = &getLastInstructionInBundle(
E);
17614 assert(LastInst &&
"Failed to find last instruction in bundle");
17619 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17620 if (LastInstIt != LastInst->
getParent()->end() &&
17621 LastInstIt->getParent()->isLandingPad())
17622 LastInstIt = std::next(LastInstIt);
17625 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17626 E->doesNotNeedToSchedule()) ||
17627 (GatheredLoadsEntriesFirst.has_value() &&
17628 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17629 E->getOpcode() == Instruction::Load)) {
17630 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17634 Builder.SetInsertPoint(
17638 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17641Value *BoUpSLP::gather(
17643 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17649 SmallSet<int, 4> PostponedIndices;
17650 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17652 SmallPtrSet<BasicBlock *, 4> Visited;
17653 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17654 InsertBB = InsertBB->getSinglePredecessor();
17655 return InsertBB && InsertBB == InstBB;
17657 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17659 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17661 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17662 PostponedIndices.
insert(
I).second)
17666 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17669 if (
Scalar->getType() != Ty) {
17680 Scalar = Builder.CreateIntCast(
17694 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17699 GatherShuffleExtractSeq.insert(InsElt);
17705 User *UserOp =
nullptr;
17710 if (
V->getType()->isVectorTy()) {
17712 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17714 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17716 if (SV->getOperand(0) == V)
17718 if (SV->getOperand(1) == V)
17724 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17726 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17729 "Failed to find shufflevector, caused by resize.");
17735 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17736 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17744 SmallVector<int> NonConsts;
17746 std::iota(
Mask.begin(),
Mask.end(), 0);
17747 Value *OriginalRoot = Root;
17750 SV->getOperand(0)->getType() == VecTy) {
17751 Root = SV->getOperand(0);
17752 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17755 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17764 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17769 Vec = OriginalRoot;
17771 Vec = CreateShuffle(Root, Vec, Mask);
17773 OI && OI->use_empty() &&
17774 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17775 return TE->VectorizedValue == OI;
17781 for (
int I : NonConsts)
17782 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17785 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17786 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17824 bool IsFinalized =
false;
17837 class ShuffleIRBuilder {
17850 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17851 CSEBlocks(CSEBlocks),
DL(DL) {}
17852 ~ShuffleIRBuilder() =
default;
17858 "Expected integer vector types only.");
17864 ->getIntegerBitWidth())
17865 V2 = Builder.CreateIntCast(
17868 V1 = Builder.CreateIntCast(
17872 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17874 GatherShuffleExtractSeq.insert(
I);
17875 CSEBlocks.insert(
I->getParent());
17884 unsigned VF = Mask.size();
17888 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17890 GatherShuffleExtractSeq.insert(
I);
17891 CSEBlocks.insert(
I->getParent());
17895 Value *createIdentity(
Value *V) {
return V; }
17896 Value *createPoison(
Type *Ty,
unsigned VF) {
17901 void resizeToMatch(
Value *&V1,
Value *&V2) {
17906 int VF = std::max(V1VF, V2VF);
17907 int MinVF = std::min(V1VF, V2VF);
17909 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17911 Value *&
Op = MinVF == V1VF ? V1 : V2;
17912 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17914 GatherShuffleExtractSeq.insert(
I);
17915 CSEBlocks.insert(
I->getParent());
17928 assert(V1 &&
"Expected at least one vector value.");
17929 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17930 R.CSEBlocks, *R.DL);
17931 return BaseShuffleAnalysis::createShuffle<Value *>(
17932 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17938 std::optional<bool> IsSigned = std::nullopt) {
17941 if (VecTy->getElementType() == ScalarTy->getScalarType())
17943 return Builder.CreateIntCast(
17944 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17948 Value *getVectorizedValue(
const TreeEntry &E) {
17949 Value *Vec = E.VectorizedValue;
17952 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17953 return !isa<PoisonValue>(V) &&
17954 !isKnownNonNegative(
17955 V, SimplifyQuery(*R.DL));
17961 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17965 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17966 unsigned NumParts,
bool &UseVecBaseAsInput) {
17967 UseVecBaseAsInput =
false;
17969 Value *VecBase =
nullptr;
17971 if (!E->ReorderIndices.empty()) {
17973 E->ReorderIndices.end());
17976 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17981 VecBase = EI->getVectorOperand();
17983 VecBase = TEs.front()->VectorizedValue;
17984 assert(VecBase &&
"Expected vectorized value.");
17985 UniqueBases.
insert(VecBase);
17988 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17989 (NumParts != 1 &&
count(VL, EI) > 1) ||
17991 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17992 return UTEs.empty() || UTEs.size() > 1 ||
17993 (isa<GetElementPtrInst>(U) &&
17994 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17996 count_if(R.VectorizableTree,
17997 [&](const std::unique_ptr<TreeEntry> &TE) {
17998 return TE->UserTreeIndex.UserTE ==
18000 is_contained(VL, EI);
18004 R.eraseInstruction(EI);
18006 if (NumParts == 1 || UniqueBases.
size() == 1) {
18007 assert(VecBase &&
"Expected vectorized value.");
18008 return castToScalarTyElem(VecBase);
18010 UseVecBaseAsInput =
true;
18020 Value *Vec =
nullptr;
18027 constexpr int MaxBases = 2;
18029 auto VLMask =
zip(SubVL, SubMask);
18030 const unsigned VF = std::accumulate(
18031 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18032 if (std::get<1>(D) == PoisonMaskElem)
18035 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18036 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18038 VecOp = TEs.front()->VectorizedValue;
18039 assert(VecOp &&
"Expected vectorized value.");
18040 const unsigned Size =
18041 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18042 return std::max(S, Size);
18044 for (
const auto [V,
I] : VLMask) {
18049 VecOp = TEs.front()->VectorizedValue;
18050 assert(VecOp &&
"Expected vectorized value.");
18051 VecOp = castToScalarTyElem(VecOp);
18052 Bases[
I / VF] = VecOp;
18054 if (!Bases.front())
18057 if (Bases.back()) {
18058 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18059 TransformToIdentity(SubMask);
18061 SubVec = Bases.front();
18067 ArrayRef<int> SubMask =
18068 Mask.slice(
P * SliceSize,
18071 return all_of(SubMask, [](
int Idx) {
18075 "Expected first part or all previous parts masked.");
18076 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18081 unsigned SubVecVF =
18083 NewVF = std::max(NewVF, SubVecVF);
18086 for (
int &Idx : SubMask)
18089 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18090 Vec = createShuffle(Vec, SubVec, VecMask);
18091 TransformToIdentity(VecMask);
18099 std::optional<Value *>
18105 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18107 return std::nullopt;
18110 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18111 return Builder.CreateAlignedLoad(
18118 IsFinalized =
false;
18119 CommonMask.clear();
18125 Value *V1 = getVectorizedValue(E1);
18126 Value *V2 = getVectorizedValue(E2);
18132 Value *V1 = getVectorizedValue(E1);
18137 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18140 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18141 V1 = castToScalarTyElem(V1);
18142 V2 = castToScalarTyElem(V2);
18143 if (InVectors.empty()) {
18144 InVectors.push_back(V1);
18145 InVectors.push_back(V2);
18146 CommonMask.assign(Mask.begin(), Mask.end());
18149 Value *Vec = InVectors.front();
18150 if (InVectors.size() == 2) {
18151 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18152 transformMaskAfterShuffle(CommonMask, CommonMask);
18155 Vec = createShuffle(Vec,
nullptr, CommonMask);
18156 transformMaskAfterShuffle(CommonMask, CommonMask);
18158 V1 = createShuffle(V1, V2, Mask);
18159 unsigned VF = std::max(getVF(V1), getVF(Vec));
18160 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18162 CommonMask[Idx] = Idx + VF;
18163 InVectors.front() = Vec;
18164 if (InVectors.size() == 2)
18165 InVectors.back() = V1;
18167 InVectors.push_back(V1);
18172 "castToScalarTyElem expects V1 to be FixedVectorType");
18173 V1 = castToScalarTyElem(V1);
18174 if (InVectors.empty()) {
18175 InVectors.push_back(V1);
18176 CommonMask.assign(Mask.begin(), Mask.end());
18179 const auto *It =
find(InVectors, V1);
18180 if (It == InVectors.end()) {
18181 if (InVectors.size() == 2 ||
18182 InVectors.front()->getType() != V1->
getType()) {
18183 Value *V = InVectors.front();
18184 if (InVectors.size() == 2) {
18185 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18186 transformMaskAfterShuffle(CommonMask, CommonMask);
18188 CommonMask.size()) {
18189 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18190 transformMaskAfterShuffle(CommonMask, CommonMask);
18192 unsigned VF = std::max(CommonMask.size(), Mask.size());
18193 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18195 CommonMask[Idx] = V->getType() != V1->
getType()
18197 : Mask[Idx] + getVF(V1);
18198 if (V->getType() != V1->
getType())
18199 V1 = createShuffle(V1,
nullptr, Mask);
18200 InVectors.front() = V;
18201 if (InVectors.size() == 2)
18202 InVectors.back() = V1;
18204 InVectors.push_back(V1);
18209 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18211 InVectors.push_back(V1);
18216 for (
Value *V : InVectors)
18217 VF = std::max(VF, getVF(V));
18218 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18220 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18229 Value *Root =
nullptr) {
18230 return R.gather(VL, Root, ScalarTy,
18232 return createShuffle(V1, V2, Mask);
18241 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18246 IsFinalized =
true;
18249 if (InVectors.
size() == 2) {
18250 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18253 Vec = createShuffle(Vec,
nullptr, CommonMask);
18255 transformMaskAfterShuffle(CommonMask, CommonMask);
18257 "Expected vector length for the final value before action.");
18261 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18262 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18265 return createShuffle(V1, V2, Mask);
18267 InVectors.
front() = Vec;
18269 if (!SubVectors.empty()) {
18271 if (InVectors.
size() == 2) {
18272 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18275 Vec = createShuffle(Vec,
nullptr, CommonMask);
18277 transformMaskAfterShuffle(CommonMask, CommonMask);
18278 auto CreateSubVectors = [&](
Value *Vec,
18279 SmallVectorImpl<int> &CommonMask) {
18280 for (
auto [
E, Idx] : SubVectors) {
18281 Value *
V = getVectorizedValue(*
E);
18288 Type *OrigScalarTy = ScalarTy;
18291 Builder, Vec, V, InsertionIndex,
18292 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18294 ScalarTy = OrigScalarTy;
18295 if (!CommonMask.
empty()) {
18296 std::iota(std::next(CommonMask.
begin(), Idx),
18297 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18303 if (SubVectorsMask.
empty()) {
18304 Vec = CreateSubVectors(Vec, CommonMask);
18307 copy(SubVectorsMask, SVMask.begin());
18308 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18311 I1 = I2 + CommonMask.
size();
18316 Vec = createShuffle(InsertVec, Vec, SVMask);
18317 transformMaskAfterShuffle(CommonMask, SVMask);
18319 InVectors.
front() = Vec;
18322 if (!ExtMask.
empty()) {
18323 if (CommonMask.
empty()) {
18327 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18330 NewMask[
I] = CommonMask[ExtMask[
I]];
18332 CommonMask.
swap(NewMask);
18335 if (CommonMask.
empty()) {
18336 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18337 return InVectors.
front();
18339 if (InVectors.
size() == 2)
18340 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18341 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18345 assert((IsFinalized || CommonMask.empty()) &&
18346 "Shuffle construction must be finalized.");
18350Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18354template <
typename BVTy,
typename ResTy,
typename... Args>
18355ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18357 assert(E->isGather() &&
"Expected gather node.");
18358 unsigned VF = E->getVectorFactor();
18360 bool NeedFreeze =
false;
18363 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18365 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18368 E->CombinedEntriesWithIndices.size());
18369 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18370 [&](
const auto &
P) {
18371 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18376 E->ReorderIndices.end());
18377 if (!ReorderMask.empty())
18383 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18385 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18388 SubVectorsMask.
clear();
18392 unsigned I,
unsigned SliceSize,
18393 bool IsNotPoisonous) {
18395 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18398 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18399 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18400 if (UserTE->getNumOperands() != 2)
18402 if (!IsNotPoisonous) {
18403 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18404 [=](
const std::unique_ptr<TreeEntry> &TE) {
18405 return TE->UserTreeIndex.UserTE == UserTE &&
18406 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18408 if (It == VectorizableTree.end())
18411 if (!(*It)->ReorderIndices.empty()) {
18415 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18416 Value *V0 = std::get<0>(
P);
18417 Value *V1 = std::get<1>(
P);
18425 if ((
Mask.size() < InputVF &&
18428 (
Mask.size() == InputVF &&
18431 std::next(
Mask.begin(),
I * SliceSize),
18432 std::next(
Mask.begin(),
18439 std::next(
Mask.begin(),
I * SliceSize),
18440 std::next(
Mask.begin(),
18446 BVTy ShuffleBuilder(ScalarTy, Params...);
18447 ResTy Res = ResTy();
18448 SmallVector<int>
Mask;
18449 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18451 Value *ExtractVecBase =
nullptr;
18452 bool UseVecBaseAsInput =
false;
18455 Type *OrigScalarTy = GatheredScalars.front()->getType();
18460 bool Resized =
false;
18462 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18463 if (!ExtractShuffles.
empty()) {
18465 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18471 ExtractEntries.
append(TEs.begin(), TEs.end());
18473 if (std::optional<ResTy> Delayed =
18474 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18476 PostponedGathers.insert(
E);
18481 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18482 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18483 ExtractVecBase = VecBase;
18485 if (VF == VecBaseTy->getNumElements() &&
18486 GatheredScalars.size() != VF) {
18488 GatheredScalars.append(VF - GatheredScalars.size(),
18496 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18497 E->getOpcode() != Instruction::Load ||
18498 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18502 return isa<LoadInst>(V) && isVectorized(V);
18504 (
E->hasState() &&
E->isAltShuffle()) ||
18505 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18507 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18509 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18511 if (!GatherShuffles.
empty()) {
18512 if (std::optional<ResTy> Delayed =
18513 ShuffleBuilder.needToDelay(
E, Entries)) {
18515 PostponedGathers.insert(
E);
18520 if (GatherShuffles.
size() == 1 &&
18522 Entries.
front().front()->isSame(
E->Scalars)) {
18525 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18528 Mask.resize(
E->Scalars.size());
18529 const TreeEntry *FrontTE = Entries.
front().front();
18530 if (FrontTE->ReorderIndices.empty() &&
18531 ((FrontTE->ReuseShuffleIndices.empty() &&
18532 E->Scalars.size() == FrontTE->Scalars.size()) ||
18533 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18534 std::iota(
Mask.begin(),
Mask.end(), 0);
18541 Mask[
I] = FrontTE->findLaneForValue(V);
18546 ShuffleBuilder.resetForSameNode();
18547 ShuffleBuilder.add(*FrontTE, Mask);
18549 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18553 if (GatheredScalars.size() != VF &&
18555 return any_of(TEs, [&](
const TreeEntry *TE) {
18556 return TE->getVectorFactor() == VF;
18559 GatheredScalars.append(VF - GatheredScalars.size(),
18563 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18569 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18570 SmallVectorImpl<int> &ReuseMask,
18571 bool IsRootPoison) {
18574 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18577 SmallVector<int> UndefPos;
18578 DenseMap<Value *, unsigned> UniquePositions;
18581 int NumNonConsts = 0;
18600 Scalars.
front() = OrigV;
18603 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18604 Scalars[Res.first->second] = OrigV;
18605 ReuseMask[
I] = Res.first->second;
18608 if (NumNonConsts == 1) {
18613 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18616 ReuseMask[SinglePos] = SinglePos;
18617 }
else if (!UndefPos.
empty() && IsSplat) {
18624 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18627 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18628 is_contained(E->UserTreeIndex.UserTE->Scalars,
18632 if (It != Scalars.
end()) {
18634 int Pos = std::distance(Scalars.
begin(), It);
18635 for (
int I : UndefPos) {
18637 ReuseMask[
I] = Pos;
18646 for (
int I : UndefPos) {
18655 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18656 bool IsNonPoisoned =
true;
18657 bool IsUsedInExpr =
true;
18658 Value *Vec1 =
nullptr;
18659 if (!ExtractShuffles.
empty()) {
18663 Value *Vec2 =
nullptr;
18664 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18668 if (UseVecBaseAsInput) {
18669 Vec1 = ExtractVecBase;
18671 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18677 Value *VecOp = EI->getVectorOperand();
18679 !TEs.
empty() && TEs.
front()->VectorizedValue)
18680 VecOp = TEs.
front()->VectorizedValue;
18683 }
else if (Vec1 != VecOp) {
18684 assert((!Vec2 || Vec2 == VecOp) &&
18685 "Expected only 1 or 2 vectors shuffle.");
18691 IsUsedInExpr =
false;
18694 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18697 IsUsedInExpr &= FindReusedSplat(
18700 ExtractMask.size(), IsNotPoisonedVec);
18701 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18702 IsNonPoisoned &= IsNotPoisonedVec;
18704 IsUsedInExpr =
false;
18709 if (!GatherShuffles.
empty()) {
18710 unsigned SliceSize =
18714 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18717 "No shuffles with empty entries list expected.");
18721 "Expected shuffle of 1 or 2 entries.");
18725 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18726 if (TEs.
size() == 1) {
18727 bool IsNotPoisonedVec =
18728 TEs.
front()->VectorizedValue
18732 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18733 SliceSize, IsNotPoisonedVec);
18734 ShuffleBuilder.add(*TEs.
front(), VecMask);
18735 IsNonPoisoned &= IsNotPoisonedVec;
18737 IsUsedInExpr =
false;
18738 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18739 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18750 int EMSz = ExtractMask.size();
18751 int MSz =
Mask.size();
18754 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18755 bool IsIdentityShuffle =
18756 ((UseVecBaseAsInput ||
18758 [](
const std::optional<TTI::ShuffleKind> &SK) {
18762 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18764 (!GatherShuffles.
empty() &&
18766 [](
const std::optional<TTI::ShuffleKind> &SK) {
18770 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18772 bool EnoughConstsForShuffle =
18782 (!IsIdentityShuffle ||
18783 (GatheredScalars.size() == 2 &&
18791 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18792 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18799 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18800 TryPackScalars(GatheredScalars, BVMask,
true);
18801 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18802 ShuffleBuilder.add(BV, BVMask);
18806 (IsSingleShuffle && ((IsIdentityShuffle &&
18809 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18812 Res = ShuffleBuilder.finalize(
18813 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18814 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18815 bool IsSplat = isSplat(NonConstants);
18816 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18817 TryPackScalars(NonConstants, BVMask, false);
18818 auto CheckIfSplatIsProfitable = [&]() {
18821 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18822 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18823 if (isa<ExtractElementInst>(V) || isVectorized(V))
18825 InstructionCost SplatCost = TTI->getVectorInstrCost(
18826 Instruction::InsertElement, VecTy, CostKind, 0,
18827 PoisonValue::get(VecTy), V);
18828 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18829 for (auto [Idx, I] : enumerate(BVMask))
18830 if (I != PoisonMaskElem)
18831 NewMask[Idx] = Mask.size();
18832 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18833 NewMask, CostKind);
18834 InstructionCost BVCost = TTI->getVectorInstrCost(
18835 Instruction::InsertElement, VecTy, CostKind,
18836 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18839 if (count(BVMask, PoisonMaskElem) <
18840 static_cast<int>(BVMask.size() - 1)) {
18841 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18842 for (auto [Idx, I] : enumerate(BVMask))
18843 if (I != PoisonMaskElem)
18845 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18846 VecTy, NewMask, CostKind);
18848 return SplatCost <= BVCost;
18850 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18854 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18860 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18862 transform(BVMask, SplatMask.begin(), [](
int I) {
18863 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18866 BV = CreateShuffle(BV,
nullptr, SplatMask);
18869 Mask[Idx] = BVMask.size() + Idx;
18870 Vec = CreateShuffle(Vec, BV, Mask);
18878 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18879 TryPackScalars(GatheredScalars, ReuseMask,
true);
18880 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18881 ShuffleBuilder.add(BV, ReuseMask);
18882 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18887 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18891 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18892 ShuffleBuilder.add(BV, Mask);
18893 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18898 Res = ShuffleBuilder.createFreeze(Res);
18902Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18903 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18905 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18913 for (
Value *V : VL)
18926 IRBuilderBase::InsertPointGuard Guard(Builder);
18928 Value *
V =
E->Scalars.front();
18929 Type *ScalarTy =
V->getType();
18932 auto It = MinBWs.find(
E);
18933 if (It != MinBWs.end()) {
18939 if (
E->VectorizedValue)
18940 return E->VectorizedValue;
18942 if (
E->isGather()) {
18944 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
18945 setInsertPointAfterBundle(
E);
18946 Value *Vec = createBuildVector(
E, ScalarTy);
18947 E->VectorizedValue = Vec;
18950 if (
E->State == TreeEntry::SplitVectorize) {
18951 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
18952 "Expected exactly 2 combined entries.");
18953 setInsertPointAfterBundle(
E);
18955 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
18957 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18958 "Expected same first part of scalars.");
18961 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
18963 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18964 "Expected same second part of scalars.");
18966 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18967 bool IsSigned =
false;
18968 auto It = MinBWs.find(OpE);
18969 if (It != MinBWs.end())
18970 IsSigned = It->second.second;
18973 if (isa<PoisonValue>(V))
18975 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18982 Op1 = Builder.CreateIntCast(
18987 GetOperandSignedness(&OpTE1));
18992 Op2 = Builder.CreateIntCast(
18997 GetOperandSignedness(&OpTE2));
18999 if (
E->ReorderIndices.empty()) {
19003 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19006 if (ScalarTyNumElements != 1) {
19010 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19012 E->CombinedEntriesWithIndices.back().second *
19013 ScalarTyNumElements);
19014 E->VectorizedValue = Vec;
19017 unsigned CommonVF =
19018 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19021 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19023 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19027 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19029 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19031 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19032 E->VectorizedValue = Vec;
19036 bool IsReverseOrder =
19038 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19040 if (
E->getOpcode() == Instruction::Store &&
19041 E->State == TreeEntry::Vectorize) {
19042 ArrayRef<int>
Mask =
19043 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19044 E->ReorderIndices.size());
19045 ShuffleBuilder.add(V, Mask);
19046 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19047 E->State == TreeEntry::CompressVectorize) {
19048 ShuffleBuilder.addOrdered(V, {});
19050 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19053 E->CombinedEntriesWithIndices.size());
19055 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19056 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19059 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19060 "Expected either combined subnodes or reordering");
19061 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19064 assert(!
E->isGather() &&
"Unhandled state");
19065 unsigned ShuffleOrOp =
19066 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19068 auto GetOperandSignedness = [&](
unsigned Idx) {
19069 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19070 bool IsSigned =
false;
19071 auto It = MinBWs.find(OpE);
19072 if (It != MinBWs.end())
19073 IsSigned = It->second.second;
19076 if (isa<PoisonValue>(V))
19078 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19082 switch (ShuffleOrOp) {
19083 case Instruction::PHI: {
19084 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19085 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19086 "PHI reordering is free.");
19088 Builder.SetInsertPoint(PH->getParent(),
19089 PH->getParent()->getFirstNonPHIIt());
19091 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19095 Builder.SetInsertPoint(PH->getParent(),
19096 PH->getParent()->getFirstInsertionPt());
19099 V = FinalShuffle(V,
E);
19101 E->VectorizedValue =
V;
19108 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19115 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19119 if (!VisitedBBs.
insert(IBB).second) {
19122 TreeEntry *OpTE = getOperandEntry(
E,
I);
19123 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19124 OpTE->VectorizedValue = VecOp;
19130 Value *Vec = vectorizeOperand(
E,
I);
19131 if (VecTy != Vec->
getType()) {
19133 MinBWs.contains(getOperandEntry(
E,
I))) &&
19134 "Expected item in MinBWs.");
19135 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19141 "Invalid number of incoming values");
19142 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19143 return E->VectorizedValue;
19146 case Instruction::ExtractElement: {
19147 Value *
V =
E->getSingleOperand(0);
19148 setInsertPointAfterBundle(
E);
19149 V = FinalShuffle(V,
E);
19150 E->VectorizedValue =
V;
19153 case Instruction::ExtractValue: {
19155 Builder.SetInsertPoint(LI);
19156 Value *
Ptr = LI->getPointerOperand();
19157 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19159 NewV = FinalShuffle(NewV,
E);
19160 E->VectorizedValue = NewV;
19163 case Instruction::InsertElement: {
19164 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19165 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19166 OpE && !OpE->isGather() && OpE->hasState() &&
19167 !OpE->hasCopyableElements())
19170 setInsertPointAfterBundle(
E);
19171 Value *
V = vectorizeOperand(
E, 1);
19173 Type *ScalarTy =
Op.front()->getType();
19176 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19177 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19178 V = Builder.CreateIntCast(
19188 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19190 const unsigned NumElts =
19192 const unsigned NumScalars =
E->Scalars.size();
19195 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19198 SmallVector<int>
Mask;
19199 if (!
E->ReorderIndices.empty()) {
19204 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19207 bool IsIdentity =
true;
19209 Mask.swap(PrevMask);
19210 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19213 IsIdentity &= InsertIdx -
Offset ==
I;
19216 if (!IsIdentity || NumElts != NumScalars) {
19217 Value *V2 =
nullptr;
19218 bool IsVNonPoisonous =
19220 SmallVector<int> InsertMask(Mask);
19221 if (NumElts != NumScalars &&
Offset == 0) {
19230 InsertMask[*InsertIdx] = *InsertIdx;
19231 if (!
Ins->hasOneUse())
19234 Ins->getUniqueUndroppableUser());
19236 SmallBitVector UseMask =
19237 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19238 SmallBitVector IsFirstPoison =
19240 SmallBitVector IsFirstUndef =
19242 if (!IsFirstPoison.
all()) {
19244 for (
unsigned I = 0;
I < NumElts;
I++) {
19246 IsFirstUndef.
test(
I)) {
19247 if (IsVNonPoisonous) {
19248 InsertMask[
I] =
I < NumScalars ?
I : 0;
19253 if (Idx >= NumScalars)
19254 Idx = NumScalars - 1;
19255 InsertMask[
I] = NumScalars + Idx;
19268 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19270 GatherShuffleExtractSeq.insert(
I);
19271 CSEBlocks.insert(
I->getParent());
19276 for (
unsigned I = 0;
I < NumElts;
I++) {
19280 SmallBitVector UseMask =
19281 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19282 SmallBitVector IsFirstUndef =
19284 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19285 NumElts != NumScalars) {
19286 if (IsFirstUndef.
all()) {
19288 SmallBitVector IsFirstPoison =
19290 if (!IsFirstPoison.
all()) {
19291 for (
unsigned I = 0;
I < NumElts;
I++) {
19293 InsertMask[
I] =
I + NumElts;
19296 V = Builder.CreateShuffleVector(
19302 GatherShuffleExtractSeq.insert(
I);
19303 CSEBlocks.insert(
I->getParent());
19307 SmallBitVector IsFirstPoison =
19309 for (
unsigned I = 0;
I < NumElts;
I++) {
19313 InsertMask[
I] += NumElts;
19315 V = Builder.CreateShuffleVector(
19316 FirstInsert->getOperand(0), V, InsertMask,
19319 GatherShuffleExtractSeq.insert(
I);
19320 CSEBlocks.insert(
I->getParent());
19325 ++NumVectorInstructions;
19326 E->VectorizedValue =
V;
19329 case Instruction::ZExt:
19330 case Instruction::SExt:
19331 case Instruction::FPToUI:
19332 case Instruction::FPToSI:
19333 case Instruction::FPExt:
19334 case Instruction::PtrToInt:
19335 case Instruction::IntToPtr:
19336 case Instruction::SIToFP:
19337 case Instruction::UIToFP:
19338 case Instruction::Trunc:
19339 case Instruction::FPTrunc:
19340 case Instruction::BitCast: {
19341 setInsertPointAfterBundle(
E);
19343 Value *InVec = vectorizeOperand(
E, 0);
19348 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19350 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19353 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19354 if (SrcIt != MinBWs.end())
19355 SrcBWSz = SrcIt->second.first;
19356 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19357 if (BWSz == SrcBWSz) {
19358 VecOpcode = Instruction::BitCast;
19359 }
else if (BWSz < SrcBWSz) {
19360 VecOpcode = Instruction::Trunc;
19361 }
else if (It != MinBWs.end()) {
19362 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19363 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19364 }
else if (SrcIt != MinBWs.end()) {
19365 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19367 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19369 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19370 !SrcIt->second.second) {
19371 VecOpcode = Instruction::UIToFP;
19373 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19375 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19376 V = FinalShuffle(V,
E);
19378 E->VectorizedValue =
V;
19379 ++NumVectorInstructions;
19382 case Instruction::FCmp:
19383 case Instruction::ICmp: {
19384 setInsertPointAfterBundle(
E);
19386 Value *
L = vectorizeOperand(
E, 0);
19387 Value *
R = vectorizeOperand(
E, 1);
19388 if (
L->getType() !=
R->getType()) {
19391 MinBWs.contains(getOperandEntry(
E, 0)) ||
19392 MinBWs.contains(getOperandEntry(
E, 1))) &&
19393 "Expected item in MinBWs.");
19398 ->getIntegerBitWidth()) {
19399 Type *CastTy =
R->getType();
19400 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19402 Type *CastTy =
L->getType();
19403 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19408 Value *
V = Builder.CreateCmp(P0, L, R);
19411 ICmp->setSameSign(
false);
19414 V = FinalShuffle(V,
E);
19416 E->VectorizedValue =
V;
19417 ++NumVectorInstructions;
19420 case Instruction::Select: {
19421 setInsertPointAfterBundle(
E);
19424 Value *True = vectorizeOperand(
E, 1);
19425 Value *False = vectorizeOperand(
E, 2);
19429 MinBWs.contains(getOperandEntry(
E, 1)) ||
19430 MinBWs.contains(getOperandEntry(
E, 2))) &&
19431 "Expected item in MinBWs.");
19432 if (True->
getType() != VecTy)
19433 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19434 if (False->
getType() != VecTy)
19435 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19440 assert(TrueNumElements >= CondNumElements &&
19441 TrueNumElements % CondNumElements == 0 &&
19442 "Cannot vectorize Instruction::Select");
19444 "Cannot vectorize Instruction::Select");
19445 if (CondNumElements != TrueNumElements) {
19448 Cond = Builder.CreateShuffleVector(
19453 "Cannot vectorize Instruction::Select");
19454 Value *
V = Builder.CreateSelect(
Cond, True, False);
19455 V = FinalShuffle(V,
E);
19457 E->VectorizedValue =
V;
19458 ++NumVectorInstructions;
19461 case Instruction::FNeg: {
19462 setInsertPointAfterBundle(
E);
19464 Value *
Op = vectorizeOperand(
E, 0);
19466 Value *
V = Builder.CreateUnOp(
19472 V = FinalShuffle(V,
E);
19474 E->VectorizedValue =
V;
19475 ++NumVectorInstructions;
19479 case Instruction::Freeze: {
19480 setInsertPointAfterBundle(
E);
19482 Value *
Op = vectorizeOperand(
E, 0);
19484 if (
Op->getType() != VecTy) {
19486 MinBWs.contains(getOperandEntry(
E, 0))) &&
19487 "Expected item in MinBWs.");
19488 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19490 Value *
V = Builder.CreateFreeze(
Op);
19491 V = FinalShuffle(V,
E);
19493 E->VectorizedValue =
V;
19494 ++NumVectorInstructions;
19498 case Instruction::Add:
19499 case Instruction::FAdd:
19500 case Instruction::Sub:
19501 case Instruction::FSub:
19502 case Instruction::Mul:
19503 case Instruction::FMul:
19504 case Instruction::UDiv:
19505 case Instruction::SDiv:
19506 case Instruction::FDiv:
19507 case Instruction::URem:
19508 case Instruction::SRem:
19509 case Instruction::FRem:
19510 case Instruction::Shl:
19511 case Instruction::LShr:
19512 case Instruction::AShr:
19513 case Instruction::And:
19514 case Instruction::Or:
19515 case Instruction::Xor: {
19516 setInsertPointAfterBundle(
E);
19520 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19525 return CI && CI->getValue().countr_one() >= It->second.first;
19527 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19528 E->VectorizedValue =
V;
19529 ++NumVectorInstructions;
19537 MinBWs.contains(getOperandEntry(
E, 0)) ||
19538 MinBWs.contains(getOperandEntry(
E, 1))) &&
19539 "Expected item in MinBWs.");
19541 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19543 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19546 Value *
V = Builder.CreateBinOp(
19553 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19555 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19557 I->setHasNoUnsignedWrap(
false);
19560 V = FinalShuffle(V,
E);
19562 E->VectorizedValue =
V;
19563 ++NumVectorInstructions;
19567 case Instruction::Load: {
19570 setInsertPointAfterBundle(
E);
19574 FixedVectorType *StridedLoadTy =
nullptr;
19575 Value *PO = LI->getPointerOperand();
19576 if (
E->State == TreeEntry::Vectorize) {
19577 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19578 }
else if (
E->State == TreeEntry::CompressVectorize) {
19579 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19580 CompressEntryToData.at(
E);
19581 Align CommonAlignment = LI->getAlign();
19587 for (
int I : CompressMask)
19591 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19594 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19597 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19608 }
else if (
E->State == TreeEntry::StridedVectorize) {
19611 PO = IsReverseOrder ? PtrN : Ptr0;
19612 Type *StrideTy = DL->getIndexType(PO->
getType());
19614 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19615 StridedLoadTy = SPtrInfo.Ty;
19616 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19617 unsigned StridedLoadEC =
19620 Value *Stride = SPtrInfo.StrideVal;
19622 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19623 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19624 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19625 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19626 &*Builder.GetInsertPoint());
19629 Builder.CreateIntCast(Stride, StrideTy,
true);
19630 StrideVal = Builder.CreateMul(
19631 NewStride, ConstantInt::get(
19632 StrideTy, (IsReverseOrder ? -1 : 1) *
19634 DL->getTypeAllocSize(ScalarTy))));
19636 auto *Inst = Builder.CreateIntrinsic(
19637 Intrinsic::experimental_vp_strided_load,
19638 {StridedLoadTy, PO->
getType(), StrideTy},
19641 Builder.getInt32(StridedLoadEC)});
19642 Inst->addParamAttr(
19647 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19648 Value *VecPtr = vectorizeOperand(
E, 0);
19653 unsigned ScalarTyNumElements =
19655 unsigned VecTyNumElements =
19657 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19658 "Cannot expand getelementptr.");
19659 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19662 return Builder.getInt64(I % ScalarTyNumElements);
19664 VecPtr = Builder.CreateGEP(
19665 VecTy->getElementType(),
19666 Builder.CreateShuffleVector(
19672 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19674 Value *
V =
E->State == TreeEntry::CompressVectorize
19678 V = FinalShuffle(V,
E);
19679 E->VectorizedValue =
V;
19680 ++NumVectorInstructions;
19683 case Instruction::Store: {
19686 setInsertPointAfterBundle(
E);
19688 Value *VecValue = vectorizeOperand(
E, 0);
19689 if (VecValue->
getType() != VecTy)
19691 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19692 VecValue = FinalShuffle(VecValue,
E);
19696 if (
E->State == TreeEntry::Vectorize) {
19697 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19699 assert(
E->State == TreeEntry::StridedVectorize &&
19700 "Expected either strided or consecutive stores.");
19701 if (!
E->ReorderIndices.empty()) {
19703 Ptr =
SI->getPointerOperand();
19706 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19707 auto *Inst = Builder.CreateIntrinsic(
19708 Intrinsic::experimental_vp_strided_store,
19709 {VecTy,
Ptr->getType(), StrideTy},
19712 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19713 Builder.getAllOnesMask(VecTy->getElementCount()),
19714 Builder.getInt32(
E->Scalars.size())});
19715 Inst->addParamAttr(
19723 E->VectorizedValue =
V;
19724 ++NumVectorInstructions;
19727 case Instruction::GetElementPtr: {
19729 setInsertPointAfterBundle(
E);
19731 Value *Op0 = vectorizeOperand(
E, 0);
19734 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19735 Value *OpVec = vectorizeOperand(
E, J);
19739 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19742 for (
Value *V :
E->Scalars) {
19749 V = FinalShuffle(V,
E);
19751 E->VectorizedValue =
V;
19752 ++NumVectorInstructions;
19756 case Instruction::Call: {
19758 setInsertPointAfterBundle(
E);
19763 CI,
ID, VecTy->getNumElements(),
19764 It != MinBWs.end() ? It->second.first : 0, TTI);
19767 VecCallCosts.first <= VecCallCosts.second;
19769 Value *ScalarArg =
nullptr;
19780 ScalarArg = CEI->getArgOperand(
I);
19783 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19784 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19785 ScalarArg = Builder.getFalse();
19792 Value *OpVec = vectorizeOperand(
E,
I);
19793 ScalarArg = CEI->getArgOperand(
I);
19796 It == MinBWs.end()) {
19799 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19800 }
else if (It != MinBWs.end()) {
19801 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19810 if (!UseIntrinsic) {
19815 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19822 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19825 V = FinalShuffle(V,
E);
19827 E->VectorizedValue =
V;
19828 ++NumVectorInstructions;
19831 case Instruction::ShuffleVector: {
19834 setInsertPointAfterBundle(
E);
19835 Value *Src = vectorizeOperand(
E, 0);
19838 SmallVector<int> NewMask(ThisMask.size());
19840 return SVSrc->getShuffleMask()[Mask];
19842 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19843 SVSrc->getOperand(1), NewMask);
19845 V = Builder.CreateShuffleVector(Src, ThisMask);
19850 V = FinalShuffle(V,
E);
19858 "Invalid Shuffle Vector Operand");
19862 setInsertPointAfterBundle(
E);
19863 LHS = vectorizeOperand(
E, 0);
19864 RHS = vectorizeOperand(
E, 1);
19866 setInsertPointAfterBundle(
E);
19867 LHS = vectorizeOperand(
E, 0);
19873 assert((It != MinBWs.end() ||
19874 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19875 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19876 MinBWs.contains(getOperandEntry(
E, 0)) ||
19877 MinBWs.contains(getOperandEntry(
E, 1))) &&
19878 "Expected item in MinBWs.");
19879 Type *CastTy = VecTy;
19885 ->getIntegerBitWidth())
19891 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19893 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19898 V0 = Builder.CreateBinOp(
19900 V1 = Builder.CreateBinOp(
19903 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19906 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19909 unsigned SrcBWSz = DL->getTypeSizeInBits(
19911 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19912 if (BWSz <= SrcBWSz) {
19913 if (BWSz < SrcBWSz)
19914 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
19916 "Expected same type as operand.");
19920 E->VectorizedValue =
LHS;
19921 ++NumVectorInstructions;
19925 V0 = Builder.CreateCast(
19927 V1 = Builder.CreateCast(
19932 for (
Value *V : {V0, V1}) {
19934 GatherShuffleExtractSeq.insert(
I);
19935 CSEBlocks.insert(
I->getParent());
19943 SmallVector<int>
Mask;
19944 E->buildAltOpShuffleMask(
19945 [
E,
this](Instruction *
I) {
19946 assert(
E->getMatchingMainOpOrAltOp(
I) &&
19947 "Unexpected main/alternate opcode");
19951 Mask, &OpScalars, &AltScalars);
19955 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19958 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
19960 if (isa<PoisonValue>(V))
19962 auto *IV = cast<Instruction>(V);
19963 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19965 I->setHasNoUnsignedWrap(
false);
19967 DropNuwFlag(V0,
E->getOpcode());
19968 DropNuwFlag(V1,
E->getAltOpcode());
19974 V = Builder.CreateShuffleVector(V0, V1, Mask);
19977 GatherShuffleExtractSeq.insert(
I);
19978 CSEBlocks.insert(
I->getParent());
19982 E->VectorizedValue =
V;
19983 ++NumVectorInstructions;
20001 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20004 EntryToLastInstruction.clear();
20006 for (
auto &BSIter : BlocksSchedules)
20007 scheduleBlock(*
this, BSIter.second.get());
20010 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20011 if (TE->isGather())
20013 (void)getLastInstructionInBundle(TE.get());
20017 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20020 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20024 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20025 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20026 TE->UserTreeIndex.UserTE->hasState() &&
20027 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20028 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20029 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20030 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20031 all_of(TE->UserTreeIndex.UserTE->Scalars,
20032 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20034 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20038 for (
auto &Entry : GatherEntries) {
20040 Builder.SetInsertPoint(Entry.second);
20041 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20046 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20047 if (GatheredLoadsEntriesFirst.has_value() &&
20048 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20049 (!TE->isGather() || TE->UserTreeIndex)) {
20050 assert((TE->UserTreeIndex ||
20051 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20052 "Expected gathered load node.");
20061 for (
const TreeEntry *E : PostponedNodes) {
20062 auto *TE =
const_cast<TreeEntry *
>(E);
20064 TE->VectorizedValue =
nullptr;
20083 if (UI->comesBefore(InsertPt))
20086 Builder.SetInsertPoint(InsertPt);
20088 Builder.SetInsertPoint(PrevVec);
20090 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20093 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20094 Builder.GetInsertPoint()->comesBefore(VecI))
20095 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20096 Builder.GetInsertPoint());
20097 if (Vec->
getType() != PrevVec->getType()) {
20099 PrevVec->getType()->isIntOrIntVectorTy() &&
20100 "Expected integer vector types only.");
20101 std::optional<bool> IsSigned;
20102 for (
Value *V : TE->Scalars) {
20104 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20105 auto It = MinBWs.find(MNTE);
20106 if (It != MinBWs.end()) {
20107 IsSigned = IsSigned.value_or(
false) || It->second.second;
20112 if (IsSigned.value_or(
false))
20115 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20116 auto It = MinBWs.find(BVE);
20117 if (It != MinBWs.end()) {
20118 IsSigned = IsSigned.value_or(
false) || It->second.second;
20123 if (IsSigned.value_or(
false))
20127 IsSigned.value_or(
false) ||
20131 if (IsSigned.value_or(
false))
20135 if (IsSigned.value_or(
false)) {
20137 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20138 if (It != MinBWs.end())
20139 IsSigned = It->second.second;
20142 "Expected user node or perfect diamond match in MinBWs.");
20143 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20145 PrevVec->replaceAllUsesWith(Vec);
20146 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20149 auto It = PostponedValues.
find(PrevVec);
20150 if (It != PostponedValues.
end()) {
20151 for (TreeEntry *VTE : It->getSecond())
20152 VTE->VectorizedValue = Vec;
20172 for (
const auto &ExternalUse : ExternalUses) {
20173 Value *Scalar = ExternalUse.Scalar;
20180 const TreeEntry *E = &ExternalUse.E;
20181 assert(E &&
"Invalid scalar");
20182 assert(!E->isGather() &&
"Extracting from a gather list");
20184 if (E->getOpcode() == Instruction::GetElementPtr &&
20188 Value *Vec = E->VectorizedValue;
20189 assert(Vec &&
"Can't find vectorizable value");
20191 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20192 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20193 if (Scalar->getType() != Vec->
getType()) {
20194 Value *Ex =
nullptr;
20195 Value *ExV =
nullptr;
20197 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20198 auto It = ScalarToEEs.
find(Scalar);
20199 if (It != ScalarToEEs.
end()) {
20202 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20203 : Builder.GetInsertBlock());
20204 if (EEIt != It->second.end()) {
20205 Value *PrevV = EEIt->second.first;
20207 I && !ReplaceInst &&
20208 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20209 Builder.GetInsertPoint()->comesBefore(
I)) {
20210 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20211 Builder.GetInsertPoint());
20216 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20225 IgnoredExtracts.
insert(EE);
20228 auto *CloneInst = Inst->clone();
20229 CloneInst->insertBefore(Inst->getIterator());
20230 if (Inst->hasName())
20231 CloneInst->takeName(Inst);
20236 Value *V = ES->getVectorOperand();
20239 V = ETEs.front()->VectorizedValue;
20241 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20242 IV->comesBefore(IVec))
20243 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20245 Ex = Builder.CreateExtractElement(Vec, Lane);
20246 }
else if (
auto *VecTy =
20249 unsigned VecTyNumElements = VecTy->getNumElements();
20254 ExternalUse.Lane * VecTyNumElements);
20256 Ex = Builder.CreateExtractElement(Vec, Lane);
20261 if (Scalar->getType() != Ex->
getType())
20262 ExV = Builder.CreateIntCast(
20267 : &F->getEntryBlock(),
20268 std::make_pair(Ex, ExV));
20274 GatherShuffleExtractSeq.insert(ExI);
20275 CSEBlocks.insert(ExI->getParent());
20281 "In-tree scalar of vector type is not insertelement?");
20290 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20293 (ExternallyUsedValues.
count(Scalar) ||
20294 ExternalUsesWithNonUsers.count(Scalar) ||
20295 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20299 if (ExternalUsesAsOriginalScalar.contains(U))
20301 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20302 return !UseEntries.empty() &&
20303 (E->State == TreeEntry::Vectorize ||
20304 E->State == TreeEntry::StridedVectorize ||
20305 E->State == TreeEntry::CompressVectorize) &&
20306 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20307 return (UseEntry->State == TreeEntry::Vectorize ||
20309 TreeEntry::StridedVectorize ||
20311 TreeEntry::CompressVectorize) &&
20312 doesInTreeUserNeedToExtract(
20313 Scalar, getRootEntryInstruction(*UseEntry),
20317 "Scalar with nullptr User must be registered in "
20318 "ExternallyUsedValues map or remain as scalar in vectorized "
20322 if (
PHI->getParent()->isLandingPad())
20323 Builder.SetInsertPoint(
20326 PHI->getParent()->getLandingPadInst()->getIterator()));
20328 Builder.SetInsertPoint(
PHI->getParent(),
20329 PHI->getParent()->getFirstNonPHIIt());
20331 Builder.SetInsertPoint(VecI->getParent(),
20332 std::next(VecI->getIterator()));
20335 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20337 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20339 if (Scalar != NewInst) {
20342 "Extractelements should not be replaced.");
20343 Scalar->replaceAllUsesWith(NewInst);
20353 if (!UsedInserts.
insert(VU).second)
20356 auto BWIt = MinBWs.find(E);
20358 auto *ScalarTy = FTy->getElementType();
20359 auto Key = std::make_pair(Vec, ScalarTy);
20360 auto VecIt = VectorCasts.
find(
Key);
20361 if (VecIt == VectorCasts.
end()) {
20364 if (IVec->getParent()->isLandingPad())
20365 Builder.SetInsertPoint(IVec->getParent(),
20366 std::next(IVec->getParent()
20367 ->getLandingPadInst()
20370 Builder.SetInsertPoint(
20371 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20373 Builder.SetInsertPoint(IVec->getNextNode());
20375 Vec = Builder.CreateIntCast(
20380 BWIt->second.second);
20383 Vec = VecIt->second;
20390 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20397 unsigned Idx = *InsertIdx;
20398 if (It == ShuffledInserts.
end()) {
20400 It = std::next(ShuffledInserts.
begin(),
20401 ShuffledInserts.
size() - 1);
20406 Mask[Idx] = ExternalUse.Lane;
20418 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20419 if (PH->getIncomingValue(
I) == Scalar) {
20421 PH->getIncomingBlock(
I)->getTerminator();
20423 Builder.SetInsertPoint(VecI->getParent(),
20424 std::next(VecI->getIterator()));
20426 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20428 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20429 PH->setOperand(
I, NewInst);
20434 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20438 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20439 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20450 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20452 CombinedMask1[
I] = Mask[
I];
20454 CombinedMask2[
I] = Mask[
I] - VF;
20456 ShuffleInstructionBuilder ShuffleBuilder(
20458 ShuffleBuilder.add(V1, CombinedMask1);
20460 ShuffleBuilder.add(V2, CombinedMask2);
20461 return ShuffleBuilder.finalize({}, {}, {});
20464 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20465 bool ForSingleMask) {
20466 unsigned VF =
Mask.size();
20469 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20470 Vec = CreateShuffle(Vec,
nullptr, Mask);
20471 return std::make_pair(Vec,
true);
20473 if (!ForSingleMask) {
20475 for (
unsigned I = 0;
I < VF; ++
I) {
20479 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20483 return std::make_pair(Vec,
false);
20487 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20490 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20491 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20492 Builder.SetInsertPoint(LastInsert);
20493 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20498 return cast<VectorType>(Vec->getType())
20499 ->getElementCount()
20500 .getKnownMinValue();
20503 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20505 assert((Vals.size() == 1 || Vals.size() == 2) &&
20506 "Expected exactly 1 or 2 input values.");
20507 if (Vals.size() == 1) {
20510 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20511 ->getNumElements() ||
20512 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20513 return CreateShuffle(Vals.front(), nullptr, Mask);
20514 return Vals.front();
20516 return CreateShuffle(Vals.
front() ? Vals.
front()
20518 Vals.
back(), Mask);
20520 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20522 InsertElementInst *
II =
nullptr;
20523 if (It != ShuffledInserts[
I].InsertElements.rend())
20526 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20527 assert(
II &&
"Must be an insertelement instruction.");
20534 for (Instruction *
II :
reverse(Inserts)) {
20535 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20537 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20538 II->moveAfter(NewI);
20542 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20543 IE->replaceUsesOfWith(
IE->getOperand(0),
20545 IE->replaceUsesOfWith(
IE->getOperand(1),
20549 CSEBlocks.insert(LastInsert->
getParent());
20554 for (
auto &TEPtr : VectorizableTree) {
20555 TreeEntry *
Entry = TEPtr.get();
20558 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20561 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20564 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20567 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20571 EE && IgnoredExtracts.contains(EE))
20578 for (User *U :
Scalar->users()) {
20583 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20586 "Deleting out-of-tree value");
20590 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20599 V->mergeDIAssignID(RemovedInsts);
20602 if (UserIgnoreList) {
20603 for (Instruction *
I : RemovedInsts) {
20604 const TreeEntry *
IE = getTreeEntries(
I).front();
20605 if (
IE->Idx != 0 &&
20606 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20607 (ValueToGatherNodes.lookup(
I).contains(
20608 VectorizableTree.front().get()) ||
20609 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20610 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20611 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20612 IE->UserTreeIndex &&
20614 !(GatheredLoadsEntriesFirst.has_value() &&
20615 IE->Idx >= *GatheredLoadsEntriesFirst &&
20616 VectorizableTree.front()->isGather() &&
20618 !(!VectorizableTree.front()->isGather() &&
20619 VectorizableTree.front()->isCopyableElement(
I)))
20624 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20625 (match(U.getUser(), m_LogicalAnd()) ||
20626 match(U.getUser(), m_LogicalOr())) &&
20627 U.getOperandNo() == 0;
20628 if (IsPoisoningLogicalOp) {
20629 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20632 return UserIgnoreList->contains(
U.getUser());
20636 for (SelectInst *SI : LogicalOpSelects)
20646 Builder.ClearInsertionPoint();
20647 InstrElementSize.clear();
20649 const TreeEntry &RootTE = *VectorizableTree.front();
20650 Value *Vec = RootTE.VectorizedValue;
20651 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20652 It != MinBWs.end() &&
20653 ReductionBitWidth != It->second.first) {
20654 IRBuilder<>::InsertPointGuard Guard(Builder);
20655 Builder.SetInsertPoint(ReductionRoot->getParent(),
20656 ReductionRoot->getIterator());
20657 Vec = Builder.CreateIntCast(
20661 It->second.second);
20667 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20668 <<
" gather sequences instructions.\n");
20675 Loop *L = LI->getLoopFor(
I->getParent());
20680 BasicBlock *PreHeader = L->getLoopPreheader();
20688 auto *OpI = dyn_cast<Instruction>(V);
20689 return OpI && L->contains(OpI);
20695 CSEBlocks.insert(PreHeader);
20700 CSEWorkList.
reserve(CSEBlocks.size());
20703 assert(DT->isReachableFromEntry(
N));
20710 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20711 "Different nodes should have different DFS numbers");
20712 return A->getDFSNumIn() <
B->getDFSNumIn();
20720 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20723 if (I1->getType() != I2->getType())
20728 return I1->isIdenticalTo(I2);
20729 if (SI1->isIdenticalTo(SI2))
20731 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20732 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20735 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20739 unsigned LastUndefsCnt = 0;
20740 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20746 NewMask[
I] != SM1[
I])
20749 NewMask[
I] = SM1[
I];
20753 return SM1.
size() - LastUndefsCnt > 1 &&
20757 SM1.
size() - LastUndefsCnt));
20763 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20765 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20766 "Worklist not sorted properly!");
20773 !GatherShuffleExtractSeq.contains(&In))
20778 bool Replaced =
false;
20781 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20782 DT->dominates(V->getParent(), In.getParent())) {
20783 In.replaceAllUsesWith(V);
20786 if (!NewMask.
empty())
20787 SI->setShuffleMask(NewMask);
20792 GatherShuffleExtractSeq.contains(V) &&
20793 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20794 DT->dominates(In.getParent(), V->getParent())) {
20796 V->replaceAllUsesWith(&In);
20799 if (!NewMask.
empty())
20800 SI->setShuffleMask(NewMask);
20808 Visited.push_back(&In);
20813 GatherShuffleExtractSeq.clear();
20816BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20819 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20820 for (
Value *V : VL) {
20821 if (S.isNonSchedulable(V))
20824 if (S.isCopyableElement(V)) {
20826 ScheduleCopyableData &SD =
20827 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20829 BundlePtr->add(&SD);
20832 ScheduleData *BundleMember = getScheduleData(V);
20833 assert(BundleMember &&
"no ScheduleData for bundle member "
20834 "(maybe not in same basic block)");
20836 BundlePtr->add(BundleMember);
20837 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20840 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20846std::optional<BoUpSLP::ScheduleBundle *>
20848 const InstructionsState &S,
20855 bool HasCopyables = S.areInstructionsWithCopyableElements();
20857 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
20861 SmallVector<ScheduleData *> ControlDependentMembers;
20862 for (
Value *V : VL) {
20864 if (!
I || (HasCopyables && S.isCopyableElement(V)))
20866 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20867 for (
const Use &U :
I->operands()) {
20870 .first->getSecond();
20873 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
20874 if (ScheduleData *OpSD = getScheduleData(
Op);
20875 OpSD && OpSD->hasValidDependencies()) {
20876 OpSD->clearDirectDependencies();
20877 if (RegionHasStackSave ||
20879 ControlDependentMembers.
push_back(OpSD);
20884 if (!ControlDependentMembers.
empty()) {
20885 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20886 calculateDependencies(
Invalid,
true, SLP,
20887 ControlDependentMembers);
20894 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20896 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20899 SmallVector<ScheduleData *> ControlDependentMembers;
20900 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20901 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20902 for (ScheduleEntity *SE : Bundle.getBundle()) {
20904 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20905 BundleMember && BundleMember->hasValidDependencies()) {
20906 BundleMember->clearDirectDependencies();
20907 if (RegionHasStackSave ||
20909 BundleMember->getInst()))
20910 ControlDependentMembers.
push_back(BundleMember);
20915 if (SD->hasValidDependencies() &&
20916 (!S.areInstructionsWithCopyableElements() ||
20917 !S.isCopyableElement(SD->getInst())) &&
20918 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20919 EI.UserTE->hasState() &&
20920 (!EI.UserTE->hasCopyableElements() ||
20921 !EI.UserTE->isCopyableElement(SD->getInst())))
20922 SD->clearDirectDependencies();
20923 for (
const Use &U : SD->getInst()->operands()) {
20926 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20927 .first->getSecond();
20930 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20932 if (ScheduleData *OpSD = getScheduleData(
Op);
20933 OpSD && OpSD->hasValidDependencies()) {
20934 OpSD->clearDirectDependencies();
20935 if (RegionHasStackSave ||
20937 ControlDependentMembers.
push_back(OpSD);
20948 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20949 for_each(ScheduleDataMap, [&](
auto &
P) {
20950 if (BB !=
P.first->getParent())
20952 ScheduleData *SD =
P.second;
20953 if (isInSchedulingRegion(*SD))
20954 SD->clearDependencies();
20956 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20957 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20958 if (isInSchedulingRegion(*SD))
20959 SD->clearDependencies();
20966 if (Bundle && !Bundle.getBundle().empty()) {
20967 if (S.areInstructionsWithCopyableElements() ||
20968 !ScheduleCopyableDataMap.empty())
20969 CheckIfNeedToClearDeps(Bundle);
20970 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20972 calculateDependencies(Bundle, !ReSchedule, SLP,
20973 ControlDependentMembers);
20974 }
else if (!ControlDependentMembers.
empty()) {
20975 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20976 calculateDependencies(
Invalid, !ReSchedule, SLP,
20977 ControlDependentMembers);
20982 initialFillReadyList(ReadyInsts);
20989 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20990 !ReadyInsts.empty()) {
20991 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20992 assert(Picked->isReady() &&
"must be ready to schedule");
20993 schedule(*SLP, S, EI, Picked, ReadyInsts);
20994 if (Picked == &Bundle)
21001 for (
Value *V : VL) {
21002 if (S.isNonSchedulable(V))
21004 if (!extendSchedulingRegion(V, S)) {
21011 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21012 TryScheduleBundleImpl(
false,
Invalid);
21013 return std::nullopt;
21017 bool ReSchedule =
false;
21018 for (
Value *V : VL) {
21019 if (S.isNonSchedulable(V))
21023 if (!CopyableData.
empty()) {
21024 for (ScheduleCopyableData *SD : CopyableData)
21025 ReadyInsts.remove(SD);
21027 ScheduleData *BundleMember = getScheduleData(V);
21028 assert((BundleMember || S.isCopyableElement(V)) &&
21029 "no ScheduleData for bundle member (maybe not in same basic block)");
21035 ReadyInsts.remove(BundleMember);
21037 !Bundles.
empty()) {
21038 for (ScheduleBundle *
B : Bundles)
21039 ReadyInsts.remove(
B);
21042 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21049 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21050 <<
" was already scheduled\n");
21054 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21055 TryScheduleBundleImpl(ReSchedule, Bundle);
21056 if (!Bundle.isReady()) {
21057 for (ScheduleEntity *BD : Bundle.getBundle()) {
21061 if (BD->isReady()) {
21063 if (Bundles.
empty()) {
21064 ReadyInsts.insert(BD);
21067 for (ScheduleBundle *
B : Bundles)
21069 ReadyInsts.insert(
B);
21072 ScheduledBundlesList.pop_back();
21073 SmallVector<ScheduleData *> ControlDependentMembers;
21074 SmallPtrSet<Instruction *, 4> Visited;
21075 for (
Value *V : VL) {
21076 if (S.isNonSchedulable(V))
21079 if (S.isCopyableElement(
I)) {
21082 auto KV = std::make_pair(EI,
I);
21083 assert(ScheduleCopyableDataMap.contains(KV) &&
21084 "no ScheduleCopyableData for copyable element");
21085 ScheduleCopyableData *SD =
21086 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21087 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21090 const auto *It =
find(
Op,
I);
21091 assert(It !=
Op.end() &&
"Lane not set");
21092 SmallPtrSet<Instruction *, 4> Visited;
21094 int Lane = std::distance(
Op.begin(), It);
21095 assert(Lane >= 0 &&
"Lane not set");
21097 !EI.UserTE->ReorderIndices.empty())
21098 Lane = EI.UserTE->ReorderIndices[Lane];
21099 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21100 "Couldn't find extract lane");
21102 if (!Visited.
insert(In).second) {
21106 ScheduleCopyableDataMapByInstUser
21107 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21110 }
while (It !=
Op.end());
21112 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21113 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21115 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21116 ScheduleCopyableDataMapByUsers.erase(
I);
21117 ScheduleCopyableDataMap.erase(KV);
21119 if (ScheduleData *OpSD = getScheduleData(
I);
21120 OpSD && OpSD->hasValidDependencies()) {
21121 OpSD->clearDirectDependencies();
21122 if (RegionHasStackSave ||
21124 ControlDependentMembers.
push_back(OpSD);
21128 ScheduledBundles.find(
I)->getSecond().pop_back();
21130 if (!ControlDependentMembers.
empty()) {
21131 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21132 calculateDependencies(
Invalid,
false, SLP,
21133 ControlDependentMembers);
21135 return std::nullopt;
21140BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21142 if (ChunkPos >= ChunkSize) {
21143 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21146 return &(ScheduleDataChunks.back()[ChunkPos++]);
21149bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21150 Value *V,
const InstructionsState &S) {
21152 assert(
I &&
"bundle member must be an instruction");
21153 if (getScheduleData(
I))
21155 if (!ScheduleStart) {
21157 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21159 ScheduleEnd =
I->getNextNode();
21160 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21161 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21169 ++ScheduleStart->getIterator().getReverse();
21175 return II->isAssumeLikeIntrinsic();
21178 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21179 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21180 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21182 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21183 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21190 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21191 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21193 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21194 assert(
I->getParent() == ScheduleStart->getParent() &&
21195 "Instruction is in wrong basic block.");
21196 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21202 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21203 "Expected to reach top of the basic block or instruction down the "
21205 assert(
I->getParent() == ScheduleEnd->getParent() &&
21206 "Instruction is in wrong basic block.");
21207 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21209 ScheduleEnd =
I->getNextNode();
21210 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21211 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21215void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21217 ScheduleData *PrevLoadStore,
21218 ScheduleData *NextLoadStore) {
21219 ScheduleData *CurrentLoadStore = PrevLoadStore;
21224 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21226 SD = allocateScheduleDataChunks();
21227 ScheduleDataMap[
I] = SD;
21229 assert(!isInSchedulingRegion(*SD) &&
21230 "new ScheduleData already in scheduling region");
21231 SD->init(SchedulingRegionID,
I);
21233 if (
I->mayReadOrWriteMemory() &&
21237 Intrinsic::pseudoprobe))) {
21239 if (CurrentLoadStore) {
21240 CurrentLoadStore->setNextLoadStore(SD);
21242 FirstLoadStoreInRegion = SD;
21244 CurrentLoadStore = SD;
21249 RegionHasStackSave =
true;
21251 if (NextLoadStore) {
21252 if (CurrentLoadStore)
21253 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21255 LastLoadStoreInRegion = CurrentLoadStore;
21259void BoUpSLP::BlockScheduling::calculateDependencies(
21260 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21262 SmallVector<ScheduleEntity *> WorkList;
21263 auto ProcessNode = [&](ScheduleEntity *SE) {
21265 if (CD->hasValidDependencies())
21268 CD->initDependencies();
21269 CD->resetUnscheduledDeps();
21270 const EdgeInfo &EI = CD->getEdgeInfo();
21273 const auto *It =
find(
Op, CD->getInst());
21274 assert(It !=
Op.end() &&
"Lane not set");
21275 SmallPtrSet<Instruction *, 4> Visited;
21277 int Lane = std::distance(
Op.begin(), It);
21278 assert(Lane >= 0 &&
"Lane not set");
21280 !EI.UserTE->ReorderIndices.empty())
21281 Lane = EI.UserTE->ReorderIndices[Lane];
21282 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21283 "Couldn't find extract lane");
21285 if (EI.UserTE->isCopyableElement(In)) {
21288 if (ScheduleCopyableData *UseSD =
21289 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21290 CD->incDependencies();
21291 if (!UseSD->isScheduled())
21292 CD->incrementUnscheduledDeps(1);
21293 if (!UseSD->hasValidDependencies() ||
21294 (InsertInReadyList && UseSD->isReady()))
21297 }
else if (Visited.
insert(In).second) {
21298 if (ScheduleData *UseSD = getScheduleData(In)) {
21299 CD->incDependencies();
21300 if (!UseSD->isScheduled())
21301 CD->incrementUnscheduledDeps(1);
21302 if (!UseSD->hasValidDependencies() ||
21303 (InsertInReadyList && UseSD->isReady()))
21308 }
while (It !=
Op.end());
21309 if (CD->isReady() && CD->getDependencies() == 0 &&
21310 (EI.UserTE->hasState() &&
21311 (EI.UserTE->getMainOp()->getParent() !=
21312 CD->getInst()->getParent() ||
21314 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21315 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21316 auto *IU = dyn_cast<Instruction>(U);
21319 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21325 CD->incDependencies();
21326 CD->incrementUnscheduledDeps(1);
21332 if (BundleMember->hasValidDependencies())
21334 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21335 BundleMember->initDependencies();
21336 BundleMember->resetUnscheduledDeps();
21338 SmallDenseMap<Value *, unsigned> UserToNumOps;
21339 for (User *U : BundleMember->getInst()->users()) {
21342 if (ScheduleData *UseSD = getScheduleData(U)) {
21346 if (areAllOperandsReplacedByCopyableData(
21349 BundleMember->incDependencies();
21350 if (!UseSD->isScheduled())
21351 BundleMember->incrementUnscheduledDeps(1);
21352 if (!UseSD->hasValidDependencies() ||
21353 (InsertInReadyList && UseSD->isReady()))
21357 for (ScheduleCopyableData *UseSD :
21358 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21359 BundleMember->incDependencies();
21360 if (!UseSD->isScheduled())
21361 BundleMember->incrementUnscheduledDeps(1);
21362 if (!UseSD->hasValidDependencies() ||
21363 (InsertInReadyList && UseSD->isReady()))
21367 SmallPtrSet<const Instruction *, 4> Visited;
21370 if (!Visited.
insert(
I).second)
21372 auto *DepDest = getScheduleData(
I);
21373 assert(DepDest &&
"must be in schedule window");
21374 DepDest->addControlDependency(BundleMember);
21375 BundleMember->incDependencies();
21376 if (!DepDest->isScheduled())
21377 BundleMember->incrementUnscheduledDeps(1);
21378 if (!DepDest->hasValidDependencies() ||
21379 (InsertInReadyList && DepDest->isReady()))
21387 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21388 I != ScheduleEnd;
I =
I->getNextNode()) {
21393 MakeControlDependent(
I);
21401 if (RegionHasStackSave) {
21406 match(BundleMember->getInst(),
21408 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21409 I != ScheduleEnd;
I =
I->getNextNode()) {
21420 MakeControlDependent(
I);
21430 BundleMember->getInst()->mayReadOrWriteMemory()) {
21431 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21432 I != ScheduleEnd;
I =
I->getNextNode()) {
21438 MakeControlDependent(
I);
21445 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21446 if (!NextLoadStore)
21450 "NextLoadStore list for non memory effecting bundle?");
21453 unsigned NumAliased = 0;
21454 unsigned DistToSrc = 1;
21455 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21457 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21458 DepDest = DepDest->getNextLoadStore()) {
21459 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21469 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21471 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21478 DepDest->addMemoryDependency(BundleMember);
21479 BundleMember->incDependencies();
21480 if (!DepDest->isScheduled())
21481 BundleMember->incrementUnscheduledDeps(1);
21482 if (!DepDest->hasValidDependencies() ||
21483 (InsertInReadyList && DepDest->isReady()))
21507 "expected at least one instruction to schedule");
21509 WorkList.
push_back(Bundle.getBundle().front());
21511 SmallPtrSet<ScheduleBundle *, 16> Visited;
21512 while (!WorkList.
empty()) {
21517 CopyableBundle.
push_back(&CD->getBundle());
21518 Bundles = CopyableBundle;
21520 Bundles = getScheduleBundles(SD->getInst());
21522 if (Bundles.
empty()) {
21523 if (!SD->hasValidDependencies())
21525 if (InsertInReadyList && SD->isReady()) {
21526 ReadyInsts.insert(SD);
21527 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21531 for (ScheduleBundle *Bundle : Bundles) {
21532 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21534 assert(isInSchedulingRegion(*Bundle) &&
21535 "ScheduleData not in scheduling region");
21536 for_each(Bundle->getBundle(), ProcessNode);
21538 if (InsertInReadyList && SD->isReady()) {
21539 for (ScheduleBundle *Bundle : Bundles) {
21540 assert(isInSchedulingRegion(*Bundle) &&
21541 "ScheduleData not in scheduling region");
21542 if (!Bundle->isReady())
21544 ReadyInsts.insert(Bundle);
21552void BoUpSLP::BlockScheduling::resetSchedule() {
21554 "tried to reset schedule on block which has not been scheduled");
21555 for_each(ScheduleDataMap, [&](
auto &
P) {
21556 if (BB !=
P.first->getParent())
21558 ScheduleData *SD =
P.second;
21559 if (isInSchedulingRegion(*SD)) {
21560 SD->setScheduled(
false);
21561 SD->resetUnscheduledDeps();
21564 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21565 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21566 if (isInSchedulingRegion(*SD)) {
21567 SD->setScheduled(false);
21568 SD->resetUnscheduledDeps();
21572 for_each(ScheduledBundles, [&](
auto &
P) {
21573 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21574 if (isInSchedulingRegion(*Bundle))
21575 Bundle->setScheduled(false);
21579 for (
auto &
P : ScheduleCopyableDataMap) {
21580 if (isInSchedulingRegion(*
P.second)) {
21581 P.second->setScheduled(
false);
21582 P.second->resetUnscheduledDeps();
21585 ReadyInsts.clear();
21588void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21589 if (!BS->ScheduleStart)
21592 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21599 BS->resetSchedule();
21606 struct ScheduleDataCompare {
21607 bool operator()(
const ScheduleEntity *SD1,
21608 const ScheduleEntity *SD2)
const {
21609 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21612 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21617 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21618 I =
I->getNextNode()) {
21620 if (!Bundles.
empty()) {
21621 for (ScheduleBundle *Bundle : Bundles) {
21622 Bundle->setSchedulingPriority(Idx++);
21623 if (!Bundle->hasValidDependencies())
21624 BS->calculateDependencies(*Bundle,
false,
this);
21627 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21628 ScheduleBundle &Bundle = SD->getBundle();
21629 Bundle.setSchedulingPriority(Idx++);
21630 if (!Bundle.hasValidDependencies())
21631 BS->calculateDependencies(Bundle,
false,
this);
21636 BS->getScheduleCopyableDataUsers(
I);
21637 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21640 SDTEs.
front()->doesNotNeedToSchedule() ||
21642 "scheduler and vectorizer bundle mismatch");
21643 SD->setSchedulingPriority(Idx++);
21644 if (!SD->hasValidDependencies() &&
21645 (!CopyableData.
empty() ||
21646 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21647 assert(TE->isGather() &&
"expected gather node");
21648 return TE->hasState() && TE->hasCopyableElements() &&
21649 TE->isCopyableElement(I);
21655 ScheduleBundle Bundle;
21657 BS->calculateDependencies(Bundle,
false,
this);
21660 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21661 ScheduleBundle &Bundle = SD->getBundle();
21662 Bundle.setSchedulingPriority(Idx++);
21663 if (!Bundle.hasValidDependencies())
21664 BS->calculateDependencies(Bundle,
false,
this);
21667 BS->initialFillReadyList(ReadyInsts);
21669 Instruction *LastScheduledInst = BS->ScheduleEnd;
21672 SmallPtrSet<Instruction *, 16> Scheduled;
21673 while (!ReadyInsts.empty()) {
21674 auto *Picked = *ReadyInsts.begin();
21675 ReadyInsts.erase(ReadyInsts.begin());
21680 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21681 Instruction *PickedInst = BundleMember->getInst();
21683 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21684 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21685 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21687 if (PickedInst->
getNextNode() != LastScheduledInst)
21689 LastScheduledInst = PickedInst;
21691 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21692 LastScheduledInst);
21696 if (PickedInst->
getNextNode() != LastScheduledInst)
21698 LastScheduledInst = PickedInst;
21700 auto Invalid = InstructionsState::invalid();
21705#ifdef EXPENSIVE_CHECKS
21709#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21711 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21712 I =
I->getNextNode()) {
21715 [](
const ScheduleBundle *Bundle) {
21716 return Bundle->isScheduled();
21718 "must be scheduled at this point");
21723 BS->ScheduleStart =
nullptr;
21731 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21736 auto E = InstrElementSize.find(V);
21737 if (E != InstrElementSize.end())
21754 Value *FirstNonBool =
nullptr;
21755 while (!Worklist.
empty()) {
21760 auto *Ty =
I->getType();
21763 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21771 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21779 for (
Use &U :
I->operands()) {
21781 if (Visited.
insert(J).second &&
21787 FirstNonBool = U.get();
21798 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21800 Width = DL->getTypeSizeInBits(V->getType());
21804 InstrElementSize[
I] = Width;
21809bool BoUpSLP::collectValuesToDemote(
21810 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21813 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21818 unsigned OrigBitWidth =
21819 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21826 if (NodesToKeepBWs.
contains(E.Idx))
21832 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21833 if (isa<PoisonValue>(R))
21835 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21837 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21840 if (getTreeEntries(V).
size() > 1)
21846 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21852 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21857 unsigned BitWidth2 =
21858 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21859 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21865 BitWidth1 = std::min(BitWidth1, BitWidth2);
21870 auto FinalAnalysis = [&, TTI = TTI]() {
21871 if (!IsProfitableToDemote)
21874 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21876 if (Res &&
E.isGather()) {
21877 if (
E.hasState()) {
21878 if (
const TreeEntry *SameTE =
21879 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21881 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21882 ToDemote, Visited, NodesToKeepBWs,
21883 MaxDepthLevel, IsProfitableToDemote,
21891 SmallPtrSet<Value *, 4> UniqueBases;
21892 for (
Value *V :
E.Scalars) {
21896 UniqueBases.
insert(EE->getVectorOperand());
21898 const unsigned VF =
E.Scalars.size();
21899 Type *OrigScalarTy =
E.Scalars.front()->getType();
21900 if (UniqueBases.
size() <= 2 ||
21913 if (
E.isGather() || !Visited.
insert(&
E).second ||
21915 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21916 return isa<InsertElementInst>(U) && !isVectorized(U);
21919 return FinalAnalysis();
21922 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21923 return isVectorized(U) ||
21924 (E.Idx == 0 && UserIgnoreList &&
21925 UserIgnoreList->contains(U)) ||
21926 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21927 !U->getType()->isScalableTy() &&
21928 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21929 }) && !IsPotentiallyTruncated(V,
BitWidth);
21934 bool &NeedToExit) {
21935 NeedToExit =
false;
21936 unsigned InitLevel = MaxDepthLevel;
21938 unsigned Level = InitLevel;
21939 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21940 ToDemote, Visited, NodesToKeepBWs, Level,
21941 IsProfitableToDemote, IsTruncRoot)) {
21942 if (!IsProfitableToDemote)
21945 if (!FinalAnalysis())
21949 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21953 auto AttemptCheckBitwidth =
21954 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
21956 NeedToExit =
false;
21957 unsigned BestFailBitwidth = 0;
21959 if (Checker(
BitWidth, OrigBitWidth))
21961 if (BestFailBitwidth == 0 && FinalAnalysis())
21965 if (BestFailBitwidth == 0) {
21976 auto TryProcessInstruction =
21978 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
21982 for (
Value *V :
E.Scalars)
21983 (void)IsPotentiallyTruncated(V,
BitWidth);
21988 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21991 bool NeedToExit =
false;
21992 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21996 if (!ProcessOperands(
Operands, NeedToExit))
22005 return IsProfitableToDemote;
22008 if (
E.State == TreeEntry::SplitVectorize)
22009 return TryProcessInstruction(
22011 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22012 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22014 switch (
E.getOpcode()) {
22018 case Instruction::Trunc:
22019 if (IsProfitableToDemoteRoot)
22020 IsProfitableToDemote =
true;
22021 return TryProcessInstruction(
BitWidth);
22022 case Instruction::ZExt:
22023 case Instruction::SExt:
22024 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22025 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22026 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22028 IsProfitableToDemote =
true;
22029 return TryProcessInstruction(
BitWidth);
22033 case Instruction::Add:
22034 case Instruction::Sub:
22035 case Instruction::Mul:
22036 case Instruction::And:
22037 case Instruction::Or:
22038 case Instruction::Xor: {
22039 return TryProcessInstruction(
22040 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22042 case Instruction::Freeze:
22043 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22044 case Instruction::Shl: {
22047 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22049 if (isa<PoisonValue>(V))
22051 if (E.isCopyableElement(V))
22053 auto *I = cast<Instruction>(V);
22054 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22055 return AmtKnownBits.getMaxValue().ult(BitWidth);
22058 return TryProcessInstruction(
22059 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22061 case Instruction::LShr: {
22065 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22067 if (isa<PoisonValue>(V))
22069 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22070 if (E.isCopyableElement(V))
22071 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22072 auto *I = cast<Instruction>(V);
22073 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22074 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22075 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22076 SimplifyQuery(*DL));
22079 return TryProcessInstruction(
22080 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22083 case Instruction::AShr: {
22087 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22089 if (isa<PoisonValue>(V))
22091 auto *I = cast<Instruction>(V);
22092 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22093 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22094 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22096 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22099 return TryProcessInstruction(
22100 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22103 case Instruction::UDiv:
22104 case Instruction::URem: {
22106 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22109 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22110 if (E.hasCopyableElements() && E.isCopyableElement(V))
22111 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22112 auto *I = cast<Instruction>(V);
22113 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22114 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22117 return TryProcessInstruction(
22118 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22122 case Instruction::Select: {
22123 return TryProcessInstruction(
22124 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22128 case Instruction::PHI: {
22129 const unsigned NumOps =
E.getNumOperands();
22132 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22137 case Instruction::Call: {
22142 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22143 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22146 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22147 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22150 auto *I = cast<Instruction>(V);
22151 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22152 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22153 return MaskedValueIsZero(I->getOperand(0), Mask,
22154 SimplifyQuery(*DL)) &&
22155 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22157 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22158 "Expected min/max intrinsics only.");
22159 unsigned SignBits = OrigBitWidth -
BitWidth;
22161 unsigned Op0SignBits =
22163 unsigned Op1SignBits =
22165 return SignBits <= Op0SignBits &&
22166 ((SignBits != Op0SignBits &&
22169 SimplifyQuery(*DL))) &&
22170 SignBits <= Op1SignBits &&
22171 ((SignBits != Op1SignBits &&
22176 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22179 auto *I = cast<Instruction>(V);
22180 unsigned SignBits = OrigBitWidth - BitWidth;
22181 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22182 unsigned Op0SignBits =
22183 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22184 return SignBits <= Op0SignBits &&
22185 ((SignBits != Op0SignBits &&
22186 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22187 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22190 if (
ID != Intrinsic::abs) {
22191 Operands.push_back(getOperandEntry(&
E, 1));
22192 CallChecker = CompChecker;
22194 CallChecker = AbsChecker;
22197 std::numeric_limits<InstructionCost::CostType>::max();
22199 unsigned VF =
E.Scalars.size();
22201 auto Checker = [&](
unsigned BitWidth, unsigned) {
22209 if (
Cost < BestCost) {
22215 [[maybe_unused]]
bool NeedToExit;
22216 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22226 return FinalAnalysis();
22233 bool IsStoreOrInsertElt =
22234 VectorizableTree.front()->hasState() &&
22235 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22236 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22237 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22238 ExtraBitWidthNodes.size() <= 1 &&
22239 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22240 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22243 unsigned NodeIdx = 0;
22244 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22248 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22249 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22250 "Unexpected tree is graph.");
22254 bool IsTruncRoot =
false;
22255 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22258 if (NodeIdx != 0 &&
22259 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22260 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22261 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22262 IsTruncRoot =
true;
22264 IsProfitableToDemoteRoot =
true;
22269 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22273 auto ComputeMaxBitWidth =
22274 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22275 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22279 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22280 !NodesToKeepBWs.
contains(E.Idx) &&
22281 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22283 return V->hasOneUse() || isa<Constant>(V) ||
22284 (!V->hasNUsesOrMore(UsesLimit) &&
22285 none_of(V->users(), [&](User *U) {
22286 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22287 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22288 if (TEs.empty() || is_contained(TEs, UserTE))
22290 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22292 isa<SIToFPInst, UIToFPInst>(U) ||
22293 (UserTE->hasState() &&
22294 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22295 SelectInst>(UserTE->getMainOp()) ||
22296 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22298 unsigned UserTESz = DL->getTypeSizeInBits(
22299 UserTE->Scalars.front()->getType());
22300 if (all_of(TEs, [&](const TreeEntry *TE) {
22301 auto It = MinBWs.find(TE);
22302 return It != MinBWs.end() &&
22303 It->second.first > UserTESz;
22306 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22310 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22311 auto It = MinBWs.find(UserTE);
22312 if (It != MinBWs.end())
22313 return It->second.first;
22314 unsigned MaxBitWidth =
22315 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22316 MaxBitWidth =
bit_ceil(MaxBitWidth);
22317 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22319 return MaxBitWidth;
22325 unsigned VF = E.getVectorFactor();
22326 Type *ScalarTy = E.Scalars.front()->getType();
22333 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22342 unsigned MaxBitWidth = 1u;
22350 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22351 if (isa<PoisonValue>(R))
22353 KnownBits Known = computeKnownBits(R, *DL);
22354 return Known.isNonNegative();
22357 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22358 E.UserTreeIndex.UserTE->hasState() &&
22359 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22361 std::min(DL->getTypeSizeInBits(
22362 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22363 DL->getTypeSizeInBits(ScalarTy));
22367 for (
Value *Root : E.Scalars) {
22373 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22389 if (!IsKnownPositive)
22394 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22397 APInt Mask = DB->getDemandedBits(
I);
22398 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22400 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22403 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22408 if (NumParts > 1 &&
22416 unsigned Opcode = E.getOpcode();
22417 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22418 Opcode == Instruction::SExt ||
22419 Opcode == Instruction::ZExt || NumParts > 1;
22424 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22425 bool NeedToDemote = IsProfitableToDemote;
22427 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22428 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22429 NeedToDemote, IsTruncRoot) ||
22430 (MaxDepthLevel <= Limit &&
22431 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22432 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22433 DL->getTypeSizeInBits(TreeRootIT) /
22434 DL->getTypeSizeInBits(
22435 E.getMainOp()->getOperand(0)->getType()) >
22439 MaxBitWidth =
bit_ceil(MaxBitWidth);
22441 return MaxBitWidth;
22448 if (UserIgnoreList &&
22452 if (
all_of(*UserIgnoreList,
22457 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22458 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22459 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22460 Builder.getInt1Ty()) {
22461 ReductionBitWidth = 1;
22463 for (
Value *V : *UserIgnoreList) {
22467 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22468 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22471 unsigned BitWidth2 = BitWidth1;
22474 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22476 ReductionBitWidth =
22477 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22479 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22480 ReductionBitWidth = 8;
22482 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22485 bool IsTopRoot = NodeIdx == 0;
22486 while (NodeIdx < VectorizableTree.size() &&
22487 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22488 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22489 RootDemotes.push_back(NodeIdx);
22491 IsTruncRoot =
true;
22493 bool IsSignedCmp =
false;
22494 if (UserIgnoreList &&
22498 IsSignedCmp =
true;
22499 while (NodeIdx < VectorizableTree.size()) {
22501 unsigned Limit = 2;
22503 ReductionBitWidth ==
22504 DL->getTypeSizeInBits(
22505 VectorizableTree.front()->Scalars.front()->getType()))
22507 unsigned MaxBitWidth = ComputeMaxBitWidth(
22508 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22509 IsTruncRoot, IsSignedCmp);
22510 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22511 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22512 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22513 else if (MaxBitWidth == 0)
22514 ReductionBitWidth = 0;
22517 for (
unsigned Idx : RootDemotes) {
22518 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22519 uint32_t OrigBitWidth =
22520 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22521 if (OrigBitWidth > MaxBitWidth) {
22529 RootDemotes.clear();
22531 IsProfitableToDemoteRoot =
true;
22533 if (ExtraBitWidthNodes.empty()) {
22534 NodeIdx = VectorizableTree.size();
22536 unsigned NewIdx = 0;
22538 NewIdx = *ExtraBitWidthNodes.begin();
22539 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22540 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22543 NodeIdx < VectorizableTree.size() &&
22544 VectorizableTree[NodeIdx]->UserTreeIndex &&
22545 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22546 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22547 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22548 Instruction::Trunc &&
22549 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22551 NodeIdx < VectorizableTree.size() &&
22552 VectorizableTree[NodeIdx]->UserTreeIndex &&
22553 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22554 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22555 Instruction::ICmp &&
22557 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22559 auto *IC = dyn_cast<ICmpInst>(V);
22560 return IC && (IC->isSigned() ||
22561 !isKnownNonNegative(IC->getOperand(0),
22562 SimplifyQuery(*DL)) ||
22563 !isKnownNonNegative(IC->getOperand(1),
22564 SimplifyQuery(*DL)));
22570 if (MaxBitWidth == 0 ||
22574 if (UserIgnoreList)
22575 AnalyzedMinBWVals.insert_range(TreeRoot);
22582 for (
unsigned Idx : ToDemote) {
22583 TreeEntry *
TE = VectorizableTree[Idx].get();
22584 if (MinBWs.contains(TE))
22587 if (isa<PoisonValue>(R))
22589 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22591 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22632 DL = &
F.getDataLayout();
22640 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22642 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22647 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22650 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22654 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22660 DT->updateDFSNumbers();
22663 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22668 R.clearReductionData();
22669 collectSeedInstructions(BB);
22672 if (!Stores.empty()) {
22674 <<
" underlying objects.\n");
22675 Changed |= vectorizeStoreChains(R);
22679 Changed |= vectorizeChainsInBlock(BB, R);
22684 if (!GEPs.empty()) {
22686 <<
" underlying objects.\n");
22687 Changed |= vectorizeGEPIndices(BB, R);
22692 R.optimizeGatherSequence();
22700 unsigned Idx,
unsigned MinVF,
22705 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22706 unsigned VF = Chain.
size();
22712 VF < 2 || VF < MinVF) {
22720 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22724 for (
Value *V : Chain)
22727 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22728 InstructionsState S =
Analysis.buildInstructionsState(
22732 bool IsAllowedSize =
22736 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22737 (!S.getMainOp()->isSafeToRemove() ||
22740 return !isa<ExtractElementInst>(V) &&
22741 (V->getNumUses() > Chain.size() ||
22742 any_of(V->users(), [&](User *U) {
22743 return !Stores.contains(U);
22746 (ValOps.
size() > Chain.size() / 2 && !S)) {
22747 Size = (!IsAllowedSize && S) ? 1 : 2;
22751 if (
R.isLoadCombineCandidate(Chain))
22753 R.buildTree(Chain);
22755 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22756 if (
R.isGathered(Chain.front()) ||
22758 return std::nullopt;
22759 Size =
R.getCanonicalGraphSize();
22762 if (
R.isProfitableToReorder()) {
22763 R.reorderTopToBottom();
22764 R.reorderBottomToTop();
22766 R.transformNodes();
22767 R.buildExternalUses();
22769 R.computeMinimumValueSizes();
22771 Size =
R.getCanonicalGraphSize();
22772 if (S && S.getOpcode() == Instruction::Load)
22780 using namespace ore;
22782 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22784 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22785 <<
" and with tree size "
22786 <<
NV(
"TreeSize",
R.getTreeSize()));
22800 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22801 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22802 unsigned Size = First ? Val.first : Val.second;
22814 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22815 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22816 unsigned P = First ? Val.first : Val.second;
22819 return V + (P - Mean) * (P - Mean);
22822 return Dev * 96 / (Mean * Mean) == 0;
22830class RelatedStoreInsts {
22833 : AllStores(AllStores) {
22834 reset(BaseInstrIdx);
22837 void reset(
unsigned NewBaseInstr) {
22838 assert(NewBaseInstr < AllStores.size() &&
22839 "Instruction index out of bounds");
22840 BaseInstrIdx = NewBaseInstr;
22842 insertOrLookup(NewBaseInstr, 0);
22849 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22850 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22851 return Inserted ? std::nullopt : std::make_optional(It->second);
22854 using DistToInstMap = std::map<int64_t, unsigned>;
22855 const DistToInstMap &getStores()
const {
return Instrs; }
22859 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22860 ScalarEvolution &SE)
const {
22861 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22864 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22870 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22871 int64_t DistFromCurBase) {
22872 DistToInstMap PrevSet = std::move(Instrs);
22873 reset(NewBaseInstIdx);
22878 for (
auto [Dist, InstIdx] : PrevSet) {
22879 if (InstIdx >= MinSafeIdx)
22880 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22886 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22887 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22888 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22893 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22894 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22899 unsigned BaseInstrIdx;
22902 DistToInstMap Instrs;
22910bool SLPVectorizerPass::vectorizeStores(
22912 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22919 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22920 int64_t PrevDist = -1;
22924 auto &[Dist, InstIdx] =
Data;
22925 if (
Operands.empty() || Dist - PrevDist == 1) {
22926 Operands.push_back(Stores[InstIdx]);
22928 if (Idx != StoreSeq.size() - 1)
22933 Operands.push_back(Stores[InstIdx]);
22939 .
insert({Operands.front(),
22940 cast<StoreInst>(Operands.front())->getValueOperand(),
22942 cast<StoreInst>(Operands.back())->getValueOperand(),
22947 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22948 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22952 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22954 Type *StoreTy =
Store->getValueOperand()->getType();
22955 Type *ValueTy = StoreTy;
22957 ValueTy = Trunc->getSrcTy();
22966 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22969 MinVF = std::max<unsigned>(2, MinVF);
22971 if (MaxVF < MinVF) {
22972 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22974 <<
"MinVF (" << MinVF <<
")\n");
22978 unsigned NonPowerOf2VF = 0;
22983 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22985 NonPowerOf2VF = CandVF;
22986 assert(NonPowerOf2VF != MaxVF &&
22987 "Non-power-of-2 VF should not be equal to MaxVF");
22994 unsigned MaxRegVF = MaxVF;
22997 if (MaxVF < MinVF) {
22998 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23000 <<
"MinVF (" << MinVF <<
")\n");
23004 SmallVector<unsigned> CandidateVFs;
23005 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23010 unsigned Repeat = 0;
23011 constexpr unsigned MaxAttempts = 4;
23012 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(
Operands.size());
23013 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23014 P.first =
P.second = 1;
23015 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23016 auto IsNotVectorized = [](
bool First,
23017 const std::pair<unsigned, unsigned> &
P) {
23018 return First ?
P.first > 0 :
P.second > 0;
23020 auto IsVectorized = [](
bool First,
23021 const std::pair<unsigned, unsigned> &
P) {
23022 return First ?
P.first == 0 :
P.second == 0;
23024 auto VFIsProfitable = [](
bool First,
unsigned Size,
23025 const std::pair<unsigned, unsigned> &
P) {
23028 auto FirstSizeSame = [](
unsigned Size,
23029 const std::pair<unsigned, unsigned> &
P) {
23030 return Size ==
P.first;
23034 bool RepeatChanged =
false;
23035 bool AnyProfitableGraph =
false;
23036 for (
unsigned VF : CandidateVFs) {
23037 AnyProfitableGraph =
false;
23038 unsigned FirstUnvecStore =
23039 std::distance(RangeSizes.begin(),
23040 find_if(RangeSizes, std::bind(IsNotVectorized,
23041 VF >= MaxRegVF, _1)));
23045 while (FirstUnvecStore < End) {
23046 unsigned FirstVecStore = std::distance(
23047 RangeSizes.begin(),
23048 find_if(RangeSizes.drop_front(FirstUnvecStore),
23049 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23050 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23051 for (
unsigned SliceStartIdx = FirstUnvecStore;
23052 SliceStartIdx + VF <= MaxSliceEnd;) {
23063 ->getValueOperand()
23066 ->getValueOperand()
23069 "Expected all operands of same type.");
23070 if (!NonSchedulable.
empty()) {
23071 auto [NonSchedSizeMax, NonSchedSizeMin] =
23073 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23076 SliceStartIdx += NonSchedSizeMax;
23081 std::optional<bool> Res =
23082 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23088 .first->getSecond()
23096 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23099 for (std::pair<unsigned, unsigned> &
P :
23100 RangeSizes.slice(SliceStartIdx, VF))
23101 P.first =
P.second = 0;
23102 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23103 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23104 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23105 P.first =
P.second = 0;
23106 FirstUnvecStore = SliceStartIdx + VF;
23108 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23109 for (std::pair<unsigned, unsigned> &
P :
23110 RangeSizes.slice(SliceStartIdx + VF,
23111 MaxSliceEnd - (SliceStartIdx + VF)))
23112 P.first =
P.second = 0;
23113 if (MaxSliceEnd == End)
23114 End = SliceStartIdx;
23115 MaxSliceEnd = SliceStartIdx;
23117 SliceStartIdx += VF;
23120 if (VF > 2 && Res &&
23121 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23122 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23124 SliceStartIdx += VF;
23129 if (VF > MaxRegVF && TreeSize > 1 &&
23130 all_of(RangeSizes.slice(SliceStartIdx, VF),
23131 std::bind(FirstSizeSame, TreeSize, _1))) {
23132 SliceStartIdx += VF;
23133 while (SliceStartIdx != MaxSliceEnd &&
23134 RangeSizes[SliceStartIdx].first == TreeSize)
23138 if (TreeSize > 1) {
23139 for (std::pair<unsigned, unsigned> &
P :
23140 RangeSizes.slice(SliceStartIdx, VF)) {
23141 if (VF >= MaxRegVF)
23142 P.second = std::max(
P.second, TreeSize);
23144 P.first = std::max(
P.first, TreeSize);
23148 AnyProfitableGraph =
true;
23150 if (FirstUnvecStore >= End)
23152 if (MaxSliceEnd - FirstUnvecStore < VF &&
23153 MaxSliceEnd - FirstUnvecStore >= MinVF)
23154 AnyProfitableGraph =
true;
23155 FirstUnvecStore = std::distance(
23156 RangeSizes.begin(),
23157 find_if(RangeSizes.drop_front(MaxSliceEnd),
23158 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23160 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23164 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23165 return P.first == 0 &&
P.second == 0;
23169 if (Repeat >= MaxAttempts ||
23170 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23172 constexpr unsigned StoresLimit = 64;
23173 const unsigned MaxTotalNum = std::min<unsigned>(
23175 static_cast<unsigned>(
23178 RangeSizes.begin(),
23179 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23181 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23184 CandidateVFs.clear();
23186 CandidateVFs.push_back(Limit);
23187 if (VF > MaxTotalNum || VF >= StoresLimit)
23189 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23191 P.first = std::max(
P.second,
P.first);
23195 CandidateVFs.push_back(VF);
23235 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23236 std::optional<int64_t> PtrDist;
23237 auto *RelatedStores =
find_if(
23238 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23239 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23240 return PtrDist.has_value();
23244 if (RelatedStores == SortedStores.
end()) {
23252 if (std::optional<unsigned> PrevInst =
23253 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23254 TryToVectorize(RelatedStores->getStores());
23255 RelatedStores->clearVectorizedStores(VectorizedStores);
23256 RelatedStores->rebase(*PrevInst + 1,
23261 Type *PrevValTy =
nullptr;
23263 if (
R.isDeleted(SI))
23266 PrevValTy =
SI->getValueOperand()->getType();
23268 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23269 for (RelatedStoreInsts &StoreSeq : SortedStores)
23270 TryToVectorize(StoreSeq.getStores());
23271 SortedStores.clear();
23272 PrevValTy =
SI->getValueOperand()->getType();
23274 FillStoresSet(
I, SI);
23278 for (RelatedStoreInsts &StoreSeq : SortedStores)
23279 TryToVectorize(StoreSeq.getStores());
23284void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23292 for (Instruction &
I : *BB) {
23296 if (!
SI->isSimple())
23307 if (
GEP->getNumIndices() != 1)
23309 Value *Idx =
GEP->idx_begin()->get();
23314 if (
GEP->getType()->isVectorTy())
23326 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23327 << VL.
size() <<
".\n");
23338 for (
Value *V : VL) {
23339 Type *Ty =
V->getType();
23343 R.getORE()->emit([&]() {
23344 std::string TypeStr;
23345 llvm::raw_string_ostream OS(TypeStr);
23347 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23348 <<
"Cannot SLP vectorize list: type "
23349 << TypeStr +
" is unsupported by vectorizer";
23356 unsigned Sz =
R.getVectorElementSize(I0);
23357 unsigned MinVF =
R.getMinVF(Sz);
23358 unsigned MaxVF = std::max<unsigned>(
23360 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23362 R.getORE()->emit([&]() {
23363 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23364 <<
"Cannot SLP vectorize list: vectorization factor "
23365 <<
"less than 2 is not supported";
23371 bool CandidateFound =
false;
23374 unsigned NextInst = 0, MaxInst = VL.size();
23375 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23381 if (TTI->getNumberOfParts(VecTy) == VF)
23383 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23384 unsigned ActualVF = std::min(MaxInst -
I, VF);
23389 if (MaxVFOnly && ActualVF < MaxVF)
23391 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23396 for (
Value *V : VL.drop_front(
I)) {
23400 !Inst || !
R.isDeleted(Inst)) {
23403 if (Idx == ActualVF)
23408 if (Idx != ActualVF)
23411 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23415 if (
R.isTreeTinyAndNotFullyVectorizable())
23417 if (
R.isProfitableToReorder()) {
23418 R.reorderTopToBottom();
23421 R.transformNodes();
23422 R.buildExternalUses();
23424 R.computeMinimumValueSizes();
23426 CandidateFound =
true;
23427 MinCost = std::min(MinCost,
Cost);
23430 <<
" for VF=" << ActualVF <<
"\n");
23433 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23435 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23436 <<
" and with tree size "
23437 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23448 if (!
Changed && CandidateFound) {
23449 R.getORE()->emit([&]() {
23450 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23451 <<
"List vectorization was possible but not beneficial with cost "
23452 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23456 R.getORE()->emit([&]() {
23457 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23458 <<
"Cannot SLP vectorize list: vectorization was impossible"
23459 <<
" with available vectorization factors";
23494 using ReductionOpsType = SmallVector<Value *, 16>;
23495 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23496 ReductionOpsListType ReductionOps;
23500 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23501 WeakTrackingVH ReductionRoot;
23506 bool IsSupportedHorRdxIdentityOp =
false;
23513 static bool isCmpSelMinMax(Instruction *
I) {
23521 static bool isBoolLogicOp(Instruction *
I) {
23527 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23528 bool TwoElementReduction =
false) {
23529 if (Kind == RecurKind::None)
23538 if (TwoElementReduction)
23541 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23545 return I->getFastMathFlags().noNaNs();
23548 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23551 return I->isAssociative();
23554 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23560 return I->getOperand(2);
23561 return I->getOperand(Index);
23566 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23570 case RecurKind::Or: {
23579 case RecurKind::And: {
23588 case RecurKind::Add:
23589 case RecurKind::Mul:
23590 case RecurKind::Xor:
23591 case RecurKind::FAdd:
23592 case RecurKind::FMul: {
23597 case RecurKind::SMax:
23598 case RecurKind::SMin:
23599 case RecurKind::UMax:
23600 case RecurKind::UMin:
23607 case RecurKind::FMax:
23608 case RecurKind::FMin:
23609 case RecurKind::FMaximum:
23610 case RecurKind::FMinimum:
23611 case RecurKind::FMaximumNum:
23612 case RecurKind::FMinimumNum: {
23625 const ReductionOpsListType &ReductionOps) {
23626 bool UseSelect = ReductionOps.size() == 2 ||
23628 (ReductionOps.size() == 1 &&
23630 assert((!UseSelect || ReductionOps.size() != 2 ||
23632 "Expected cmp + select pairs for reduction");
23633 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23651 return RecurKind::None;
23653 return RecurKind::Add;
23655 return RecurKind::Mul;
23658 return RecurKind::And;
23661 return RecurKind::Or;
23663 return RecurKind::Xor;
23665 return RecurKind::FAdd;
23667 return RecurKind::FMul;
23670 return RecurKind::FMax;
23672 return RecurKind::FMin;
23675 return RecurKind::FMaximum;
23677 return RecurKind::FMinimum;
23683 return RecurKind::SMax;
23685 return RecurKind::SMin;
23687 return RecurKind::UMax;
23689 return RecurKind::UMin;
23715 return RecurKind::None;
23719 return RecurKind::None;
23722 return RecurKind::None;
23726 return RecurKind::None;
23731 return RecurKind::None;
23734 return RecurKind::SMax;
23737 return RecurKind::SMin;
23740 return RecurKind::UMax;
23743 return RecurKind::UMin;
23746 return RecurKind::None;
23750 static unsigned getFirstOperandIndex(Instruction *
I) {
23751 return isCmpSelMinMax(
I) ? 1 : 0;
23756 static unsigned getNumberOfOperands(Instruction *
I) {
23757 return isCmpSelMinMax(
I) ? 3 : 2;
23762 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23763 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23766 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23768 return I->getParent() == BB;
23772 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23773 if (IsCmpSelMinMax) {
23777 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23778 return I->hasNUses(2);
23786 void initReductionOps(Instruction *
I) {
23787 if (isCmpSelMinMax(
I))
23788 ReductionOps.assign(2, ReductionOpsType());
23790 ReductionOps.assign(1, ReductionOpsType());
23794 void addReductionOps(Instruction *
I) {
23795 if (isCmpSelMinMax(
I)) {
23797 ReductionOps[1].emplace_back(
I);
23799 ReductionOps[0].emplace_back(
I);
23804 int Sz =
Data.size();
23813 : ReductionRoot(
I), ReductionLimit(2) {
23814 RdxKind = HorizontalReduction::getRdxKind(
I);
23815 ReductionOps.emplace_back().push_back(
I);
23818 ReducedValsToOps[
V].push_back(
I);
23821 bool matchReductionForOperands()
const {
23824 assert(ReductionRoot &&
"Reduction root is not set!");
23827 return Ops.size() == 2;
23835 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23836 ScalarEvolution &SE,
const DataLayout &
DL,
23837 const TargetLibraryInfo &TLI) {
23838 RdxKind = HorizontalReduction::getRdxKind(Root);
23839 if (!isVectorizable(RdxKind, Root))
23851 if (!Sel->getCondition()->hasOneUse())
23854 ReductionRoot = Root;
23859 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23861 1, std::make_pair(Root, 0));
23866 SmallVectorImpl<Value *> &PossibleReducedVals,
23867 SmallVectorImpl<Instruction *> &ReductionOps,
23870 getNumberOfOperands(TreeN)))) {
23871 Value *EdgeVal = getRdxOperand(TreeN,
I);
23872 ReducedValsToOps[EdgeVal].push_back(TreeN);
23880 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23881 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23882 !isVectorizable(RdxKind, EdgeInst) ||
23883 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23885 PossibleReducedVals.push_back(EdgeVal);
23888 ReductionOps.push_back(EdgeInst);
23897 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23899 PossibleReducedVals;
23900 initReductionOps(Root);
23902 SmallSet<size_t, 2> LoadKeyUsed;
23904 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23909 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23910 if (LIt != LoadsMap.
end()) {
23911 for (LoadInst *RLI : LIt->second) {
23917 for (LoadInst *RLI : LIt->second) {
23924 if (LIt->second.size() > 2) {
23926 hash_value(LIt->second.back()->getPointerOperand());
23932 .first->second.push_back(LI);
23936 while (!Worklist.empty()) {
23937 auto [TreeN,
Level] = Worklist.pop_back_val();
23940 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23941 addReductionOps(TreeN);
23944 for (
Value *V : PossibleRedVals) {
23948 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
23950 for (Instruction *
I :
reverse(PossibleReductionOps))
23951 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23953 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23956 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23957 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23959 for (
auto &Slice : PossibleRedVals) {
23961 auto RedValsVect = Slice.second.takeVector();
23963 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
23964 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
23966 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23967 return P1.size() > P2.size();
23974 }
else if (!isGoodForReduction(
Data)) {
23977 if (!LI || !LastLI ||
23982 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
23988 return P1.size() > P2.
size();
23994 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
23995 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23996 DominatorTree &DT) {
23997 constexpr unsigned RegMaxNumber = 4;
23998 constexpr unsigned RedValsMaxNumber = 128;
24002 if (
unsigned NumReducedVals = std::accumulate(
24003 ReducedVals.
begin(), ReducedVals.
end(), 0,
24005 if (!isGoodForReduction(Vals))
24007 return Num + Vals.size();
24009 NumReducedVals < ReductionLimit &&
24013 for (ReductionOpsType &RdxOps : ReductionOps)
24014 for (
Value *RdxOp : RdxOps)
24019 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24025 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24026 ReducedVals.
front().size());
24030 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24032 "Expected min/max reduction to have select root instruction");
24035 "Expected min/max reduction to have compare condition");
24039 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24040 return isBoolLogicOp(cast<Instruction>(V));
24043 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24044 if (VectorizedTree) {
24048 if (AnyBoolLogicOp) {
24049 auto It = ReducedValsToOps.
find(VectorizedTree);
24050 auto It1 = ReducedValsToOps.
find(Res);
24051 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24053 (It != ReducedValsToOps.
end() &&
24054 any_of(It->getSecond(), [&](Instruction *
I) {
24055 return isBoolLogicOp(I) &&
24056 getRdxOperand(I, 0) == VectorizedTree;
24060 (It1 != ReducedValsToOps.
end() &&
24061 any_of(It1->getSecond(), [&](Instruction *
I) {
24062 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24066 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24070 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24076 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24077 ReductionOps.front().size());
24078 for (ReductionOpsType &RdxOps : ReductionOps)
24079 for (
Value *RdxOp : RdxOps) {
24082 IgnoreList.insert(RdxOp);
24085 FastMathFlags RdxFMF;
24087 for (
Value *U : IgnoreList)
24089 RdxFMF &= FPMO->getFastMathFlags();
24095 for (
Value *V : Candidates)
24096 TrackedVals.try_emplace(V, V);
24098 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24099 Value *
V) ->
unsigned & {
24100 auto *It = MV.
find(V);
24101 assert(It != MV.
end() &&
"Unable to find given key.");
24105 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24108 SmallPtrSet<Value *, 4> RequiredExtract;
24109 WeakTrackingVH VectorizedTree =
nullptr;
24110 bool CheckForReusedReductionOps =
false;
24115 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24117 InstructionsState S = States[
I];
24120 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24121 for (
Value *ReducedVal : OrigReducedVals) {
24122 Value *RdxVal = TrackedVals.at(ReducedVal);
24129 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24133 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24135 bool ShuffledExtracts =
false;
24137 if (S && S.getOpcode() == Instruction::ExtractElement &&
24138 !S.isAltShuffle() &&
I + 1 <
E) {
24140 for (
Value *RV : ReducedVals[
I + 1]) {
24141 Value *RdxVal = TrackedVals.at(RV);
24148 CommonCandidates.push_back(RdxVal);
24149 TrackedToOrig.try_emplace(RdxVal, RV);
24151 SmallVector<int>
Mask;
24154 Candidates.
swap(CommonCandidates);
24155 ShuffledExtracts =
true;
24162 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24163 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24165 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24166 Value *OrigV = TrackedToOrig.at(VC);
24167 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24169 V.analyzedReductionRoot(ResI);
24171 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24175 unsigned NumReducedVals = Candidates.
size();
24176 if (NumReducedVals < ReductionLimit &&
24177 (NumReducedVals < 2 || !
isSplat(Candidates)))
24182 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24183 RdxKind != RecurKind::FMul &&
24184 RdxKind != RecurKind::FMulAdd;
24186 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24187 if (IsSupportedHorRdxIdentityOp)
24188 for (
Value *V : Candidates) {
24189 Value *OrigV = TrackedToOrig.at(V);
24190 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24202 bool SameScaleFactor =
false;
24203 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24204 SameValuesCounter.
size() != Candidates.size();
24206 if (OptReusedScalars) {
24208 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24209 RdxKind == RecurKind::Xor) &&
24211 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24212 return P.second == SameValuesCounter.
front().second;
24214 Candidates.resize(SameValuesCounter.
size());
24215 transform(SameValuesCounter, Candidates.begin(),
24216 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24217 NumReducedVals = Candidates.size();
24219 if (NumReducedVals == 1) {
24220 Value *OrigV = TrackedToOrig.at(Candidates.front());
24221 unsigned Cnt = At(SameValuesCounter, OrigV);
24223 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24224 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24225 VectorizedVals.try_emplace(OrigV, Cnt);
24226 ExternallyUsedValues.
insert(OrigV);
24231 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24232 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24233 const unsigned MaxElts = std::clamp<unsigned>(
24235 RegMaxNumber * RedValsMaxNumber);
24237 unsigned ReduxWidth = NumReducedVals;
24238 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24239 unsigned NumParts, NumRegs;
24240 Type *ScalarTy = Candidates.front()->getType();
24247 while (NumParts > NumRegs) {
24248 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24249 ReduxWidth =
bit_floor(ReduxWidth - 1);
24255 if (NumParts > NumRegs / 2)
24260 ReduxWidth = GetVectorFactor(ReduxWidth);
24261 ReduxWidth = std::min(ReduxWidth, MaxElts);
24263 unsigned Start = 0;
24264 unsigned Pos =
Start;
24266 unsigned PrevReduxWidth = ReduxWidth;
24267 bool CheckForReusedReductionOpsLocal =
false;
24268 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24269 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24270 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24273 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24276 if (Pos < NumReducedVals - ReduxWidth + 1)
24277 return IsAnyRedOpGathered;
24280 if (ReduxWidth > 1)
24281 ReduxWidth = GetVectorFactor(ReduxWidth);
24282 return IsAnyRedOpGathered;
24284 bool AnyVectorized =
false;
24285 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24286 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24287 ReduxWidth >= ReductionLimit) {
24290 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24292 CheckForReusedReductionOps =
true;
24295 PrevReduxWidth = ReduxWidth;
24298 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24301 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24303 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24305 V.areAnalyzedReductionVals(VL)) {
24306 (void)AdjustReducedVals(
true);
24313 return RedValI &&
V.isDeleted(RedValI);
24316 V.buildTree(VL, IgnoreList);
24317 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24318 if (!AdjustReducedVals())
24319 V.analyzedReductionVals(VL);
24322 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24323 if (!AdjustReducedVals())
24324 V.analyzedReductionVals(VL);
24327 V.reorderTopToBottom();
24330 VL.front()->getType()->isIntOrIntVectorTy() ||
24331 ReductionLimit > 2);
24335 ExternallyUsedValues);
24339 LocalExternallyUsedValues.insert(ReductionRoot);
24340 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24341 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24343 for (
Value *V : ReducedVals[Cnt])
24345 LocalExternallyUsedValues.insert(TrackedVals[V]);
24347 if (!IsSupportedHorRdxIdentityOp) {
24350 "Reused values counter map is not empty");
24351 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24352 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24354 Value *
V = Candidates[Cnt];
24355 Value *OrigV = TrackedToOrig.at(V);
24356 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24359 V.transformNodes();
24362 SmallPtrSet<Value *, 4> Visited;
24363 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24364 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24366 Value *RdxVal = Candidates[Cnt];
24367 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24368 RdxVal = It->second;
24369 if (!Visited.
insert(RdxVal).second)
24373 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24374 LocalExternallyUsedValues.insert(RdxVal);
24377 Value *OrigV = TrackedToOrig.at(RdxVal);
24379 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24380 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24381 LocalExternallyUsedValues.insert(RdxVal);
24384 if (!IsSupportedHorRdxIdentityOp)
24385 SameValuesCounter.
clear();
24386 for (
Value *RdxVal : VL)
24387 if (RequiredExtract.
contains(RdxVal))
24388 LocalExternallyUsedValues.insert(RdxVal);
24389 V.buildExternalUses(LocalExternallyUsedValues);
24391 V.computeMinimumValueSizes();
24395 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24398 <<
" for reduction\n");
24402 V.getORE()->emit([&]() {
24403 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24404 ReducedValsToOps.
at(VL[0]).front())
24405 <<
"Vectorizing horizontal reduction is possible "
24406 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24407 <<
" and threshold "
24410 if (!AdjustReducedVals()) {
24411 V.analyzedReductionVals(VL);
24413 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24416 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24417 VF >= ReductionLimit;
24419 *
TTI, VL.front()->getType(), VF - 1)) {
24421 V.getCanonicalGraphSize() !=
V.getTreeSize())
24424 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24431 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24432 <<
Cost <<
". (HorRdx)\n");
24433 V.getORE()->emit([&]() {
24434 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24435 ReducedValsToOps.
at(VL[0]).front())
24436 <<
"Vectorized horizontal reduction with cost "
24437 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24438 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24447 if (IsCmpSelMinMax)
24448 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24451 Value *VectorizedRoot =
V.vectorizeTree(
24452 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24455 for (
Value *RdxVal : Candidates) {
24456 Value *OrigVal = TrackedToOrig.at(RdxVal);
24457 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24458 if (TransformedRdxVal != RdxVal)
24459 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24468 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24471 if (OptReusedScalars && !SameScaleFactor) {
24472 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24473 SameValuesCounter, TrackedToOrig);
24476 Type *ScalarTy = VL.front()->getType();
24481 OptReusedScalars && SameScaleFactor
24482 ? SameValuesCounter.
front().second
24485 ?
V.isSignedMinBitwidthRootNode()
24489 for (
Value *RdxVal : VL) {
24490 Value *OrigV = TrackedToOrig.at(RdxVal);
24491 if (IsSupportedHorRdxIdentityOp) {
24492 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24495 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24496 if (!
V.isVectorized(RdxVal))
24497 RequiredExtract.
insert(RdxVal);
24501 ReduxWidth = NumReducedVals - Pos;
24502 if (ReduxWidth > 1)
24503 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24504 AnyVectorized =
true;
24506 if (OptReusedScalars && !AnyVectorized) {
24507 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24508 Value *RdxVal = TrackedVals.at(
P.first);
24509 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24510 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24511 VectorizedVals.try_emplace(
P.first,
P.second);
24516 if (!VectorValuesAndScales.
empty())
24517 VectorizedTree = GetNewVectorizedTree(
24519 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24521 if (!VectorizedTree) {
24522 if (!CheckForReusedReductionOps) {
24523 for (ReductionOpsType &RdxOps : ReductionOps)
24524 for (
Value *RdxOp : RdxOps)
24546 auto FixBoolLogicalOps =
24549 if (!AnyBoolLogicOp)
24551 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24552 getRdxOperand(RedOp1, 0) ==
LHS ||
24555 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24556 getRdxOperand(RedOp2, 0) ==
RHS ||
24561 if (
LHS != VectorizedTree)
24569 unsigned Sz = InstVals.
size();
24571 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24574 Value *RdxVal1 = InstVals[
I].second;
24575 Value *StableRdxVal1 = RdxVal1;
24576 auto It1 = TrackedVals.find(RdxVal1);
24577 if (It1 != TrackedVals.end())
24578 StableRdxVal1 = It1->second;
24579 Value *RdxVal2 = InstVals[
I + 1].second;
24580 Value *StableRdxVal2 = RdxVal2;
24581 auto It2 = TrackedVals.find(RdxVal2);
24582 if (It2 != TrackedVals.end())
24583 StableRdxVal2 = It2->second;
24587 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24589 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24590 StableRdxVal2,
"op.rdx", ReductionOps);
24591 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24594 ExtraReds[Sz / 2] = InstVals.
back();
24600 SmallPtrSet<Value *, 8> Visited;
24602 for (
Value *RdxVal : Candidates) {
24603 if (!Visited.
insert(RdxVal).second)
24605 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24606 for (Instruction *RedOp :
24612 bool InitStep =
true;
24613 while (ExtraReductions.
size() > 1) {
24615 FinalGen(ExtraReductions, InitStep);
24616 ExtraReductions.
swap(NewReds);
24619 VectorizedTree = ExtraReductions.
front().second;
24621 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24628 SmallPtrSet<Value *, 4> IgnoreSet;
24637 for (
auto *U :
Ignore->users()) {
24639 "All users must be either in the reduction ops list.");
24642 if (!
Ignore->use_empty()) {
24644 Ignore->replaceAllUsesWith(
P);
24647 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24649 return VectorizedTree;
24655 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24656 Value *Vec,
unsigned Scale,
bool IsSigned,
24680 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24683 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24685 if (Rdx->
getType() != DestTy)
24691 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24698 bool IsCmpSelMinMax, FastMathFlags FMF,
24699 const BoUpSLP &R, DominatorTree &DT,
24700 const DataLayout &
DL,
24701 const TargetLibraryInfo &TLI) {
24703 Type *ScalarTy = ReducedVals.
front()->getType();
24704 unsigned ReduxWidth = ReducedVals.
size();
24705 FixedVectorType *VectorTy =
R.getReductionType();
24710 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24713 int Cnt = ReducedVals.
size();
24714 for (
Value *RdxVal : ReducedVals) {
24719 Cost += GenCostFn();
24723 for (User *U : RdxVal->
users()) {
24725 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24726 if (RdxKind == RecurKind::FAdd) {
24736 FMACost -= FMulCost;
24738 ScalarCost += FMACost;
24745 ScalarCost = InstructionCost::getInvalid();
24749 Cost += ScalarCost;
24751 Cost += GenCostFn();
24760 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24762 case RecurKind::Add:
24763 case RecurKind::Mul:
24764 case RecurKind::Or:
24765 case RecurKind::And:
24766 case RecurKind::Xor:
24767 case RecurKind::FAdd:
24768 case RecurKind::FMul: {
24771 if (DoesRequireReductionOp) {
24774 unsigned ScalarTyNumElements = VecTy->getNumElements();
24779 ReducedVals.size()),
24790 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24791 std::make_pair(RedTy,
true));
24792 if (RType == RedTy) {
24797 RdxOpcode, !IsSigned, RedTy,
24803 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24804 std::make_pair(RedTy,
true));
24807 if (RdxKind == RecurKind::FAdd) {
24812 for (
Value *RdxVal : ReducedVals) {
24818 FMF &= FPCI->getFastMathFlags();
24821 if (!
Ops.empty()) {
24826 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24827 {RVecTy, RVecTy, RVecTy}, FMF);
24833 Instruction::FMul, RVecTy,
CostKind);
24835 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24836 FMACost -= FMulCost;
24840 if (FMACost.isValid())
24841 VectorCost += FMACost;
24845 if (RType != RedTy) {
24846 unsigned Opcode = Instruction::Trunc;
24848 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24854 ScalarCost = EvaluateScalarCost([&]() {
24859 case RecurKind::FMax:
24860 case RecurKind::FMin:
24861 case RecurKind::FMaximum:
24862 case RecurKind::FMinimum:
24863 case RecurKind::SMax:
24864 case RecurKind::SMin:
24865 case RecurKind::UMax:
24866 case RecurKind::UMin: {
24869 if (DoesRequireReductionOp) {
24875 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24876 std::make_pair(RedTy,
true));
24878 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24880 if (RType != RedTy) {
24881 unsigned Opcode = Instruction::Trunc;
24883 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24889 ScalarCost = EvaluateScalarCost([&]() {
24890 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24899 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24901 <<
" (It is a splitting reduction)\n");
24902 return VectorCost - ScalarCost;
24908 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24910 Value *ReducedSubTree =
nullptr;
24912 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24913 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24914 if (ReducedSubTree)
24915 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24916 "op.rdx", ReductionOps);
24918 ReducedSubTree = Rdx;
24920 if (VectorValuesAndScales.
size() == 1) {
24921 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24922 CreateSingleOp(Vec, Scale, IsSigned);
24923 return ReducedSubTree;
24927 Value *VecRes =
nullptr;
24928 bool VecResSignedness =
false;
24929 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24935 case RecurKind::Add: {
24936 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24939 <<
". (HorRdx)\n");
24942 std::iota(std::next(
Mask.begin(), VF *
I),
24943 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24944 ++NumVectorInstructions;
24955 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24956 <<
". (HorRdx)\n");
24957 ++NumVectorInstructions;
24961 case RecurKind::Xor: {
24964 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24969 case RecurKind::FAdd: {
24973 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24974 <<
". (HorRdx)\n");
24975 ++NumVectorInstructions;
24979 case RecurKind::And:
24980 case RecurKind::Or:
24981 case RecurKind::SMax:
24982 case RecurKind::SMin:
24983 case RecurKind::UMax:
24984 case RecurKind::UMin:
24985 case RecurKind::FMax:
24986 case RecurKind::FMin:
24987 case RecurKind::FMaximum:
24988 case RecurKind::FMinimum:
24991 case RecurKind::Sub:
24992 case RecurKind::AddChainWithSubs:
24993 case RecurKind::Mul:
24994 case RecurKind::FMul:
24995 case RecurKind::FMulAdd:
24996 case RecurKind::AnyOf:
24997 case RecurKind::FindFirstIVSMin:
24998 case RecurKind::FindFirstIVUMin:
24999 case RecurKind::FindLastIVSMax:
25000 case RecurKind::FindLastIVUMax:
25001 case RecurKind::FMaxNum:
25002 case RecurKind::FMinNum:
25003 case RecurKind::FMaximumNum:
25004 case RecurKind::FMinimumNum:
25005 case RecurKind::None:
25012 VecResSignedness = IsSigned;
25014 ++NumVectorInstructions;
25015 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
25021 std::iota(
Mask.begin(),
Mask.end(), 0);
25023 if (VecResVF < VecVF) {
25027 if (VecResVF != VecVF) {
25029 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25046 if (VecResVF < VecVF) {
25052 if (VecResVF != VecVF)
25054 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25055 if (VecResVF != VecVF)
25060 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25061 CreateVecOp(Vec, Scale, IsSigned);
25062 CreateSingleOp(VecRes, 1,
false);
25064 return ReducedSubTree;
25068 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25069 const TargetTransformInfo *
TTI,
Type *DestTy) {
25070 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25071 assert(RdxKind != RecurKind::FMulAdd &&
25072 "A call to the llvm.fmuladd intrinsic is not handled yet");
25075 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25076 RdxKind == RecurKind::Add &&
25081 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25082 ++NumVectorInstructions;
25085 ++NumVectorInstructions;
25090 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25092 assert(IsSupportedHorRdxIdentityOp &&
25093 "The optimization of matched scalar identity horizontal reductions "
25094 "must be supported.");
25096 return VectorizedValue;
25098 case RecurKind::Add: {
25100 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25102 << VectorizedValue <<
". (HorRdx)\n");
25103 return Builder.
CreateMul(VectorizedValue, Scale);
25105 case RecurKind::Xor: {
25107 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25108 <<
". (HorRdx)\n");
25111 return VectorizedValue;
25113 case RecurKind::FAdd: {
25115 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25117 << VectorizedValue <<
". (HorRdx)\n");
25118 return Builder.
CreateFMul(VectorizedValue, Scale);
25120 case RecurKind::And:
25121 case RecurKind::Or:
25122 case RecurKind::SMax:
25123 case RecurKind::SMin:
25124 case RecurKind::UMax:
25125 case RecurKind::UMin:
25126 case RecurKind::FMax:
25127 case RecurKind::FMin:
25128 case RecurKind::FMaximum:
25129 case RecurKind::FMinimum:
25131 return VectorizedValue;
25132 case RecurKind::Sub:
25133 case RecurKind::AddChainWithSubs:
25134 case RecurKind::Mul:
25135 case RecurKind::FMul:
25136 case RecurKind::FMulAdd:
25137 case RecurKind::AnyOf:
25138 case RecurKind::FindFirstIVSMin:
25139 case RecurKind::FindFirstIVUMin:
25140 case RecurKind::FindLastIVSMax:
25141 case RecurKind::FindLastIVUMax:
25142 case RecurKind::FMaxNum:
25143 case RecurKind::FMinNum:
25144 case RecurKind::FMaximumNum:
25145 case RecurKind::FMinimumNum:
25146 case RecurKind::None:
25155 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25156 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25157 const DenseMap<Value *, Value *> &TrackedToOrig) {
25158 assert(IsSupportedHorRdxIdentityOp &&
25159 "The optimization of matched scalar identity horizontal reductions "
25160 "must be supported.");
25163 if (VTy->getElementType() != VL.
front()->getType()) {
25167 R.isSignedMinBitwidthRootNode());
25170 case RecurKind::Add: {
25173 for (
Value *V : VL) {
25174 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25175 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25179 << VectorizedValue <<
". (HorRdx)\n");
25180 return Builder.
CreateMul(VectorizedValue, Scale);
25182 case RecurKind::And:
25183 case RecurKind::Or:
25186 <<
". (HorRdx)\n");
25187 return VectorizedValue;
25188 case RecurKind::SMax:
25189 case RecurKind::SMin:
25190 case RecurKind::UMax:
25191 case RecurKind::UMin:
25192 case RecurKind::FMax:
25193 case RecurKind::FMin:
25194 case RecurKind::FMaximum:
25195 case RecurKind::FMinimum:
25198 <<
". (HorRdx)\n");
25199 return VectorizedValue;
25200 case RecurKind::Xor: {
25205 SmallVector<int>
Mask(
25208 std::iota(
Mask.begin(),
Mask.end(), 0);
25209 bool NeedShuffle =
false;
25210 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25212 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25213 if (Cnt % 2 == 0) {
25215 NeedShuffle =
true;
25221 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25225 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25226 return VectorizedValue;
25228 case RecurKind::FAdd: {
25231 for (
Value *V : VL) {
25232 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25233 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25236 return Builder.
CreateFMul(VectorizedValue, Scale);
25238 case RecurKind::Sub:
25239 case RecurKind::AddChainWithSubs:
25240 case RecurKind::Mul:
25241 case RecurKind::FMul:
25242 case RecurKind::FMulAdd:
25243 case RecurKind::AnyOf:
25244 case RecurKind::FindFirstIVSMin:
25245 case RecurKind::FindFirstIVUMin:
25246 case RecurKind::FindLastIVSMax:
25247 case RecurKind::FindLastIVUMax:
25248 case RecurKind::FMaxNum:
25249 case RecurKind::FMinNum:
25250 case RecurKind::FMaximumNum:
25251 case RecurKind::FMinimumNum:
25252 case RecurKind::None:
25262 return HorizontalReduction::getRdxKind(V);
25268 unsigned AggregateSize = 1;
25270 Type *CurrentType =
IV->getType();
25273 for (
auto *Elt : ST->elements())
25274 if (Elt != ST->getElementType(0))
25275 return std::nullopt;
25276 AggregateSize *= ST->getNumElements();
25277 CurrentType = ST->getElementType(0);
25279 AggregateSize *= AT->getNumElements();
25280 CurrentType = AT->getElementType();
25282 AggregateSize *= VT->getNumElements();
25283 return AggregateSize;
25285 return AggregateSize;
25287 return std::nullopt;
25296 unsigned OperandOffset,
const BoUpSLP &R) {
25299 std::optional<unsigned> OperandIndex =
25301 if (!OperandIndex || R.isDeleted(LastInsertInst))
25305 BuildVectorOpds, InsertElts, *OperandIndex, R);
25308 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25309 InsertElts[*OperandIndex] = LastInsertInst;
25312 }
while (LastInsertInst !=
nullptr &&
25339 "Expected insertelement or insertvalue instruction!");
25342 "Expected empty result vectors!");
25345 if (!AggregateSize)
25347 BuildVectorOpds.
resize(*AggregateSize);
25348 InsertElts.
resize(*AggregateSize);
25353 if (BuildVectorOpds.
size() >= 2)
25371 auto DominatedReduxValue = [&](
Value *R) {
25379 if (
P->getIncomingBlock(0) == ParentBB) {
25381 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25385 if (Rdx && DominatedReduxValue(Rdx))
25398 if (
P->getIncomingBlock(0) == BBLatch) {
25400 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25404 if (Rdx && DominatedReduxValue(Rdx))
25440 "Expected binop, select, or intrinsic for reduction matching");
25442 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25444 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25455 Value *Op0 =
nullptr;
25456 Value *Op1 =
nullptr;
25465 Value *B0 =
nullptr, *B1 =
nullptr;
25470bool SLPVectorizerPass::vectorizeHorReduction(
25471 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25472 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25481 auto SelectRoot = [&]() {
25500 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25501 Stack.emplace(SelectRoot(), 0);
25502 SmallPtrSet<Value *, 8> VisitedInstrs;
25505 if (
R.isAnalyzedReductionRoot(Inst))
25510 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25512 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25514 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25515 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25527 while (!
Stack.empty()) {
25530 std::tie(Inst, Level) =
Stack.front();
25535 if (
R.isDeleted(Inst))
25537 if (
Value *VectorizedV = TryToReduce(Inst)) {
25541 Stack.emplace(
I, Level);
25544 if (
R.isDeleted(Inst))
25548 if (!TryAppendToPostponedInsts(Inst)) {
25559 if (VisitedInstrs.
insert(
Op).second)
25564 !
R.isDeleted(
I) &&
I->getParent() == BB)
25565 Stack.emplace(
I, Level);
25570bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25577 if ((
I->getOpcode() == Instruction::FAdd ||
25578 I->getOpcode() == Instruction::FSub) &&
25588 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25589 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25599 if (
A &&
B &&
B->hasOneUse()) {
25602 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25604 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25608 if (
B &&
A &&
A->hasOneUse()) {
25611 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25613 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25617 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25621 Type *Ty = Inst->getType();
25625 if (!HorRdx.matchReductionForOperands())
25631 TTI.getScalarizationOverhead(
25634 TTI.getInstructionCost(Inst,
CostKind);
25646 FMF = FPCI->getFastMathFlags();
25647 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25654 if (RedCost >= ScalarCost)
25657 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25659 if (Candidates.
size() == 1)
25660 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25663 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25664 if (!BestCandidate)
25666 return (*BestCandidate == 0 &&
25667 TryToReduce(
I, {Candidates[*BestCandidate].first,
25668 Candidates[*BestCandidate].second})) ||
25669 tryToVectorizeList({Candidates[*BestCandidate].first,
25670 Candidates[*BestCandidate].second},
25674bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25675 BasicBlock *BB,
BoUpSLP &R) {
25677 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25678 Res |= tryToVectorize(PostponedInsts, R);
25685 for (
Value *V : Insts)
25687 Res |= tryToVectorize(Inst, R);
25691bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25694 if (!
R.canMapToVector(IVI->
getType()))
25697 SmallVector<Value *, 16> BuildVectorOpds;
25698 SmallVector<Value *, 16> BuildVectorInsts;
25702 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25703 R.getORE()->emit([&]() {
25704 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25705 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25706 "trying reduction first.";
25710 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25712 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25715bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25718 SmallVector<Value *, 16> BuildVectorInsts;
25719 SmallVector<Value *, 16> BuildVectorOpds;
25720 SmallVector<int>
Mask;
25726 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25727 R.getORE()->emit([&]() {
25728 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25729 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25730 "trying reduction first.";
25734 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25735 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25738template <
typename T>
25743 bool MaxVFOnly,
BoUpSLP &R) {
25756 if (!
I || R.isDeleted(
I)) {
25760 auto *SameTypeIt = IncIt;
25763 AreCompatible(VL, *SameTypeIt))) {
25766 if (
I && !R.isDeleted(
I))
25771 unsigned NumElts = VL.
size();
25772 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25773 << NumElts <<
")\n");
25783 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25786 VL.
swap(Candidates);
25787 Candidates.
clear();
25795 auto GetMinNumElements = [&R](
Value *V) {
25796 unsigned EltSize = R.getVectorElementSize(V);
25797 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25799 if (NumElts < GetMinNumElements(*IncIt) &&
25800 (Candidates.
empty() ||
25801 Candidates.
front()->getType() == (*IncIt)->getType())) {
25809 if (Candidates.
size() > 1 &&
25810 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25811 if (TryToVectorizeHelper(Candidates,
false)) {
25814 }
else if (MaxVFOnly) {
25817 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25820 if (!
I || R.isDeleted(
I)) {
25824 auto *SameTypeIt = It;
25825 while (SameTypeIt != End &&
25828 AreCompatible(*SameTypeIt, *It))) {
25831 if (
I && !R.isDeleted(
I))
25834 unsigned NumElts = VL.
size();
25835 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25841 Candidates.
clear();
25845 IncIt = SameTypeIt;
25857template <
bool IsCompatibility>
25862 "Expected valid element types only.");
25864 return IsCompatibility;
25867 if (CI1->getOperand(0)->getType()->getTypeID() <
25869 return !IsCompatibility;
25870 if (CI1->getOperand(0)->getType()->getTypeID() >
25873 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25875 return !IsCompatibility;
25876 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25885 if (BasePred1 < BasePred2)
25886 return !IsCompatibility;
25887 if (BasePred1 > BasePred2)
25890 bool CI1Preds = Pred1 == BasePred1;
25891 bool CI2Preds = Pred2 == BasePred1;
25892 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25893 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25898 return !IsCompatibility;
25903 if (IsCompatibility) {
25904 if (I1->getParent() != I2->getParent())
25911 return NodeI2 !=
nullptr;
25914 assert((NodeI1 == NodeI2) ==
25916 "Different nodes should have different DFS numbers");
25917 if (NodeI1 != NodeI2)
25921 if (S && (IsCompatibility || !S.isAltShuffle()))
25923 if (IsCompatibility)
25925 if (I1->getOpcode() != I2->getOpcode())
25926 return I1->getOpcode() < I2->getOpcode();
25929 return IsCompatibility;
25932template <
typename ItT>
25934 BasicBlock *BB,
BoUpSLP &R) {
25937 for (CmpInst *
I : CmpInsts) {
25938 if (
R.isDeleted(
I))
25942 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25943 if (
R.isDeleted(
I))
25948 for (CmpInst *
I : CmpInsts) {
25949 if (
R.isDeleted(
I))
25968 for (Instruction *V : CmpInsts)
25971 if (Vals.
size() <= 1)
25974 Vals, CompareSorter, AreCompatibleCompares,
25977 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25978 return any_of(
V->users(), [V](User *U) {
25979 auto *Select = dyn_cast<SelectInst>(U);
25981 Select->getParent() != cast<Instruction>(V)->getParent();
25984 if (ArePossiblyReducedInOtherBlock)
25986 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25992bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25993 BasicBlock *BB,
BoUpSLP &R) {
25995 "This function only accepts Insert instructions");
25996 bool OpsChanged =
false;
25998 for (
auto *
I :
reverse(Instructions)) {
26004 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
26007 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
26010 if (
R.isDeleted(
I))
26012 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
26018 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
26020 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26025 OpsChanged |= tryToVectorize(PostponedInsts, R);
26031bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
26034 SmallPtrSet<Value *, 16> VisitedInstrs;
26038 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26039 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
26042 "Expected vectorizable types only.");
26052 V2->getType()->getScalarSizeInBits())
26055 V2->getType()->getScalarSizeInBits())
26059 if (Opcodes1.
size() < Opcodes2.
size())
26061 if (Opcodes1.
size() > Opcodes2.
size())
26063 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26072 return NodeI2 !=
nullptr;
26075 assert((NodeI1 == NodeI2) ==
26077 "Different nodes should have different DFS numbers");
26078 if (NodeI1 != NodeI2)
26081 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26097 DT->getNode(V1->getParent());
26099 DT->getNode(V2->getParent());
26101 return NodeI2 !=
nullptr;
26104 assert((NodeI1 == NodeI2) ==
26106 "Different nodes should have different DFS numbers");
26107 if (NodeI1 != NodeI2)
26109 return V1->comesBefore(V2);
26122 return *Id1 < *Id2;
26126 if (
I1->getOpcode() == I2->getOpcode())
26128 return I1->getOpcode() < I2->getOpcode();
26151 auto ValID1 = Opcodes1[
I]->getValueID();
26152 auto ValID2 = Opcodes2[
I]->getValueID();
26153 if (ValID1 == ValID2)
26155 if (ValID1 < ValID2)
26157 if (ValID1 > ValID2)
26166 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26172 if (VL.empty() || V1 == VL.back())
26174 Value *V2 = VL.back();
26179 if (Opcodes1.
size() != Opcodes2.
size())
26181 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26187 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26189 if (
I1->getParent() != I2->getParent())
26197 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26203 bool HaveVectorizedPhiNodes =
false;
26207 for (Instruction &
I : *BB) {
26214 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26219 if (Incoming.
size() <= 1)
26224 for (
Value *V : Incoming) {
26225 SmallVectorImpl<Value *> &Opcodes =
26227 if (!Opcodes.
empty())
26230 SmallPtrSet<Value *, 4> Visited;
26231 while (!Nodes.empty()) {
26235 for (
Value *V :
PHI->incoming_values()) {
26237 Nodes.push_back(PHI1);
26246 Incoming, PHICompare, AreCompatiblePHIs,
26248 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26251 Changed |= HaveVectorizedPhiNodes;
26252 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26254 return !
PHI ||
R.isDeleted(
PHI);
26256 PHIToOpcodes.
clear();
26258 }
while (HaveVectorizedPhiNodes);
26260 VisitedInstrs.
clear();
26262 InstSetVector PostProcessInserts;
26263 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26266 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26267 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26268 if (VectorizeCmps) {
26270 PostProcessCmps.
clear();
26272 PostProcessInserts.clear();
26278 return PostProcessCmps.
contains(Cmp);
26280 PostProcessInserts.contains(
I);
26286 return I->use_empty() &&
26296 if (
R.isDeleted(&*It))
26299 if (!VisitedInstrs.
insert(&*It).second) {
26300 if (HasNoUsers(&*It) &&
26301 VectorizeInsertsAndCmps(It->isTerminator())) {
26314 if (
P->getNumIncomingValues() == 2) {
26317 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26331 if (BB ==
P->getIncomingBlock(
I) ||
26332 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26338 PI && !IsInPostProcessInstrs(PI)) {
26340 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26342 if (Res &&
R.isDeleted(
P)) {
26352 if (HasNoUsers(&*It)) {
26353 bool OpsChanged =
false;
26364 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26365 SI->getValueOperand()->hasOneUse();
26367 if (TryToVectorizeRoot) {
26368 for (
auto *V : It->operand_values()) {
26372 VI && !IsInPostProcessInstrs(VI))
26374 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26381 VectorizeInsertsAndCmps(It->isTerminator());
26393 PostProcessInserts.insert(&*It);
26401bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26403 for (
auto &Entry : GEPs) {
26406 if (
Entry.second.size() < 2)
26409 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26410 <<
Entry.second.size() <<
".\n");
26418 return !R.isDeleted(GEP);
26420 if (It ==
Entry.second.end())
26422 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26423 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26424 if (MaxVecRegSize < EltSize)
26427 unsigned MaxElts = MaxVecRegSize / EltSize;
26428 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26429 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26442 Candidates.remove_if([&R](
Value *
I) {
26452 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26453 auto *GEPI = GEPList[
I];
26454 if (!Candidates.count(GEPI))
26456 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26457 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26458 auto *GEPJ = GEPList[J];
26459 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26461 Candidates.remove(GEPI);
26462 Candidates.remove(GEPJ);
26463 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26464 Candidates.remove(GEPJ);
26471 if (Candidates.
size() < 2)
26477 SmallVector<Value *, 16> Bundle(Candidates.
size());
26478 auto BundleIndex = 0
u;
26479 for (
auto *V : Candidates) {
26481 auto *GEPIdx =
GEP->idx_begin()->get();
26483 Bundle[BundleIndex++] = GEPIdx;
26495 Changed |= tryToVectorizeList(Bundle, R);
26501bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26506 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26507 if (
V->getValueOperand()->getType()->getTypeID() <
26510 if (
V->getValueOperand()->getType()->getTypeID() >
26513 if (
V->getPointerOperandType()->getTypeID() <
26514 V2->getPointerOperandType()->getTypeID())
26516 if (
V->getPointerOperandType()->getTypeID() >
26517 V2->getPointerOperandType()->getTypeID())
26519 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26522 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26528 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26529 DT->getNode(
I1->getParent());
26530 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26531 DT->getNode(I2->getParent());
26532 assert(NodeI1 &&
"Should only process reachable instructions");
26533 assert(NodeI2 &&
"Should only process reachable instructions");
26534 assert((NodeI1 == NodeI2) ==
26536 "Different nodes should have different DFS numbers");
26537 if (NodeI1 != NodeI2)
26539 return I1->getOpcode() < I2->getOpcode();
26541 return V->getValueOperand()->getValueID() <
26545 bool SameParent =
true;
26551 StoreInst *V2 = VL.
back();
26576 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26578 for (
auto [SI, V] :
zip(VL, NewVL))
26579 V =
SI->getValueOperand();
26580 NewVL.back() = V1->getValueOperand();
26581 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26582 InstructionsState S =
Analysis.buildInstructionsState(
26590 return V1->getValueOperand()->
getValueID() ==
26595 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26596 for (
auto &Pair : Stores) {
26597 if (Pair.second.size() < 2)
26601 << Pair.second.size() <<
".\n");
26610 Pair.second.rend());
26612 ReversedStores, StoreSorter, AreCompatibleStores,
26614 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const