74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1108 InterchangeableInfo MainOp;
1109 InterchangeableInfo AltOp;
1110 bool isValidForAlternation(
const Instruction *
I)
const {
1111 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1112 ::isValidForAlternation(
I->getOpcode());
1114 bool initializeAltOp(
const Instruction *
I) {
1117 if (!isValidForAlternation(
I))
1124 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1125 const Instruction *AltOp =
nullptr)
1126 : MainOp(MainOp), AltOp(AltOp) {
1129 bool add(
const Instruction *
I) {
1131 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1132 unsigned Opcode =
I->getOpcode();
1133 MaskType OpcodeInMaskForm;
1136 case Instruction::Shl:
1137 OpcodeInMaskForm = ShlBIT;
1139 case Instruction::AShr:
1140 OpcodeInMaskForm = AShrBIT;
1142 case Instruction::Mul:
1143 OpcodeInMaskForm = MulBIT;
1145 case Instruction::Add:
1146 OpcodeInMaskForm = AddBIT;
1148 case Instruction::Sub:
1149 OpcodeInMaskForm = SubBIT;
1151 case Instruction::And:
1152 OpcodeInMaskForm = AndBIT;
1154 case Instruction::Or:
1155 OpcodeInMaskForm = OrBIT;
1157 case Instruction::Xor:
1158 OpcodeInMaskForm = XorBIT;
1161 return MainOp.equal(Opcode) ||
1162 (initializeAltOp(
I) && AltOp.equal(Opcode));
1164 MaskType InterchangeableMask = OpcodeInMaskForm;
1165 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1167 constexpr MaskType CanBeAll =
1168 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1169 const APInt &CIValue = CI->
getValue();
1171 case Instruction::Shl:
1173 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1175 case Instruction::Mul:
1176 if (CIValue.
isOne()) {
1177 InterchangeableMask = CanBeAll;
1181 InterchangeableMask = MulBIT | ShlBIT;
1183 case Instruction::Add:
1184 case Instruction::Sub:
1185 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1187 case Instruction::And:
1189 InterchangeableMask = CanBeAll;
1193 InterchangeableMask = CanBeAll;
1197 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1198 (initializeAltOp(
I) &&
1199 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1201 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1203 bool hasCandidateOpcode(
unsigned Opcode)
const {
1204 return MainOp.hasCandidateOpcode(Opcode);
1206 bool hasAltOp()
const {
return AltOp.I; }
1207 unsigned getAltOpcode()
const {
1208 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1211 return MainOp.getOperand(
I);
1216class InstructionsState {
1242 bool HasCopyables =
false;
1246 assert(valid() &&
"InstructionsState is invalid.");
1251 assert(valid() &&
"InstructionsState is invalid.");
1256 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1258 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1261 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1270 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1271 assert(MainOp &&
"MainOp cannot be nullptr.");
1272 if (
I->getOpcode() == MainOp->getOpcode())
1275 assert(AltOp &&
"AltOp cannot be nullptr.");
1276 if (
I->getOpcode() == AltOp->getOpcode())
1278 if (!
I->isBinaryOp())
1280 BinOpSameOpcodeHelper
Converter(MainOp);
1283 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1284 BinOpSameOpcodeHelper AltConverter(AltOp);
1285 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1286 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1289 if (
Converter.hasAltOp() && !isAltShuffle())
1291 return Converter.hasAltOp() ? AltOp : MainOp;
1295 bool isShiftOp()
const {
1296 return getMainOp()->isShift() && getAltOp()->isShift();
1301 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1305 bool isMulDivLikeOp()
const {
1306 constexpr std::array<unsigned, 8> MulDiv = {
1307 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1308 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1309 Instruction::URem, Instruction::FRem};
1315 bool isAddSubLikeOp()
const {
1316 constexpr std::array<unsigned, 4>
AddSub = {
1317 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 bool isCmpOp()
const {
1325 return (
getOpcode() == Instruction::ICmp ||
1331 bool valid()
const {
return MainOp && AltOp; }
1333 explicit operator bool()
const {
return valid(); }
1335 InstructionsState() =
delete;
1336 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1337 bool HasCopyables =
false)
1338 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1339 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1342 bool isCopyableElement(
Value *V)
const {
1343 assert(valid() &&
"InstructionsState is invalid.");
1346 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1351 if (
I->getParent() != MainOp->getParent() &&
1355 if (
I->getOpcode() == MainOp->getOpcode())
1357 if (!
I->isBinaryOp())
1359 BinOpSameOpcodeHelper
Converter(MainOp);
1365 bool isNonSchedulable(
Value *V)
const {
1366 assert(valid() &&
"InstructionsState is invalid.");
1373 if (getMainOp() == V)
1375 if (isCopyableElement(V)) {
1376 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1378 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1383 !MainOp->comesBefore(
I));
1386 return IsNonSchedulableCopyableElement(V);
1393 bool areInstructionsWithCopyableElements()
const {
1394 assert(valid() &&
"InstructionsState is invalid.");
1395 return HasCopyables;
1399std::pair<Instruction *, SmallVector<Value *>>
1401 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1402 assert(SelectedOp &&
"Cannot convert the instruction.");
1403 if (
I->isBinaryOp()) {
1405 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1424 for (
Value *V : VL) {
1429 if (Inst->getOpcode() == Opcode)
1443 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1454 "Assessing comparisons of different types?");
1464 return (BasePred == Pred &&
1466 (BasePred == SwappedPred &&
1477 return InstructionsState::invalid();
1481 return InstructionsState::invalid();
1486 (VL.
size() == 2 && InstCnt < 2))
1487 return InstructionsState::invalid();
1496 unsigned AltOpcode = Opcode;
1498 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1499 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1501 UniquePreds.
insert(BasePred);
1502 UniqueNonSwappedPreds.
insert(BasePred);
1503 for (
Value *V : VL) {
1510 UniqueNonSwappedPreds.
insert(CurrentPred);
1511 if (!UniquePreds.
contains(CurrentPred) &&
1512 !UniquePreds.
contains(SwappedCurrentPred))
1513 UniquePreds.
insert(CurrentPred);
1518 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1528 return InstructionsState::invalid();
1530 bool AnyPoison = InstCnt != VL.
size();
1541 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1542 return InstructionsState::invalid();
1543 unsigned InstOpcode =
I->getOpcode();
1545 if (BinOpHelper.add(
I))
1550 Value *Op1 =
I->getOperand(0);
1553 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1555 if (Opcode == AltOpcode) {
1556 assert(isValidForAlternation(Opcode) &&
1557 isValidForAlternation(InstOpcode) &&
1558 "Cast isn't safe for alternation, logic needs to be updated!");
1559 AltOpcode = InstOpcode;
1566 Type *Ty0 = BaseInst->getOperand(0)->getType();
1567 Type *Ty1 = Inst->getOperand(0)->getType();
1569 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1570 assert(InstOpcode == AltOpcode &&
1571 "Alternate instructions are only supported by BinaryOperator "
1579 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1580 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1586 if (MainOp != AltOp) {
1589 }
else if (BasePred != CurrentPred) {
1591 isValidForAlternation(InstOpcode) &&
1592 "CmpInst isn't safe for alternation, logic needs to be updated!");
1597 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1598 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1601 }
else if (InstOpcode == Opcode) {
1602 assert(InstOpcode == AltOpcode &&
1603 "Alternate instructions are only supported by BinaryOperator and "
1606 if (Gep->getNumOperands() != 2 ||
1608 return InstructionsState::invalid();
1611 return InstructionsState::invalid();
1614 if (!LI->isSimple() || !BaseLI->isSimple())
1615 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1620 if (
Call->hasOperandBundles() &&
1622 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1623 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1626 return InstructionsState::invalid();
1629 return InstructionsState::invalid();
1632 if (Mappings.
size() != BaseMappings.
size() ||
1633 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1634 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1635 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1636 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1637 Mappings.
front().Shape.Parameters !=
1638 BaseMappings.
front().Shape.Parameters)
1639 return InstructionsState::invalid();
1644 return InstructionsState::invalid();
1649 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1651 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1654 "Incorrect implementation of allSameOpcode.");
1655 InstructionsState S(MainOp, AltOp);
1661 "Invalid InstructionsState.");
1669 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1679 unsigned Opcode = UserInst->
getOpcode();
1681 case Instruction::Load: {
1685 case Instruction::Store: {
1687 return (
SI->getPointerOperand() == Scalar);
1689 case Instruction::Call: {
1693 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1694 Arg.value().get() == Scalar;
1714 return LI->isSimple();
1716 return SI->isSimple();
1718 return !
MI->isVolatile();
1726 bool ExtendingManyInputs =
false) {
1727 if (SubMask.
empty())
1730 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1733 "SubMask with many inputs support must be larger than the mask.");
1735 Mask.append(SubMask.
begin(), SubMask.
end());
1739 int TermValue = std::min(Mask.size(), SubMask.
size());
1740 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1742 (!ExtendingManyInputs &&
1743 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1745 NewMask[
I] = Mask[SubMask[
I]];
1761 const size_t Sz = Order.
size();
1764 for (
unsigned I = 0;
I < Sz; ++
I) {
1766 UnusedIndices.
reset(Order[
I]);
1768 MaskedIndices.
set(
I);
1770 if (MaskedIndices.
none())
1773 "Non-synced masked/available indices.");
1777 assert(Idx >= 0 &&
"Indices must be synced.");
1787 unsigned Opcode0,
unsigned Opcode1) {
1794 OpcodeMask.
set(Lane * ScalarTyNumElements,
1795 Lane * ScalarTyNumElements + ScalarTyNumElements);
1804 "Expected scalar constants.");
1807 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1816 const unsigned E = Indices.
size();
1818 for (
unsigned I = 0;
I < E; ++
I)
1819 Mask[Indices[
I]] =
I;
1825 assert(!Mask.empty() &&
"Expected non-empty mask.");
1829 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1831 Scalars[Mask[
I]] = Prev[
I];
1844 auto *IO = dyn_cast<Instruction>(V);
1847 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1860 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1862 auto *IU = dyn_cast<Instruction>(U);
1865 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1881 return !VL.
empty() &&
1897 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1906 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1907 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1908 if (NumParts == 0 || NumParts >= Limit)
1911 if (NumParts >= Sz || Sz % NumParts != 0 ||
1922 class ScheduleEntity;
1924 class ScheduleCopyableData;
1925 class ScheduleBundle;
1935 struct StridedPtrInfo {
1936 Value *StrideVal =
nullptr;
1937 const SCEV *StrideSCEV =
nullptr;
1963 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1964 AC(AC), DB(DB), DL(DL), ORE(ORE),
1983 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1996 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2017 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2018 return VectorizableTree.front()->Scalars;
2024 const TreeEntry &Root = *VectorizableTree.front();
2025 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2026 !Root.Scalars.
front()->getType()->isIntegerTy())
2027 return std::nullopt;
2028 auto It = MinBWs.find(&Root);
2029 if (It != MinBWs.end())
2033 if (Root.getOpcode() == Instruction::ZExt ||
2034 Root.getOpcode() == Instruction::SExt)
2035 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2036 Root.getOpcode() == Instruction::SExt);
2037 return std::nullopt;
2043 return MinBWs.at(VectorizableTree.front().get()).second;
2048 if (ReductionBitWidth == 0 ||
2049 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2050 ReductionBitWidth >=
2051 DL->getTypeSizeInBits(
2052 VectorizableTree.front()->Scalars.front()->getType()))
2054 VectorizableTree.front()->Scalars.front()->getType(),
2055 VectorizableTree.front()->getVectorFactor());
2058 VectorizableTree.front()->Scalars.front()->getContext(),
2060 VectorizableTree.front()->getVectorFactor());
2075 VectorizableTree.clear();
2076 ScalarToTreeEntries.clear();
2077 OperandsToTreeEntry.clear();
2078 ScalarsInSplitNodes.clear();
2080 NonScheduledFirst.clear();
2081 EntryToLastInstruction.clear();
2082 LoadEntriesToVectorize.clear();
2083 IsGraphTransformMode =
false;
2084 GatheredLoadsEntriesFirst.reset();
2085 CompressEntryToData.clear();
2086 ExternalUses.clear();
2087 ExternalUsesAsOriginalScalar.clear();
2088 ExternalUsesWithNonUsers.clear();
2089 for (
auto &Iter : BlocksSchedules) {
2090 BlockScheduling *BS = Iter.second.get();
2094 ReductionBitWidth = 0;
2096 CastMaxMinBWSizes.reset();
2097 ExtraBitWidthNodes.clear();
2098 InstrElementSize.clear();
2099 UserIgnoreList =
nullptr;
2100 PostponedGathers.clear();
2101 ValueToGatherNodes.clear();
2117 assert(!Order.
empty() &&
"expected non-empty order");
2118 const unsigned Sz = Order.
size();
2120 return P.value() ==
P.index() ||
P.value() == Sz;
2133 bool IgnoreReorder);
2146 std::optional<OrdersType>
2184 return MaxVecRegSize;
2189 return MinVecRegSize;
2197 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2198 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2199 return MaxVF ? MaxVF : UINT_MAX;
2240 const bool IsAnyPointerUsedOutGraph,
const int64_t Diff,
2241 StridedPtrInfo &SPtrInfo)
const;
2256 StridedPtrInfo &SPtrInfo,
2257 unsigned *BestVF =
nullptr,
2258 bool TryRecursiveCheck =
true)
const;
2262 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2266 template <
typename T>
2268 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2293 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2294 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2319 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2320 MaxLevel(MaxLevel) {}
2376 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2381 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2383 return U == U1 || U == U2 || R.isVectorized(U);
2386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2391 ((
int)V1->getNumUses() == NumLanes ||
2392 AllUsersAreInternal(V1, V2)))
2398 auto CheckSameEntryOrFail = [&]() {
2403 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2412 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2414 return CheckSameEntryOrFail();
2417 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2418 LI2->getPointerOperand(), DL, SE,
true);
2419 if (!Dist || *Dist == 0) {
2422 R.TTI->isLegalMaskedGather(
2425 return CheckSameEntryOrFail();
2429 if (std::abs(*Dist) > NumLanes / 2)
2462 Value *EV2 =
nullptr;
2475 int Dist = Idx2 - Idx1;
2478 if (std::abs(Dist) == 0)
2480 if (std::abs(Dist) > NumLanes / 2)
2487 return CheckSameEntryOrFail();
2493 if (I1->getParent() != I2->getParent())
2494 return CheckSameEntryOrFail();
2502 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2503 !S.isAltShuffle()) &&
2507 S.getMainOp()->getNumOperands();
2519 return CheckSameEntryOrFail();
2553 int ShallowScoreAtThisLevel =
2564 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2567 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2569 ShallowScoreAtThisLevel))
2570 return ShallowScoreAtThisLevel;
2571 assert(I1 && I2 &&
"Should have early exited.");
2578 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2579 OpIdx1 != NumOperands1; ++OpIdx1) {
2581 int MaxTmpScore = 0;
2582 unsigned MaxOpIdx2 = 0;
2583 bool FoundBest =
false;
2587 ? I2->getNumOperands()
2588 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2589 assert(FromIdx <= ToIdx &&
"Bad index");
2590 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2592 if (Op2Used.
count(OpIdx2))
2597 I1, I2, CurrLevel + 1, {});
2600 TmpScore > MaxTmpScore) {
2601 MaxTmpScore = TmpScore;
2608 Op2Used.
insert(MaxOpIdx2);
2609 ShallowScoreAtThisLevel += MaxTmpScore;
2612 return ShallowScoreAtThisLevel;
2643 struct OperandData {
2644 OperandData() =
default;
2645 OperandData(
Value *V,
bool APO,
bool IsUsed)
2646 : V(V), APO(APO), IsUsed(IsUsed) {}
2656 bool IsUsed =
false;
2665 enum class ReorderingMode {
2679 unsigned ArgSize = 0;
2685 const Loop *L =
nullptr;
2688 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2689 return OpsVec[
OpIdx][Lane];
2693 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2694 return OpsVec[
OpIdx][Lane];
2699 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2701 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2703 OpsVec[
OpIdx][Lane].IsUsed =
false;
2707 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2708 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2720 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2722 Value *IdxLaneV = getData(Idx, Lane).V;
2735 unsigned UniquesCount = Uniques.
size();
2736 auto IdxIt = Uniques.
find(IdxLaneV);
2737 unsigned UniquesCntWithIdxLaneV =
2738 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2740 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2741 unsigned UniquesCntWithOpIdxLaneV =
2742 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2743 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2745 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2746 UniquesCntWithOpIdxLaneV,
2747 UniquesCntWithOpIdxLaneV -
2749 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2750 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2751 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2760 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2761 Value *IdxLaneV = getData(Idx, Lane).V;
2774 return R.areAllUsersVectorized(IdxLaneI)
2782 static const int ScoreScaleFactor = 10;
2790 int Lane,
unsigned OpIdx,
unsigned Idx,
2800 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2801 if (Score <= -SplatScore) {
2805 Score += SplatScore;
2811 Score *= ScoreScaleFactor;
2812 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2830 std::optional<unsigned>
2831 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2835 unsigned NumOperands = getNumOperands();
2838 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2841 ReorderingMode RMode = ReorderingModes[
OpIdx];
2842 if (RMode == ReorderingMode::Failed)
2843 return std::nullopt;
2846 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2852 std::optional<unsigned> Idx;
2856 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2862 bool IsUsed = RMode == ReorderingMode::Splat ||
2863 RMode == ReorderingMode::Constant ||
2864 RMode == ReorderingMode::Load;
2866 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2868 OperandData &OpData = getData(Idx, Lane);
2870 bool OpAPO = OpData.APO;
2879 if (OpAPO != OpIdxAPO)
2884 case ReorderingMode::Load:
2885 case ReorderingMode::Opcode: {
2886 bool LeftToRight = Lane > LastLane;
2887 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2888 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2889 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2890 OpIdx, Idx, IsUsed, UsedLanes);
2891 if (Score >
static_cast<int>(BestOp.Score) ||
2892 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2895 BestOp.Score = Score;
2896 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2900 case ReorderingMode::Constant:
2902 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2906 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2913 case ReorderingMode::Splat:
2915 IsUsed =
Op == OpLastLane;
2916 if (
Op == OpLastLane) {
2918 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2924 case ReorderingMode::Failed:
2930 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2934 return std::nullopt;
2941 unsigned getBestLaneToStartReordering()
const {
2942 unsigned Min = UINT_MAX;
2943 unsigned SameOpNumber = 0;
2954 for (
int I = getNumLanes();
I > 0; --
I) {
2955 unsigned Lane =
I - 1;
2956 OperandsOrderData NumFreeOpsHash =
2957 getMaxNumOperandsThatCanBeReordered(Lane);
2960 if (NumFreeOpsHash.NumOfAPOs < Min) {
2961 Min = NumFreeOpsHash.NumOfAPOs;
2962 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2964 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2965 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2966 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2969 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2970 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2971 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2972 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2973 auto [It, Inserted] =
2974 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2980 unsigned BestLane = 0;
2981 unsigned CntMin = UINT_MAX;
2983 if (
Data.second.first < CntMin) {
2984 CntMin =
Data.second.first;
2985 BestLane =
Data.second.second;
2992 struct OperandsOrderData {
2995 unsigned NumOfAPOs = UINT_MAX;
2998 unsigned NumOpsWithSameOpcodeParent = 0;
3012 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3013 unsigned CntTrue = 0;
3014 unsigned NumOperands = getNumOperands();
3024 bool AllUndefs =
true;
3025 unsigned NumOpsWithSameOpcodeParent = 0;
3030 const OperandData &OpData = getData(
OpIdx, Lane);
3037 I->getParent() != Parent) {
3038 if (NumOpsWithSameOpcodeParent == 0) {
3039 NumOpsWithSameOpcodeParent = 1;
3041 Parent =
I->getParent();
3043 --NumOpsWithSameOpcodeParent;
3046 ++NumOpsWithSameOpcodeParent;
3055 OperandsOrderData
Data;
3056 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3057 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3064 const InstructionsState &S) {
3068 return VL.
size() == getNumLanes();
3070 "Expected same number of lanes");
3071 assert(S.valid() &&
"InstructionsState is invalid.");
3077 OpsVec.resize(ArgSize);
3078 unsigned NumLanes = VL.
size();
3079 for (OperandDataVec &
Ops : OpsVec)
3080 Ops.resize(NumLanes);
3098 bool IsInverseOperation =
false;
3099 if (S.isCopyableElement(VL[Lane])) {
3103 assert(
I &&
"Expected instruction");
3104 auto [SelectedOp,
Ops] = convertTo(
I, S);
3111 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3118 unsigned getNumOperands()
const {
return ArgSize; }
3121 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3124 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3125 return getData(
OpIdx, Lane).V;
3129 bool empty()
const {
return OpsVec.empty(); }
3132 void clear() { OpsVec.clear(); }
3137 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3139 "Op is expected to be getValue(OpIdx, Lane).");
3143 bool OpAPO = getData(
OpIdx, Lane).APO;
3144 bool IsInvariant = L && L->isLoopInvariant(
Op);
3146 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3150 bool FoundCandidate =
false;
3151 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3152 OperandData &
Data = getData(OpI, Ln);
3153 if (
Data.APO != OpAPO ||
Data.IsUsed)
3155 Value *OpILane = getValue(OpI, Lane);
3179 L->isLoopInvariant(
Data.V))) {
3180 FoundCandidate =
true;
3187 if (!FoundCandidate)
3190 return getNumLanes() == 2 || Cnt > 1;
3197 "Op is expected to be getValue(OpIdx, Lane).");
3198 bool OpAPO = getData(
OpIdx, Lane).APO;
3199 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3203 const OperandData &
Data = getData(OpI, Ln);
3204 if (
Data.APO != OpAPO ||
Data.IsUsed)
3206 Value *OpILn = getValue(OpI, Ln);
3207 return (L && L->isLoopInvariant(OpILn)) ||
3219 const InstructionsState &S,
const BoUpSLP &R)
3220 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3221 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3223 appendOperands(RootVL,
Operands, S);
3231 "Expected same num of lanes across all operands");
3232 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3233 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3241 unsigned NumOperands = getNumOperands();
3242 unsigned NumLanes = getNumLanes();
3262 unsigned FirstLane = getBestLaneToStartReordering();
3271 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3272 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3273 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3275 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3277 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3279 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3282 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3292 auto &&SkipReordering = [
this]() {
3295 for (
const OperandData &
Data : Op0)
3298 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3299 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3306 return UniqueValues.
size() != 2 &&
3308 UniqueValues.
size());
3320 if (SkipReordering())
3323 bool StrategyFailed =
false;
3331 for (
unsigned I = 0;
I < NumOperands; ++
I)
3332 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3335 UsedLanes.
set(FirstLane);
3336 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3338 for (
int Direction : {+1, -1}) {
3339 int Lane = FirstLane + Direction * Distance;
3340 if (Lane < 0 || Lane >= (
int)NumLanes)
3342 UsedLanes.
set(Lane);
3343 int LastLane = Lane - Direction;
3344 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3349 std::optional<unsigned> BestIdx =
3350 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3351 MainAltOps[
OpIdx], UsedLanes);
3358 swap(
OpIdx, *BestIdx, Lane);
3361 StrategyFailed =
true;
3365 OperandData &AltOp = getData(
OpIdx, Lane);
3366 InstructionsState OpS =
3368 if (OpS && OpS.isAltShuffle())
3375 if (!StrategyFailed)
3380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3383 case ReorderingMode::Load:
3385 case ReorderingMode::Opcode:
3387 case ReorderingMode::Constant:
3389 case ReorderingMode::Splat:
3391 case ReorderingMode::Failed:
3412 const unsigned Indent = 2;
3414 for (
const OperandDataVec &OpDataVec : OpsVec) {
3415 OS <<
"Operand " << Cnt++ <<
"\n";
3416 for (
const OperandData &OpData : OpDataVec) {
3417 OS.
indent(Indent) <<
"{";
3418 if (
Value *V = OpData.V)
3422 OS <<
", APO:" << OpData.APO <<
"}\n";
3444 int BestScore = Limit;
3445 std::optional<int> Index;
3446 for (
int I :
seq<int>(0, Candidates.size())) {
3448 Candidates[
I].second,
3451 if (Score > BestScore) {
3466 DeletedInstructions.insert(
I);
3471 template <
typename T>
3474 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3476 for (
T *V : DeadVals) {
3481 for (
T *V : DeadVals) {
3482 if (!V || !Processed.
insert(V).second)
3487 for (
Use &U :
I->operands()) {
3489 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3491 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3492 return Entry->VectorizedValue == OpI;
3496 I->dropAllReferences();
3498 for (
T *V : DeadVals) {
3500 if (!
I->getParent())
3505 cast<Instruction>(U.getUser()));
3507 "trying to erase instruction with users.");
3508 I->removeFromParent();
3512 while (!DeadInsts.
empty()) {
3515 if (!VI || !VI->getParent())
3518 "Live instruction found in dead worklist!");
3519 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3526 for (
Use &OpU : VI->operands()) {
3527 Value *OpV = OpU.get();
3539 if (!DeletedInstructions.contains(OpI) &&
3540 (!OpI->getType()->isVectorTy() ||
3541 none_of(VectorValuesAndScales,
3542 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3543 return std::get<0>(V) == OpI;
3549 VI->removeFromParent();
3551 SE->forgetValue(VI);
3558 return AnalyzedReductionsRoots.count(
I);
3563 AnalyzedReductionsRoots.insert(
I);
3568 return AnalyzedReductionVals.contains(
hash_value(VL));
3573 AnalyzedReductionVals.insert(
hash_value(VL));
3577 AnalyzedReductionsRoots.clear();
3578 AnalyzedReductionVals.clear();
3579 AnalyzedMinBWVals.clear();
3587 return MustGather.contains(V);
3591 return NonScheduledFirst.contains(V);
3596 assert(V &&
"V cannot be nullptr.");
3597 return ScalarToTreeEntries.contains(V);
3607 bool collectValuesToDemote(
3608 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3611 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3620 void buildReorderableOperands(
3628 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3631 bool areAllUsersVectorized(
3640 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3641 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3642 return const_cast<TreeEntry *
>(
3643 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3649 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3653 getCastContextHint(
const TreeEntry &TE)
const;
3667 const InstructionsState &LocalState,
3674 unsigned InterleaveFactor = 0);
3685 bool ResizeAllowed =
false)
const;
3692 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3697 template <
typename BVTy,
typename ResTy,
typename... Args>
3698 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3703 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3709 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3716 std::optional<TargetTransformInfo::ShuffleKind>
3728 unsigned NumParts)
const;
3740 std::optional<TargetTransformInfo::ShuffleKind>
3741 isGatherShuffledSingleRegisterEntry(
3758 isGatherShuffledEntry(
3761 unsigned NumParts,
bool ForOrder =
false);
3767 Type *ScalarTy)
const;
3771 void setInsertPointAfterBundle(
const TreeEntry *E);
3781 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3786 void tryToVectorizeGatheredLoads(
3788 std::tuple<BasicBlock *, Value *, Type *>,
3796 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3812 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3816 void reorderGatherNode(TreeEntry &TE);
3821 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3825 if (State == TreeEntry::SplitVectorize)
3827 SmallVector<int>
Mask;
3834 SmallVector<int> getSplitMask()
const {
3835 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3836 "Expected only split vectorize node.");
3838 unsigned CommonVF = std::max<unsigned>(
3839 CombinedEntriesWithIndices.back().second,
3840 Scalars.size() - CombinedEntriesWithIndices.back().second);
3841 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3843 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3844 ? CommonVF - CombinedEntriesWithIndices.back().second
3851 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3852 ArrayRef<int> MaskOrder);
3857 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3858 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3861 [Scalars](
Value *V,
int Idx) {
3862 return (isa<UndefValue>(V) &&
3863 Idx == PoisonMaskElem) ||
3864 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3867 if (!ReorderIndices.empty()) {
3871 SmallVector<int>
Mask;
3873 if (VL.
size() == Scalars.size())
3874 return IsSame(Scalars, Mask);
3875 if (VL.
size() == ReuseShuffleIndices.size()) {
3877 return IsSame(Scalars, Mask);
3881 return IsSame(Scalars, ReuseShuffleIndices);
3885 bool hasEqualOperands(
const TreeEntry &TE)
const {
3886 if (
TE.getNumOperands() != getNumOperands())
3888 SmallBitVector
Used(getNumOperands());
3889 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3890 unsigned PrevCount =
Used.count();
3891 for (
unsigned K = 0;
K <
E; ++
K) {
3894 if (getOperand(K) ==
TE.getOperand(
I)) {
3900 if (PrevCount ==
Used.count())
3909 unsigned getVectorFactor()
const {
3910 if (!ReuseShuffleIndices.empty())
3911 return ReuseShuffleIndices.size();
3912 return Scalars.size();
3916 bool isGather()
const {
return State == NeedToGather; }
3922 WeakTrackingVH VectorizedValue =
nullptr;
3943 enum CombinedOpcode {
3945 MinMax = Instruction::OtherOpsEnd + 1,
3948 CombinedOpcode CombinedOp = NotCombinedOp;
3951 SmallVector<int, 4> ReuseShuffleIndices;
3954 SmallVector<unsigned, 4> ReorderIndices;
3962 VecTreeTy &Container;
3965 EdgeInfo UserTreeIndex;
3981 SmallPtrSet<const Value *, 4> CopyableElements;
3985 InstructionsState S = InstructionsState::invalid();
3988 unsigned InterleaveFactor = 0;
3991 bool DoesNotNeedToSchedule =
false;
3995 if (Operands.size() <
OpIdx + 1)
3996 Operands.resize(
OpIdx + 1);
3999 "Number of operands is greater than the number of scalars.");
4006 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4008 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4011 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4014 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4019 setOperand(
I, Operands[
I]);
4023 void reorderOperands(ArrayRef<int> Mask) {
4031 return Operands[
OpIdx];
4037 return Operands[
OpIdx];
4041 unsigned getNumOperands()
const {
return Operands.size(); }
4044 Value *getSingleOperand(
unsigned OpIdx)
const {
4047 return Operands[
OpIdx][0];
4051 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4053 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4054 return S.getMatchingMainOpOrAltOp(
I);
4062 if (
I && getMatchingMainOpOrAltOp(
I))
4064 return S.getMainOp();
4067 void setOperations(
const InstructionsState &S) {
4068 assert(S &&
"InstructionsState is invalid.");
4072 Instruction *getMainOp()
const {
return S.getMainOp(); }
4074 Instruction *getAltOp()
const {
return S.getAltOp(); }
4077 unsigned getOpcode()
const {
return S.getOpcode(); }
4079 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4081 bool hasState()
const {
return S.valid(); }
4084 void addCopyableElement(
Value *V) {
4085 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4086 CopyableElements.insert(V);
4090 bool isCopyableElement(
Value *V)
const {
4091 return CopyableElements.contains(V);
4095 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4098 const InstructionsState &getOperations()
const {
return S; }
4102 unsigned findLaneForValue(
Value *V)
const {
4103 unsigned FoundLane = getVectorFactor();
4104 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4105 std::advance(It, 1)) {
4108 FoundLane = std::distance(Scalars.begin(), It);
4109 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4110 if (!ReorderIndices.empty())
4111 FoundLane = ReorderIndices[FoundLane];
4112 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4113 if (ReuseShuffleIndices.empty())
4115 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4116 RIt != ReuseShuffleIndices.end()) {
4117 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4121 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4128 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4129 SmallVectorImpl<int> &Mask,
4130 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4131 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4134 bool isNonPowOf2Vec()
const {
4136 return IsNonPowerOf2;
4142 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4145 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4146 "Reshuffling not supported with non-power-of-2 vectors yet.");
4147 return IsNonPowerOf2;
4150 Value *getOrdered(
unsigned Idx)
const {
4151 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4152 if (ReorderIndices.empty())
4153 return Scalars[Idx];
4154 SmallVector<int>
Mask;
4156 return Scalars[
Mask[Idx]];
4162 dbgs() << Idx <<
".\n";
4163 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4164 dbgs() <<
"Operand " << OpI <<
":\n";
4165 for (
const Value *V : Operands[OpI])
4168 dbgs() <<
"Scalars: \n";
4169 for (
Value *V : Scalars)
4171 dbgs() <<
"State: ";
4172 if (S && hasCopyableElements())
4173 dbgs() <<
"[[Copyable]] ";
4176 if (InterleaveFactor > 0) {
4177 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4180 dbgs() <<
"Vectorize\n";
4183 case ScatterVectorize:
4184 dbgs() <<
"ScatterVectorize\n";
4186 case StridedVectorize:
4187 dbgs() <<
"StridedVectorize\n";
4189 case CompressVectorize:
4190 dbgs() <<
"CompressVectorize\n";
4193 dbgs() <<
"NeedToGather\n";
4195 case CombinedVectorize:
4196 dbgs() <<
"CombinedVectorize\n";
4198 case SplitVectorize:
4199 dbgs() <<
"SplitVectorize\n";
4203 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4204 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4206 dbgs() <<
"MainOp: NULL\n";
4207 dbgs() <<
"AltOp: NULL\n";
4209 dbgs() <<
"VectorizedValue: ";
4210 if (VectorizedValue)
4211 dbgs() << *VectorizedValue <<
"\n";
4214 dbgs() <<
"ReuseShuffleIndices: ";
4215 if (ReuseShuffleIndices.empty())
4218 for (
int ReuseIdx : ReuseShuffleIndices)
4219 dbgs() << ReuseIdx <<
", ";
4221 dbgs() <<
"ReorderIndices: ";
4222 for (
unsigned ReorderIdx : ReorderIndices)
4223 dbgs() << ReorderIdx <<
", ";
4225 dbgs() <<
"UserTreeIndex: ";
4227 dbgs() << UserTreeIndex;
4229 dbgs() <<
"<invalid>";
4231 if (!CombinedEntriesWithIndices.empty()) {
4232 dbgs() <<
"Combined entries: ";
4234 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4245 StringRef Banner)
const {
4246 dbgs() <<
"SLP: " << Banner <<
":\n";
4248 dbgs() <<
"SLP: Costs:\n";
4249 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4250 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4251 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4252 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4253 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4259 const InstructionsState &S,
4261 ArrayRef<int> ReuseShuffleIndices = {}) {
4262 auto Invalid = ScheduleBundle::invalid();
4263 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4268 const InstructionsState &S,
4270 ArrayRef<int> ReuseShuffleIndices = {},
4271 ArrayRef<unsigned> ReorderIndices = {},
4272 unsigned InterleaveFactor = 0) {
4273 TreeEntry::EntryState EntryState =
4274 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4275 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4276 ReuseShuffleIndices, ReorderIndices);
4277 if (
E && InterleaveFactor > 0)
4278 E->setInterleave(InterleaveFactor);
4283 TreeEntry::EntryState EntryState,
4284 ScheduleBundle &Bundle,
const InstructionsState &S,
4286 ArrayRef<int> ReuseShuffleIndices = {},
4287 ArrayRef<unsigned> ReorderIndices = {}) {
4288 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4289 EntryState == TreeEntry::SplitVectorize)) ||
4290 (Bundle && EntryState != TreeEntry::NeedToGather &&
4291 EntryState != TreeEntry::SplitVectorize)) &&
4292 "Need to vectorize gather entry?");
4294 if (GatheredLoadsEntriesFirst.has_value() &&
4295 EntryState == TreeEntry::NeedToGather && S &&
4296 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4297 !UserTreeIdx.UserTE)
4299 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4300 TreeEntry *
Last = VectorizableTree.back().get();
4301 Last->Idx = VectorizableTree.size() - 1;
4302 Last->State = EntryState;
4303 if (UserTreeIdx.UserTE)
4304 OperandsToTreeEntry.try_emplace(
4305 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4310 ReuseShuffleIndices.empty()) &&
4311 "Reshuffling scalars not yet supported for nodes with padding");
4312 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4313 ReuseShuffleIndices.end());
4314 if (ReorderIndices.
empty()) {
4317 Last->setOperations(S);
4320 Last->Scalars.assign(VL.
size(),
nullptr);
4322 [VL](
unsigned Idx) ->
Value * {
4323 if (Idx >= VL.size())
4324 return UndefValue::get(VL.front()->getType());
4329 Last->setOperations(S);
4330 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4332 if (EntryState == TreeEntry::SplitVectorize) {
4333 assert(S &&
"Split nodes must have operations.");
4334 Last->setOperations(S);
4335 SmallPtrSet<Value *, 4> Processed;
4336 for (
Value *V : VL) {
4340 auto It = ScalarsInSplitNodes.find(V);
4341 if (It == ScalarsInSplitNodes.end()) {
4342 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4343 (void)Processed.
insert(V);
4344 }
else if (Processed.
insert(V).second) {
4346 "Value already associated with the node.");
4347 It->getSecond().push_back(
Last);
4350 }
else if (!
Last->isGather()) {
4353 (!S.areInstructionsWithCopyableElements() &&
4355 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4356 Last->setDoesNotNeedToSchedule();
4357 SmallPtrSet<Value *, 4> Processed;
4358 for (
Value *V : VL) {
4361 if (S.isCopyableElement(V)) {
4362 Last->addCopyableElement(V);
4365 auto It = ScalarToTreeEntries.find(V);
4366 if (It == ScalarToTreeEntries.end()) {
4367 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4368 (void)Processed.
insert(V);
4369 }
else if (Processed.
insert(V).second) {
4371 "Value already associated with the node.");
4372 It->getSecond().push_back(
Last);
4376 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4377 "Bundle and VL out of sync");
4378 if (!Bundle.getBundle().empty()) {
4379#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4380 auto *BundleMember = Bundle.getBundle().begin();
4381 SmallPtrSet<Value *, 4> Processed;
4382 for (
Value *V : VL) {
4383 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4387 assert(BundleMember == Bundle.getBundle().end() &&
4388 "Bundle and VL out of sync");
4390 Bundle.setTreeEntry(
Last);
4394 bool AllConstsOrCasts =
true;
4395 for (
Value *V : VL) {
4396 if (S && S.areInstructionsWithCopyableElements() &&
4397 S.isCopyableElement(V))
4398 Last->addCopyableElement(V);
4401 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4402 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4403 !UserTreeIdx.UserTE->isGather())
4404 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4407 if (AllConstsOrCasts)
4409 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4410 MustGather.insert_range(VL);
4413 if (UserTreeIdx.UserTE)
4414 Last->UserTreeIndex = UserTreeIdx;
4420 TreeEntry::VecTreeTy VectorizableTree;
4425 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4426 VectorizableTree[
Id]->dump();
4434 assert(V &&
"V cannot be nullptr.");
4435 auto It = ScalarToTreeEntries.find(V);
4436 if (It == ScalarToTreeEntries.end())
4438 return It->getSecond();
4443 assert(V &&
"V cannot be nullptr.");
4444 auto It = ScalarsInSplitNodes.find(V);
4445 if (It == ScalarsInSplitNodes.end())
4447 return It->getSecond();
4452 bool SameVF =
false)
const {
4453 assert(V &&
"V cannot be nullptr.");
4454 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4455 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4466 bool areAltOperandsProfitable(
const InstructionsState &S,
4471 class ScalarsVectorizationLegality {
4472 InstructionsState S;
4474 bool TryToFindDuplicates;
4475 bool TrySplitVectorize;
4478 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4479 bool TryToFindDuplicates =
true,
4480 bool TrySplitVectorize =
false)
4481 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4482 TrySplitVectorize(TrySplitVectorize) {
4483 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4484 "Inconsistent state");
4486 const InstructionsState &getInstructionsState()
const {
return S; };
4487 bool isLegal()
const {
return IsLegal; }
4488 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4489 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4494 ScalarsVectorizationLegality
4497 bool TryCopyableElementsVectorization)
const;
4501 TreeEntry::EntryState getScalarsVectorizationState(
4503 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4504 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4507 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4510 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4511 OperandsToTreeEntry;
4514 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4517 SmallDenseMap<Value *, unsigned> InstrElementSize;
4531 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4536 SetVector<const TreeEntry *> PostponedGathers;
4538 using ValueToGatherNodesMap =
4539 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4540 ValueToGatherNodesMap ValueToGatherNodes;
4545 SetVector<unsigned> LoadEntriesToVectorize;
4548 bool IsGraphTransformMode =
false;
4551 std::optional<unsigned> GatheredLoadsEntriesFirst;
4554 SmallDenseMap<
const TreeEntry *,
4555 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4556 CompressEntryToData;
4559 struct ExternalUser {
4560 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4561 : Scalar(S), User(
U), E(E), Lane(
L) {}
4564 Value *Scalar =
nullptr;
4567 llvm::User *User =
nullptr;
4575 using UserList = SmallVector<ExternalUser, 16>;
4581 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4582 Instruction *Inst2) {
4585 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4586 auto Res = AliasCache.try_emplace(
Key);
4588 return Res.first->second;
4589 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4591 Res.first->getSecond() = Aliased;
4595 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4599 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4604 BatchAAResults BatchAA;
4611 DenseSet<Instruction *> DeletedInstructions;
4614 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4617 DenseSet<size_t> AnalyzedReductionVals;
4621 DenseSet<Value *> AnalyzedMinBWVals;
4627 UserList ExternalUses;
4631 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4635 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4638 SmallPtrSet<const Value *, 32> EphValues;
4642 SetVector<Instruction *> GatherShuffleExtractSeq;
4645 DenseSet<BasicBlock *> CSEBlocks;
4648 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4655 class ScheduleEntity {
4656 friend class ScheduleBundle;
4657 friend class ScheduleData;
4658 friend class ScheduleCopyableData;
4661 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4662 Kind getKind()
const {
return K; }
4663 ScheduleEntity(Kind K) : K(K) {}
4667 int SchedulingPriority = 0;
4670 bool IsScheduled =
false;
4672 const Kind K = Kind::ScheduleData;
4675 ScheduleEntity() =
delete;
4677 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4678 int getSchedulingPriority()
const {
return SchedulingPriority; }
4679 bool isReady()
const {
4681 return SD->isReady();
4683 return CD->isReady();
4689 bool hasValidDependencies()
const {
4691 return SD->hasValidDependencies();
4693 return CD->hasValidDependencies();
4697 int getUnscheduledDeps()
const {
4699 return SD->getUnscheduledDeps();
4701 return CD->getUnscheduledDeps();
4705 int incrementUnscheduledDeps(
int Incr) {
4707 return SD->incrementUnscheduledDeps(Incr);
4711 int getDependencies()
const {
4713 return SD->getDependencies();
4719 return SD->getInst();
4724 bool isScheduled()
const {
return IsScheduled; }
4725 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4727 static bool classof(
const ScheduleEntity *) {
return true; }
4729#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4730 void dump(raw_ostream &OS)
const {
4732 return SD->dump(OS);
4734 return CD->dump(OS);
4745#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4747 const BoUpSLP::ScheduleEntity &SE) {
4757 class ScheduleData final :
public ScheduleEntity {
4761 enum { InvalidDeps = -1 };
4763 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4764 static bool classof(
const ScheduleEntity *Entity) {
4765 return Entity->getKind() == Kind::ScheduleData;
4768 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4769 NextLoadStore =
nullptr;
4770 IsScheduled =
false;
4771 SchedulingRegionID = BlockSchedulingRegionID;
4772 clearDependencies();
4778 if (hasValidDependencies()) {
4779 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4781 assert(UnscheduledDeps == Dependencies &&
"invariant");
4785 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4786 "unexpected scheduled state");
4793 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4797 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4802 int incrementUnscheduledDeps(
int Incr) {
4803 assert(hasValidDependencies() &&
4804 "increment of unscheduled deps would be meaningless");
4805 UnscheduledDeps += Incr;
4806 return UnscheduledDeps;
4811 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4814 void clearDependencies() {
4815 clearDirectDependencies();
4816 MemoryDependencies.clear();
4817 ControlDependencies.clear();
4824 void clearDirectDependencies() {
4825 Dependencies = InvalidDeps;
4826 resetUnscheduledDeps();
4827 IsScheduled =
false;
4831 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4833 int getDependencies()
const {
return Dependencies; }
4835 void initDependencies() { Dependencies = 0; }
4837 void incDependencies() { Dependencies++; }
4840 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4847 return MemoryDependencies;
4850 void addMemoryDependency(ScheduleData *Dep) {
4851 MemoryDependencies.push_back(Dep);
4855 return ControlDependencies;
4858 void addControlDependency(ScheduleData *Dep) {
4859 ControlDependencies.push_back(Dep);
4862 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4863 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4865 void dump(raw_ostream &OS)
const { OS << *Inst; }
4877 ScheduleData *NextLoadStore =
nullptr;
4881 SmallVector<ScheduleData *> MemoryDependencies;
4887 SmallVector<ScheduleData *> ControlDependencies;
4891 int SchedulingRegionID = 0;
4897 int Dependencies = InvalidDeps;
4903 int UnscheduledDeps = InvalidDeps;
4908 const BoUpSLP::ScheduleData &SD) {
4914 class ScheduleBundle final :
public ScheduleEntity {
4918 bool IsValid =
true;
4920 TreeEntry *TE =
nullptr;
4921 ScheduleBundle(
bool IsValid)
4922 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4925 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4926 static bool classof(
const ScheduleEntity *Entity) {
4927 return Entity->getKind() == Kind::ScheduleBundle;
4932 for (
const ScheduleEntity *SD : Bundle) {
4933 if (SD->hasValidDependencies()) {
4934 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4937 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4941 if (isScheduled()) {
4942 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4943 "unexpected scheduled state");
4949 int unscheduledDepsInBundle()
const {
4950 assert(*
this &&
"bundle must not be empty");
4952 for (
const ScheduleEntity *BundleMember : Bundle) {
4953 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4954 return ScheduleData::InvalidDeps;
4955 Sum += BundleMember->getUnscheduledDeps();
4963 bool hasValidDependencies()
const {
4964 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4965 return SD->hasValidDependencies();
4971 bool isReady()
const {
4972 assert(*
this &&
"bundle must not be empty");
4973 return unscheduledDepsInBundle() == 0 && !isScheduled();
4981 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4984 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4985 TreeEntry *getTreeEntry()
const {
return TE; }
4987 static ScheduleBundle invalid() {
return {
false}; }
4989 operator bool()
const {
return IsValid; }
4992 void dump(raw_ostream &OS)
const {
5001 OS << *SD->getInst();
5015 const BoUpSLP::ScheduleBundle &Bundle) {
5026 class ScheduleCopyableData final :
public ScheduleEntity {
5033 int SchedulingRegionID = 0;
5035 ScheduleBundle &Bundle;
5038 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5039 const EdgeInfo &EI, ScheduleBundle &Bundle)
5040 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5041 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5042 static bool classof(
const ScheduleEntity *Entity) {
5043 return Entity->getKind() == Kind::ScheduleCopyableData;
5048 if (hasValidDependencies()) {
5049 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5051 assert(UnscheduledDeps == Dependencies &&
"invariant");
5055 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5056 "unexpected scheduled state");
5063 bool hasValidDependencies()
const {
5064 return Dependencies != ScheduleData::InvalidDeps;
5069 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5074 int incrementUnscheduledDeps(
int Incr) {
5075 assert(hasValidDependencies() &&
5076 "increment of unscheduled deps would be meaningless");
5077 UnscheduledDeps += Incr;
5078 assert(UnscheduledDeps >= 0 &&
"invariant");
5079 return UnscheduledDeps;
5084 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5087 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5089 int getDependencies()
const {
return Dependencies; }
5091 void initDependencies() { Dependencies = 0; }
5093 void incDependencies() { Dependencies++; }
5096 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5102 void clearDependencies() {
5103 Dependencies = ScheduleData::InvalidDeps;
5104 UnscheduledDeps = ScheduleData::InvalidDeps;
5105 IsScheduled =
false;
5109 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5112 ScheduleBundle &getBundle() {
return Bundle; }
5113 const ScheduleBundle &getBundle()
const {
return Bundle; }
5115#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5116 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5127 int Dependencies = ScheduleData::InvalidDeps;
5133 int UnscheduledDeps = ScheduleData::InvalidDeps;
5163 struct BlockScheduling {
5165 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5168 ScheduledBundles.clear();
5169 ScheduledBundlesList.
clear();
5170 ScheduleCopyableDataMap.clear();
5171 ScheduleCopyableDataMapByInst.clear();
5172 ScheduleCopyableDataMapByInstUser.clear();
5173 ScheduleCopyableDataMapByUsers.clear();
5175 ScheduleStart =
nullptr;
5176 ScheduleEnd =
nullptr;
5177 FirstLoadStoreInRegion =
nullptr;
5178 LastLoadStoreInRegion =
nullptr;
5179 RegionHasStackSave =
false;
5183 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5186 ScheduleRegionSize = 0;
5190 ++SchedulingRegionID;
5196 if (BB !=
I->getParent())
5199 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5200 if (SD && isInSchedulingRegion(*SD))
5205 ScheduleData *getScheduleData(
Value *V) {
5211 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5212 const Value *V)
const {
5213 if (ScheduleCopyableDataMap.empty())
5215 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5216 if (It == ScheduleCopyableDataMap.end())
5218 ScheduleCopyableData *SD = It->getSecond().get();
5219 if (!isInSchedulingRegion(*SD))
5227 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5229 if (ScheduleCopyableDataMapByInstUser.empty())
5231 const auto It = ScheduleCopyableDataMapByInstUser.find(
5232 std::make_pair(std::make_pair(User, OperandIdx), V));
5233 if (It == ScheduleCopyableDataMapByInstUser.end())
5236 for (ScheduleCopyableData *SD : It->getSecond()) {
5237 if (isInSchedulingRegion(*SD))
5251 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5255 if (ScheduleCopyableDataMap.empty())
5257 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5258 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5259 for (
const Use &U :
User->operands()) {
5263 if (Entries.
empty())
5267 for (TreeEntry *TE : Entries) {
5273 bool IsCommutativeUser =
5276 EdgeInfo EI(TE,
U.getOperandNo());
5279 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5280 if (!getScheduleCopyableData(EI,
Op) && OpCnt <
NumOps)
5286 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5287 .first->getSecond();
5291 if (!PotentiallyReorderedEntriesCount.
empty()) {
5292 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5293 auto *It =
find(
P.first->Scalars, User);
5294 assert(It !=
P.first->Scalars.end() &&
5295 "User is not in the tree entry");
5296 int Lane = std::distance(
P.first->Scalars.begin(), It);
5297 assert(Lane >= 0 &&
"Lane is not found");
5299 Lane =
P.first->ReorderIndices[Lane];
5300 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5301 "Couldn't find extract lane");
5302 SmallVector<unsigned> OpIndices;
5303 for (
unsigned OpIdx :
5305 P.first->getMainOp()))) {
5306 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5307 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5311 return all_of(PotentiallyReorderedEntriesCount,
5312 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5313 return P.second ==
NumOps - 1;
5320 getScheduleCopyableData(
const Instruction *
I)
const {
5321 if (ScheduleCopyableDataMapByInst.empty())
5323 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5324 if (It == ScheduleCopyableDataMapByInst.end())
5327 for (ScheduleCopyableData *SD : It->getSecond()) {
5328 if (isInSchedulingRegion(*SD))
5335 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5336 if (ScheduleCopyableDataMapByUsers.empty())
5338 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5339 if (It == ScheduleCopyableDataMapByUsers.end())
5342 for (ScheduleCopyableData *SD : It->getSecond()) {
5343 if (isInSchedulingRegion(*SD))
5349 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5351 int SchedulingRegionID,
5352 ScheduleBundle &Bundle) {
5353 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5354 ScheduleCopyableData *CD =
5355 ScheduleCopyableDataMap
5356 .try_emplace(std::make_pair(EI,
I),
5357 std::make_unique<ScheduleCopyableData>(
5358 SchedulingRegionID,
I, EI, Bundle))
5361 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5365 assert(It !=
Op.end() &&
"Lane not set");
5366 SmallPtrSet<Instruction *, 4> Visited;
5368 int Lane = std::distance(
Op.begin(), It);
5369 assert(Lane >= 0 &&
"Lane not set");
5371 !EI.UserTE->ReorderIndices.empty())
5372 Lane = EI.UserTE->ReorderIndices[Lane];
5373 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5374 "Couldn't find extract lane");
5376 if (!Visited.
insert(In).second) {
5380 ScheduleCopyableDataMapByInstUser
5381 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5384 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5391 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5392 if (ScheduleCopyableData *UserCD =
5393 getScheduleCopyableData(UserEI, In))
5394 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5397 }
while (It !=
Op.end());
5399 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5409 auto It = ScheduledBundles.find(
I);
5410 if (It == ScheduledBundles.end())
5412 return It->getSecond();
5416 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5418 return Data->getSchedulingRegionID() == SchedulingRegionID;
5420 return CD->getSchedulingRegionID() == SchedulingRegionID;
5422 [&](
const ScheduleEntity *BundleMember) {
5423 return isInSchedulingRegion(*BundleMember);
5429 template <
typename ReadyListType>
5430 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5431 const EdgeInfo &EI, ScheduleEntity *
Data,
5432 ReadyListType &ReadyList) {
5433 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5438 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5439 if ((IsControl ||
Data->hasValidDependencies()) &&
5440 Data->incrementUnscheduledDeps(-1) == 0) {
5447 CopyableBundle.
push_back(&CD->getBundle());
5448 Bundles = CopyableBundle;
5450 Bundles = getScheduleBundles(
Data->getInst());
5452 if (!Bundles.
empty()) {
5453 for (ScheduleBundle *Bundle : Bundles) {
5454 if (Bundle->unscheduledDepsInBundle() == 0) {
5455 assert(!Bundle->isScheduled() &&
5456 "already scheduled bundle gets ready");
5457 ReadyList.insert(Bundle);
5459 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5465 "already scheduled bundle gets ready");
5467 "Expected non-copyable data");
5468 ReadyList.insert(
Data);
5475 if (!ScheduleCopyableDataMap.empty()) {
5477 getScheduleCopyableData(User,
OpIdx,
I);
5478 for (ScheduleCopyableData *CD : CopyableData)
5479 DecrUnsched(CD,
false);
5480 if (!CopyableData.empty())
5483 if (ScheduleData *OpSD = getScheduleData(
I))
5484 DecrUnsched(OpSD,
false);
5490 if (!Bundles.empty()) {
5491 auto *
In = BundleMember->getInst();
5493 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5494 unsigned TotalOpCount = 0;
5497 TotalOpCount = OperandsUses[
In] = 1;
5499 for (
const Use &U :
In->operands()) {
5502 ++Res.first->getSecond();
5509 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5511 if (!ScheduleCopyableDataMap.empty()) {
5512 const EdgeInfo EI = {UserTE,
OpIdx};
5513 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5514 DecrUnsched(CD,
false);
5518 auto It = OperandsUses.
find(
I);
5519 assert(It != OperandsUses.
end() &&
"Operand not found");
5520 if (It->second > 0) {
5522 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5524 if (ScheduleData *OpSD = getScheduleData(
I))
5525 DecrUnsched(OpSD,
false);
5529 for (ScheduleBundle *Bundle : Bundles) {
5530 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5534 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5535 find(Bundle->getTreeEntry()->Scalars, In));
5536 assert(Lane >= 0 &&
"Lane not set");
5538 !Bundle->getTreeEntry()->ReorderIndices.empty())
5539 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5540 assert(Lane <
static_cast<int>(
5541 Bundle->getTreeEntry()->Scalars.size()) &&
5542 "Couldn't find extract lane");
5552 In->getNumOperands() ==
5553 Bundle->getTreeEntry()->getNumOperands() ||
5554 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5555 "Missed TreeEntry operands?");
5557 for (
unsigned OpIdx :
5560 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5563 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5569 for (Use &U : BundleMember->getInst()->operands()) {
5572 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5573 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5581 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5582 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5583 if (!VisitedMemory.
insert(MemoryDep).second)
5588 << *MemoryDep <<
"\n");
5589 DecrUnsched(MemoryDep);
5592 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5593 for (ScheduleData *Dep : SD->getControlDependencies()) {
5594 if (!VisitedControl.
insert(Dep).second)
5599 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5600 DecrUnsched(Dep,
true);
5604 SD->setScheduled(
true);
5609 if (
R.isVectorized(In)) {
5611 for (TreeEntry *TE : Entries) {
5613 In->getNumOperands() !=
TE->getNumOperands())
5616 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5617 BundlePtr->setTreeEntry(TE);
5622 ProcessBundleMember(SD, Bundles);
5625 Bundle.setScheduled(
true);
5627 auto AreAllBundlesScheduled =
5628 [&](
const ScheduleEntity *SD,
5632 return !SDBundles.empty() &&
5633 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5634 return SDBundle->isScheduled();
5637 for (ScheduleEntity *SD : Bundle.getBundle()) {
5640 SDBundles = getScheduleBundles(SD->getInst());
5641 if (AreAllBundlesScheduled(SD, SDBundles)) {
5642 SD->setScheduled(
true);
5655 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5656 ScheduleStart->comesBefore(ScheduleEnd) &&
5657 "Not a valid scheduling region?");
5659 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5661 if (!Bundles.
empty()) {
5662 for (ScheduleBundle *Bundle : Bundles) {
5663 assert(isInSchedulingRegion(*Bundle) &&
5664 "primary schedule data not in window?");
5669 auto *SD = getScheduleData(
I);
5672 assert(isInSchedulingRegion(*SD) &&
5673 "primary schedule data not in window?");
5678 [](
const ScheduleEntity *Bundle) {
5679 return Bundle->isReady();
5681 "item in ready list not ready?");
5685 template <
typename ReadyListType>
5686 void initialFillReadyList(ReadyListType &ReadyList) {
5687 SmallPtrSet<ScheduleBundle *, 16> Visited;
5688 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5689 ScheduleData *SD = getScheduleData(
I);
5690 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5693 for (ScheduleBundle *Bundle : Bundles) {
5694 if (!Visited.
insert(Bundle).second)
5696 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5697 ReadyList.insert(Bundle);
5699 << *Bundle <<
"\n");
5704 ReadyList.insert(SD);
5706 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5717 const InstructionsState &S,
const EdgeInfo &EI);
5724 std::optional<ScheduleBundle *>
5726 const InstructionsState &S,
const EdgeInfo &EI);
5729 ScheduleData *allocateScheduleDataChunks();
5733 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5737 void initScheduleData(Instruction *FromI, Instruction *ToI,
5738 ScheduleData *PrevLoadStore,
5739 ScheduleData *NextLoadStore);
5743 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5748 void resetSchedule();
5765 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5769 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5770 std::unique_ptr<ScheduleCopyableData>>
5771 ScheduleCopyableDataMap;
5777 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5778 ScheduleCopyableDataMapByInst;
5784 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5786 ScheduleCopyableDataMapByInstUser;
5806 SmallSetVector<ScheduleCopyableData *, 4>>
5807 ScheduleCopyableDataMapByUsers;
5810 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5816 SetVector<ScheduleEntity *> ReadyInsts;
5826 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5830 ScheduleData *LastLoadStoreInRegion =
nullptr;
5835 bool RegionHasStackSave =
false;
5838 int ScheduleRegionSize = 0;
5847 int SchedulingRegionID = 1;
5851 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5855 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5858 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5862 struct OrdersTypeDenseMapInfo {
5875 static unsigned getHashValue(
const OrdersType &V) {
5886 ScalarEvolution *SE;
5887 TargetTransformInfo *TTI;
5888 TargetLibraryInfo *TLI;
5891 AssumptionCache *AC;
5893 const DataLayout *DL;
5894 OptimizationRemarkEmitter *ORE;
5896 unsigned MaxVecRegSize;
5897 unsigned MinVecRegSize;
5900 IRBuilder<TargetFolder> Builder;
5907 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5912 unsigned ReductionBitWidth = 0;
5915 unsigned BaseGraphSize = 1;
5919 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5923 DenseSet<unsigned> ExtraBitWidthNodes;
5933 SecondInfo::getEmptyKey());
5938 SecondInfo::getTombstoneKey());
5943 SecondInfo::getHashValue(Val.
EdgeIdx));
5964 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5975 return R.VectorizableTree[0].get();
5979 return {&
N->UserTreeIndex,
N->Container};
5983 return {&
N->UserTreeIndex + 1,
N->Container};
6010 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6021 OS << Entry->Idx <<
".\n";
6024 for (
auto *V : Entry->Scalars) {
6026 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6027 return EU.Scalar == V;
6037 if (Entry->isGather())
6039 if (Entry->State == TreeEntry::ScatterVectorize ||
6040 Entry->State == TreeEntry::StridedVectorize ||
6041 Entry->State == TreeEntry::CompressVectorize)
6042 return "color=blue";
6051 for (
auto *
I : DeletedInstructions) {
6052 if (!
I->getParent()) {
6057 I->insertBefore(F->getEntryBlock(),
6058 F->getEntryBlock().getFirstNonPHIIt());
6060 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6063 for (
Use &U :
I->operands()) {
6065 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6069 I->dropAllReferences();
6071 for (
auto *
I : DeletedInstructions) {
6073 "trying to erase instruction with users.");
6074 I->eraseFromParent();
6080#ifdef EXPENSIVE_CHECKS
6091 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6092 "Expected non-empty mask.");
6095 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6097 Reuses[Mask[
I]] = Prev[
I];
6105 bool BottomOrder =
false) {
6106 assert(!Mask.empty() &&
"Expected non-empty mask.");
6107 unsigned Sz = Mask.size();
6110 if (Order.
empty()) {
6112 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6114 PrevOrder.
swap(Order);
6117 for (
unsigned I = 0;
I < Sz; ++
I)
6119 Order[
I] = PrevOrder[Mask[
I]];
6121 return Data.value() == Sz ||
Data.index() ==
Data.value();
6130 if (Order.
empty()) {
6132 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6142 for (
unsigned I = 0;
I < Sz; ++
I)
6144 Order[MaskOrder[
I]] =
I;
6148std::optional<BoUpSLP::OrdersType>
6150 bool TopToBottom,
bool IgnoreReorder) {
6151 assert(TE.isGather() &&
"Expected gather node only.");
6155 Type *ScalarTy = GatheredScalars.
front()->getType();
6156 size_t NumScalars = GatheredScalars.
size();
6158 return std::nullopt;
6165 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6167 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6170 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6171 return std::nullopt;
6172 OrdersType CurrentOrder(NumScalars, NumScalars);
6173 if (GatherShuffles.
size() == 1 &&
6175 Entries.
front().front()->isSame(TE.Scalars)) {
6179 return std::nullopt;
6181 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6182 TE.UserTreeIndex.UserTE)
6183 return std::nullopt;
6186 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6187 return std::nullopt;
6190 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6191 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6194 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6196 return std::nullopt;
6200 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6201 return CurrentOrder;
6205 return all_of(Mask, [&](
int I) {
6212 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6213 (Entries.
size() != 1 ||
6214 Entries.
front().front()->ReorderIndices.empty())) ||
6215 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6216 return std::nullopt;
6222 if (ShuffledSubMasks.
test(
I))
6224 const int VF = GetVF(
I);
6230 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6232 ShuffledSubMasks.
set(
I);
6236 int FirstMin = INT_MAX;
6237 int SecondVecFound =
false;
6239 int Idx = Mask[
I * PartSz + K];
6241 Value *V = GatheredScalars[
I * PartSz + K];
6243 SecondVecFound =
true;
6252 SecondVecFound =
true;
6256 FirstMin = (FirstMin / PartSz) * PartSz;
6258 if (SecondVecFound) {
6260 ShuffledSubMasks.
set(
I);
6264 int Idx = Mask[
I * PartSz + K];
6268 if (Idx >= PartSz) {
6269 SecondVecFound =
true;
6272 if (CurrentOrder[
I * PartSz + Idx] >
6273 static_cast<unsigned>(
I * PartSz + K) &&
6274 CurrentOrder[
I * PartSz + Idx] !=
6275 static_cast<unsigned>(
I * PartSz + Idx))
6276 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6279 if (SecondVecFound) {
6281 ShuffledSubMasks.
set(
I);
6287 if (!ExtractShuffles.
empty())
6288 TransformMaskToOrder(
6289 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6290 if (!ExtractShuffles[
I])
6293 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6295 int K =
I * PartSz + Idx;
6298 if (!TE.ReuseShuffleIndices.empty())
6299 K = TE.ReuseShuffleIndices[K];
6302 if (!TE.ReorderIndices.empty())
6303 K = std::distance(TE.ReorderIndices.begin(),
6304 find(TE.ReorderIndices, K));
6310 .getKnownMinValue());
6315 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6316 if (ShuffledSubMasks.
any())
6317 return std::nullopt;
6318 PartSz = NumScalars;
6321 if (!Entries.
empty())
6322 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6323 if (!GatherShuffles[
I])
6325 return std::max(Entries[
I].front()->getVectorFactor(),
6326 Entries[
I].back()->getVectorFactor());
6328 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6329 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6330 return std::nullopt;
6331 return std::move(CurrentOrder);
6336 bool CompareOpcodes =
true) {
6342 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6343 (!GEP2 || GEP2->getNumOperands() == 2) &&
6344 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6345 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6348 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6352template <
typename T>
6357 return CommonAlignment;
6363 "Order is empty. Please check it before using isReverseOrder.");
6364 unsigned Sz = Order.
size();
6366 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6377 const SCEV *PtrSCEVLowest =
nullptr;
6378 const SCEV *PtrSCEVHighest =
nullptr;
6386 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6387 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6394 PtrSCEVLowest = PtrSCEV;
6401 PtrSCEVHighest = PtrSCEV;
6409 int Size =
DL.getTypeStoreSize(ElemTy);
6410 auto TryGetStride = [&](
const SCEV *Dist,
6411 const SCEV *Multiplier) ->
const SCEV * {
6413 if (M->getOperand(0) == Multiplier)
6414 return M->getOperand(1);
6415 if (M->getOperand(1) == Multiplier)
6416 return M->getOperand(0);
6419 if (Multiplier == Dist)
6424 const SCEV *Stride =
nullptr;
6425 if (
Size != 1 || SCEVs.
size() > 2) {
6427 Stride = TryGetStride(Dist, Sz);
6435 using DistOrdPair = std::pair<int64_t, int>;
6437 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6439 bool IsConsecutive =
true;
6440 for (
const SCEV *PtrSCEV : SCEVs) {
6442 if (PtrSCEV != PtrSCEVLowest) {
6444 const SCEV *Coeff = TryGetStride(Diff, Stride);
6454 Dist = SC->getAPInt().getZExtValue();
6459 auto Res = Offsets.emplace(Dist, Cnt);
6463 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6466 if (Offsets.size() != SCEVs.
size())
6468 SortedIndices.
clear();
6469 if (!IsConsecutive) {
6473 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6474 SortedIndices[Cnt] = Pair.second;
6481static std::pair<InstructionCost, InstructionCost>
6484 Type *ScalarTy, VectorType *VecTy);
6502 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6505 Mask, NumSrcElts, NumSubElts, Index)) {
6506 if (Index + NumSubElts > NumSrcElts &&
6507 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6524 "ScalableVectorType is not supported.");
6527 "Incorrect usage.");
6532 unsigned ScalarTyNumElements = VecTy->getNumElements();
6535 if (!DemandedElts[
I])
6539 I * ScalarTyNumElements, VecTy);
6542 I * ScalarTyNumElements, VecTy);
6555 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6556 if (Opcode == Instruction::ExtractElement) {
6562 Index * VecTy->getNumElements(), VecTy);
6565 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6578 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6580 Index * ScalarTy->getNumElements(), SubTp) +
6584 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6600 auto *Begin = std::next(
Mask.begin(), Index);
6601 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6605 std::iota(
Mask.begin(),
Mask.end(), 0);
6606 std::iota(std::next(
Mask.begin(), Index),
6607 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6609 return Generator(Vec, V, Mask);
6612 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6620 unsigned SubVecVF,
unsigned Index) {
6622 std::iota(Mask.begin(), Mask.end(), Index);
6623 return Builder.CreateShuffleVector(Vec, Mask);
6633 const unsigned Sz = PointerOps.
size();
6636 CompressMask[0] = 0;
6638 std::optional<unsigned> Stride = 0;
6642 std::optional<int64_t> OptPos =
6644 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6646 unsigned Pos =
static_cast<unsigned>(*OptPos);
6647 CompressMask[
I] = Pos;
6654 if (Pos != *Stride *
I)
6657 return Stride.has_value();
6670 InterleaveFactor = 0;
6672 const size_t Sz = VL.
size();
6680 if (AreAllUsersVectorized(V))
6683 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6684 Mask.empty() ?
I : Mask[
I]);
6687 if (ExtractCost <= ScalarCost)
6692 if (Order.
empty()) {
6693 Ptr0 = PointerOps.
front();
6694 PtrN = PointerOps.
back();
6696 Ptr0 = PointerOps[Order.
front()];
6697 PtrN = PointerOps[Order.
back()];
6699 std::optional<int64_t> Diff =
6703 const size_t MaxRegSize =
6707 if (*Diff / Sz >= MaxRegSize / 8)
6711 Align CommonAlignment = LI->getAlign();
6713 Ptr0, LoadVecTy, CommonAlignment,
DL,
6716 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6717 LI->getPointerAddressSpace()))
6723 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6727 auto [ScalarGEPCost, VectorGEPCost] =
6729 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6747 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6748 LI->getPointerAddressSpace(),
CostKind);
6751 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6752 LI->getPointerAddressSpace(),
CostKind);
6754 if (IsStrided && !IsMasked && Order.
empty()) {
6761 AlignedLoadVecTy = LoadVecTy;
6762 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6764 LI->getPointerAddressSpace())) {
6766 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6767 Instruction::Load, AlignedLoadVecTy,
6768 CompressMask[1], {}, CommonAlignment,
6769 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6770 if (InterleavedCost < GatherCost) {
6771 InterleaveFactor = CompressMask[1];
6772 LoadVecTy = AlignedLoadVecTy;
6779 if (!Order.
empty()) {
6782 NewMask[
I] = CompressMask[Mask[
I]];
6784 CompressMask.
swap(NewMask);
6786 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6787 return TotalVecCost < GatherCost;
6800 unsigned InterleaveFactor;
6804 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6805 CompressMask, LoadVecTy);
6825 const bool IsAnyPointerUsedOutGraph,
6827 StridedPtrInfo &SPtrInfo)
const {
6828 const size_t Sz = VL.
size();
6829 const uint64_t AbsoluteDiff = std::abs(Diff);
6832 if (IsAnyPointerUsedOutGraph ||
6833 (AbsoluteDiff > Sz &&
6836 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6837 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6838 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6839 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6844 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6848 if (Order.
empty()) {
6849 Ptr0 = PointerOps.
front();
6850 PtrN = PointerOps.
back();
6852 Ptr0 = PointerOps[Order.
front()];
6853 PtrN = PointerOps[Order.
back()];
6862 else if (
Ptr != Ptr0)
6866 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6869 if (Dists.
size() == Sz) {
6870 Type *StrideTy = DL.getIndexType(Ptr0->
getType());
6871 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6882 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6895 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6901 const size_t Sz = VL.
size();
6903 auto *POIter = PointerOps.
begin();
6904 for (
Value *V : VL) {
6906 if (!L || !L->isSimple())
6908 *POIter = L->getPointerOperand();
6914 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6920 if (
const SCEV *Stride =
6922 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6924 SPtrInfo.StrideSCEV = Stride;
6929 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6930 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6941 if (Order.
empty()) {
6942 Ptr0 = PointerOps.
front();
6943 PtrN = PointerOps.
back();
6945 Ptr0 = PointerOps[Order.
front()];
6946 PtrN = PointerOps[Order.
back()];
6948 std::optional<int64_t> Diff =
6951 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6954 *TLI, [&](
Value *V) {
6955 return areAllUsersVectorized(
6960 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6962 auto IsAnyPointerUsedOutGraph =
6963 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
6965 return !isVectorized(U) && !MustGather.contains(U);
6968 if (IsPossibleStrided &&
6970 IsAnyPointerUsedOutGraph, *Diff, SPtrInfo))
6973 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6974 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6979 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
6981 bool ProfitableGatherPointers) {
6986 auto [ScalarGEPCost, VectorGEPCost] =
6988 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6992 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6994 if (
static_cast<unsigned>(
count_if(
7013 return C + TTI.getInstructionCost(
7019 TTI.getGatherScatterOpCost(
7021 false, CommonAlignment,
CostKind) +
7022 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7030 constexpr unsigned ListLimit = 4;
7031 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7040 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7050 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7055 PointerOps, SPtrInfo, BestVF,
7063 DemandedElts.
setBits(Cnt, Cnt + VF);
7079 if (!DemandedElts.
isZero()) {
7085 if (DemandedElts[Idx])
7096 LI0->getPointerOperand(),
7097 Instruction::GetElementPtr,
CostKind, ScalarTy,
7101 if (
static_cast<unsigned>(
7103 PointerOps.
size() - 1 ||
7122 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7123 LI0->getPointerAddressSpace(),
CostKind,
7128 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7129 LI0->getPointerOperand(),
7135 VecLdCost += TTI.getMaskedMemoryOpCost(
7136 Instruction::Load, SubVecTy, CommonAlignment,
7137 LI0->getPointerAddressSpace(),
CostKind) +
7143 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7144 LI0->getPointerOperand(),
7155 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7164 if (MaskedGatherCost >= VecLdCost &&
7177 bool ProfitableGatherPointers =
7178 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7179 return L->isLoopInvariant(V);
7181 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7184 (
GEP &&
GEP->getNumOperands() == 2 &&
7192 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7193 ProfitableGatherPointers))
7205 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7206 "Expected list of pointer operands.");
7211 std::pair<BasicBlock *, Value *>,
7215 .try_emplace(std::make_pair(
7219 SortedIndices.
clear();
7221 auto Key = std::make_pair(BBs[Cnt + 1],
7223 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7225 std::optional<int64_t> Diff =
7226 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7227 ElemTy, Ptr, DL, SE,
7232 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7238 if (Bases.size() > VL.
size() / 2 - 1)
7242 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7246 if (Bases.size() == VL.
size())
7249 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7250 Bases.front().second.size() == VL.
size()))
7255 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7264 FirstPointers.
insert(P1);
7265 SecondPointers.
insert(P2);
7271 "Unable to find matching root.");
7274 for (
auto &
Base : Bases) {
7275 for (
auto &Vec :
Base.second) {
7276 if (Vec.size() > 1) {
7278 int64_t InitialOffset = std::get<1>(Vec[0]);
7279 bool AnyConsecutive =
7281 return std::get<1>(
P.value()) ==
7282 int64_t(
P.index()) + InitialOffset;
7286 if (!AnyConsecutive)
7291 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7295 for (
auto &
T : Bases)
7296 for (
const auto &Vec :
T.second)
7297 for (
const auto &
P : Vec)
7301 "Expected SortedIndices to be the size of VL");
7305std::optional<BoUpSLP::OrdersType>
7307 assert(TE.isGather() &&
"Expected gather node only.");
7308 Type *ScalarTy = TE.Scalars[0]->getType();
7311 Ptrs.
reserve(TE.Scalars.size());
7313 BBs.
reserve(TE.Scalars.size());
7314 for (
Value *V : TE.Scalars) {
7316 if (!L || !L->isSimple())
7317 return std::nullopt;
7323 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7325 return std::move(Order);
7326 return std::nullopt;
7337 if (VU->
getType() != V->getType())
7340 if (!VU->
hasOneUse() && !V->hasOneUse())
7346 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7353 bool IsReusedIdx =
false;
7355 if (IE2 == VU && !IE1)
7357 if (IE1 == V && !IE2)
7358 return V->hasOneUse();
7359 if (IE1 && IE1 != V) {
7361 IsReusedIdx |= ReusedIdx.
test(Idx1);
7362 ReusedIdx.
set(Idx1);
7363 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7368 if (IE2 && IE2 != VU) {
7370 IsReusedIdx |= ReusedIdx.
test(Idx2);
7371 ReusedIdx.
set(Idx2);
7372 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7377 }
while (!IsReusedIdx && (IE1 || IE2));
7385 const TargetLibraryInfo &TLI);
7387std::optional<BoUpSLP::OrdersType>
7389 bool IgnoreReorder) {
7392 if (!TE.ReuseShuffleIndices.empty()) {
7394 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7395 "Reshuffling scalars not yet supported for nodes with padding");
7398 return std::nullopt;
7406 unsigned Sz = TE.Scalars.size();
7407 if (TE.isGather()) {
7408 if (std::optional<OrdersType> CurrentOrder =
7413 ::addMask(Mask, TE.ReuseShuffleIndices);
7414 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7415 unsigned Sz = TE.Scalars.size();
7416 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7419 Res[Idx + K * Sz] =
I + K * Sz;
7421 return std::move(Res);
7424 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7426 2 * TE.getVectorFactor())) == 1)
7427 return std::nullopt;
7428 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7429 return std::nullopt;
7433 if (TE.ReorderIndices.empty())
7434 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7437 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7438 unsigned VF = ReorderMask.
size();
7442 for (
unsigned I = 0;
I < VF;
I += Sz) {
7444 unsigned UndefCnt = 0;
7445 unsigned Limit = std::min(Sz, VF -
I);
7454 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7456 return std::nullopt;
7458 for (
unsigned K = 0; K < NumParts; ++K) {
7459 unsigned Idx = Val + Sz * K;
7460 if (Idx < VF &&
I + K < VF)
7461 ResOrder[Idx] =
I + K;
7464 return std::move(ResOrder);
7466 unsigned VF = TE.getVectorFactor();
7469 TE.ReuseShuffleIndices.end());
7470 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7472 if (isa<PoisonValue>(V))
7474 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7475 return Idx && *Idx < Sz;
7477 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7478 "by BinaryOperator and CastInst.");
7480 if (TE.ReorderIndices.empty())
7481 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7484 for (
unsigned I = 0;
I < VF; ++
I) {
7485 int &Idx = ReusedMask[
I];
7488 Value *V = TE.Scalars[ReorderMask[Idx]];
7490 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7496 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7497 auto *It = ResOrder.
begin();
7498 for (
unsigned K = 0; K < VF; K += Sz) {
7502 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7504 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7505 std::advance(It, Sz);
7508 return Data.index() ==
Data.value();
7510 return std::nullopt;
7511 return std::move(ResOrder);
7513 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7514 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7516 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7517 return std::nullopt;
7518 if (TE.State == TreeEntry::SplitVectorize ||
7519 ((TE.State == TreeEntry::Vectorize ||
7520 TE.State == TreeEntry::StridedVectorize ||
7521 TE.State == TreeEntry::CompressVectorize) &&
7524 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7525 "Alternate instructions are only supported by "
7526 "BinaryOperator and CastInst.");
7527 return TE.ReorderIndices;
7529 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7530 TE.isAltShuffle()) {
7531 assert(TE.ReuseShuffleIndices.empty() &&
7532 "ReuseShuffleIndices should be "
7533 "empty for alternate instructions.");
7535 TE.buildAltOpShuffleMask(
7537 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7538 "Unexpected main/alternate opcode");
7542 const int VF = TE.getVectorFactor();
7547 ResOrder[Mask[
I] % VF] =
I;
7549 return std::move(ResOrder);
7551 if (!TE.ReorderIndices.empty())
7552 return TE.ReorderIndices;
7553 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7554 if (!TE.ReorderIndices.empty())
7555 return TE.ReorderIndices;
7558 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7566 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7574 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7575 if (!DT->isReachableFromEntry(BB1))
7577 if (!DT->isReachableFromEntry(BB2))
7579 auto *NodeA = DT->getNode(BB1);
7580 auto *NodeB = DT->getNode(BB2);
7581 assert(NodeA &&
"Should only process reachable instructions");
7582 assert(NodeB &&
"Should only process reachable instructions");
7583 assert((NodeA == NodeB) ==
7584 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7585 "Different nodes should have different DFS numbers");
7586 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7588 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7589 Value *V1 = TE.Scalars[I1];
7590 Value *V2 = TE.Scalars[I2];
7603 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7604 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7605 FirstUserOfPhi2->getParent());
7615 if (UserBVHead[I1] && !UserBVHead[I2])
7617 if (!UserBVHead[I1])
7619 if (UserBVHead[I1] == UserBVHead[I2])
7622 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7624 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7637 if (EE1->getOperand(0) == EE2->getOperand(0))
7639 if (!Inst1 && Inst2)
7641 if (Inst1 && Inst2) {
7649 "Expected either instructions or arguments vector operands.");
7650 return P1->getArgNo() < P2->getArgNo();
7655 std::iota(Phis.
begin(), Phis.
end(), 0);
7658 return std::nullopt;
7659 return std::move(Phis);
7661 if (TE.isGather() &&
7662 (!TE.hasState() || !TE.isAltShuffle() ||
7663 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7667 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7671 auto *EE = dyn_cast<ExtractElementInst>(V);
7672 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7678 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7679 if (Reuse || !CurrentOrder.
empty())
7680 return std::move(CurrentOrder);
7688 int Sz = TE.Scalars.size();
7692 if (It == TE.Scalars.begin())
7695 if (It != TE.Scalars.end()) {
7697 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7712 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7715 return std::move(Order);
7720 return std::nullopt;
7721 if (TE.Scalars.size() >= 3)
7726 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7728 StridedPtrInfo SPtrInfo;
7731 CurrentOrder, PointerOps, SPtrInfo);
7734 return std::move(CurrentOrder);
7739 if (std::optional<OrdersType> CurrentOrder =
7741 return CurrentOrder;
7743 return std::nullopt;
7753 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7755 if (Cluster != FirstCluster)
7761void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7764 const unsigned Sz =
TE.Scalars.size();
7766 if (!
TE.isGather() ||
7771 SmallVector<int> NewMask;
7773 addMask(NewMask,
TE.ReuseShuffleIndices);
7775 TE.ReorderIndices.clear();
7777 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7778 SmallVector<unsigned> NewOrder(Slice);
7782 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7783 *End =
TE.ReuseShuffleIndices.end();
7784 It != End; std::advance(It, Sz))
7785 std::iota(It, std::next(It, Sz), 0);
7791 "Expected same size of orders");
7792 size_t Sz = Order.
size();
7795 if (Order[Idx] != Sz)
7796 UsedIndices.
set(Order[Idx]);
7798 if (SecondaryOrder.
empty()) {
7800 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7804 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7805 !UsedIndices.
test(SecondaryOrder[Idx]))
7806 Order[Idx] = SecondaryOrder[Idx];
7814 constexpr unsigned TinyVF = 2;
7815 constexpr unsigned TinyTree = 10;
7816 constexpr unsigned PhiOpsLimit = 12;
7817 constexpr unsigned GatherLoadsLimit = 2;
7818 if (VectorizableTree.size() <= TinyTree)
7820 if (VectorizableTree.front()->hasState() &&
7821 !VectorizableTree.front()->isGather() &&
7822 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7823 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7824 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7825 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7826 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7827 VectorizableTree.front()->ReorderIndices.empty()) {
7831 if (VectorizableTree.front()->hasState() &&
7832 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7833 VectorizableTree.front()->Scalars.size() == TinyVF &&
7834 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7837 if (VectorizableTree.front()->hasState() &&
7838 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7839 VectorizableTree.front()->ReorderIndices.empty()) {
7840 const unsigned ReorderedSplitsCnt =
7841 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7842 return TE->State == TreeEntry::SplitVectorize &&
7843 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7844 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7847 if (ReorderedSplitsCnt <= 1 &&
7849 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7850 return ((!TE->isGather() &&
7851 (TE->ReorderIndices.empty() ||
7852 (TE->UserTreeIndex.UserTE &&
7853 TE->UserTreeIndex.UserTE->State ==
7854 TreeEntry::Vectorize &&
7855 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7857 (TE->isGather() && TE->ReorderIndices.empty() &&
7858 (!TE->hasState() || TE->isAltShuffle() ||
7859 TE->getOpcode() == Instruction::Load ||
7860 TE->getOpcode() == Instruction::ZExt ||
7861 TE->getOpcode() == Instruction::SExt))) &&
7862 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7863 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7864 return !isConstant(V) && isVectorized(V);
7866 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7869 bool HasPhis =
false;
7870 bool HasLoad =
true;
7871 unsigned GatherLoads = 0;
7872 for (
const std::unique_ptr<TreeEntry> &TE :
7873 ArrayRef(VectorizableTree).drop_front()) {
7874 if (TE->State == TreeEntry::SplitVectorize)
7876 if (!TE->hasState()) {
7880 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7885 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7886 if (!TE->isGather()) {
7893 if (GatherLoads >= GatherLoadsLimit)
7896 if (TE->getOpcode() == Instruction::GetElementPtr ||
7899 if (TE->getOpcode() != Instruction::PHI &&
7900 (!TE->hasCopyableElements() ||
7902 TE->Scalars.size() / 2))
7904 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7905 TE->getNumOperands() > PhiOpsLimit)
7914void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
7916 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7919 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7920 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7923 copy(MaskOrder, NewMaskOrder.begin());
7925 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
7926 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7935 ReorderIndices.clear();
7954 ExternalUserReorderMap;
7958 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7959 const std::unique_ptr<TreeEntry> &TE) {
7962 findExternalStoreUsersReorderIndices(TE.get());
7963 if (!ExternalUserReorderIndices.
empty()) {
7964 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7966 std::move(ExternalUserReorderIndices));
7972 if (TE->hasState() && TE->isAltShuffle() &&
7973 TE->State != TreeEntry::SplitVectorize) {
7974 Type *ScalarTy = TE->Scalars[0]->getType();
7976 unsigned Opcode0 = TE->getOpcode();
7977 unsigned Opcode1 = TE->getAltOpcode();
7981 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7982 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7988 bool IgnoreReorder =
7989 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7990 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
7991 VectorizableTree.front()->getOpcode() == Instruction::Store);
7992 if (std::optional<OrdersType> CurrentOrder =
8002 const TreeEntry *UserTE = TE.get();
8004 if (!UserTE->UserTreeIndex)
8006 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8007 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8008 UserTE->UserTreeIndex.UserTE->Idx != 0)
8010 UserTE = UserTE->UserTreeIndex.UserTE;
8013 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8014 if (!(TE->State == TreeEntry::Vectorize ||
8015 TE->State == TreeEntry::StridedVectorize ||
8016 TE->State == TreeEntry::SplitVectorize ||
8017 TE->State == TreeEntry::CompressVectorize) ||
8018 !TE->ReuseShuffleIndices.empty())
8019 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8020 if (TE->State == TreeEntry::Vectorize &&
8021 TE->getOpcode() == Instruction::PHI)
8022 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8027 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8028 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8029 auto It = VFToOrderedEntries.
find(VF);
8030 if (It == VFToOrderedEntries.
end())
8044 for (
const TreeEntry *OpTE : OrderedEntries) {
8047 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8048 OpTE->State != TreeEntry::SplitVectorize)
8051 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8053 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8054 auto It = GathersToOrders.find(OpTE);
8055 if (It != GathersToOrders.end())
8058 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8059 auto It = AltShufflesToOrders.find(OpTE);
8060 if (It != AltShufflesToOrders.end())
8063 if (OpTE->State == TreeEntry::Vectorize &&
8064 OpTE->getOpcode() == Instruction::PHI) {
8065 auto It = PhisToOrders.
find(OpTE);
8066 if (It != PhisToOrders.
end())
8069 return OpTE->ReorderIndices;
8072 auto It = ExternalUserReorderMap.
find(OpTE);
8073 if (It != ExternalUserReorderMap.
end()) {
8074 const auto &ExternalUserReorderIndices = It->second;
8078 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8079 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8080 ExternalUserReorderIndices.size();
8082 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8083 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8090 if (OpTE->State == TreeEntry::Vectorize &&
8091 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8092 assert(!OpTE->isAltShuffle() &&
8093 "Alternate instructions are only supported by BinaryOperator "
8097 unsigned E = Order.
size();
8100 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8103 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8105 ++OrdersUses.try_emplace(Order, 0).first->second;
8108 if (OrdersUses.empty())
8111 unsigned IdentityCnt = 0;
8112 unsigned FilledIdentityCnt = 0;
8114 for (
auto &Pair : OrdersUses) {
8116 if (!Pair.first.empty())
8117 FilledIdentityCnt += Pair.second;
8118 IdentityCnt += Pair.second;
8123 unsigned Cnt = IdentityCnt;
8124 for (
auto &Pair : OrdersUses) {
8128 if (Cnt < Pair.second ||
8129 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8130 Cnt == Pair.second && !BestOrder.
empty() &&
8133 BestOrder = Pair.first;
8146 unsigned E = BestOrder.
size();
8148 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8151 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8153 if (TE->Scalars.size() != VF) {
8154 if (TE->ReuseShuffleIndices.size() == VF) {
8155 assert(TE->State != TreeEntry::SplitVectorize &&
8156 "Split vectorized not expected.");
8161 (!TE->UserTreeIndex ||
8162 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8163 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8164 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8165 "All users must be of VF size.");
8172 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8178 reorderNodeWithReuses(*TE, Mask);
8180 if (TE->UserTreeIndex &&
8181 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8182 TE->UserTreeIndex.UserTE->reorderSplitNode(
8183 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8187 if ((TE->State == TreeEntry::SplitVectorize &&
8188 TE->ReuseShuffleIndices.empty()) ||
8189 ((TE->State == TreeEntry::Vectorize ||
8190 TE->State == TreeEntry::StridedVectorize ||
8191 TE->State == TreeEntry::CompressVectorize) &&
8196 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8197 TE->ReuseShuffleIndices.empty())) &&
8198 "Alternate instructions are only supported by BinaryOperator "
8204 TE->reorderOperands(Mask);
8207 TE->reorderOperands(Mask);
8208 assert(TE->ReorderIndices.empty() &&
8209 "Expected empty reorder sequence.");
8212 if (!TE->ReuseShuffleIndices.empty()) {
8219 addMask(NewReuses, TE->ReuseShuffleIndices);
8220 TE->ReuseShuffleIndices.swap(NewReuses);
8221 }
else if (TE->UserTreeIndex &&
8222 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8224 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8230void BoUpSLP::buildReorderableOperands(
8231 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8235 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8236 return OpData.first ==
I &&
8237 (OpData.second->State == TreeEntry::Vectorize ||
8238 OpData.second->State == TreeEntry::StridedVectorize ||
8239 OpData.second->State == TreeEntry::CompressVectorize ||
8240 OpData.second->State == TreeEntry::SplitVectorize);
8244 if (UserTE->hasState()) {
8245 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8246 UserTE->getOpcode() == Instruction::ExtractValue)
8248 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8250 if (UserTE->getOpcode() == Instruction::Store &&
8251 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8253 if (UserTE->getOpcode() == Instruction::Load &&
8254 (UserTE->State == TreeEntry::Vectorize ||
8255 UserTE->State == TreeEntry::StridedVectorize ||
8256 UserTE->State == TreeEntry::CompressVectorize))
8259 TreeEntry *TE = getOperandEntry(UserTE,
I);
8260 assert(TE &&
"Expected operand entry.");
8261 if (!TE->isGather()) {
8264 Edges.emplace_back(
I, TE);
8270 if (TE->State == TreeEntry::ScatterVectorize &&
8271 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8275 if (ReorderableGathers.
contains(TE))
8281 struct TreeEntryCompare {
8282 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8283 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8284 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8285 return LHS->Idx < RHS->Idx;
8294 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8295 if (TE->State != TreeEntry::Vectorize &&
8296 TE->State != TreeEntry::StridedVectorize &&
8297 TE->State != TreeEntry::CompressVectorize &&
8298 TE->State != TreeEntry::SplitVectorize)
8299 NonVectorized.
insert(TE.get());
8300 if (std::optional<OrdersType> CurrentOrder =
8302 Queue.push(TE.get());
8303 if (!(TE->State == TreeEntry::Vectorize ||
8304 TE->State == TreeEntry::StridedVectorize ||
8305 TE->State == TreeEntry::CompressVectorize ||
8306 TE->State == TreeEntry::SplitVectorize) ||
8307 !TE->ReuseShuffleIndices.empty())
8308 GathersToOrders.
insert(TE.get());
8317 while (!Queue.empty()) {
8319 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8320 TreeEntry *TE = Queue.top();
8321 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8324 while (!Queue.empty()) {
8326 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8331 for (TreeEntry *TE : OrderedOps) {
8332 if (!(TE->State == TreeEntry::Vectorize ||
8333 TE->State == TreeEntry::StridedVectorize ||
8334 TE->State == TreeEntry::CompressVectorize ||
8335 TE->State == TreeEntry::SplitVectorize ||
8336 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8337 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8338 !Visited.
insert(TE).second)
8342 Users.first = TE->UserTreeIndex.UserTE;
8343 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8347 if (
Data.first->State == TreeEntry::SplitVectorize) {
8349 Data.second.size() <= 2 &&
8350 "Expected not greater than 2 operands for split vectorize node.");
8352 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8355 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8356 "Expected exactly 2 entries.");
8357 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8358 TreeEntry &OpTE = *VectorizableTree[
P.first];
8360 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8361 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8363 const auto BestOrder =
8372 const unsigned E = Order.
size();
8375 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8377 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8379 if (!OpTE.ReorderIndices.empty()) {
8380 OpTE.ReorderIndices.clear();
8381 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8384 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8388 if (
Data.first->ReuseShuffleIndices.empty() &&
8389 !
Data.first->ReorderIndices.empty()) {
8392 Queue.push(
Data.first);
8398 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8410 for (
const auto &
Op :
Data.second) {
8411 TreeEntry *OpTE =
Op.second;
8412 if (!VisitedOps.
insert(OpTE).second)
8414 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8416 const auto Order = [&]() ->
const OrdersType {
8417 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8421 return OpTE->ReorderIndices;
8425 if (Order.
size() == 1)
8431 Value *Root = OpTE->hasState()
8434 auto GetSameNodesUsers = [&](
Value *Root) {
8436 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8437 if (TE != OpTE && TE->UserTreeIndex &&
8438 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8439 TE->Scalars.size() == OpTE->Scalars.size() &&
8440 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8441 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8442 Res.
insert(TE->UserTreeIndex.UserTE);
8444 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8445 if (TE != OpTE && TE->UserTreeIndex &&
8446 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8447 TE->Scalars.size() == OpTE->Scalars.size() &&
8448 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8449 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8450 Res.
insert(TE->UserTreeIndex.UserTE);
8454 auto GetNumOperands = [](
const TreeEntry *TE) {
8455 if (TE->State == TreeEntry::SplitVectorize)
8456 return TE->getNumOperands();
8458 return CI->arg_size();
8459 return TE->getNumOperands();
8461 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8462 const TreeEntry *TE) {
8470 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8471 if (
Op->isGather() &&
Op->hasState()) {
8472 const TreeEntry *VecOp =
8473 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8477 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8484 if (!RevisitedOps.
insert(UTE).second)
8486 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8487 !UTE->ReuseShuffleIndices.empty() ||
8488 (UTE->UserTreeIndex &&
8489 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8490 (
Data.first->UserTreeIndex &&
8491 Data.first->UserTreeIndex.UserTE == UTE) ||
8492 (IgnoreReorder && UTE->UserTreeIndex &&
8493 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8494 NodeShouldBeReorderedWithOperands(UTE);
8497 for (TreeEntry *UTE :
Users) {
8505 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8507 Queue.push(
const_cast<TreeEntry *
>(
Op));
8512 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8513 return P.second == OpTE;
8516 if (OpTE->State == TreeEntry::Vectorize &&
8517 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8518 assert(!OpTE->isAltShuffle() &&
8519 "Alternate instructions are only supported by BinaryOperator "
8523 unsigned E = Order.
size();
8526 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8529 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8531 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8533 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8534 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8535 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8536 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8537 (IgnoreReorder && TE->Idx == 0))
8539 if (TE->isGather()) {
8549 if (OpTE->UserTreeIndex) {
8550 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8551 if (!VisitedUsers.
insert(UserTE).second)
8556 if (AllowsReordering(UserTE))
8564 if (
static_cast<unsigned>(
count_if(
8565 Ops, [UserTE, &AllowsReordering](
8566 const std::pair<unsigned, TreeEntry *> &
Op) {
8567 return AllowsReordering(
Op.second) &&
8568 Op.second->UserTreeIndex.UserTE == UserTE;
8569 })) <=
Ops.size() / 2)
8570 ++Res.first->second;
8573 if (OrdersUses.empty()) {
8578 unsigned IdentityCnt = 0;
8579 unsigned VF =
Data.second.front().second->getVectorFactor();
8581 for (
auto &Pair : OrdersUses) {
8583 IdentityCnt += Pair.second;
8588 unsigned Cnt = IdentityCnt;
8589 for (
auto &Pair : OrdersUses) {
8593 if (Cnt < Pair.second) {
8595 BestOrder = Pair.first;
8612 unsigned E = BestOrder.
size();
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8616 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8617 TreeEntry *TE =
Op.second;
8618 if (!VisitedOps.
insert(TE).second)
8620 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8621 reorderNodeWithReuses(*TE, Mask);
8625 if (TE->State != TreeEntry::Vectorize &&
8626 TE->State != TreeEntry::StridedVectorize &&
8627 TE->State != TreeEntry::CompressVectorize &&
8628 TE->State != TreeEntry::SplitVectorize &&
8629 (TE->State != TreeEntry::ScatterVectorize ||
8630 TE->ReorderIndices.empty()))
8632 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8633 TE->ReorderIndices.empty()) &&
8634 "Non-matching sizes of user/operand entries.");
8636 if (IgnoreReorder && TE == VectorizableTree.front().get())
8637 IgnoreReorder =
false;
8640 for (TreeEntry *
Gather : GatherOps) {
8642 "Unexpected reordering of gathers.");
8643 if (!
Gather->ReuseShuffleIndices.empty()) {
8653 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8654 return TE.isAltShuffle() &&
8655 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8656 TE.ReorderIndices.empty());
8658 if (
Data.first->State != TreeEntry::Vectorize ||
8660 Data.first->getMainOp()) ||
8661 IsNotProfitableAltCodeNode(*
Data.first))
8662 Data.first->reorderOperands(Mask);
8664 IsNotProfitableAltCodeNode(*
Data.first) ||
8665 Data.first->State == TreeEntry::StridedVectorize ||
8666 Data.first->State == TreeEntry::CompressVectorize) {
8670 if (
Data.first->ReuseShuffleIndices.empty() &&
8671 !
Data.first->ReorderIndices.empty() &&
8672 !IsNotProfitableAltCodeNode(*
Data.first)) {
8675 Queue.push(
Data.first);
8683 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8684 VectorizableTree.front()->ReuseShuffleIndices.empty())
8685 VectorizableTree.front()->ReorderIndices.
clear();
8688Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8689 if (Entry.hasState() &&
8690 (Entry.getOpcode() == Instruction::Store ||
8691 Entry.getOpcode() == Instruction::Load) &&
8692 Entry.State == TreeEntry::StridedVectorize &&
8693 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8700 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8704 for (
auto &TEPtr : VectorizableTree) {
8705 TreeEntry *Entry = TEPtr.get();
8708 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8712 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8713 Value *Scalar = Entry->Scalars[Lane];
8718 auto It = ScalarToExtUses.
find(Scalar);
8719 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8722 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8723 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8724 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8725 <<
" from " << *Scalar <<
"for many users.\n");
8726 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8727 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8728 ExternalUsesWithNonUsers.insert(Scalar);
8733 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8734 if (ExtI != ExternallyUsedValues.
end()) {
8735 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8736 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8737 << FoundLane <<
" from " << *Scalar <<
".\n");
8738 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8739 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8750 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8755 !UseEntries.
empty()) {
8759 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8762 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8763 return UseEntry->State == TreeEntry::ScatterVectorize ||
8765 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8768 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8771 [](TreeEntry *UseEntry) {
8772 return UseEntry->isGather();
8778 if (It != ScalarToExtUses.
end()) {
8779 ExternalUses[It->second].User =
nullptr;
8784 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8786 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8788 <<
" from lane " << FoundLane <<
" from " << *Scalar
8790 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8791 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8792 ExternalUsesWithNonUsers.insert(Scalar);
8801BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8805 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8806 Value *V = TE->Scalars[Lane];
8819 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8828 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8829 SI->getValueOperand()->getType(),
Ptr}];
8832 if (StoresVec.size() > Lane)
8834 if (!StoresVec.empty()) {
8836 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8837 SI->getValueOperand()->getType(),
8838 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8844 StoresVec.push_back(
SI);
8849 for (
auto &
P : PtrToStoresMap) {
8864 StoreInst *S0 = StoresVec[0];
8869 StoreInst *
SI = StoresVec[Idx];
8870 std::optional<int64_t> Diff =
8872 SI->getPointerOperand(), *DL, *SE,
8878 if (StoreOffsetVec.
size() != StoresVec.
size())
8880 sort(StoreOffsetVec, llvm::less_first());
8882 int64_t PrevDist = 0;
8883 for (
const auto &
P : StoreOffsetVec) {
8884 if (Idx > 0 &&
P.first != PrevDist + 1)
8892 ReorderIndices.assign(StoresVec.
size(), 0);
8893 bool IsIdentity =
true;
8895 ReorderIndices[
P.second] =
I;
8896 IsIdentity &=
P.second ==
I;
8902 ReorderIndices.clear();
8909 for (
unsigned Idx : Order)
8910 dbgs() << Idx <<
", ";
8916BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8917 unsigned NumLanes =
TE->Scalars.size();
8930 if (StoresVec.
size() != NumLanes)
8935 if (!canFormVector(StoresVec, ReorderIndices))
8940 ExternalReorderIndices.
push_back(ReorderIndices);
8942 return ExternalReorderIndices;
8948 UserIgnoreList = &UserIgnoreLst;
8951 buildTreeRec(Roots, 0,
EdgeInfo());
8958 buildTreeRec(Roots, 0,
EdgeInfo());
8967 bool AddNew =
true) {
8975 for (
Value *V : VL) {
8979 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8981 bool IsFound =
false;
8982 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8983 assert(LI->getParent() ==
Data.front().first->getParent() &&
8984 LI->getType() ==
Data.front().first->getType() &&
8988 "Expected loads with the same type, same parent and same "
8989 "underlying pointer.");
8991 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8992 Data.front().first->getPointerOperand(),
DL, SE,
8996 auto It = Map.find(*Dist);
8997 if (It != Map.end() && It->second != LI)
8999 if (It == Map.end()) {
9000 Data.emplace_back(LI, *Dist);
9001 Map.try_emplace(*Dist, LI);
9011 auto FindMatchingLoads =
9016 int64_t &
Offset,
unsigned &Start) {
9018 return GatheredLoads.
end();
9027 std::optional<int64_t> Dist =
9029 Data.front().first->getType(),
9030 Data.front().first->getPointerOperand(),
DL, SE,
9036 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9042 unsigned NumUniques = 0;
9043 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9044 bool Used = DataLoads.
contains(Pair.first);
9045 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9049 Repeated.insert(Cnt);
9052 if (NumUniques > 0 &&
9053 (Loads.
size() == NumUniques ||
9054 (Loads.
size() - NumUniques >= 2 &&
9055 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9061 return std::next(GatheredLoads.
begin(), Idx);
9065 return GatheredLoads.
end();
9067 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9071 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9073 while (It != GatheredLoads.
end()) {
9074 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9075 for (
unsigned Idx : LocalToAdd)
9078 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9082 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9089 Loads.push_back(
Data[Idx]);
9095 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9096 return PD.front().first->getParent() == LI->
getParent() &&
9097 PD.front().first->getType() == LI->
getType();
9099 while (It != GatheredLoads.
end()) {
9102 std::next(It), GatheredLoads.
end(),
9103 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9104 return PD.front().first->getParent() == LI->getParent() &&
9105 PD.front().first->getType() == LI->getType();
9109 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9110 AddNewLoads(GatheredLoads.emplace_back());
9115void BoUpSLP::tryToVectorizeGatheredLoads(
9116 const SmallMapVector<
9117 std::tuple<BasicBlock *, Value *, Type *>,
9120 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9123 LoadEntriesToVectorize.size());
9124 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9125 Set.insert_range(VectorizableTree[Idx]->Scalars);
9128 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9129 const std::pair<LoadInst *, int64_t> &L2) {
9130 return L1.second > L2.second;
9137 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9138 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9139 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9144 SmallVectorImpl<LoadInst *> &NonVectorized,
9145 bool Final,
unsigned MaxVF) {
9147 unsigned StartIdx = 0;
9148 SmallVector<int> CandidateVFs;
9152 *TTI, Loads.
front()->getType(), MaxVF);
9154 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9160 if (Final && CandidateVFs.
empty())
9163 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9164 for (
unsigned NumElts : CandidateVFs) {
9165 if (Final && NumElts > BestVF)
9167 SmallVector<unsigned> MaskedGatherVectorized;
9168 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9172 if (VectorizedLoads.count(Slice.
front()) ||
9173 VectorizedLoads.count(Slice.
back()) ||
9179 bool AllowToVectorize =
false;
9182 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9185 for (LoadInst *LI : Slice) {
9187 if (LI->hasOneUse())
9193 if (
static_cast<unsigned int>(std::distance(
9194 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9196 if (!IsLegalBroadcastLoad)
9200 for (User *U : LI->users()) {
9203 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9204 for (
int I :
seq<int>(UTE->getNumOperands())) {
9206 return V == LI || isa<PoisonValue>(V);
9216 AllowToVectorize = CheckIfAllowed(Slice);
9220 any_of(ValueToGatherNodes.at(Slice.front()),
9221 [=](
const TreeEntry *TE) {
9222 return TE->Scalars.size() == 2 &&
9223 ((TE->Scalars.front() == Slice.front() &&
9224 TE->Scalars.back() == Slice.back()) ||
9225 (TE->Scalars.front() == Slice.back() &&
9226 TE->Scalars.back() == Slice.front()));
9231 if (AllowToVectorize) {
9236 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9237 StridedPtrInfo SPtrInfo;
9239 PointerOps, SPtrInfo, &BestVF);
9241 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9243 if (MaskedGatherVectorized.
empty() ||
9244 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9249 Results.emplace_back(Values, LS);
9250 VectorizedLoads.insert_range(Slice);
9253 if (Cnt == StartIdx)
9254 StartIdx += NumElts;
9257 if (StartIdx >= Loads.
size())
9261 if (!MaskedGatherVectorized.
empty() &&
9262 Cnt < MaskedGatherVectorized.
back() + NumElts)
9268 if (!AllowToVectorize || BestVF == 0)
9272 for (
unsigned Cnt : MaskedGatherVectorized) {
9274 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9278 VectorizedLoads.insert_range(Slice);
9280 if (Cnt == StartIdx)
9281 StartIdx += NumElts;
9284 for (LoadInst *LI : Loads) {
9285 if (!VectorizedLoads.contains(LI))
9286 NonVectorized.push_back(LI);
9290 auto ProcessGatheredLoads =
9293 bool Final =
false) {
9295 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9297 if (LoadsDists.size() <= 1) {
9298 NonVectorized.
push_back(LoadsDists.back().first);
9306 unsigned MaxConsecutiveDistance = 0;
9307 unsigned CurrentConsecutiveDist = 1;
9308 int64_t LastDist = LocalLoadsDists.front().second;
9309 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9310 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9313 assert(LastDist >=
L.second &&
9314 "Expected first distance always not less than second");
9315 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9316 CurrentConsecutiveDist) {
9317 ++CurrentConsecutiveDist;
9318 MaxConsecutiveDistance =
9319 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9323 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9326 CurrentConsecutiveDist = 1;
9327 LastDist =
L.second;
9330 if (Loads.
size() <= 1)
9332 if (AllowMaskedGather)
9333 MaxConsecutiveDistance = Loads.
size();
9334 else if (MaxConsecutiveDistance < 2)
9339 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9340 Final, MaxConsecutiveDistance);
9342 OriginalLoads.size() == Loads.
size() &&
9343 MaxConsecutiveDistance == Loads.
size() &&
9348 VectorizedLoads.
clear();
9352 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9353 UnsortedNonVectorized, Final,
9354 OriginalLoads.size());
9355 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9356 SortedNonVectorized.
swap(UnsortedNonVectorized);
9357 Results.swap(UnsortedResults);
9362 << Slice.
size() <<
")\n");
9364 for (
Value *L : Slice)
9372 unsigned MaxVF = Slice.size();
9373 unsigned UserMaxVF = 0;
9374 unsigned InterleaveFactor = 0;
9379 std::optional<unsigned> InterleavedLoadsDistance = 0;
9381 std::optional<unsigned> CommonVF = 0;
9382 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9383 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9384 for (
auto [Idx, V] :
enumerate(Slice)) {
9385 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9386 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9389 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9391 if (*CommonVF == 0) {
9392 CommonVF =
E->Scalars.size();
9395 if (*CommonVF !=
E->Scalars.size())
9399 if (Pos != Idx && InterleavedLoadsDistance) {
9402 if (isa<Constant>(V))
9404 if (isVectorized(V))
9406 const auto &Nodes = ValueToGatherNodes.at(V);
9407 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9408 !is_contained(Slice, V);
9410 InterleavedLoadsDistance.reset();
9414 if (*InterleavedLoadsDistance == 0) {
9415 InterleavedLoadsDistance = Idx - Pos;
9418 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9419 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9420 InterleavedLoadsDistance.reset();
9421 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9425 DeinterleavedNodes.
clear();
9427 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9428 CommonVF.value_or(0) != 0) {
9429 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9430 unsigned VF = *CommonVF;
9433 StridedPtrInfo SPtrInfo;
9435 if (InterleaveFactor <= Slice.size() &&
9436 TTI.isLegalInterleavedAccessType(
9444 UserMaxVF = InterleaveFactor * VF;
9446 InterleaveFactor = 0;
9451 unsigned ConsecutiveNodesSize = 0;
9452 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9453 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9454 [&, Slice = Slice](
const auto &
P) {
9456 return std::get<1>(
P).contains(V);
9458 if (It == Slice.end())
9460 const TreeEntry &
TE =
9461 *VectorizableTree[std::get<0>(
P)];
9465 StridedPtrInfo SPtrInfo;
9467 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9471 ConsecutiveNodesSize += VL.
size();
9472 size_t Start = std::distance(Slice.begin(), It);
9473 size_t Sz = Slice.size() -
Start;
9474 return Sz < VL.
size() ||
9475 Slice.slice(Start, VL.
size()) != VL;
9480 if (InterleaveFactor == 0 &&
9482 [&, Slice = Slice](
unsigned Idx) {
9484 SmallVector<Value *> PointerOps;
9485 StridedPtrInfo SPtrInfo;
9486 return canVectorizeLoads(
9487 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9488 Slice[Idx * UserMaxVF], Order, PointerOps,
9489 SPtrInfo) == LoadsState::ScatterVectorize;
9492 if (Slice.size() != ConsecutiveNodesSize)
9493 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9495 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9496 bool IsVectorized =
true;
9497 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9499 Slice.slice(
I, std::min(VF,
E -
I));
9504 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9505 [&](
const auto &
P) {
9507 VectorizableTree[std::get<0>(
P)]
9512 unsigned Sz = VectorizableTree.size();
9513 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9514 if (Sz == VectorizableTree.size()) {
9515 IsVectorized =
false;
9518 if (InterleaveFactor > 0) {
9519 VF = 2 * (MaxVF / InterleaveFactor);
9520 InterleaveFactor = 0;
9529 NonVectorized.
append(SortedNonVectorized);
9531 return NonVectorized;
9533 for (
const auto &GLs : GatheredLoads) {
9534 const auto &
Ref = GLs.second;
9536 if (!
Ref.empty() && !NonVectorized.
empty() &&
9538 Ref.begin(),
Ref.end(), 0u,
9539 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9540 ->
unsigned { return S + LoadsDists.size(); }) !=
9541 NonVectorized.
size() &&
9542 IsMaskedGatherSupported(NonVectorized)) {
9545 for (LoadInst *LI : NonVectorized) {
9553 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9557 for (
unsigned Idx : LoadEntriesToVectorize) {
9558 const TreeEntry &
E = *VectorizableTree[Idx];
9561 if (!
E.ReorderIndices.empty()) {
9564 SmallVector<int> ReorderMask;
9568 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9572 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9573 VectorizableTree.size())
9574 GatheredLoadsEntriesFirst.reset();
9584 bool AllowAlternate) {
9607 isValidForAlternation(
I->getOpcode())) {
9619 std::pair<size_t, size_t> OpVals =
9627 if (CI->isCommutative())
9649 SubKey =
hash_value(Gep->getPointerOperand());
9661 return std::make_pair(
Key, SubKey);
9667 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9669bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9671 Type *ScalarTy = S.getMainOp()->getType();
9672 unsigned Opcode0 = S.getOpcode();
9673 unsigned Opcode1 = S.getAltOpcode();
9674 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9677 Opcode1, OpcodeMask))
9680 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9683 for (
Value *V : VL) {
9700 switch (Res.value_or(0)) {
9714 DenseSet<unsigned> UniqueOpcodes;
9715 constexpr unsigned NumAltInsts = 3;
9716 unsigned NonInstCnt = 0;
9719 unsigned UndefCnt = 0;
9721 unsigned ExtraShuffleInsts = 0;
9730 return is_contained(Operands.back(), V);
9733 ++ExtraShuffleInsts;
9736 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9748 DenseMap<Value *, unsigned> Uniques;
9758 if (!Res.second && Res.first->second == 1)
9759 ++ExtraShuffleInsts;
9760 ++Res.first->getSecond();
9762 UniqueOpcodes.
insert(
I->getOpcode());
9763 else if (Res.second)
9766 return none_of(Uniques, [&](
const auto &
P) {
9767 return P.first->hasNUsesOrMore(
P.second + 1) &&
9768 none_of(
P.first->users(), [&](User *U) {
9769 return isVectorized(U) || Uniques.contains(U);
9778 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9779 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9780 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9787 const unsigned VF,
unsigned MinBW,
9810static std::pair<InstructionCost, InstructionCost>
9830 FMF = FPCI->getFastMathFlags();
9833 LibCost.isValid() ? LibCost : ScalarLimit);
9843BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9845 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9846 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9848 "Expected instructions with same/alternate opcodes only.");
9850 unsigned ShuffleOrOp =
9851 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9853 switch (ShuffleOrOp) {
9854 case Instruction::PHI: {
9857 return TreeEntry::NeedToGather;
9859 for (
Value *V : VL) {
9863 for (
Value *Incoming :
PHI->incoming_values()) {
9865 if (Term &&
Term->isTerminator()) {
9867 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9868 return TreeEntry::NeedToGather;
9873 return TreeEntry::Vectorize;
9875 case Instruction::ExtractElement:
9882 return TreeEntry::NeedToGather;
9884 case Instruction::ExtractValue: {
9885 bool Reuse = canReuseExtract(VL, CurrentOrder);
9889 return TreeEntry::NeedToGather;
9890 if (Reuse || !CurrentOrder.empty())
9891 return TreeEntry::Vectorize;
9893 return TreeEntry::NeedToGather;
9895 case Instruction::InsertElement: {
9899 for (
Value *V : VL) {
9901 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9902 return TreeEntry::NeedToGather;
9906 "Non-constant or undef index?");
9910 return !SourceVectors.contains(V);
9913 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9914 "different source vectors.\n");
9915 return TreeEntry::NeedToGather;
9920 return SourceVectors.contains(V) && !
V->hasOneUse();
9923 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9924 "multiple uses.\n");
9925 return TreeEntry::NeedToGather;
9928 return TreeEntry::Vectorize;
9930 case Instruction::Load: {
9937 auto IsGatheredNode = [&]() {
9938 if (!GatheredLoadsEntriesFirst)
9943 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9944 return TE->Idx >= *GatheredLoadsEntriesFirst;
9950 return TreeEntry::Vectorize;
9952 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9954 LoadEntriesToVectorize.insert(VectorizableTree.size());
9955 return TreeEntry::NeedToGather;
9957 return IsGatheredNode() ? TreeEntry::NeedToGather
9958 : TreeEntry::CompressVectorize;
9960 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9962 LoadEntriesToVectorize.insert(VectorizableTree.size());
9963 return TreeEntry::NeedToGather;
9965 return IsGatheredNode() ? TreeEntry::NeedToGather
9966 : TreeEntry::ScatterVectorize;
9968 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9970 LoadEntriesToVectorize.insert(VectorizableTree.size());
9971 return TreeEntry::NeedToGather;
9973 return IsGatheredNode() ? TreeEntry::NeedToGather
9974 : TreeEntry::StridedVectorize;
9978 if (DL->getTypeSizeInBits(ScalarTy) !=
9979 DL->getTypeAllocSizeInBits(ScalarTy))
9980 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9983 return !LI || !LI->isSimple();
9990 return TreeEntry::NeedToGather;
9994 case Instruction::ZExt:
9995 case Instruction::SExt:
9996 case Instruction::FPToUI:
9997 case Instruction::FPToSI:
9998 case Instruction::FPExt:
9999 case Instruction::PtrToInt:
10000 case Instruction::IntToPtr:
10001 case Instruction::SIToFP:
10002 case Instruction::UIToFP:
10003 case Instruction::Trunc:
10004 case Instruction::FPTrunc:
10005 case Instruction::BitCast: {
10007 for (
Value *V : VL) {
10013 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10014 return TreeEntry::NeedToGather;
10017 return TreeEntry::Vectorize;
10019 case Instruction::ICmp:
10020 case Instruction::FCmp: {
10025 for (
Value *V : VL) {
10029 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10030 Cmp->getOperand(0)->getType() != ComparedTy) {
10031 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10032 return TreeEntry::NeedToGather;
10035 return TreeEntry::Vectorize;
10037 case Instruction::Select:
10038 case Instruction::FNeg:
10039 case Instruction::Add:
10040 case Instruction::FAdd:
10041 case Instruction::Sub:
10042 case Instruction::FSub:
10043 case Instruction::Mul:
10044 case Instruction::FMul:
10045 case Instruction::UDiv:
10046 case Instruction::SDiv:
10047 case Instruction::FDiv:
10048 case Instruction::URem:
10049 case Instruction::SRem:
10050 case Instruction::FRem:
10051 case Instruction::Shl:
10052 case Instruction::LShr:
10053 case Instruction::AShr:
10054 case Instruction::And:
10055 case Instruction::Or:
10056 case Instruction::Xor:
10057 case Instruction::Freeze:
10058 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10059 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10061 return I &&
I->isBinaryOp() && !
I->isFast();
10063 return TreeEntry::NeedToGather;
10064 return TreeEntry::Vectorize;
10065 case Instruction::GetElementPtr: {
10067 for (
Value *V : VL) {
10071 if (
I->getNumOperands() != 2) {
10072 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10073 return TreeEntry::NeedToGather;
10080 for (
Value *V : VL) {
10084 Type *CurTy =
GEP->getSourceElementType();
10085 if (Ty0 != CurTy) {
10086 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10087 return TreeEntry::NeedToGather;
10093 for (
Value *V : VL) {
10097 auto *
Op =
I->getOperand(1);
10099 (
Op->getType() != Ty1 &&
10101 Op->getType()->getScalarSizeInBits() >
10102 DL->getIndexSizeInBits(
10103 V->getType()->getPointerAddressSpace())))) {
10105 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10106 return TreeEntry::NeedToGather;
10110 return TreeEntry::Vectorize;
10112 case Instruction::Store: {
10114 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10117 if (DL->getTypeSizeInBits(ScalarTy) !=
10118 DL->getTypeAllocSizeInBits(ScalarTy)) {
10119 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10120 return TreeEntry::NeedToGather;
10124 for (
Value *V : VL) {
10126 if (!
SI->isSimple()) {
10128 return TreeEntry::NeedToGather;
10137 if (CurrentOrder.empty()) {
10138 Ptr0 = PointerOps.
front();
10139 PtrN = PointerOps.
back();
10141 Ptr0 = PointerOps[CurrentOrder.front()];
10142 PtrN = PointerOps[CurrentOrder.back()];
10144 std::optional<int64_t> Dist =
10147 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10148 return TreeEntry::Vectorize;
10152 return TreeEntry::NeedToGather;
10154 case Instruction::Call: {
10155 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10156 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10158 return I && !
I->isFast();
10160 return TreeEntry::NeedToGather;
10170 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10174 return TreeEntry::NeedToGather;
10177 unsigned NumArgs = CI->
arg_size();
10179 for (
unsigned J = 0; J != NumArgs; ++J)
10182 for (
Value *V : VL) {
10187 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10189 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10191 return TreeEntry::NeedToGather;
10195 for (
unsigned J = 0; J != NumArgs; ++J) {
10198 if (ScalarArgs[J] != A1J) {
10200 <<
"SLP: mismatched arguments in call:" << *CI
10201 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10202 return TreeEntry::NeedToGather;
10211 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10212 <<
"!=" << *V <<
'\n');
10213 return TreeEntry::NeedToGather;
10218 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10220 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10221 return TreeEntry::NeedToGather;
10223 return TreeEntry::Vectorize;
10225 case Instruction::ShuffleVector: {
10226 if (!S.isAltShuffle()) {
10229 return TreeEntry::Vectorize;
10232 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10233 return TreeEntry::NeedToGather;
10238 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10239 "the whole alt sequence is not profitable.\n");
10240 return TreeEntry::NeedToGather;
10243 return TreeEntry::Vectorize;
10247 return TreeEntry::NeedToGather;
10256 PHINode *Main =
nullptr;
10261 PHIHandler() =
delete;
10263 : DT(DT), Main(Main), Phis(Phis),
10264 Operands(Main->getNumIncomingValues(),
10266 void buildOperands() {
10267 constexpr unsigned FastLimit = 4;
10276 for (
auto [Idx, V] :
enumerate(Phis)) {
10280 "Expected isa instruction or poison value.");
10284 if (
P->getIncomingBlock(
I) == InBB)
10287 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10292 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10302 for (
auto [Idx, V] :
enumerate(Phis)) {
10317 auto *It = Blocks.
find(InBB);
10318 if (It == Blocks.
end())
10320 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10323 for (
const auto &
P : Blocks) {
10324 ArrayRef<unsigned> IncomingValues =
P.second;
10325 if (IncomingValues.
size() <= 1)
10328 for (
unsigned I : IncomingValues) {
10330 [&](
const auto &
Data) {
10331 return !
Data.value() ||
10334 "Expected empty operands list.");
10348static std::pair<Instruction *, Instruction *>
10352 for (
Value *V : VL) {
10362 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10381 "Expected different main and alt instructions.");
10382 return std::make_pair(MainOp, AltOp);
10395 const InstructionsState &S,
10397 bool TryPad =
false) {
10401 for (
Value *V : VL) {
10417 size_t NumUniqueScalarValues = UniqueValues.
size();
10420 if (NumUniqueScalarValues == VL.
size() &&
10422 ReuseShuffleIndices.
clear();
10427 if ((UserTreeIdx.
UserTE &&
10428 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10430 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10431 "for nodes with padding.\n");
10432 ReuseShuffleIndices.
clear();
10437 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10441 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10442 S.getMainOp()->isSafeToRemove() &&
10443 (S.areInstructionsWithCopyableElements() ||
10447 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10448 PWSz = std::min<unsigned>(PWSz, VL.
size());
10449 if (PWSz == VL.
size()) {
10453 ReuseShuffleIndices.
clear();
10457 UniqueValues.
end());
10458 PaddedUniqueValues.
append(
10459 PWSz - UniqueValues.
size(),
10463 if (!S.areInstructionsWithCopyableElements() &&
10466 ReuseShuffleIndices.
clear();
10469 VL = std::move(PaddedUniqueValues);
10474 ReuseShuffleIndices.
clear();
10477 VL = std::move(UniqueValues);
10482 const InstructionsState &LocalState,
10483 SmallVectorImpl<Value *> &Op1,
10484 SmallVectorImpl<Value *> &Op2,
10486 constexpr unsigned SmallNodeSize = 4;
10487 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10492 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10494 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10495 if (
E->isSame(VL)) {
10497 << *LocalState.getMainOp() <<
".\n");
10509 ReorderIndices.assign(VL.
size(), VL.
size());
10510 SmallBitVector Op1Indices(VL.
size());
10515 Op1Indices.set(Idx);
10518 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10521 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10523 LocalState.getAltOp(), *TLI))) {
10525 Op1Indices.set(Idx);
10532 unsigned Opcode0 = LocalState.getOpcode();
10533 unsigned Opcode1 = LocalState.getAltOpcode();
10534 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10539 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10540 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10545 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10547 if (Op1Indices.test(Idx)) {
10548 ReorderIndices[Op1Cnt] = Idx;
10551 ReorderIndices[Op2Cnt] = Idx;
10556 ReorderIndices.clear();
10557 SmallVector<int>
Mask;
10558 if (!ReorderIndices.empty())
10560 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10565 if (NumParts >= VL.
size())
10570 FixedVectorType *SubVecTy =
10574 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10575 (
Mask.empty() || InsertCost >= NewShuffleCost))
10577 if ((LocalState.getMainOp()->isBinaryOp() &&
10578 LocalState.getAltOp()->isBinaryOp() &&
10579 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10580 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10581 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10582 (LocalState.getMainOp()->isUnaryOp() &&
10583 LocalState.getAltOp()->isUnaryOp())) {
10585 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10586 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10591 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10595 VecTy, OriginalMask, Kind);
10597 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10598 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10600 NewVecOpsCost + InsertCost +
10601 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10602 VectorizableTree.front()->getOpcode() == Instruction::Store
10606 if (NewCost >= OriginalCost)
10616class InstructionsCompatibilityAnalysis {
10618 const DataLayout &
DL;
10619 const TargetTransformInfo &
TTI;
10620 const TargetLibraryInfo &TLI;
10621 unsigned MainOpcode = 0;
10626 static bool isSupportedOpcode(
const unsigned Opcode) {
10627 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10638 return I && isSupportedOpcode(
I->getOpcode()) &&
10643 SmallDenseSet<Value *, 8>
Operands;
10644 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10645 for (
Value *V : VL) {
10651 if (Candidates.
empty()) {
10652 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10654 Operands.insert(
I->op_begin(),
I->op_end());
10657 if (Parent ==
I->getParent()) {
10658 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10659 Operands.insert(
I->op_begin(),
I->op_end());
10662 auto *NodeA = DT.
getNode(Parent);
10663 auto *NodeB = DT.
getNode(
I->getParent());
10664 assert(NodeA &&
"Should only process reachable instructions");
10665 assert(NodeB &&
"Should only process reachable instructions");
10666 assert((NodeA == NodeB) ==
10667 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10668 "Different nodes should have different DFS numbers");
10669 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10670 Candidates.
clear();
10671 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10674 Operands.insert(
I->op_begin(),
I->op_end());
10677 unsigned BestOpcodeNum = 0;
10679 for (
const auto &
P : Candidates) {
10680 if (
P.second.size() < BestOpcodeNum)
10682 for (Instruction *
I :
P.second) {
10683 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10685 BestOpcodeNum =
P.second.size();
10695 return I &&
I->getParent() == MainOp->
getParent() &&
10708 Value *selectBestIdempotentValue()
const {
10709 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10720 if (!S.isCopyableElement(V))
10722 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10723 return {
V, selectBestIdempotentValue()};
10729 SmallVectorImpl<BoUpSLP::ValueList> &
Operands)
const {
10731 unsigned ShuffleOrOp =
10732 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10735 switch (ShuffleOrOp) {
10736 case Instruction::PHI: {
10740 PHIHandler Handler(DT, PH, VL);
10741 Handler.buildOperands();
10742 Operands.assign(PH->getNumOperands(), {});
10744 Operands[
I].assign(Handler.getOperands(
I).begin(),
10745 Handler.getOperands(
I).end());
10748 case Instruction::ExtractValue:
10749 case Instruction::ExtractElement:
10754 case Instruction::InsertElement:
10762 case Instruction::Load:
10770 Op = LI->getPointerOperand();
10773 case Instruction::ZExt:
10774 case Instruction::SExt:
10775 case Instruction::FPToUI:
10776 case Instruction::FPToSI:
10777 case Instruction::FPExt:
10778 case Instruction::PtrToInt:
10779 case Instruction::IntToPtr:
10780 case Instruction::SIToFP:
10781 case Instruction::UIToFP:
10782 case Instruction::Trunc:
10783 case Instruction::FPTrunc:
10784 case Instruction::BitCast:
10785 case Instruction::ICmp:
10786 case Instruction::FCmp:
10787 case Instruction::Select:
10788 case Instruction::FNeg:
10789 case Instruction::Add:
10790 case Instruction::FAdd:
10791 case Instruction::Sub:
10792 case Instruction::FSub:
10793 case Instruction::Mul:
10794 case Instruction::FMul:
10795 case Instruction::UDiv:
10796 case Instruction::SDiv:
10797 case Instruction::FDiv:
10798 case Instruction::URem:
10799 case Instruction::SRem:
10800 case Instruction::FRem:
10801 case Instruction::Shl:
10802 case Instruction::LShr:
10803 case Instruction::AShr:
10804 case Instruction::And:
10805 case Instruction::Or:
10806 case Instruction::Xor:
10807 case Instruction::Freeze:
10808 case Instruction::Store:
10809 case Instruction::ShuffleVector:
10818 auto [
Op, ConvertedOps] = convertTo(
I, S);
10823 case Instruction::GetElementPtr: {
10830 const unsigned IndexIdx = 1;
10836 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10839 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10840 ->getPointerOperandType()
10841 ->getScalarType());
10846 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10850 auto *
Op =
GEP->getOperand(IndexIdx);
10853 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10858 case Instruction::Call: {
10865 for (
Value *V : VL) {
10867 Ops.push_back(
I ?
I->getOperand(Idx)
10880 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10881 const TargetTransformInfo &
TTI,
10882 const TargetLibraryInfo &TLI)
10887 bool TryCopyableElementsVectorization,
10888 bool WithProfitabilityCheck =
false,
10889 bool SkipSameCodeCheck =
false) {
10890 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10891 ? InstructionsState::invalid()
10897 findAndSetMainInstruction(VL, R);
10899 return InstructionsState::invalid();
10900 S = InstructionsState(MainOp, MainOp,
true);
10901 if (!WithProfitabilityCheck)
10905 auto BuildCandidates =
10906 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
10912 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10913 I1->getParent() != I2->getParent())
10917 if (VL.
size() == 2) {
10922 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10923 R.findBestRootPair(Candidates1) &&
10924 R.findBestRootPair(Candidates2);
10926 Candidates1.
clear();
10927 Candidates2.
clear();
10930 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10931 R.findBestRootPair(Candidates1) &&
10932 R.findBestRootPair(Candidates2);
10935 return InstructionsState::invalid();
10939 FixedVectorType *VecTy =
10941 switch (MainOpcode) {
10942 case Instruction::Add:
10943 case Instruction::LShr:
10949 if (VectorCost > ScalarCost)
10950 return InstructionsState::invalid();
10953 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10954 unsigned CopyableNum =
10955 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10956 if (CopyableNum < VL.
size() / 2)
10959 const unsigned Limit = VL.
size() / 24;
10960 if ((CopyableNum >= VL.
size() - Limit ||
10961 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10966 return InstructionsState::invalid();
10985 return InstructionsState::invalid();
10991 constexpr unsigned Limit = 4;
10992 if (
Operands.front().size() >= Limit) {
10993 SmallDenseMap<const Value *, unsigned>
Counters;
11001 return C.second == 1;
11007 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11008 InstructionsState OpS =
Analysis.buildInstructionsState(
11010 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11012 unsigned CopyableNum =
11014 return CopyableNum <= VL.
size() / 2;
11016 if (!CheckOperand(
Operands.front()))
11017 return InstructionsState::invalid();
11024 assert(S &&
"Invalid state!");
11026 if (S.areInstructionsWithCopyableElements()) {
11027 MainOp = S.getMainOp();
11028 MainOpcode = S.getOpcode();
11033 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11034 Operands[OperandIdx][Idx] = Operand;
11037 buildOriginalOperands(S, VL,
Operands);
11044BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11046 bool TryCopyableElementsVectorization)
const {
11049 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11050 InstructionsState S =
Analysis.buildInstructionsState(
11051 VL, *
this, TryCopyableElementsVectorization,
11052 true, TryCopyableElementsVectorization);
11060 return ScalarsVectorizationLegality(S,
false,
11066 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11067 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11068 if (
E->isSame(VL)) {
11069 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11071 return ScalarsVectorizationLegality(S,
false);
11076 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11077 LI->getLoopFor(S.getMainOp()->getParent()) &&
11081 return ScalarsVectorizationLegality(S,
false);
11090 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11097 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11098 return ScalarsVectorizationLegality(S,
false);
11102 if (S && S.getOpcode() == Instruction::ExtractElement &&
11105 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11106 return ScalarsVectorizationLegality(S,
false);
11113 return ScalarsVectorizationLegality(S,
false,
11123 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11131 SmallVector<unsigned, 8> InstsCount;
11132 for (
Value *V : VL) {
11135 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11138 bool IsCommutative =
11140 if ((IsCommutative &&
11141 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11143 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11145 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11149 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11151 I2->getOperand(
Op));
11152 if (
static_cast<unsigned>(
count_if(
11153 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11155 })) >= S.getMainOp()->getNumOperands() / 2)
11157 if (S.getMainOp()->getNumOperands() > 2)
11159 if (IsCommutative) {
11161 Candidates.
clear();
11162 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11164 I2->getOperand((
Op + 1) %
E));
11166 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11173 SmallVector<unsigned> SortedIndices;
11175 bool IsScatterVectorizeUserTE =
11176 UserTreeIdx.UserTE &&
11177 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11178 bool AreAllSameBlock = S.valid();
11179 bool AreScatterAllGEPSameBlock =
11192 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11194 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11200 NotProfitableForVectorization(VL)) {
11202 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11203 "C,S,B,O, small shuffle. \n";
11207 return ScalarsVectorizationLegality(S,
false,
11211 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11215 return ScalarsVectorizationLegality(S,
false);
11219 if (S && !EphValues.empty()) {
11220 for (
Value *V : VL) {
11221 if (EphValues.count(V)) {
11223 <<
") is ephemeral.\n");
11225 return ScalarsVectorizationLegality(S,
false,
11237 if (S && S.isAltShuffle()) {
11238 auto GetNumVectorizedExtracted = [&]() {
11244 all_of(
I->operands(), [&](
const Use &U) {
11245 return isa<ExtractElementInst>(U.get());
11250 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11253 return std::make_pair(Vectorized, Extracted);
11255 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11257 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11258 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11261 Type *ScalarTy = VL.front()->getType();
11266 false,
true, Kind);
11268 *TTI, ScalarTy, VecTy, Vectorized,
11269 true,
false, Kind,
false);
11270 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11272 if (PreferScalarize) {
11273 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11274 "node is not profitable.\n");
11275 return ScalarsVectorizationLegality(S,
false);
11280 if (UserIgnoreList && !UserIgnoreList->empty()) {
11281 for (
Value *V : VL) {
11282 if (UserIgnoreList->contains(V)) {
11283 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11284 return ScalarsVectorizationLegality(S,
false);
11291 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11292 assert(VL.front()->getType()->isPointerTy() &&
11294 "Expected pointers only.");
11297 assert(It != VL.end() &&
"Expected at least one GEP.");
11308 !DT->isReachableFromEntry(BB))) {
11314 return ScalarsVectorizationLegality(S,
false);
11316 return ScalarsVectorizationLegality(S,
true);
11321 unsigned InterleaveFactor) {
11324 SmallVector<int> ReuseShuffleIndices;
11328 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11331 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11334 auto Invalid = ScheduleBundle::invalid();
11335 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11336 UserTreeIdx, {}, ReorderIndices);
11341 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11343 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11344 Idx == 0 ? 0 : Op1.
size());
11345 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11347 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11348 Idx == 0 ? 0 : Op1.
size());
11358 bool AreConsts =
false;
11359 for (
Value *V : VL) {
11371 if (AreOnlyConstsWithPHIs(VL)) {
11372 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11373 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11377 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11378 VL,
Depth, UserTreeIdx,
false);
11379 InstructionsState S = Legality.getInstructionsState();
11380 if (!Legality.isLegal()) {
11381 if (Legality.trySplitVectorize()) {
11384 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11388 Legality = getScalarsVectorizationLegality(
11389 VL,
Depth, UserTreeIdx,
true);
11390 if (!Legality.isLegal()) {
11391 if (Legality.tryToFindDuplicates())
11395 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11398 S = Legality.getInstructionsState();
11402 if (S.isAltShuffle() && TrySplitNode(S))
11408 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11413 bool IsScatterVectorizeUserTE =
11414 UserTreeIdx.UserTE &&
11415 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11418 StridedPtrInfo SPtrInfo;
11419 TreeEntry::EntryState State = getScalarsVectorizationState(
11420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11421 if (State == TreeEntry::NeedToGather) {
11422 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11428 auto &BSRef = BlocksSchedules[BB];
11430 BSRef = std::make_unique<BlockScheduling>(BB);
11432 BlockScheduling &BS = *BSRef;
11435 std::optional<ScheduleBundle *> BundlePtr =
11436 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11437#ifdef EXPENSIVE_CHECKS
11441 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11442 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11444 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11446 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11447 NonScheduledFirst.insert(VL.front());
11448 if (S.getOpcode() == Instruction::Load &&
11449 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11453 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11455 ScheduleBundle
Empty;
11456 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11457 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11459 unsigned ShuffleOrOp =
11460 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11461 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11463 SmallVector<unsigned> PHIOps;
11469 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11474 for (
unsigned I : PHIOps)
11477 switch (ShuffleOrOp) {
11478 case Instruction::PHI: {
11480 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11488 case Instruction::ExtractValue:
11489 case Instruction::ExtractElement: {
11490 if (CurrentOrder.empty()) {
11491 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11494 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11496 for (
unsigned Idx : CurrentOrder)
11497 dbgs() <<
" " << Idx;
11504 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11505 ReuseShuffleIndices, CurrentOrder);
11507 "(ExtractValueInst/ExtractElementInst).\n";
11514 case Instruction::InsertElement: {
11515 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11517 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11518 const std::pair<int, int> &P2) {
11519 return P1.first > P2.first;
11522 decltype(OrdCompare)>
11523 Indices(OrdCompare);
11524 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11526 Indices.emplace(Idx,
I);
11528 OrdersType CurrentOrder(VL.size(), VL.size());
11529 bool IsIdentity =
true;
11530 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11531 CurrentOrder[Indices.top().second] =
I;
11532 IsIdentity &= Indices.top().second ==
I;
11536 CurrentOrder.clear();
11537 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11539 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11543 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11546 case Instruction::Load: {
11553 TreeEntry *
TE =
nullptr;
11556 case TreeEntry::Vectorize:
11557 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11558 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11559 if (CurrentOrder.empty())
11560 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11564 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11567 case TreeEntry::CompressVectorize:
11569 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11570 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11573 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11576 case TreeEntry::StridedVectorize:
11578 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11579 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11580 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11581 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11584 case TreeEntry::ScatterVectorize:
11586 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11587 UserTreeIdx, ReuseShuffleIndices);
11590 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11593 case TreeEntry::CombinedVectorize:
11594 case TreeEntry::SplitVectorize:
11595 case TreeEntry::NeedToGather:
11598 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11599 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11600 SmallVector<int>
Mask;
11605 if (State == TreeEntry::ScatterVectorize)
11606 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11609 case Instruction::ZExt:
11610 case Instruction::SExt:
11611 case Instruction::FPToUI:
11612 case Instruction::FPToSI:
11613 case Instruction::FPExt:
11614 case Instruction::PtrToInt:
11615 case Instruction::IntToPtr:
11616 case Instruction::SIToFP:
11617 case Instruction::UIToFP:
11618 case Instruction::Trunc:
11619 case Instruction::FPTrunc:
11620 case Instruction::BitCast: {
11621 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11622 std::make_pair(std::numeric_limits<unsigned>::min(),
11623 std::numeric_limits<unsigned>::max()));
11624 if (ShuffleOrOp == Instruction::ZExt ||
11625 ShuffleOrOp == Instruction::SExt) {
11626 CastMaxMinBWSizes = std::make_pair(
11627 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11629 std::min<unsigned>(
11632 }
else if (ShuffleOrOp == Instruction::Trunc) {
11633 CastMaxMinBWSizes = std::make_pair(
11634 std::max<unsigned>(
11637 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11640 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11641 ReuseShuffleIndices);
11642 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11647 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11648 if (ShuffleOrOp == Instruction::Trunc) {
11649 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11650 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11651 ShuffleOrOp == Instruction::UIToFP) {
11652 unsigned NumSignBits =
11655 APInt
Mask = DB->getDemandedBits(OpI);
11656 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11658 if (NumSignBits * 2 >=
11660 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11664 case Instruction::ICmp:
11665 case Instruction::FCmp: {
11668 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11669 ReuseShuffleIndices);
11678 "Commutative Predicate mismatch");
11688 if (
Cmp->getPredicate() != P0)
11695 if (ShuffleOrOp == Instruction::ICmp) {
11696 unsigned NumSignBits0 =
11698 if (NumSignBits0 * 2 >=
11700 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11701 unsigned NumSignBits1 =
11703 if (NumSignBits1 * 2 >=
11705 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11709 case Instruction::Select:
11710 case Instruction::FNeg:
11711 case Instruction::Add:
11712 case Instruction::FAdd:
11713 case Instruction::Sub:
11714 case Instruction::FSub:
11715 case Instruction::Mul:
11716 case Instruction::FMul:
11717 case Instruction::UDiv:
11718 case Instruction::SDiv:
11719 case Instruction::FDiv:
11720 case Instruction::URem:
11721 case Instruction::SRem:
11722 case Instruction::FRem:
11723 case Instruction::Shl:
11724 case Instruction::LShr:
11725 case Instruction::AShr:
11726 case Instruction::And:
11727 case Instruction::Or:
11728 case Instruction::Xor:
11729 case Instruction::Freeze: {
11730 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11731 ReuseShuffleIndices);
11733 dbgs() <<
"SLP: added a new TreeEntry "
11734 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11745 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11748 case Instruction::GetElementPtr: {
11749 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11750 ReuseShuffleIndices);
11751 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11759 case Instruction::Store: {
11760 bool Consecutive = CurrentOrder.empty();
11763 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11764 ReuseShuffleIndices, CurrentOrder);
11766 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11770 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11773 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11776 case Instruction::Call: {
11782 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11783 ReuseShuffleIndices);
11784 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11798 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11802 case Instruction::ShuffleVector: {
11803 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11804 ReuseShuffleIndices);
11805 if (S.isAltShuffle()) {
11806 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11811 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11825 "Expected different main/alternate predicates.");
11855 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11873 for (
const auto *Ty : ST->elements())
11874 if (Ty != *ST->element_begin())
11876 N *= ST->getNumElements();
11877 EltTy = *ST->element_begin();
11879 N *= AT->getNumElements();
11880 EltTy = AT->getElementType();
11883 N *= VT->getNumElements();
11884 EltTy = VT->getElementType();
11890 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
11891 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11892 VTSize != DL->getTypeStoreSizeInBits(
T))
11899 bool ResizeAllowed)
const {
11901 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11908 Value *Vec = E0->getOperand(0);
11910 CurrentOrder.
clear();
11914 if (E0->getOpcode() == Instruction::ExtractValue) {
11926 unsigned E = VL.
size();
11927 if (!ResizeAllowed && NElts !=
E)
11930 unsigned MinIdx = NElts, MaxIdx = 0;
11935 if (Inst->getOperand(0) != Vec)
11943 const unsigned ExtIdx = *Idx;
11944 if (ExtIdx >= NElts)
11946 Indices[
I] = ExtIdx;
11947 if (MinIdx > ExtIdx)
11949 if (MaxIdx < ExtIdx)
11952 if (MaxIdx - MinIdx + 1 >
E)
11954 if (MaxIdx + 1 <=
E)
11958 bool ShouldKeepOrder =
true;
11965 for (
unsigned I = 0;
I <
E; ++
I) {
11968 const unsigned ExtIdx = Indices[
I] - MinIdx;
11969 if (CurrentOrder[ExtIdx] !=
E) {
11970 CurrentOrder.
clear();
11973 ShouldKeepOrder &= ExtIdx ==
I;
11974 CurrentOrder[ExtIdx] =
I;
11976 if (ShouldKeepOrder)
11977 CurrentOrder.
clear();
11979 return ShouldKeepOrder;
11982bool BoUpSLP::areAllUsersVectorized(
11983 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
11984 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11985 all_of(
I->users(), [
this](User *U) {
11986 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11987 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11991void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11992 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11993 SmallVectorImpl<Value *> *OpScalars,
11994 SmallVectorImpl<Value *> *AltScalars)
const {
11995 unsigned Sz = Scalars.size();
11997 SmallVector<int> OrderMask;
11998 if (!ReorderIndices.empty())
12000 for (
unsigned I = 0;
I < Sz; ++
I) {
12002 if (!ReorderIndices.empty())
12003 Idx = OrderMask[
I];
12007 if (IsAltOp(OpInst)) {
12008 Mask[
I] = Sz + Idx;
12017 if (!ReuseShuffleIndices.
empty()) {
12019 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12020 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12022 Mask.swap(NewMask);
12029 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12039 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12048 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12049 "CmpInst expected to match either main or alternate predicate or "
12051 return MainP !=
P && MainP != SwappedP;
12053 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12058 const auto *Op0 =
Ops.front();
12071 return CI->getValue().isPowerOf2();
12077 return CI->getValue().isNegatedPowerOf2();
12082 if (IsConstant && IsUniform)
12084 else if (IsConstant)
12086 else if (IsUniform)
12098class BaseShuffleAnalysis {
12100 Type *ScalarTy =
nullptr;
12102 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12110 unsigned getVF(
Value *V)
const {
12111 assert(V &&
"V cannot be nullptr");
12113 "V does not have FixedVectorType");
12114 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12116 unsigned VNumElements =
12118 assert(VNumElements > ScalarTyNumElements &&
12119 "the number of elements of V is not large enough");
12120 assert(VNumElements % ScalarTyNumElements == 0 &&
12121 "the number of elements of V is not a vectorized value");
12122 return VNumElements / ScalarTyNumElements;
12128 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12130 int Limit =
Mask.size();
12142 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12143 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12156 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12157 ArrayRef<int> ExtMask) {
12158 unsigned VF =
Mask.size();
12160 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12163 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12167 Mask.swap(NewMask);
12203 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12204 bool SinglePermute) {
12206 ShuffleVectorInst *IdentityOp =
nullptr;
12207 SmallVector<int> IdentityMask;
12216 if (isIdentityMask(Mask, SVTy,
false)) {
12217 if (!IdentityOp || !SinglePermute ||
12218 (isIdentityMask(Mask, SVTy,
true) &&
12220 IdentityMask.
size()))) {
12225 IdentityMask.
assign(Mask);
12245 if (SV->isZeroEltSplat()) {
12247 IdentityMask.
assign(Mask);
12249 int LocalVF =
Mask.size();
12252 LocalVF = SVOpTy->getNumElements();
12256 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12258 ExtMask[Idx] = SV->getMaskValue(
I);
12268 if (!IsOp1Undef && !IsOp2Undef) {
12270 for (
int &
I : Mask) {
12273 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12279 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12280 combineMasks(LocalVF, ShuffleMask, Mask);
12281 Mask.swap(ShuffleMask);
12283 Op = SV->getOperand(0);
12285 Op = SV->getOperand(1);
12288 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12293 "Expected masks of same sizes.");
12298 Mask.swap(IdentityMask);
12300 return SinglePermute &&
12303 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12304 Shuffle->isZeroEltSplat() &&
12308 Shuffle->getShuffleMask()[
P.index()] == 0;
12321 template <
typename T,
typename ShuffleBuilderTy>
12322 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12323 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12324 assert(V1 &&
"Expected at least one vector value.");
12326 SmallVector<int> NewMask(Mask);
12327 if (ScalarTyNumElements != 1) {
12333 Builder.resizeToMatch(V1, V2);
12334 int VF =
Mask.size();
12336 VF = FTy->getNumElements();
12347 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12349 CombinedMask1[
I] =
Mask[
I];
12351 CombinedMask2[
I] =
Mask[
I] - VF;
12358 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12359 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12365 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12368 ExtMask1[Idx] = SV1->getMaskValue(
I);
12372 ->getNumElements(),
12373 ExtMask1, UseMask::SecondArg);
12374 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12375 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12378 ExtMask2[Idx] = SV2->getMaskValue(
I);
12382 ->getNumElements(),
12383 ExtMask2, UseMask::SecondArg);
12384 if (SV1->getOperand(0)->getType() ==
12385 SV2->getOperand(0)->getType() &&
12386 SV1->getOperand(0)->getType() != SV1->getType() &&
12389 Op1 = SV1->getOperand(0);
12390 Op2 = SV2->getOperand(0);
12391 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12392 int LocalVF = ShuffleMask1.size();
12394 LocalVF = FTy->getNumElements();
12395 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12396 CombinedMask1.swap(ShuffleMask1);
12397 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12398 LocalVF = ShuffleMask2.size();
12400 LocalVF = FTy->getNumElements();
12401 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12402 CombinedMask2.swap(ShuffleMask2);
12405 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12406 Builder.resizeToMatch(Op1, Op2);
12408 ->getElementCount()
12409 .getKnownMinValue(),
12411 ->getElementCount()
12412 .getKnownMinValue());
12413 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12416 "Expected undefined mask element");
12417 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12426 return Builder.createIdentity(Op1);
12427 return Builder.createShuffleVector(
12432 return Builder.createPoison(
12434 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12435 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12438 return Builder.createShuffleVector(V1, NewMask);
12439 return Builder.createIdentity(V1);
12445 ArrayRef<int> Mask) {
12454static std::pair<InstructionCost, InstructionCost>
12465 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12474 ScalarCost =
TTI.getPointersChainCost(
12475 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12479 for (
Value *V : Ptrs) {
12480 if (V == BasePtr) {
12489 if (!
Ptr || !
Ptr->hasOneUse())
12493 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12498 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12499 TTI::PointersChainInfo::getKnownStride(),
12509 [](
const Value *V) {
12511 return Ptr && !
Ptr->hasAllConstantIndices();
12513 ? TTI::PointersChainInfo::getUnknownStride()
12514 : TTI::PointersChainInfo::getKnownStride();
12517 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12521 if (It != Ptrs.
end())
12526 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12527 BaseGEP->getPointerOperand(), Indices, VecTy,
12532 return std::make_pair(ScalarCost, VecCost);
12535void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12536 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12537 "Expected gather node without reordering.");
12539 SmallSet<size_t, 2> LoadKeyUsed;
12543 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12548 return VectorizableTree[Idx]->isSame(TE.Scalars);
12552 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12557 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12558 if (LIt != LoadsMap.
end()) {
12559 for (LoadInst *RLI : LIt->second) {
12561 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12565 for (LoadInst *RLI : LIt->second) {
12567 LI->getPointerOperand(), *TLI)) {
12572 if (LIt->second.size() > 2) {
12574 hash_value(LIt->second.back()->getPointerOperand());
12583 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12584 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12585 bool IsOrdered =
true;
12586 unsigned NumInstructions = 0;
12590 size_t Key = 1, Idx = 1;
12598 auto &Container = SortedValues[
Key];
12599 if (IsOrdered && !KeyToIndex.
contains(V) &&
12602 ((Container.contains(Idx) &&
12603 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12604 (!Container.empty() && !Container.contains(Idx) &&
12605 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12607 auto &KTI = KeyToIndex[
V];
12609 Container[Idx].push_back(V);
12614 if (!IsOrdered && NumInstructions > 1) {
12616 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12617 for (
const auto &
D : SortedValues) {
12618 for (
const auto &
P :
D.second) {
12620 for (
Value *V :
P.second) {
12621 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12622 for (
auto [K, Idx] :
enumerate(Indices)) {
12623 TE.ReorderIndices[Cnt +
K] = Idx;
12624 TE.Scalars[Cnt +
K] =
V;
12626 Sz += Indices.
size();
12627 Cnt += Indices.
size();
12631 *TTI,
TE.Scalars.front()->getType(), Sz);
12635 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12643 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12648 auto *ScalarTy =
TE.Scalars.front()->getType();
12650 for (
auto [Idx, Sz] : SubVectors) {
12657 int Sz =
TE.Scalars.size();
12658 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12659 TE.ReorderIndices.end());
12665 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12669 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12672 VecTy, ReorderMask);
12678 DemandedElts.clearBit(
I);
12680 ReorderMask[
I] =
I;
12682 ReorderMask[
I] =
I + Sz;
12688 if (!DemandedElts.isAllOnes())
12690 if (
Cost >= BVCost) {
12691 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12693 TE.ReorderIndices.clear();
12700 const InstructionsState &S,
12706 return V->getType()->getScalarType()->isFloatingPointTy();
12708 "Can only convert to FMA for floating point types");
12709 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12714 for (
Value *V : VL) {
12718 if (S.isCopyableElement(
I))
12720 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12721 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12724 FMF &= FPCI->getFastMathFlags();
12728 if (!CheckForContractable(VL))
12731 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12738 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12740 if (!CheckForContractable(
Operands.front()))
12748 for (
Value *V : VL) {
12752 if (!S.isCopyableElement(
I))
12754 FMF &= FPCI->getFastMathFlags();
12755 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12759 if (S.isCopyableElement(V))
12762 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12764 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12771 FMF &= FPCI->getFastMathFlags();
12772 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12782 BaseGraphSize = VectorizableTree.size();
12784 class GraphTransformModeRAAI {
12785 bool &SavedIsGraphTransformMode;
12788 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12789 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12790 IsGraphTransformMode =
true;
12792 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12793 } TransformContext(IsGraphTransformMode);
12802 const InstructionsState &S) {
12806 I2->getOperand(
Op));
12808 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12810 [](
const std::pair<Value *, Value *> &
P) {
12820 TreeEntry &E = *VectorizableTree[Idx];
12822 reorderGatherNode(E);
12827 constexpr unsigned VFLimit = 16;
12828 bool ForceLoadGather =
12829 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12830 return TE->isGather() && TE->hasState() &&
12831 TE->getOpcode() == Instruction::Load &&
12832 TE->getVectorFactor() < VFLimit;
12838 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12847 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12848 if (E.hasState()) {
12850 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12851 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12852 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12853 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12854 return is_contained(TEs, TE);
12861 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12862 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12863 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12864 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12865 return is_contained(TEs, TE);
12873 if (It != E.Scalars.end()) {
12875 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12876 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12877 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12878 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12879 return is_contained(TEs, TE);
12889 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
12890 TreeEntry &
E = *VectorizableTree[Idx];
12891 if (
E.isGather()) {
12894 unsigned MinVF =
getMinVF(2 * Sz);
12897 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12898 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
12904 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
12907 if (CheckForSameVectorNodes(
E))
12911 unsigned StartIdx = 0;
12912 unsigned End = VL.
size();
12914 *TTI, VL.
front()->getType(), VL.
size() - 1);
12916 *TTI, VL.
front()->getType(), VF - 1)) {
12917 if (StartIdx + VF > End)
12920 bool AllStrided =
true;
12921 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12926 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12933 bool IsSplat =
isSplat(Slice);
12934 bool IsTwoRegisterSplat =
true;
12935 if (IsSplat && VF == 2) {
12938 IsTwoRegisterSplat = NumRegs2VF == 2;
12940 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12948 (S.getOpcode() == Instruction::Load &&
12950 (S.getOpcode() != Instruction::Load &&
12956 if ((!UserIgnoreList ||
E.Idx != 0) &&
12957 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12966 if (S.getOpcode() == Instruction::Load) {
12969 StridedPtrInfo SPtrInfo;
12971 PointerOps, SPtrInfo);
12982 if (UserIgnoreList &&
E.Idx == 0)
12987 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12988 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12990 !CheckOperandsProfitability(
13007 if (VF == 2 && AllStrided && Slices.
size() > 2)
13009 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13010 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13011 if (StartIdx == Cnt)
13012 StartIdx = Cnt + Sz;
13013 if (End == Cnt + Sz)
13016 for (
auto [Cnt, Sz] : Slices) {
13018 const TreeEntry *SameTE =
nullptr;
13020 It != Slice.
end()) {
13022 SameTE = getSameValuesTreeEntry(*It, Slice);
13024 unsigned PrevSize = VectorizableTree.size();
13025 [[maybe_unused]]
unsigned PrevEntriesSize =
13026 LoadEntriesToVectorize.size();
13027 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13028 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13029 VectorizableTree[PrevSize]->isGather() &&
13030 VectorizableTree[PrevSize]->hasState() &&
13031 VectorizableTree[PrevSize]->getOpcode() !=
13032 Instruction::ExtractElement &&
13034 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13036 VectorizableTree.pop_back();
13037 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13038 "LoadEntriesToVectorize expected to remain the same");
13041 AddCombinedNode(PrevSize, Cnt, Sz);
13045 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13046 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13048 E.ReorderIndices.clear();
13053 switch (
E.getOpcode()) {
13054 case Instruction::Load: {
13057 if (
E.State != TreeEntry::Vectorize)
13059 Type *ScalarTy =
E.getMainOp()->getType();
13065 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13066 SmallVector<int>
Mask;
13070 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13071 BaseLI->getPointerAddressSpace(),
CostKind,
13075 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13076 false, CommonAlignment,
CostKind, BaseLI);
13081 ->getPointerOperand()
13083 StridedPtrInfo SPtrInfo;
13084 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13085 SPtrInfo.Ty = VecTy;
13086 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13087 E.State = TreeEntry::StridedVectorize;
13092 case Instruction::Store: {
13100 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13101 SmallVector<int>
Mask;
13105 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13106 BaseSI->getPointerAddressSpace(),
CostKind,
13110 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13111 false, CommonAlignment,
CostKind, BaseSI);
13112 if (StridedCost < OriginalVecCost)
13115 E.State = TreeEntry::StridedVectorize;
13116 }
else if (!
E.ReorderIndices.empty()) {
13118 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13120 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13121 if (
Mask.size() < 4)
13125 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13126 TTI.isLegalInterleavedAccessType(
13127 VecTy, Factor, BaseSI->getAlign(),
13128 BaseSI->getPointerAddressSpace()))
13134 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13135 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13136 if (InterleaveFactor != 0)
13137 E.setInterleave(InterleaveFactor);
13141 case Instruction::Select: {
13142 if (
E.State != TreeEntry::Vectorize)
13148 E.CombinedOp = TreeEntry::MinMax;
13149 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13150 if (SelectOnly && CondEntry->UserTreeIndex &&
13151 CondEntry->State == TreeEntry::Vectorize) {
13153 CondEntry->State = TreeEntry::CombinedVectorize;
13157 case Instruction::FSub:
13158 case Instruction::FAdd: {
13160 if (
E.State != TreeEntry::Vectorize ||
13161 !
E.getOperations().isAddSubLikeOp())
13167 E.CombinedOp = TreeEntry::FMulAdd;
13168 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13169 if (FMulEntry->UserTreeIndex &&
13170 FMulEntry->State == TreeEntry::Vectorize) {
13172 FMulEntry->State = TreeEntry::CombinedVectorize;
13181 if (LoadEntriesToVectorize.empty()) {
13183 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13184 VectorizableTree.front()->getOpcode() == Instruction::Load)
13187 constexpr unsigned SmallTree = 3;
13188 constexpr unsigned SmallVF = 2;
13189 if ((VectorizableTree.size() <= SmallTree &&
13190 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13191 (VectorizableTree.size() <= 2 && UserIgnoreList))
13194 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13198 [](
const std::unique_ptr<TreeEntry> &TE) {
13199 return TE->isGather() &&
TE->hasState() &&
13200 TE->getOpcode() == Instruction::Load &&
13208 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13212 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13213 TreeEntry &
E = *
TE;
13214 if (
E.isGather() &&
13215 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13216 (!
E.hasState() &&
any_of(
E.Scalars,
13218 return isa<LoadInst>(V) &&
13219 !isVectorized(V) &&
13220 !isDeleted(cast<Instruction>(V));
13223 for (
Value *V :
E.Scalars) {
13230 *
this, V, *DL, *SE, *TTI,
13231 GatheredLoads[std::make_tuple(
13239 if (!GatheredLoads.
empty())
13240 tryToVectorizeGatheredLoads(GatheredLoads);
13250 bool IsFinalized =
false;
13263 bool SameNodesEstimated =
true;
13266 if (Ty->getScalarType()->isPointerTy()) {
13270 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13271 Ty->getScalarType());
13289 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13292 count(VL, *It) > 1 &&
13294 if (!NeedShuffle) {
13297 return TTI.getShuffleCost(
13302 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13303 CostKind, std::distance(VL.
begin(), It),
13309 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13312 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13316 VecTy, ShuffleMask, CostKind,
13320 return GatherCost +
13323 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13331 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13332 unsigned NumParts) {
13333 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13335 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13336 auto *EE = dyn_cast<ExtractElementInst>(V);
13339 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13342 return std::max(Sz, VecTy->getNumElements());
13349 -> std::optional<TTI::ShuffleKind> {
13350 if (NumElts <= EltsPerVector)
13351 return std::nullopt;
13353 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13355 if (I == PoisonMaskElem)
13357 return std::min(S, I);
13360 int OffsetReg1 = OffsetReg0;
13364 int FirstRegId = -1;
13365 Indices.assign(1, OffsetReg0);
13369 int Idx =
I - OffsetReg0;
13371 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13372 if (FirstRegId < 0)
13373 FirstRegId = RegId;
13374 RegIndices.
insert(RegId);
13375 if (RegIndices.
size() > 2)
13376 return std::nullopt;
13377 if (RegIndices.
size() == 2) {
13379 if (Indices.
size() == 1) {
13382 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13383 [&](
int S,
int I) {
13384 if (I == PoisonMaskElem)
13386 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13387 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13388 if (RegId == FirstRegId)
13390 return std::min(S, I);
13393 unsigned Index = OffsetReg1 % NumElts;
13394 Indices.push_back(Index);
13395 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13397 Idx =
I - OffsetReg1;
13399 I = (Idx % NumElts) % EltsPerVector +
13400 (RegId == FirstRegId ? 0 : EltsPerVector);
13402 return ShuffleKind;
13410 if (!ShuffleKinds[Part])
13413 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13418 std::optional<TTI::ShuffleKind> RegShuffleKind =
13419 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13420 if (!RegShuffleKind) {
13423 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13436 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13437 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13438 assert((Idx + SubVecSize) <= BaseVF &&
13439 "SK_ExtractSubvector index out of range");
13449 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13450 if (OriginalCost < Cost)
13451 Cost = OriginalCost;
13458 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13460 unsigned SliceSize) {
13461 if (SameNodesEstimated) {
13467 if ((InVectors.size() == 2 &&
13471 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13474 "Expected all poisoned elements.");
13476 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13481 Cost += createShuffle(InVectors.front(),
13482 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13484 transformMaskAfterShuffle(CommonMask, CommonMask);
13485 }
else if (InVectors.size() == 2) {
13486 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13487 transformMaskAfterShuffle(CommonMask, CommonMask);
13489 SameNodesEstimated =
false;
13490 if (!E2 && InVectors.size() == 1) {
13491 unsigned VF = E1.getVectorFactor();
13493 VF = std::max(VF, getVF(V1));
13496 VF = std::max(VF, E->getVectorFactor());
13498 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13500 CommonMask[Idx] = Mask[Idx] + VF;
13501 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13502 transformMaskAfterShuffle(CommonMask, CommonMask);
13504 auto P = InVectors.front();
13505 Cost += createShuffle(&E1, E2, Mask);
13506 unsigned VF = Mask.size();
13512 VF = std::max(VF, E->getVectorFactor());
13514 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13516 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13517 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13518 transformMaskAfterShuffle(CommonMask, CommonMask);
13522 class ShuffleCostBuilder {
13525 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13527 return Mask.empty() ||
13528 (VF == Mask.size() &&
13536 ~ShuffleCostBuilder() =
default;
13542 if (isEmptyOrIdentity(Mask, VF))
13551 if (isEmptyOrIdentity(Mask, VF))
13560 void resizeToMatch(
Value *&,
Value *&)
const {}
13570 ShuffleCostBuilder Builder(TTI);
13573 unsigned CommonVF = Mask.size();
13575 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13579 Type *EScalarTy = E.Scalars.front()->getType();
13580 bool IsSigned =
true;
13581 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13583 IsSigned = It->second.second;
13585 if (EScalarTy != ScalarTy) {
13586 unsigned CastOpcode = Instruction::Trunc;
13587 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13588 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13590 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13591 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13601 Type *EScalarTy = VecTy->getElementType();
13602 if (EScalarTy != ScalarTy) {
13604 unsigned CastOpcode = Instruction::Trunc;
13605 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13606 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13608 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13609 return TTI.getCastInstrCost(
13615 if (!V1 && !V2 && !P2.
isNull()) {
13618 unsigned VF = E->getVectorFactor();
13620 CommonVF = std::max(VF, E2->getVectorFactor());
13623 return Idx < 2 * static_cast<int>(CommonVF);
13625 "All elements in mask must be less than 2 * CommonVF.");
13626 if (E->Scalars.size() == E2->Scalars.size()) {
13630 for (
int &Idx : CommonMask) {
13633 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13635 else if (Idx >=
static_cast<int>(CommonVF))
13636 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13640 CommonVF = E->Scalars.size();
13641 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13642 GetNodeMinBWAffectedCost(*E2, CommonVF);
13644 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13645 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13648 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13649 }
else if (!V1 && P2.
isNull()) {
13652 unsigned VF = E->getVectorFactor();
13656 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13657 "All elements in mask must be less than CommonVF.");
13658 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13660 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13661 for (
int &Idx : CommonMask) {
13665 CommonVF = E->Scalars.size();
13666 }
else if (
unsigned Factor = E->getInterleaveFactor();
13667 Factor > 0 && E->Scalars.size() != Mask.size() &&
13671 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13673 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13676 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13677 CommonVF == CommonMask.size() &&
13679 [](
const auto &&
P) {
13681 static_cast<unsigned>(
P.value()) !=
P.index();
13689 }
else if (V1 && P2.
isNull()) {
13691 ExtraCost += GetValueMinBWAffectedCost(V1);
13692 CommonVF = getVF(V1);
13695 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13696 "All elements in mask must be less than CommonVF.");
13697 }
else if (V1 && !V2) {
13699 unsigned VF = getVF(V1);
13701 CommonVF = std::max(VF, E2->getVectorFactor());
13704 return Idx < 2 * static_cast<int>(CommonVF);
13706 "All elements in mask must be less than 2 * CommonVF.");
13707 if (E2->Scalars.size() == VF && VF != CommonVF) {
13709 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13710 for (
int &Idx : CommonMask) {
13713 if (Idx >=
static_cast<int>(CommonVF))
13714 Idx = E2Mask[Idx - CommonVF] + VF;
13718 ExtraCost += GetValueMinBWAffectedCost(V1);
13720 ExtraCost += GetNodeMinBWAffectedCost(
13721 *E2, std::min(CommonVF, E2->getVectorFactor()));
13722 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13723 }
else if (!V1 && V2) {
13725 unsigned VF = getVF(V2);
13727 CommonVF = std::max(VF, E1->getVectorFactor());
13730 return Idx < 2 * static_cast<int>(CommonVF);
13732 "All elements in mask must be less than 2 * CommonVF.");
13733 if (E1->Scalars.size() == VF && VF != CommonVF) {
13735 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13736 for (
int &Idx : CommonMask) {
13739 if (Idx >=
static_cast<int>(CommonVF))
13740 Idx = E1Mask[Idx - CommonVF] + VF;
13746 ExtraCost += GetNodeMinBWAffectedCost(
13747 *E1, std::min(CommonVF, E1->getVectorFactor()));
13749 ExtraCost += GetValueMinBWAffectedCost(V2);
13750 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13752 assert(V1 && V2 &&
"Expected both vectors.");
13753 unsigned VF = getVF(V1);
13754 CommonVF = std::max(VF, getVF(V2));
13757 return Idx < 2 * static_cast<int>(CommonVF);
13759 "All elements in mask must be less than 2 * CommonVF.");
13761 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13764 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13769 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13772 InVectors.front() =
13774 if (InVectors.size() == 2)
13775 InVectors.pop_back();
13776 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13777 V1, V2, CommonMask, Builder, ScalarTy);
13784 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13785 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13786 CheckedExtracts(CheckedExtracts) {}
13788 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13789 unsigned NumParts,
bool &UseVecBaseAsInput) {
13790 UseVecBaseAsInput =
false;
13793 Value *VecBase =
nullptr;
13795 if (!E->ReorderIndices.empty()) {
13797 E->ReorderIndices.end());
13802 bool PrevNodeFound =
any_of(
13803 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13804 [&](
const std::unique_ptr<TreeEntry> &TE) {
13805 return ((TE->hasState() && !TE->isAltShuffle() &&
13806 TE->getOpcode() == Instruction::ExtractElement) ||
13808 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13809 return VL.size() > Data.index() &&
13810 (Mask[Data.index()] == PoisonMaskElem ||
13811 isa<UndefValue>(VL[Data.index()]) ||
13812 Data.value() == VL[Data.index()]);
13820 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13834 VecBase = EE->getVectorOperand();
13835 UniqueBases.
insert(VecBase);
13837 if (!CheckedExtracts.
insert(V).second ||
13841 return isa<GetElementPtrInst>(U) &&
13842 !R.areAllUsersVectorized(cast<Instruction>(U),
13850 unsigned Idx = *EEIdx;
13852 if (EE->hasOneUse() || !PrevNodeFound) {
13858 Cost -= TTI.getExtractWithExtendCost(
13859 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13862 Cost += TTI.getCastInstrCost(
13863 Ext->getOpcode(), Ext->getType(), EE->getType(),
13868 APInt &DemandedElts =
13869 VectorOpsToExtracts
13872 .first->getSecond();
13873 DemandedElts.
setBit(Idx);
13876 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13878 DemandedElts,
false,
13886 if (!PrevNodeFound)
13887 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13890 transformMaskAfterShuffle(CommonMask, CommonMask);
13891 SameNodesEstimated =
false;
13892 if (NumParts != 1 && UniqueBases.
size() != 1) {
13893 UseVecBaseAsInput =
true;
13901 std::optional<InstructionCost>
13905 return std::nullopt;
13909 IsFinalized =
false;
13910 CommonMask.clear();
13913 VectorizedVals.clear();
13914 SameNodesEstimated =
true;
13920 return Idx < static_cast<int>(E1.getVectorFactor());
13922 "Expected single vector shuffle mask.");
13926 if (InVectors.empty()) {
13927 CommonMask.assign(Mask.begin(), Mask.end());
13928 InVectors.assign({&E1, &E2});
13931 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13937 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13938 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13941 if (InVectors.empty()) {
13942 CommonMask.assign(Mask.begin(), Mask.end());
13943 InVectors.assign(1, &E1);
13946 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13952 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13953 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13954 if (!SameNodesEstimated && InVectors.size() == 1)
13955 InVectors.emplace_back(&E1);
13961 assert(InVectors.size() == 1 &&
13968 ->getOrdered(
P.index()));
13969 return EI->getVectorOperand() == V1 ||
13970 EI->getVectorOperand() == V2;
13972 "Expected extractelement vectors.");
13976 if (InVectors.empty()) {
13977 assert(CommonMask.empty() && !ForExtracts &&
13978 "Expected empty input mask/vectors.");
13979 CommonMask.assign(Mask.begin(), Mask.end());
13980 InVectors.assign(1, V1);
13986 !CommonMask.empty() &&
13990 ->getOrdered(
P.index());
13992 return P.value() == Mask[
P.index()] ||
13997 return EI->getVectorOperand() == V1;
13999 "Expected only tree entry for extractelement vectors.");
14002 assert(!InVectors.empty() && !CommonMask.empty() &&
14003 "Expected only tree entries from extracts/reused buildvectors.");
14004 unsigned VF = getVF(V1);
14005 if (InVectors.size() == 2) {
14006 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14007 transformMaskAfterShuffle(CommonMask, CommonMask);
14008 VF = std::max<unsigned>(VF, CommonMask.size());
14009 }
else if (
const auto *InTE =
14010 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14011 VF = std::max(VF, InTE->getVectorFactor());
14015 ->getNumElements());
14017 InVectors.push_back(V1);
14018 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14020 CommonMask[Idx] = Mask[Idx] + VF;
14023 Value *Root =
nullptr) {
14024 Cost += getBuildVectorCost(VL, Root);
14028 unsigned VF = VL.
size();
14030 VF = std::min(VF, MaskVF);
14031 Type *VLScalarTy = VL.
front()->getType();
14055 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14061 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14066 IsFinalized =
true;
14069 if (InVectors.
size() == 2)
14070 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14072 Cost += createShuffle(Vec,
nullptr, CommonMask);
14073 transformMaskAfterShuffle(CommonMask, CommonMask);
14075 "Expected vector length for the final value before action.");
14078 Cost += createShuffle(V1, V2, Mask);
14081 InVectors.
front() = V;
14083 if (!SubVectors.empty()) {
14085 if (InVectors.
size() == 2)
14086 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14088 Cost += createShuffle(Vec,
nullptr, CommonMask);
14089 transformMaskAfterShuffle(CommonMask, CommonMask);
14091 if (!SubVectorsMask.
empty()) {
14093 "Expected same size of masks for subvectors and common mask.");
14095 copy(SubVectorsMask, SVMask.begin());
14096 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14099 I1 = I2 + CommonMask.
size();
14106 for (
auto [
E, Idx] : SubVectors) {
14107 Type *EScalarTy =
E->Scalars.front()->getType();
14108 bool IsSigned =
true;
14109 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14112 IsSigned = It->second.second;
14114 if (ScalarTy != EScalarTy) {
14115 unsigned CastOpcode = Instruction::Trunc;
14116 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14117 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14119 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14120 Cost += TTI.getCastInstrCost(
14129 if (!CommonMask.
empty()) {
14130 std::iota(std::next(CommonMask.
begin(), Idx),
14131 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14137 if (!ExtMask.
empty()) {
14138 if (CommonMask.
empty()) {
14142 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14145 NewMask[
I] = CommonMask[ExtMask[
I]];
14147 CommonMask.
swap(NewMask);
14150 if (CommonMask.
empty()) {
14151 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14155 createShuffle(InVectors.
front(),
14156 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14161 assert((IsFinalized || CommonMask.empty()) &&
14162 "Shuffle construction must be finalized.");
14166const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14167 unsigned Idx)
const {
14168 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14169 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14174 if (TE.State == TreeEntry::ScatterVectorize ||
14175 TE.State == TreeEntry::StridedVectorize)
14177 if (TE.State == TreeEntry::CompressVectorize)
14179 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14180 !TE.isAltShuffle()) {
14181 if (TE.ReorderIndices.empty())
14193 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14203 auto It = MinBWs.find(
E);
14204 Type *OrigScalarTy = ScalarTy;
14205 if (It != MinBWs.end()) {
14212 unsigned EntryVF =
E->getVectorFactor();
14215 if (
E->isGather()) {
14221 ScalarTy = VL.
front()->getType();
14222 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14223 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14225 if (
E->State == TreeEntry::SplitVectorize) {
14226 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14227 "Expected exactly 2 combined entries.");
14228 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14230 if (
E->ReorderIndices.empty()) {
14233 E->CombinedEntriesWithIndices.back().second,
14236 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14237 ->getVectorFactor()));
14239 unsigned CommonVF =
14240 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14241 ->getVectorFactor(),
14242 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14243 ->getVectorFactor());
14248 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14252 SmallVector<int>
Mask;
14253 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14254 (
E->State != TreeEntry::StridedVectorize ||
14256 SmallVector<int> NewMask;
14257 if (
E->getOpcode() == Instruction::Store) {
14259 NewMask.
resize(
E->ReorderIndices.size());
14266 if (!
E->ReuseShuffleIndices.empty())
14271 assert((
E->State == TreeEntry::Vectorize ||
14272 E->State == TreeEntry::ScatterVectorize ||
14273 E->State == TreeEntry::StridedVectorize ||
14274 E->State == TreeEntry::CompressVectorize) &&
14275 "Unhandled state");
14278 (
E->getOpcode() == Instruction::GetElementPtr &&
14279 E->getMainOp()->getType()->isPointerTy()) ||
14280 E->hasCopyableElements()) &&
14283 unsigned ShuffleOrOp =
14284 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14285 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14286 ShuffleOrOp =
E->CombinedOp;
14287 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14288 const unsigned Sz = UniqueValues.size();
14289 SmallBitVector UsedScalars(Sz,
false);
14290 for (
unsigned I = 0;
I < Sz; ++
I) {
14292 !
E->isCopyableElement(UniqueValues[
I]) &&
14293 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14295 UsedScalars.set(
I);
14297 auto GetCastContextHint = [&](
Value *
V) {
14299 return getCastContextHint(*OpTEs.front());
14300 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14301 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14302 !SrcState.isAltShuffle())
14315 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14317 for (
unsigned I = 0;
I < Sz; ++
I) {
14318 if (UsedScalars.test(
I))
14320 ScalarCost += ScalarEltCost(
I);
14329 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14331 if (!EI.UserTE->hasState() ||
14332 EI.UserTE->getOpcode() != Instruction::Select ||
14334 auto UserBWIt = MinBWs.find(EI.UserTE);
14335 Type *UserScalarTy =
14336 (EI.UserTE->isGather() ||
14337 EI.UserTE->State == TreeEntry::SplitVectorize)
14338 ? EI.UserTE->Scalars.front()->getType()
14339 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14340 if (UserBWIt != MinBWs.end())
14342 UserBWIt->second.first);
14343 if (ScalarTy != UserScalarTy) {
14344 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14345 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14346 unsigned VecOpcode;
14348 if (BWSz > SrcBWSz)
14349 VecOpcode = Instruction::Trunc;
14352 It->second.second ? Instruction::SExt : Instruction::ZExt;
14354 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14359 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14360 ScalarCost,
"Calculated costs for Tree"));
14361 return VecCost - ScalarCost;
14366 assert((
E->State == TreeEntry::Vectorize ||
14367 E->State == TreeEntry::StridedVectorize ||
14368 E->State == TreeEntry::CompressVectorize) &&
14369 "Entry state expected to be Vectorize, StridedVectorize or "
14370 "MaskedLoadCompressVectorize here.");
14374 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14375 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14376 "Calculated GEPs cost for Tree"));
14378 return VecCost - ScalarCost;
14385 Type *CanonicalType = Ty;
14391 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14392 {CanonicalType, CanonicalType});
14394 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14397 if (VI && SelectOnly) {
14399 "Expected only for scalar type.");
14402 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14403 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14404 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14408 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14413 switch (ShuffleOrOp) {
14414 case Instruction::PHI: {
14417 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14418 for (
Value *V : UniqueValues) {
14424 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14428 if (
const TreeEntry *OpTE =
14430 if (CountedOps.
insert(OpTE).second &&
14431 !OpTE->ReuseShuffleIndices.empty())
14432 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14433 OpTE->Scalars.size());
14436 return CommonCost - ScalarCost;
14438 case Instruction::ExtractValue:
14439 case Instruction::ExtractElement: {
14440 APInt DemandedElts;
14442 auto GetScalarCost = [&](
unsigned Idx) {
14448 if (ShuffleOrOp == Instruction::ExtractElement) {
14450 SrcVecTy = EE->getVectorOperandType();
14453 Type *AggregateTy = EV->getAggregateOperand()->getType();
14456 NumElts = ATy->getNumElements();
14462 if (
I->hasOneUse()) {
14472 Cost -= TTI->getCastInstrCost(
14473 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14478 if (DemandedElts.
isZero())
14484 return CommonCost - (DemandedElts.
isZero()
14486 : TTI.getScalarizationOverhead(
14487 SrcVecTy, DemandedElts,
false,
14490 return GetCostDiff(GetScalarCost, GetVectorCost);
14492 case Instruction::InsertElement: {
14493 assert(
E->ReuseShuffleIndices.empty() &&
14494 "Unique insertelements only are expected.");
14496 unsigned const NumElts = SrcVecTy->getNumElements();
14497 unsigned const NumScalars = VL.
size();
14503 unsigned OffsetEnd = OffsetBeg;
14504 InsertMask[OffsetBeg] = 0;
14507 if (OffsetBeg > Idx)
14509 else if (OffsetEnd < Idx)
14511 InsertMask[Idx] =
I + 1;
14514 if (NumOfParts > 0 && NumOfParts < NumElts)
14515 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14516 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14518 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14519 unsigned InsertVecSz = std::min<unsigned>(
14521 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14522 bool IsWholeSubvector =
14523 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14527 if (OffsetBeg + InsertVecSz > VecSz) {
14530 InsertVecSz = VecSz;
14535 SmallVector<int>
Mask;
14536 if (!
E->ReorderIndices.empty()) {
14541 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14543 bool IsIdentity =
true;
14545 Mask.swap(PrevMask);
14546 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14548 DemandedElts.
setBit(InsertIdx);
14549 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14550 Mask[InsertIdx - OffsetBeg] =
I;
14552 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14566 InsertVecTy, Mask);
14568 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14574 SmallBitVector InMask =
14576 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14577 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14578 if (InsertVecSz != VecSz) {
14583 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14585 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14589 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14598 case Instruction::ZExt:
14599 case Instruction::SExt:
14600 case Instruction::FPToUI:
14601 case Instruction::FPToSI:
14602 case Instruction::FPExt:
14603 case Instruction::PtrToInt:
14604 case Instruction::IntToPtr:
14605 case Instruction::SIToFP:
14606 case Instruction::UIToFP:
14607 case Instruction::Trunc:
14608 case Instruction::FPTrunc:
14609 case Instruction::BitCast: {
14610 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14613 unsigned Opcode = ShuffleOrOp;
14614 unsigned VecOpcode = Opcode;
14616 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14618 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14619 if (SrcIt != MinBWs.end()) {
14620 SrcBWSz = SrcIt->second.first;
14626 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14627 if (BWSz == SrcBWSz) {
14628 VecOpcode = Instruction::BitCast;
14629 }
else if (BWSz < SrcBWSz) {
14630 VecOpcode = Instruction::Trunc;
14631 }
else if (It != MinBWs.end()) {
14632 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14633 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14634 }
else if (SrcIt != MinBWs.end()) {
14635 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14637 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14639 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14640 !SrcIt->second.second) {
14641 VecOpcode = Instruction::UIToFP;
14644 assert(Idx == 0 &&
"Expected 0 index only");
14645 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14652 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14654 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14657 bool IsArithmeticExtendedReduction =
14658 E->Idx == 0 && UserIgnoreList &&
14661 return is_contained({Instruction::Add, Instruction::FAdd,
14662 Instruction::Mul, Instruction::FMul,
14663 Instruction::And, Instruction::Or,
14667 if (IsArithmeticExtendedReduction &&
14668 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14670 return CommonCost +
14671 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14672 VecOpcode == Opcode ? VI :
nullptr);
14674 return GetCostDiff(GetScalarCost, GetVectorCost);
14676 case Instruction::FCmp:
14677 case Instruction::ICmp:
14678 case Instruction::Select: {
14679 CmpPredicate VecPred, SwappedVecPred;
14682 match(VL0, MatchCmp))
14688 auto GetScalarCost = [&](
unsigned Idx) {
14698 !
match(VI, MatchCmp)) ||
14706 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14707 CostKind, getOperandInfo(
VI->getOperand(0)),
14708 getOperandInfo(
VI->getOperand(1)), VI);
14719 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14720 CostKind, getOperandInfo(
E->getOperand(0)),
14721 getOperandInfo(
E->getOperand(1)), VL0);
14725 unsigned CondNumElements = CondType->getNumElements();
14727 assert(VecTyNumElements >= CondNumElements &&
14728 VecTyNumElements % CondNumElements == 0 &&
14729 "Cannot vectorize Instruction::Select");
14730 if (CondNumElements != VecTyNumElements) {
14739 return VecCost + CommonCost;
14741 return GetCostDiff(GetScalarCost, GetVectorCost);
14743 case TreeEntry::MinMax: {
14744 auto GetScalarCost = [&](
unsigned Idx) {
14745 return GetMinMaxCost(OrigScalarTy);
14749 return VecCost + CommonCost;
14751 return GetCostDiff(GetScalarCost, GetVectorCost);
14753 case TreeEntry::FMulAdd: {
14754 auto GetScalarCost = [&](
unsigned Idx) {
14757 return GetFMulAddCost(
E->getOperations(),
14763 for (
Value *V :
E->Scalars) {
14765 FMF &= FPCI->getFastMathFlags();
14767 FMF &= FPCIOp->getFastMathFlags();
14770 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14771 {VecTy, VecTy, VecTy}, FMF);
14773 return VecCost + CommonCost;
14775 return GetCostDiff(GetScalarCost, GetVectorCost);
14777 case Instruction::FNeg:
14778 case Instruction::Add:
14779 case Instruction::FAdd:
14780 case Instruction::Sub:
14781 case Instruction::FSub:
14782 case Instruction::Mul:
14783 case Instruction::FMul:
14784 case Instruction::UDiv:
14785 case Instruction::SDiv:
14786 case Instruction::FDiv:
14787 case Instruction::URem:
14788 case Instruction::SRem:
14789 case Instruction::FRem:
14790 case Instruction::Shl:
14791 case Instruction::LShr:
14792 case Instruction::AShr:
14793 case Instruction::And:
14794 case Instruction::Or:
14795 case Instruction::Xor: {
14796 auto GetScalarCost = [&](
unsigned Idx) {
14803 Value *Op1 =
E->getOperand(0)[Idx];
14805 SmallVector<const Value *, 2>
Operands(1, Op1);
14809 Op2 =
E->getOperand(1)[Idx];
14817 I && (ShuffleOrOp == Instruction::FAdd ||
14818 ShuffleOrOp == Instruction::FSub)) {
14826 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14831 return CI && CI->getValue().countr_one() >= It->second.first;
14839 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14840 Op2Info, {},
nullptr, TLI) +
14843 return GetCostDiff(GetScalarCost, GetVectorCost);
14845 case Instruction::GetElementPtr: {
14846 return CommonCost + GetGEPCostDiff(VL, VL0);
14848 case Instruction::Load: {
14849 auto GetScalarCost = [&](
unsigned Idx) {
14851 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14852 VI->getAlign(),
VI->getPointerAddressSpace(),
14858 switch (
E->State) {
14859 case TreeEntry::Vectorize:
14860 if (
unsigned Factor =
E->getInterleaveFactor()) {
14861 VecLdCost = TTI->getInterleavedMemoryOpCost(
14862 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14863 LI0->getPointerAddressSpace(),
CostKind);
14866 VecLdCost = TTI->getMemoryOpCost(
14867 Instruction::Load, VecTy, LI0->getAlign(),
14871 case TreeEntry::StridedVectorize: {
14872 Align CommonAlignment =
14874 VecLdCost = TTI->getStridedMemoryOpCost(
14875 Instruction::Load, VecTy, LI0->getPointerOperand(),
14876 false, CommonAlignment,
CostKind);
14879 case TreeEntry::CompressVectorize: {
14881 unsigned InterleaveFactor;
14882 SmallVector<int> CompressMask;
14885 if (!
E->ReorderIndices.empty()) {
14886 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
14887 E->ReorderIndices.end());
14894 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14895 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14896 CompressMask, LoadVecTy);
14897 assert(IsVectorized &&
"Failed to vectorize load");
14898 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
14899 InterleaveFactor, IsMasked);
14900 Align CommonAlignment = LI0->getAlign();
14901 if (InterleaveFactor) {
14902 VecLdCost = TTI->getInterleavedMemoryOpCost(
14903 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14904 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14905 }
else if (IsMasked) {
14906 VecLdCost = TTI->getMaskedMemoryOpCost(
14907 Instruction::Load, LoadVecTy, CommonAlignment,
14908 LI0->getPointerAddressSpace(),
CostKind);
14911 LoadVecTy, CompressMask,
CostKind);
14913 VecLdCost = TTI->getMemoryOpCost(
14914 Instruction::Load, LoadVecTy, CommonAlignment,
14918 LoadVecTy, CompressMask,
CostKind);
14922 case TreeEntry::ScatterVectorize: {
14923 Align CommonAlignment =
14925 VecLdCost = TTI->getGatherScatterOpCost(
14926 Instruction::Load, VecTy, LI0->getPointerOperand(),
14927 false, CommonAlignment,
CostKind);
14930 case TreeEntry::CombinedVectorize:
14931 case TreeEntry::SplitVectorize:
14932 case TreeEntry::NeedToGather:
14935 return VecLdCost + CommonCost;
14941 if (
E->State == TreeEntry::ScatterVectorize)
14948 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14950 case Instruction::Store: {
14951 bool IsReorder = !
E->ReorderIndices.empty();
14952 auto GetScalarCost = [=](
unsigned Idx) {
14955 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14956 VI->getAlign(),
VI->getPointerAddressSpace(),
14964 if (
E->State == TreeEntry::StridedVectorize) {
14965 Align CommonAlignment =
14967 VecStCost = TTI->getStridedMemoryOpCost(
14968 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14969 false, CommonAlignment,
CostKind);
14971 assert(
E->State == TreeEntry::Vectorize &&
14972 "Expected either strided or consecutive stores.");
14973 if (
unsigned Factor =
E->getInterleaveFactor()) {
14974 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
14975 "No reused shuffles expected");
14977 VecStCost = TTI->getInterleavedMemoryOpCost(
14978 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14979 BaseSI->getPointerAddressSpace(),
CostKind);
14982 VecStCost = TTI->getMemoryOpCost(
14983 Instruction::Store, VecTy, BaseSI->getAlign(),
14984 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14987 return VecStCost + CommonCost;
14991 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
14995 return GetCostDiff(GetScalarCost, GetVectorCost) +
14996 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14998 case Instruction::Call: {
14999 auto GetScalarCost = [&](
unsigned Idx) {
15003 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15004 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15014 CI,
ID, VecTy->getNumElements(),
15015 It != MinBWs.end() ? It->second.first : 0, TTI);
15017 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15019 return GetCostDiff(GetScalarCost, GetVectorCost);
15021 case Instruction::ShuffleVector: {
15029 "Invalid Shuffle Vector Operand");
15032 auto TryFindNodeWithEqualOperands = [=]() {
15033 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15036 if (
TE->hasState() &&
TE->isAltShuffle() &&
15037 ((
TE->getOpcode() ==
E->getOpcode() &&
15038 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15039 (
TE->getOpcode() ==
E->getAltOpcode() &&
15040 TE->getAltOpcode() ==
E->getOpcode())) &&
15041 TE->hasEqualOperands(*
E))
15046 auto GetScalarCost = [&](
unsigned Idx) {
15051 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15052 "Unexpected main/alternate opcode");
15054 return TTI->getInstructionCost(VI,
CostKind);
15062 if (TryFindNodeWithEqualOperands()) {
15064 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15071 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15073 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15076 VecCost = TTIRef.getCmpSelInstrCost(
15077 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15078 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15080 VecCost += TTIRef.getCmpSelInstrCost(
15081 E->getOpcode(), VecTy, MaskTy,
15083 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15086 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15089 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15090 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15092 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15093 if (SrcIt != MinBWs.end()) {
15094 SrcBWSz = SrcIt->second.first;
15098 if (BWSz <= SrcBWSz) {
15099 if (BWSz < SrcBWSz)
15101 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15105 <<
"SLP: alternate extension, which should be truncated.\n";
15111 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15114 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15117 SmallVector<int>
Mask;
15118 E->buildAltOpShuffleMask(
15119 [&](Instruction *
I) {
15120 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15121 "Unexpected main/alternate opcode");
15132 unsigned Opcode0 =
E->getOpcode();
15133 unsigned Opcode1 =
E->getAltOpcode();
15134 SmallBitVector OpcodeMask(
15138 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15140 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15141 return AltVecCost < VecCost ? AltVecCost : VecCost;
15147 return GetCostDiff(
15152 "Not supported shufflevector usage.");
15154 unsigned SVNumElements =
15156 ->getNumElements();
15157 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15158 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15163 "Not supported shufflevector usage.");
15166 [[maybe_unused]]
bool IsExtractSubvectorMask =
15167 SV->isExtractSubvectorMask(Index);
15168 assert(IsExtractSubvectorMask &&
15169 "Not supported shufflevector usage.");
15170 if (NextIndex != Index)
15172 NextIndex += SV->getShuffleMask().size();
15175 return ::getShuffleCost(
15181 return GetCostDiff(GetScalarCost, GetVectorCost);
15183 case Instruction::Freeze:
15190bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15192 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15194 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15195 SmallVector<int>
Mask;
15196 return TE->isGather() &&
15198 [
this](
Value *V) { return EphValues.contains(V); }) &&
15200 TE->Scalars.size() < Limit ||
15201 (((
TE->hasState() &&
15202 TE->getOpcode() == Instruction::ExtractElement) ||
15205 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15206 !
TE->isAltShuffle()) ||
15211 if (VectorizableTree.size() == 1 &&
15212 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15213 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15214 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15216 AreVectorizableGathers(VectorizableTree[0].
get(),
15217 VectorizableTree[0]->Scalars.size()) &&
15218 VectorizableTree[0]->getVectorFactor() > 2)))
15221 if (VectorizableTree.size() != 2)
15228 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15229 AreVectorizableGathers(VectorizableTree[1].
get(),
15230 VectorizableTree[0]->Scalars.size()))
15234 if (VectorizableTree[0]->
isGather() ||
15235 (VectorizableTree[1]->
isGather() &&
15236 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15237 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15238 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15246 bool MustMatchOrInst) {
15250 Value *ZextLoad = Root;
15251 const APInt *ShAmtC;
15252 bool FoundOr =
false;
15256 ShAmtC->
urem(8) == 0))) {
15258 ZextLoad = BinOp->getOperand(0);
15259 if (BinOp->getOpcode() == Instruction::Or)
15264 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15271 Type *SrcTy = Load->getType();
15272 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15278 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15288 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15289 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15297 unsigned NumElts = Stores.
size();
15298 for (
Value *Scalar : Stores) {
15312 if (VectorizableTree.empty()) {
15313 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15319 if (VectorizableTree.size() == 2 &&
15321 VectorizableTree[1]->isGather() &&
15322 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15323 !(
isSplat(VectorizableTree[1]->Scalars) ||
15331 constexpr int Limit = 4;
15333 !VectorizableTree.empty() &&
15334 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15335 return (TE->isGather() &&
15336 (!TE->hasState() ||
15337 TE->getOpcode() != Instruction::ExtractElement) &&
15339 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15346 VectorizableTree.size() <= Limit &&
15347 all_of(VectorizableTree,
15348 [&](
const std::unique_ptr<TreeEntry> &TE) {
15349 return (TE->isGather() &&
15350 (!TE->hasState() ||
15351 TE->getOpcode() != Instruction::ExtractElement) &&
15355 (TE->getOpcode() == Instruction::InsertElement ||
15356 (TE->getOpcode() == Instruction::PHI &&
15358 return isa<PoisonValue>(V) || MustGather.contains(V);
15361 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15362 return TE->State == TreeEntry::Vectorize &&
15363 TE->getOpcode() == Instruction::PHI;
15370 unsigned NumGathers = 0;
15371 constexpr int LimitTreeSize = 36;
15373 all_of(VectorizableTree,
15374 [&](
const std::unique_ptr<TreeEntry> &TE) {
15375 if (!TE->isGather() && TE->hasState() &&
15376 (TE->getOpcode() == Instruction::Load ||
15377 TE->getOpcode() == Instruction::Store)) {
15381 if (TE->isGather())
15383 return TE->State == TreeEntry::SplitVectorize ||
15384 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15385 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15386 VectorizableTree.size() > LimitTreeSize) ||
15390 (TE->getOpcode() == Instruction::PHI ||
15391 (TE->hasCopyableElements() &&
15394 TE->Scalars.size() / 2) ||
15395 ((!TE->ReuseShuffleIndices.empty() ||
15396 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15397 TE->Scalars.size() == 2)));
15399 (StoreLoadNodes.
empty() ||
15400 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15401 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15402 return TE->getOpcode() == Instruction::Store ||
15404 return !isa<LoadInst>(V) ||
15405 areAllUsersVectorized(cast<Instruction>(V));
15413 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15414 VectorizableTree.size() >= Limit &&
15416 [&](
const std::unique_ptr<TreeEntry> &TE) {
15417 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15418 TE->UserTreeIndex.UserTE->Idx == 0;
15425 VectorizableTree.size() > 2 &&
15426 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15427 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15428 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15429 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15431 ArrayRef(VectorizableTree).drop_front(2),
15432 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15442 if (isFullyVectorizableTinyTree(ForReduction))
15447 bool IsAllowedSingleBVNode =
15448 VectorizableTree.
size() > 1 ||
15449 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15450 !VectorizableTree.front()->isAltShuffle() &&
15451 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15452 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15454 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15455 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15456 return isa<ExtractElementInst, Constant>(V) ||
15457 (IsAllowedSingleBVNode &&
15458 !V->hasNUsesOrMore(UsesLimit) &&
15459 any_of(V->users(), IsaPred<InsertElementInst>));
15464 if (VectorizableTree.back()->isGather() &&
15465 VectorizableTree.back()->hasState() &&
15466 VectorizableTree.back()->isAltShuffle() &&
15467 VectorizableTree.back()->getVectorFactor() > 2 &&
15469 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15470 TTI->getScalarizationOverhead(
15471 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15472 VectorizableTree.back()->getVectorFactor()),
15485 constexpr unsigned SmallTree = 3;
15486 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15489 [](
const std::unique_ptr<TreeEntry> &TE) {
15490 return TE->isGather() && TE->hasState() &&
15491 TE->getOpcode() == Instruction::Load &&
15499 TreeEntry &E = *VectorizableTree[Idx];
15500 if (E.State == TreeEntry::SplitVectorize)
15504 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15523 const TreeEntry *Root = VectorizableTree.front().get();
15524 if (Root->isGather())
15532 for (
const auto &TEPtr : VectorizableTree) {
15533 if (!TEPtr->isGather()) {
15534 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15535 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15536 LastInstructions.
insert(LastInst);
15538 if (TEPtr->UserTreeIndex)
15539 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15546 if (
II->isAssumeLikeIntrinsic())
15553 return IntrCost < CallCost;
15560 CheckedInstructions;
15561 unsigned Budget = 0;
15562 const unsigned BudgetLimit =
15567 "Expected instructions in same block.");
15568 if (
auto It = CheckedInstructions.
find(
Last);
15569 It != CheckedInstructions.
end()) {
15570 const Instruction *Checked = It->second.getPointer();
15572 return It->second.getInt() != 0;
15578 ++
First->getIterator().getReverse(),
15580 Last->getIterator().getReverse();
15582 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15588 for (
const Instruction *LastInst : LastInstsInRange)
15589 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15592 if (LastInstructions.
contains(&*PrevInstIt))
15593 LastInstsInRange.
push_back(&*PrevInstIt);
15598 for (
const Instruction *LastInst : LastInstsInRange)
15600 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15601 Budget <= BudgetLimit ? 1 : 0);
15602 return Budget <= BudgetLimit;
15604 auto AddCosts = [&](
const TreeEntry *
Op) {
15605 Type *ScalarTy =
Op->Scalars.front()->getType();
15606 auto It = MinBWs.find(
Op);
15607 if (It != MinBWs.end())
15610 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15613 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15620 ParentOpParentToPreds;
15623 auto Key = std::make_pair(Root, OpParent);
15624 if (
auto It = ParentOpParentToPreds.
find(
Key);
15625 It != ParentOpParentToPreds.
end())
15637 for (
const auto &KeyPair : ParentsPairsToAdd) {
15639 "Should not have been added before.");
15643 while (!Worklist.
empty()) {
15645 if (BB == OpParent || !Visited.
insert(BB).second)
15647 auto Pair = std::make_pair(BB, OpParent);
15648 if (
auto It = ParentOpParentToPreds.
find(Pair);
15649 It != ParentOpParentToPreds.
end()) {
15653 ParentsPairsToAdd.
insert(Pair);
15658 if (Budget > BudgetLimit)
15670 while (!LiveEntries.
empty()) {
15675 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15678 if (!
Op->isGather())
15680 if (Entry->State == TreeEntry::SplitVectorize ||
15681 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15687 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15690 if (
Op->isGather()) {
15691 assert(Entry->getOpcode() == Instruction::PHI &&
15692 "Expected phi node only.");
15694 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15696 for (
Value *V :
Op->Scalars) {
15707 OpLastInst = EntriesToLastInstruction.
at(
Op);
15711 if (OpParent == Parent) {
15712 if (Entry->getOpcode() == Instruction::PHI) {
15713 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15717 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15723 if (Entry->getOpcode() != Instruction::PHI &&
15724 !CheckForNonVecCallsInSameBlock(
15725 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15731 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15737 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15753 const auto *I1 = IE1;
15754 const auto *I2 = IE2;
15766 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15769 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15772 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15779struct ValueSelect {
15780 template <
typename U>
15781 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15784 template <
typename U>
15785 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15803template <
typename T>
15809 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15811 auto VMIt = std::next(ShuffleMask.begin());
15814 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15816 if (!IsBaseUndef.
all()) {
15818 std::pair<T *, bool> Res =
15819 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15821 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15825 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15827 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15828 assert((!V || GetVF(V) == Mask.size()) &&
15829 "Expected base vector of VF number of elements.");
15830 Prev = Action(Mask, {
nullptr, Res.first});
15831 }
else if (ShuffleMask.size() == 1) {
15834 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15840 Prev = Action(Mask, {ShuffleMask.begin()->first});
15844 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15845 unsigned Vec2VF = GetVF(VMIt->first);
15846 if (Vec1VF == Vec2VF) {
15850 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15853 Mask[
I] = SecMask[
I] + Vec1VF;
15856 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15859 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15861 std::pair<T *, bool> Res2 =
15862 ResizeAction(VMIt->first, VMIt->second,
false);
15864 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15871 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15874 Prev = Action(Mask, {Res1.first, Res2.first});
15876 VMIt = std::next(VMIt);
15878 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15880 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
15882 std::pair<T *, bool> Res =
15883 ResizeAction(VMIt->first, VMIt->second,
false);
15885 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15888 "Multiple uses of scalars.");
15889 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15894 Prev = Action(Mask, {Prev, Res.first});
15902template <
typename T>
struct ShuffledInsertData {
15906 MapVector<T, SmallVector<int>> ValueMasks;
15914 << VectorizableTree.size() <<
".\n");
15917 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15918 TreeEntry &TE = *VectorizableTree[
I];
15921 if (TE.State == TreeEntry::CombinedVectorize) {
15923 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15924 << *TE.Scalars[0] <<
".\n";
15925 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15928 if (TE.hasState() &&
15929 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15930 if (
const TreeEntry *E =
15931 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15932 E && E->getVectorFactor() == TE.getVectorFactor()) {
15937 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15944 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15945 "Expected gather nodes with users only.");
15951 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15955 none_of(ExternalUses, [](
const ExternalUser &EU) {
15966 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15973 for (ExternalUser &EU : ExternalUses) {
15974 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15977 for (ExternalUser &EU : ExternalUses) {
15978 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15979 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15981 else dbgs() <<
" User: nullptr\n");
15982 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15987 if (EphValues.count(EU.User))
15991 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15993 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16001 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16007 !ExtractCostCalculated.
insert(EU.Scalar).second)
16020 if (!UsedInserts.
insert(VU).second)
16024 const TreeEntry *ScalarTE = &EU.E;
16027 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16032 Value *Op0 =
II->getOperand(0);
16039 if (It == ShuffledInserts.
end()) {
16041 Data.InsertElements.emplace_back(VU);
16043 VecId = ShuffledInserts.
size() - 1;
16044 auto It = MinBWs.find(ScalarTE);
16045 if (It != MinBWs.end() &&
16047 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16049 unsigned BWSz = It->second.first;
16050 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16051 unsigned VecOpcode;
16052 if (DstBWSz < BWSz)
16053 VecOpcode = Instruction::Trunc;
16056 It->second.second ? Instruction::SExt : Instruction::ZExt;
16061 FTy->getNumElements()),
16064 <<
" for extending externally used vector with "
16065 "non-equal minimum bitwidth.\n");
16070 It->InsertElements.front() = VU;
16071 VecId = std::distance(ShuffledInserts.
begin(), It);
16073 int InIdx = *InsertIdx;
16075 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16078 Mask[InIdx] = EU.Lane;
16079 DemandedElts[VecId].setBit(InIdx);
16090 auto *ScalarTy = EU.Scalar->getType();
16091 const unsigned BundleWidth = EU.E.getVectorFactor();
16092 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16094 const TreeEntry *Entry = &EU.E;
16095 auto It = MinBWs.find(Entry);
16096 if (It != MinBWs.end()) {
16101 ? Instruction::ZExt
16102 : Instruction::SExt;
16107 << ExtraCost <<
"\n");
16111 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16112 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16113 << *VecTy <<
": " << ExtraCost <<
"\n");
16116 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16117 Entry->getOpcode() == Instruction::Load) {
16119 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16122 const Loop *L = LI->getLoopFor(Phi->getParent());
16123 return L && (Phi->getParent() ==
I->getParent() ||
16124 L == LI->getLoopFor(
I->getParent()));
16128 if (!ValueToExtUses) {
16129 ValueToExtUses.emplace();
16130 for (
const auto &
P :
enumerate(ExternalUses)) {
16132 if (IsPhiInLoop(
P.value()))
16135 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16142 auto OperandIsScalar = [&](
Value *V) {
16148 return !EE->hasOneUse() || !MustGather.contains(EE);
16151 return ValueToExtUses->contains(V);
16153 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16154 bool CanBeUsedAsScalarCast =
false;
16157 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16162 if (ScalarCost + OpCost <= ExtraCost) {
16163 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16164 ScalarCost += OpCost;
16168 if (CanBeUsedAsScalar) {
16169 bool KeepScalar = ScalarCost <= ExtraCost;
16173 bool IsProfitablePHIUser =
16175 VectorizableTree.front()->Scalars.size() > 2)) &&
16176 VectorizableTree.front()->hasState() &&
16177 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16181 auto *PHIUser = dyn_cast<PHINode>(U);
16182 return (!PHIUser ||
16183 PHIUser->getParent() !=
16185 VectorizableTree.front()->getMainOp())
16190 return ValueToExtUses->contains(V);
16192 if (IsProfitablePHIUser) {
16196 (!GatheredLoadsEntriesFirst.has_value() ||
16197 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16198 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16199 return ValueToExtUses->contains(V);
16201 auto It = ExtractsCount.
find(Entry);
16202 if (It != ExtractsCount.
end()) {
16203 assert(ScalarUsesCount >= It->getSecond().size() &&
16204 "Expected total number of external uses not less than "
16205 "number of scalar uses.");
16206 ScalarUsesCount -= It->getSecond().size();
16211 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16214 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16215 for (
Value *V : Inst->operands()) {
16216 auto It = ValueToExtUses->find(V);
16217 if (It != ValueToExtUses->end()) {
16219 ExternalUses[It->second].User =
nullptr;
16222 ExtraCost = ScalarCost;
16223 if (!IsPhiInLoop(EU))
16224 ExtractsCount[Entry].
insert(Inst);
16225 if (CanBeUsedAsScalarCast) {
16226 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16230 for (
Value *V : IOp->operands()) {
16231 auto It = ValueToExtUses->find(V);
16232 if (It != ValueToExtUses->end()) {
16234 ExternalUses[It->second].User =
nullptr;
16243 ExtractCost += ExtraCost;
16247 for (
Value *V : ScalarOpsFromCasts) {
16248 ExternalUsesAsOriginalScalar.insert(V);
16250 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16251 TEs.front()->findLaneForValue(V));
16255 if (!VectorizedVals.
empty()) {
16256 const TreeEntry &Root = *VectorizableTree.front();
16257 auto BWIt = MinBWs.find(&Root);
16258 if (BWIt != MinBWs.end()) {
16259 Type *DstTy = Root.Scalars.front()->getType();
16260 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16262 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16263 if (OriginalSz != SrcSz) {
16264 unsigned Opcode = Instruction::Trunc;
16265 if (OriginalSz > SrcSz)
16266 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16272 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16279 Cost += ExtractCost;
16281 bool ForSingleMask) {
16283 unsigned VF = Mask.size();
16284 unsigned VecVF = TE->getVectorFactor();
16285 bool HasLargeIndex =
16286 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16287 if ((VF != VecVF && HasLargeIndex) ||
16290 if (HasLargeIndex) {
16292 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16298 dbgs() <<
"SLP: Adding cost " <<
C
16299 <<
" for final shuffle of insertelement external users.\n";
16300 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16302 return std::make_pair(TE,
true);
16305 if (!ForSingleMask) {
16307 for (
unsigned I = 0;
I < VF; ++
I) {
16309 ResizeMask[Mask[
I]] = Mask[
I];
16316 dbgs() <<
"SLP: Adding cost " <<
C
16317 <<
" for final shuffle of insertelement external users.\n";
16318 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16323 return std::make_pair(TE,
false);
16326 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16327 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16328 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16332 assert((TEs.size() == 1 || TEs.size() == 2) &&
16333 "Expected exactly 1 or 2 tree entries.");
16334 if (TEs.size() == 1) {
16336 VF = TEs.front()->getVectorFactor();
16337 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16341 (
Data.index() < VF &&
16342 static_cast<int>(
Data.index()) ==
Data.value());
16347 <<
" for final shuffle of insertelement "
16348 "external users.\n";
16349 TEs.front()->
dump();
16350 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16356 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16357 VF = TEs.front()->getVectorFactor();
16361 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16365 <<
" for final shuffle of vector node and external "
16366 "insertelement users.\n";
16367 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16368 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16376 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16377 EstimateShufflesCost);
16380 ShuffledInserts[
I].InsertElements.
front()->getType()),
16383 Cost -= InsertCost;
16387 if (ReductionBitWidth != 0) {
16388 assert(UserIgnoreList &&
"Expected reduction tree.");
16389 const TreeEntry &E = *VectorizableTree.front();
16390 auto It = MinBWs.find(&E);
16391 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16392 unsigned SrcSize = It->second.first;
16393 unsigned DstSize = ReductionBitWidth;
16394 unsigned Opcode = Instruction::Trunc;
16395 if (SrcSize < DstSize) {
16396 bool IsArithmeticExtendedReduction =
16399 return is_contained({Instruction::Add, Instruction::FAdd,
16400 Instruction::Mul, Instruction::FMul,
16401 Instruction::And, Instruction::Or,
16405 if (IsArithmeticExtendedReduction)
16407 Instruction::BitCast;
16409 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16411 if (Opcode != Instruction::BitCast) {
16413 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16415 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16418 switch (E.getOpcode()) {
16419 case Instruction::SExt:
16420 case Instruction::ZExt:
16421 case Instruction::Trunc: {
16422 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16423 CCH = getCastContextHint(*OpTE);
16429 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16433 <<
" for final resize for reduction from " << SrcVecTy
16434 <<
" to " << DstVecTy <<
"\n";
16435 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16440 std::optional<InstructionCost> SpillCost;
16443 Cost += *SpillCost;
16449 OS <<
"SLP: Spill Cost = ";
16454 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16455 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16459 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16470std::optional<TTI::ShuffleKind>
16471BoUpSLP::tryToGatherSingleRegisterExtractElements(
16477 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16493 if (Idx >= VecTy->getNumElements()) {
16497 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16498 ExtractMask.reset(*Idx);
16503 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16508 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16509 return P1.second.size() > P2.second.size();
16512 const int UndefSz = UndefVectorExtracts.
size();
16513 unsigned SingleMax = 0;
16514 unsigned PairMax = 0;
16515 if (!Vectors.
empty()) {
16516 SingleMax = Vectors.
front().second.size() + UndefSz;
16517 if (Vectors.
size() > 1) {
16518 auto *ItNext = std::next(Vectors.
begin());
16519 PairMax = SingleMax + ItNext->second.size();
16522 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16523 return std::nullopt;
16529 if (SingleMax >= PairMax && SingleMax) {
16530 for (
int Idx : Vectors.
front().second)
16531 std::swap(GatheredExtracts[Idx], VL[Idx]);
16532 }
else if (!Vectors.
empty()) {
16533 for (
unsigned Idx : {0, 1})
16534 for (
int Idx : Vectors[Idx].second)
16535 std::swap(GatheredExtracts[Idx], VL[Idx]);
16538 for (
int Idx : UndefVectorExtracts)
16539 std::swap(GatheredExtracts[Idx], VL[Idx]);
16542 std::optional<TTI::ShuffleKind> Res =
16548 return std::nullopt;
16552 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16573BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16574 SmallVectorImpl<int> &Mask,
16575 unsigned NumParts)
const {
16576 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16585 SmallVector<int> SubMask;
16586 std::optional<TTI::ShuffleKind> Res =
16587 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16588 ShufflesRes[Part] = Res;
16589 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16591 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16592 return Res.has_value();
16594 ShufflesRes.clear();
16595 return ShufflesRes;
16598std::optional<TargetTransformInfo::ShuffleKind>
16599BoUpSLP::isGatherShuffledSingleRegisterEntry(
16601 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16605 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16606 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16607 TE =
TE->UserTreeIndex.UserTE;
16608 if (TE == VectorizableTree.front().get())
16609 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16610 return TE->UserTreeIndex;
16612 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16613 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16614 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16616 TE =
TE->UserTreeIndex.UserTE;
16620 const EdgeInfo TEUseEI = GetUserEntry(TE);
16622 return std::nullopt;
16623 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16628 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16629 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16630 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16633 TEInsertBlock = TEInsertPt->
getParent();
16635 if (!DT->isReachableFromEntry(TEInsertBlock))
16636 return std::nullopt;
16637 auto *NodeUI = DT->getNode(TEInsertBlock);
16638 assert(NodeUI &&
"Should only process reachable instructions");
16640 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16653 const BasicBlock *InsertBlock = InsertPt->getParent();
16654 auto *NodeEUI = DT->getNode(InsertBlock);
16657 assert((NodeUI == NodeEUI) ==
16658 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16659 "Different nodes should have different DFS numbers");
16661 if (TEInsertPt->
getParent() != InsertBlock &&
16662 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16664 if (TEInsertPt->
getParent() == InsertBlock &&
16677 SmallDenseMap<Value *, int> UsedValuesEntry;
16678 SmallPtrSet<const Value *, 16> VisitedValue;
16679 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16681 if ((TEPtr->getVectorFactor() != VL.
size() &&
16682 TEPtr->Scalars.size() != VL.
size()) ||
16683 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16687 for (
Value *V : VL) {
16694 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16695 unsigned EdgeIdx) {
16696 const TreeEntry *Ptr1 = User1;
16697 const TreeEntry *Ptr2 = User2;
16698 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16701 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16702 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16705 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16706 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16707 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16708 return Idx < It->second;
16712 for (
Value *V : VL) {
16716 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16717 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16718 if (TEPtr == TE || TEPtr->Idx == 0)
16721 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16722 "Must contain at least single gathered value.");
16723 assert(TEPtr->UserTreeIndex &&
16724 "Expected only single user of a gather node.");
16725 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16727 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16728 UseEI.UserTE->hasState())
16733 : &getLastInstructionInBundle(UseEI.UserTE);
16734 if (TEInsertPt == InsertPt) {
16736 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16737 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16738 TEUseEI.UserTE->isAltShuffle()) &&
16740 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16741 (UseEI.UserTE->hasState() &&
16742 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16743 !UseEI.UserTE->isAltShuffle()) ||
16752 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16755 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16756 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16757 UseEI.UserTE->State == TreeEntry::Vectorize &&
16758 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16759 TEUseEI.UserTE != UseEI.UserTE)
16764 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16768 if (TEUseEI.UserTE != UseEI.UserTE &&
16769 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16770 HasGatherUser(TEUseEI.UserTE)))
16773 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16777 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16778 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16779 UseEI.UserTE->doesNotNeedToSchedule() &&
16784 if ((TEInsertBlock != InsertPt->
getParent() ||
16785 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16786 !CheckOrdering(InsertPt))
16789 if (CheckAndUseSameNode(TEPtr))
16795 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16796 if (It != VTEs.end()) {
16797 const TreeEntry *VTE = *It;
16798 if (
none_of(
TE->CombinedEntriesWithIndices,
16799 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16800 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16801 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16805 if (CheckAndUseSameNode(VTE))
16811 const TreeEntry *VTE = VTEs.front();
16812 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16813 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16814 VTEs = VTEs.drop_front();
16816 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16817 return MTE->State == TreeEntry::Vectorize;
16819 if (MIt == VTEs.end())
16823 if (
none_of(
TE->CombinedEntriesWithIndices,
16824 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16825 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16826 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16830 if (CheckAndUseSameNode(VTE))
16834 if (VToTEs.
empty())
16836 if (UsedTEs.
empty()) {
16844 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16846 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16850 if (!VToTEs.
empty()) {
16856 VToTEs = SavedVToTEs;
16861 if (Idx == UsedTEs.
size()) {
16865 if (UsedTEs.
size() == 2)
16867 UsedTEs.push_back(SavedVToTEs);
16868 Idx = UsedTEs.
size() - 1;
16874 if (UsedTEs.
empty()) {
16876 return std::nullopt;
16880 if (UsedTEs.
size() == 1) {
16883 UsedTEs.front().
end());
16884 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16885 return TE1->Idx < TE2->Idx;
16888 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16889 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16891 if (It != FirstEntries.end() &&
16892 ((*It)->getVectorFactor() == VL.size() ||
16893 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16894 TE->ReuseShuffleIndices.size() == VL.size() &&
16895 (*It)->isSame(
TE->Scalars)))) {
16897 if ((*It)->getVectorFactor() == VL.size()) {
16898 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16899 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16901 SmallVector<int> CommonMask =
TE->getCommonMask();
16912 Entries.
push_back(FirstEntries.front());
16914 for (
auto &
P : UsedValuesEntry)
16916 VF = FirstEntries.front()->getVectorFactor();
16919 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16921 DenseMap<int, const TreeEntry *> VFToTE;
16922 for (
const TreeEntry *TE : UsedTEs.front()) {
16923 unsigned VF =
TE->getVectorFactor();
16924 auto It = VFToTE.
find(VF);
16925 if (It != VFToTE.
end()) {
16926 if (It->second->Idx >
TE->Idx)
16927 It->getSecond() =
TE;
16934 UsedTEs.back().
end());
16935 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16936 return TE1->Idx < TE2->Idx;
16938 for (
const TreeEntry *TE : SecondEntries) {
16939 auto It = VFToTE.
find(
TE->getVectorFactor());
16940 if (It != VFToTE.
end()) {
16949 if (Entries.
empty()) {
16951 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16952 return TE1->Idx < TE2->Idx;
16954 Entries.
push_back(SecondEntries.front());
16955 VF = std::max(Entries.
front()->getVectorFactor(),
16956 Entries.
back()->getVectorFactor());
16958 VF = Entries.
front()->getVectorFactor();
16961 for (
const TreeEntry *
E : Entries)
16965 for (
auto &
P : UsedValuesEntry) {
16967 if (ValuesToEntries[Idx].
contains(
P.first)) {
16977 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16984 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
16986 Value *In1 = PHI1->getIncomingValue(
I);
17001 auto MightBeIgnored = [=](
Value *
V) {
17005 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17010 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17011 Value *V1 = VL[Idx];
17012 bool UsedInSameVTE =
false;
17013 auto It = UsedValuesEntry.find(V1);
17014 if (It != UsedValuesEntry.end())
17015 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17016 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17023 SmallBitVector UsedIdxs(Entries.size());
17025 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17027 auto It = UsedValuesEntry.find(V);
17028 if (It == UsedValuesEntry.end())
17034 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17035 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17037 unsigned Idx = It->second;
17044 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17045 if (!UsedIdxs.test(
I))
17051 for (std::pair<unsigned, int> &Pair : EntryLanes)
17052 if (Pair.first ==
I)
17053 Pair.first = TempEntries.
size();
17056 Entries.swap(TempEntries);
17057 if (EntryLanes.size() == Entries.size() &&
17059 .slice(Part * VL.size(),
17060 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17066 return std::nullopt;
17069 bool IsIdentity = Entries.size() == 1;
17072 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17073 unsigned Idx = Part * VL.size() + Pair.second;
17076 (ForOrder ? std::distance(
17077 Entries[Pair.first]->Scalars.begin(),
17078 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17079 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17080 IsIdentity &=
Mask[Idx] == Pair.second;
17082 if (ForOrder || IsIdentity || Entries.empty()) {
17083 switch (Entries.size()) {
17085 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17089 if (EntryLanes.size() > 2 || VL.size() <= 2)
17096 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17098 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17099 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17100 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17101 for (
int Idx : SubMask) {
17109 assert(MaxElement >= 0 && MinElement >= 0 &&
17110 MaxElement % VF >= MinElement % VF &&
17111 "Expected at least single element.");
17112 unsigned NewVF = std::max<unsigned>(
17114 (MaxElement % VF) -
17115 (MinElement % VF) + 1));
17117 for (
int &Idx : SubMask) {
17120 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17121 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17129 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17130 auto GetShuffleCost = [&,
17131 &TTI = *TTI](ArrayRef<int>
Mask,
17134 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17136 Mask, Entries.front()->getInterleaveFactor()))
17138 return ::getShuffleCost(TTI,
17143 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17145 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17146 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17147 FirstShuffleCost = ShuffleCost;
17151 bool IsIdentity =
true;
17152 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17153 if (Idx >=
static_cast<int>(NewVF)) {
17158 IsIdentity &=
static_cast<int>(
I) == Idx;
17162 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17164 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17168 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17169 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17170 SecondShuffleCost = ShuffleCost;
17174 bool IsIdentity =
true;
17175 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17176 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17182 IsIdentity &=
static_cast<int>(
I) == Idx;
17187 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17189 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17197 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17199 const TreeEntry *BestEntry =
nullptr;
17200 if (FirstShuffleCost < ShuffleCost) {
17201 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17202 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17204 if (Idx >= static_cast<int>(VF))
17205 Idx = PoisonMaskElem;
17207 BestEntry = Entries.front();
17208 ShuffleCost = FirstShuffleCost;
17210 if (SecondShuffleCost < ShuffleCost) {
17211 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17212 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17214 if (Idx < static_cast<int>(VF))
17215 Idx = PoisonMaskElem;
17219 BestEntry = Entries[1];
17220 ShuffleCost = SecondShuffleCost;
17222 if (BuildVectorCost >= ShuffleCost) {
17225 Entries.push_back(BestEntry);
17233 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17235 return std::nullopt;
17239BoUpSLP::isGatherShuffledEntry(
17243 assert(NumParts > 0 && NumParts < VL.
size() &&
17244 "Expected positive number of registers.");
17247 if (TE == VectorizableTree.front().get() &&
17248 (!GatheredLoadsEntriesFirst.has_value() ||
17250 [](
const std::unique_ptr<TreeEntry> &TE) {
17251 return !
TE->isGather();
17256 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17259 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17260 "Expected only single user of the gather node.");
17262 "Number of scalars must be divisible by NumParts.");
17263 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17264 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17266 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17269 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17276 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17277 std::optional<TTI::ShuffleKind> SubRes =
17278 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17281 SubEntries.
clear();
17284 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17285 (SubEntries.
front()->isSame(
TE->Scalars) ||
17286 SubEntries.
front()->isSame(VL))) {
17288 LocalSubEntries.
swap(SubEntries);
17291 std::iota(
Mask.begin(),
Mask.end(), 0);
17293 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17296 Entries.emplace_back(1, LocalSubEntries.
front());
17302 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17310 Type *ScalarTy)
const {
17311 const unsigned VF = VL.
size();
17319 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17321 if (
V->getType() != ScalarTy)
17322 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17326 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17333 ConstantShuffleMask[
I] =
I + VF;
17336 EstimateInsertCost(
I, V);
17339 bool IsAnyNonUndefConst =
17342 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17344 ConstantShuffleMask);
17348 if (!DemandedElements.
isZero())
17352 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17356Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17357 auto It = EntryToLastInstruction.find(
E);
17358 if (It != EntryToLastInstruction.end())
17366 if (
E->hasState()) {
17367 Front =
E->getMainOp();
17368 Opcode =
E->getOpcode();
17375 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17376 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17377 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17379 [=](
Value *V) ->
bool {
17380 if (Opcode == Instruction::GetElementPtr &&
17381 !isa<GetElementPtrInst>(V))
17383 auto *I = dyn_cast<Instruction>(V);
17384 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17385 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17387 "Expected gathered loads or GEPs or instructions from same basic "
17390 auto FindLastInst = [&]() {
17392 for (
Value *V :
E->Scalars) {
17396 if (
E->isCopyableElement(
I))
17398 if (LastInst->
getParent() ==
I->getParent()) {
17403 assert(((Opcode == Instruction::GetElementPtr &&
17405 E->State == TreeEntry::SplitVectorize ||
17408 (GatheredLoadsEntriesFirst.has_value() &&
17409 Opcode == Instruction::Load &&
E->isGather() &&
17410 E->Idx < *GatheredLoadsEntriesFirst)) &&
17411 "Expected vector-like or non-GEP in GEP node insts only.");
17412 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17416 if (!DT->isReachableFromEntry(
I->getParent()))
17418 auto *NodeA = DT->getNode(LastInst->
getParent());
17419 auto *NodeB = DT->getNode(
I->getParent());
17420 assert(NodeA &&
"Should only process reachable instructions");
17421 assert(NodeB &&
"Should only process reachable instructions");
17422 assert((NodeA == NodeB) ==
17423 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17424 "Different nodes should have different DFS numbers");
17425 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17432 auto FindFirstInst = [&]() {
17434 for (
Value *V :
E->Scalars) {
17438 if (
E->isCopyableElement(
I))
17440 if (FirstInst->
getParent() ==
I->getParent()) {
17441 if (
I->comesBefore(FirstInst))
17445 assert(((Opcode == Instruction::GetElementPtr &&
17449 "Expected vector-like or non-GEP in GEP node insts only.");
17450 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17454 if (!DT->isReachableFromEntry(
I->getParent()))
17456 auto *NodeA = DT->getNode(FirstInst->
getParent());
17457 auto *NodeB = DT->getNode(
I->getParent());
17458 assert(NodeA &&
"Should only process reachable instructions");
17459 assert(NodeB &&
"Should only process reachable instructions");
17460 assert((NodeA == NodeB) ==
17461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17462 "Different nodes should have different DFS numbers");
17463 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17469 if (
E->State == TreeEntry::SplitVectorize) {
17470 Res = FindLastInst();
17472 for (
auto *
E : Entries) {
17475 I = &getLastInstructionInBundle(
E);
17480 EntryToLastInstruction.try_emplace(
E, Res);
17485 if (GatheredLoadsEntriesFirst.has_value() &&
17486 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17487 Opcode == Instruction::Load) {
17488 Res = FindFirstInst();
17489 EntryToLastInstruction.try_emplace(
E, Res);
17495 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17499 const auto *It = BlocksSchedules.find(BB);
17500 if (It == BlocksSchedules.end())
17502 for (
Value *V :
E->Scalars) {
17508 if (Bundles.
empty())
17511 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17512 if (It != Bundles.
end())
17517 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17518 if (!
E->isGather() && !Bundle) {
17519 if ((Opcode == Instruction::GetElementPtr &&
17522 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17525 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17526 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17528 Res = FindLastInst();
17530 Res = FindFirstInst();
17531 EntryToLastInstruction.try_emplace(
E, Res);
17540 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17541 Res = Bundle->getBundle().back()->getInst();
17542 EntryToLastInstruction.try_emplace(
E, Res);
17565 Res = FindLastInst();
17566 assert(Res &&
"Failed to find last instruction in bundle");
17567 EntryToLastInstruction.try_emplace(
E, Res);
17571void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17572 auto *Front =
E->getMainOp();
17573 Instruction *LastInst = &getLastInstructionInBundle(
E);
17574 assert(LastInst &&
"Failed to find last instruction in bundle");
17579 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17580 if (LastInstIt != LastInst->
getParent()->end() &&
17581 LastInstIt->getParent()->isLandingPad())
17582 LastInstIt = std::next(LastInstIt);
17585 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17586 E->doesNotNeedToSchedule()) ||
17587 (GatheredLoadsEntriesFirst.has_value() &&
17588 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17589 E->getOpcode() == Instruction::Load)) {
17590 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17594 Builder.SetInsertPoint(
17598 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17601Value *BoUpSLP::gather(
17603 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17609 SmallSet<int, 4> PostponedIndices;
17610 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17612 SmallPtrSet<BasicBlock *, 4> Visited;
17613 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17614 InsertBB = InsertBB->getSinglePredecessor();
17615 return InsertBB && InsertBB == InstBB;
17617 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17619 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17621 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17622 PostponedIndices.
insert(
I).second)
17626 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17629 if (
Scalar->getType() != Ty) {
17640 Scalar = Builder.CreateIntCast(
17654 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17659 GatherShuffleExtractSeq.insert(InsElt);
17665 User *UserOp =
nullptr;
17670 if (
V->getType()->isVectorTy()) {
17672 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17674 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17676 if (SV->getOperand(0) == V)
17678 if (SV->getOperand(1) == V)
17684 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17686 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17689 "Failed to find shufflevector, caused by resize.");
17695 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17696 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17704 SmallVector<int> NonConsts;
17706 std::iota(
Mask.begin(),
Mask.end(), 0);
17707 Value *OriginalRoot = Root;
17710 SV->getOperand(0)->getType() == VecTy) {
17711 Root = SV->getOperand(0);
17712 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17715 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17724 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17729 Vec = OriginalRoot;
17731 Vec = CreateShuffle(Root, Vec, Mask);
17733 OI && OI->use_empty() &&
17734 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17735 return TE->VectorizedValue == OI;
17741 for (
int I : NonConsts)
17742 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17745 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17746 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17784 bool IsFinalized =
false;
17797 class ShuffleIRBuilder {
17810 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17811 CSEBlocks(CSEBlocks),
DL(DL) {}
17812 ~ShuffleIRBuilder() =
default;
17818 "Expected integer vector types only.");
17824 ->getIntegerBitWidth())
17825 V2 = Builder.CreateIntCast(
17828 V1 = Builder.CreateIntCast(
17832 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17834 GatherShuffleExtractSeq.insert(
I);
17835 CSEBlocks.insert(
I->getParent());
17844 unsigned VF = Mask.size();
17848 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17850 GatherShuffleExtractSeq.insert(
I);
17851 CSEBlocks.insert(
I->getParent());
17855 Value *createIdentity(
Value *V) {
return V; }
17856 Value *createPoison(
Type *Ty,
unsigned VF) {
17861 void resizeToMatch(
Value *&V1,
Value *&V2) {
17866 int VF = std::max(V1VF, V2VF);
17867 int MinVF = std::min(V1VF, V2VF);
17869 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17871 Value *&
Op = MinVF == V1VF ? V1 : V2;
17872 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17874 GatherShuffleExtractSeq.insert(
I);
17875 CSEBlocks.insert(
I->getParent());
17888 assert(V1 &&
"Expected at least one vector value.");
17889 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17890 R.CSEBlocks, *R.DL);
17891 return BaseShuffleAnalysis::createShuffle<Value *>(
17892 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17898 std::optional<bool> IsSigned = std::nullopt) {
17901 if (VecTy->getElementType() == ScalarTy->getScalarType())
17903 return Builder.CreateIntCast(
17904 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17908 Value *getVectorizedValue(
const TreeEntry &E) {
17909 Value *Vec = E.VectorizedValue;
17912 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17913 return !isa<PoisonValue>(V) &&
17914 !isKnownNonNegative(
17915 V, SimplifyQuery(*R.DL));
17921 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17925 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17926 unsigned NumParts,
bool &UseVecBaseAsInput) {
17927 UseVecBaseAsInput =
false;
17929 Value *VecBase =
nullptr;
17931 if (!E->ReorderIndices.empty()) {
17933 E->ReorderIndices.end());
17936 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17941 VecBase = EI->getVectorOperand();
17943 VecBase = TEs.front()->VectorizedValue;
17944 assert(VecBase &&
"Expected vectorized value.");
17945 UniqueBases.
insert(VecBase);
17948 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17949 (NumParts != 1 &&
count(VL, EI) > 1) ||
17951 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17952 return UTEs.empty() || UTEs.size() > 1 ||
17953 (isa<GetElementPtrInst>(U) &&
17954 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17956 count_if(R.VectorizableTree,
17957 [&](const std::unique_ptr<TreeEntry> &TE) {
17958 return TE->UserTreeIndex.UserTE ==
17960 is_contained(VL, EI);
17964 R.eraseInstruction(EI);
17966 if (NumParts == 1 || UniqueBases.
size() == 1) {
17967 assert(VecBase &&
"Expected vectorized value.");
17968 return castToScalarTyElem(VecBase);
17970 UseVecBaseAsInput =
true;
17980 Value *Vec =
nullptr;
17987 constexpr int MaxBases = 2;
17989 auto VLMask =
zip(SubVL, SubMask);
17990 const unsigned VF = std::accumulate(
17991 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17992 if (std::get<1>(D) == PoisonMaskElem)
17995 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17996 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17998 VecOp = TEs.front()->VectorizedValue;
17999 assert(VecOp &&
"Expected vectorized value.");
18000 const unsigned Size =
18001 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18002 return std::max(S, Size);
18004 for (
const auto [V,
I] : VLMask) {
18009 VecOp = TEs.front()->VectorizedValue;
18010 assert(VecOp &&
"Expected vectorized value.");
18011 VecOp = castToScalarTyElem(VecOp);
18012 Bases[
I / VF] = VecOp;
18014 if (!Bases.front())
18017 if (Bases.back()) {
18018 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18019 TransformToIdentity(SubMask);
18021 SubVec = Bases.front();
18027 ArrayRef<int> SubMask =
18028 Mask.slice(
P * SliceSize,
18031 return all_of(SubMask, [](
int Idx) {
18035 "Expected first part or all previous parts masked.");
18036 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18041 unsigned SubVecVF =
18043 NewVF = std::max(NewVF, SubVecVF);
18046 for (
int &Idx : SubMask)
18049 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18050 Vec = createShuffle(Vec, SubVec, VecMask);
18051 TransformToIdentity(VecMask);
18059 std::optional<Value *>
18065 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18067 return std::nullopt;
18070 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18071 return Builder.CreateAlignedLoad(
18078 IsFinalized =
false;
18079 CommonMask.clear();
18085 Value *V1 = getVectorizedValue(E1);
18086 Value *V2 = getVectorizedValue(E2);
18092 Value *V1 = getVectorizedValue(E1);
18097 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18100 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18101 V1 = castToScalarTyElem(V1);
18102 V2 = castToScalarTyElem(V2);
18103 if (InVectors.empty()) {
18104 InVectors.push_back(V1);
18105 InVectors.push_back(V2);
18106 CommonMask.assign(Mask.begin(), Mask.end());
18109 Value *Vec = InVectors.front();
18110 if (InVectors.size() == 2) {
18111 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18112 transformMaskAfterShuffle(CommonMask, CommonMask);
18115 Vec = createShuffle(Vec,
nullptr, CommonMask);
18116 transformMaskAfterShuffle(CommonMask, CommonMask);
18118 V1 = createShuffle(V1, V2, Mask);
18119 unsigned VF = std::max(getVF(V1), getVF(Vec));
18120 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18122 CommonMask[Idx] = Idx + VF;
18123 InVectors.front() = Vec;
18124 if (InVectors.size() == 2)
18125 InVectors.back() = V1;
18127 InVectors.push_back(V1);
18132 "castToScalarTyElem expects V1 to be FixedVectorType");
18133 V1 = castToScalarTyElem(V1);
18134 if (InVectors.empty()) {
18135 InVectors.push_back(V1);
18136 CommonMask.assign(Mask.begin(), Mask.end());
18139 const auto *It =
find(InVectors, V1);
18140 if (It == InVectors.end()) {
18141 if (InVectors.size() == 2 ||
18142 InVectors.front()->getType() != V1->
getType()) {
18143 Value *V = InVectors.front();
18144 if (InVectors.size() == 2) {
18145 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18146 transformMaskAfterShuffle(CommonMask, CommonMask);
18148 CommonMask.size()) {
18149 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18150 transformMaskAfterShuffle(CommonMask, CommonMask);
18152 unsigned VF = std::max(CommonMask.size(), Mask.size());
18153 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18155 CommonMask[Idx] = V->getType() != V1->
getType()
18157 : Mask[Idx] + getVF(V1);
18158 if (V->getType() != V1->
getType())
18159 V1 = createShuffle(V1,
nullptr, Mask);
18160 InVectors.front() = V;
18161 if (InVectors.size() == 2)
18162 InVectors.back() = V1;
18164 InVectors.push_back(V1);
18169 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18171 InVectors.push_back(V1);
18176 for (
Value *V : InVectors)
18177 VF = std::max(VF, getVF(V));
18178 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18180 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18189 Value *Root =
nullptr) {
18190 return R.gather(VL, Root, ScalarTy,
18192 return createShuffle(V1, V2, Mask);
18201 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18206 IsFinalized =
true;
18209 if (InVectors.
size() == 2) {
18210 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18213 Vec = createShuffle(Vec,
nullptr, CommonMask);
18215 transformMaskAfterShuffle(CommonMask, CommonMask);
18217 "Expected vector length for the final value before action.");
18221 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18222 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18225 return createShuffle(V1, V2, Mask);
18227 InVectors.
front() = Vec;
18229 if (!SubVectors.empty()) {
18231 if (InVectors.
size() == 2) {
18232 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18235 Vec = createShuffle(Vec,
nullptr, CommonMask);
18237 transformMaskAfterShuffle(CommonMask, CommonMask);
18238 auto CreateSubVectors = [&](
Value *Vec,
18239 SmallVectorImpl<int> &CommonMask) {
18240 for (
auto [
E, Idx] : SubVectors) {
18241 Value *
V = getVectorizedValue(*
E);
18248 Type *OrigScalarTy = ScalarTy;
18251 Builder, Vec, V, InsertionIndex,
18252 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18254 ScalarTy = OrigScalarTy;
18255 if (!CommonMask.
empty()) {
18256 std::iota(std::next(CommonMask.
begin(), Idx),
18257 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18263 if (SubVectorsMask.
empty()) {
18264 Vec = CreateSubVectors(Vec, CommonMask);
18267 copy(SubVectorsMask, SVMask.begin());
18268 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18271 I1 = I2 + CommonMask.
size();
18276 Vec = createShuffle(InsertVec, Vec, SVMask);
18277 transformMaskAfterShuffle(CommonMask, SVMask);
18279 InVectors.
front() = Vec;
18282 if (!ExtMask.
empty()) {
18283 if (CommonMask.
empty()) {
18287 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18290 NewMask[
I] = CommonMask[ExtMask[
I]];
18292 CommonMask.
swap(NewMask);
18295 if (CommonMask.
empty()) {
18296 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18297 return InVectors.
front();
18299 if (InVectors.
size() == 2)
18300 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18301 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18305 assert((IsFinalized || CommonMask.empty()) &&
18306 "Shuffle construction must be finalized.");
18310Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18314template <
typename BVTy,
typename ResTy,
typename... Args>
18315ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18317 assert(E->isGather() &&
"Expected gather node.");
18318 unsigned VF = E->getVectorFactor();
18320 bool NeedFreeze =
false;
18323 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18325 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18328 E->CombinedEntriesWithIndices.size());
18329 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18330 [&](
const auto &
P) {
18331 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18336 E->ReorderIndices.end());
18337 if (!ReorderMask.empty())
18343 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18345 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18348 SubVectorsMask.
clear();
18352 unsigned I,
unsigned SliceSize,
18353 bool IsNotPoisonous) {
18355 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18358 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18359 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18360 if (UserTE->getNumOperands() != 2)
18362 if (!IsNotPoisonous) {
18363 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18364 [=](
const std::unique_ptr<TreeEntry> &TE) {
18365 return TE->UserTreeIndex.UserTE == UserTE &&
18366 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18368 if (It == VectorizableTree.end())
18371 if (!(*It)->ReorderIndices.empty()) {
18375 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18376 Value *V0 = std::get<0>(
P);
18377 Value *V1 = std::get<1>(
P);
18385 if ((
Mask.size() < InputVF &&
18388 (
Mask.size() == InputVF &&
18391 std::next(
Mask.begin(),
I * SliceSize),
18392 std::next(
Mask.begin(),
18399 std::next(
Mask.begin(),
I * SliceSize),
18400 std::next(
Mask.begin(),
18406 BVTy ShuffleBuilder(ScalarTy, Params...);
18407 ResTy Res = ResTy();
18408 SmallVector<int>
Mask;
18409 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18411 Value *ExtractVecBase =
nullptr;
18412 bool UseVecBaseAsInput =
false;
18415 Type *OrigScalarTy = GatheredScalars.front()->getType();
18420 bool Resized =
false;
18422 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18423 if (!ExtractShuffles.
empty()) {
18425 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18431 ExtractEntries.
append(TEs.begin(), TEs.end());
18433 if (std::optional<ResTy> Delayed =
18434 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18436 PostponedGathers.insert(
E);
18441 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18442 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18443 ExtractVecBase = VecBase;
18445 if (VF == VecBaseTy->getNumElements() &&
18446 GatheredScalars.size() != VF) {
18448 GatheredScalars.append(VF - GatheredScalars.size(),
18456 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18457 E->getOpcode() != Instruction::Load ||
18458 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18462 return isa<LoadInst>(V) && isVectorized(V);
18464 (
E->hasState() &&
E->isAltShuffle()) ||
18465 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18467 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18469 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18471 if (!GatherShuffles.
empty()) {
18472 if (std::optional<ResTy> Delayed =
18473 ShuffleBuilder.needToDelay(
E, Entries)) {
18475 PostponedGathers.insert(
E);
18480 if (GatherShuffles.
size() == 1 &&
18482 Entries.
front().front()->isSame(
E->Scalars)) {
18485 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18488 Mask.resize(
E->Scalars.size());
18489 const TreeEntry *FrontTE = Entries.
front().front();
18490 if (FrontTE->ReorderIndices.empty() &&
18491 ((FrontTE->ReuseShuffleIndices.empty() &&
18492 E->Scalars.size() == FrontTE->Scalars.size()) ||
18493 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18494 std::iota(
Mask.begin(),
Mask.end(), 0);
18501 Mask[
I] = FrontTE->findLaneForValue(V);
18506 ShuffleBuilder.resetForSameNode();
18507 ShuffleBuilder.add(*FrontTE, Mask);
18509 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18513 if (GatheredScalars.size() != VF &&
18515 return any_of(TEs, [&](
const TreeEntry *TE) {
18516 return TE->getVectorFactor() == VF;
18519 GatheredScalars.append(VF - GatheredScalars.size(),
18523 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18529 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18530 SmallVectorImpl<int> &ReuseMask,
18531 bool IsRootPoison) {
18534 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18537 SmallVector<int> UndefPos;
18538 DenseMap<Value *, unsigned> UniquePositions;
18541 int NumNonConsts = 0;
18560 Scalars.
front() = OrigV;
18563 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18564 Scalars[Res.first->second] = OrigV;
18565 ReuseMask[
I] = Res.first->second;
18568 if (NumNonConsts == 1) {
18573 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18576 ReuseMask[SinglePos] = SinglePos;
18577 }
else if (!UndefPos.
empty() && IsSplat) {
18584 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18587 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18588 is_contained(E->UserTreeIndex.UserTE->Scalars,
18592 if (It != Scalars.
end()) {
18594 int Pos = std::distance(Scalars.
begin(), It);
18595 for (
int I : UndefPos) {
18597 ReuseMask[
I] = Pos;
18606 for (
int I : UndefPos) {
18615 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18616 bool IsNonPoisoned =
true;
18617 bool IsUsedInExpr =
true;
18618 Value *Vec1 =
nullptr;
18619 if (!ExtractShuffles.
empty()) {
18623 Value *Vec2 =
nullptr;
18624 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18628 if (UseVecBaseAsInput) {
18629 Vec1 = ExtractVecBase;
18631 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18637 Value *VecOp = EI->getVectorOperand();
18639 !TEs.
empty() && TEs.
front()->VectorizedValue)
18640 VecOp = TEs.
front()->VectorizedValue;
18643 }
else if (Vec1 != VecOp) {
18644 assert((!Vec2 || Vec2 == VecOp) &&
18645 "Expected only 1 or 2 vectors shuffle.");
18651 IsUsedInExpr =
false;
18654 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18657 IsUsedInExpr &= FindReusedSplat(
18660 ExtractMask.size(), IsNotPoisonedVec);
18661 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18662 IsNonPoisoned &= IsNotPoisonedVec;
18664 IsUsedInExpr =
false;
18669 if (!GatherShuffles.
empty()) {
18670 unsigned SliceSize =
18674 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18677 "No shuffles with empty entries list expected.");
18681 "Expected shuffle of 1 or 2 entries.");
18685 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18686 if (TEs.
size() == 1) {
18687 bool IsNotPoisonedVec =
18688 TEs.
front()->VectorizedValue
18692 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18693 SliceSize, IsNotPoisonedVec);
18694 ShuffleBuilder.add(*TEs.
front(), VecMask);
18695 IsNonPoisoned &= IsNotPoisonedVec;
18697 IsUsedInExpr =
false;
18698 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18699 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18710 int EMSz = ExtractMask.size();
18711 int MSz =
Mask.size();
18714 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18715 bool IsIdentityShuffle =
18716 ((UseVecBaseAsInput ||
18718 [](
const std::optional<TTI::ShuffleKind> &SK) {
18722 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18724 (!GatherShuffles.
empty() &&
18726 [](
const std::optional<TTI::ShuffleKind> &SK) {
18730 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18732 bool EnoughConstsForShuffle =
18742 (!IsIdentityShuffle ||
18743 (GatheredScalars.size() == 2 &&
18751 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18752 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18759 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18760 TryPackScalars(GatheredScalars, BVMask,
true);
18761 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18762 ShuffleBuilder.add(BV, BVMask);
18766 (IsSingleShuffle && ((IsIdentityShuffle &&
18769 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18772 Res = ShuffleBuilder.finalize(
18773 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18774 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18775 bool IsSplat = isSplat(NonConstants);
18776 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18777 TryPackScalars(NonConstants, BVMask, false);
18778 auto CheckIfSplatIsProfitable = [&]() {
18781 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18782 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18783 if (isa<ExtractElementInst>(V) || isVectorized(V))
18785 InstructionCost SplatCost = TTI->getVectorInstrCost(
18786 Instruction::InsertElement, VecTy, CostKind, 0,
18787 PoisonValue::get(VecTy), V);
18788 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18789 for (auto [Idx, I] : enumerate(BVMask))
18790 if (I != PoisonMaskElem)
18791 NewMask[Idx] = Mask.size();
18792 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18793 NewMask, CostKind);
18794 InstructionCost BVCost = TTI->getVectorInstrCost(
18795 Instruction::InsertElement, VecTy, CostKind,
18796 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18799 if (count(BVMask, PoisonMaskElem) <
18800 static_cast<int>(BVMask.size() - 1)) {
18801 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18802 for (auto [Idx, I] : enumerate(BVMask))
18803 if (I != PoisonMaskElem)
18805 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18806 VecTy, NewMask, CostKind);
18808 return SplatCost <= BVCost;
18810 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18814 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18820 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18822 transform(BVMask, SplatMask.begin(), [](
int I) {
18823 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18826 BV = CreateShuffle(BV,
nullptr, SplatMask);
18829 Mask[Idx] = BVMask.size() + Idx;
18830 Vec = CreateShuffle(Vec, BV, Mask);
18838 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18839 TryPackScalars(GatheredScalars, ReuseMask,
true);
18840 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18841 ShuffleBuilder.add(BV, ReuseMask);
18842 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18847 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18851 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18852 ShuffleBuilder.add(BV, Mask);
18853 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18858 Res = ShuffleBuilder.createFreeze(Res);
18862Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18863 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18865 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18873 for (
Value *V : VL)
18886 IRBuilderBase::InsertPointGuard Guard(Builder);
18888 Value *
V =
E->Scalars.front();
18889 Type *ScalarTy =
V->getType();
18892 auto It = MinBWs.find(
E);
18893 if (It != MinBWs.end()) {
18899 if (
E->VectorizedValue)
18900 return E->VectorizedValue;
18902 if (
E->isGather()) {
18904 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
18905 setInsertPointAfterBundle(
E);
18906 Value *Vec = createBuildVector(
E, ScalarTy);
18907 E->VectorizedValue = Vec;
18910 if (
E->State == TreeEntry::SplitVectorize) {
18911 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
18912 "Expected exactly 2 combined entries.");
18913 setInsertPointAfterBundle(
E);
18915 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
18917 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18918 "Expected same first part of scalars.");
18921 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
18923 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18924 "Expected same second part of scalars.");
18926 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18927 bool IsSigned =
false;
18928 auto It = MinBWs.find(OpE);
18929 if (It != MinBWs.end())
18930 IsSigned = It->second.second;
18933 if (isa<PoisonValue>(V))
18935 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18942 Op1 = Builder.CreateIntCast(
18947 GetOperandSignedness(&OpTE1));
18952 Op2 = Builder.CreateIntCast(
18957 GetOperandSignedness(&OpTE2));
18959 if (
E->ReorderIndices.empty()) {
18963 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
18966 if (ScalarTyNumElements != 1) {
18970 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18972 E->CombinedEntriesWithIndices.back().second *
18973 ScalarTyNumElements);
18974 E->VectorizedValue = Vec;
18977 unsigned CommonVF =
18978 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18981 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18983 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18987 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18989 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18991 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
18992 E->VectorizedValue = Vec;
18996 bool IsReverseOrder =
18998 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19000 if (
E->getOpcode() == Instruction::Store &&
19001 E->State == TreeEntry::Vectorize) {
19002 ArrayRef<int>
Mask =
19003 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19004 E->ReorderIndices.size());
19005 ShuffleBuilder.add(V, Mask);
19006 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19007 E->State == TreeEntry::CompressVectorize) {
19008 ShuffleBuilder.addOrdered(V, {});
19010 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19013 E->CombinedEntriesWithIndices.size());
19015 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19016 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19019 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19020 "Expected either combined subnodes or reordering");
19021 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19024 assert(!
E->isGather() &&
"Unhandled state");
19025 unsigned ShuffleOrOp =
19026 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19028 auto GetOperandSignedness = [&](
unsigned Idx) {
19029 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19030 bool IsSigned =
false;
19031 auto It = MinBWs.find(OpE);
19032 if (It != MinBWs.end())
19033 IsSigned = It->second.second;
19036 if (isa<PoisonValue>(V))
19038 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19042 switch (ShuffleOrOp) {
19043 case Instruction::PHI: {
19044 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19045 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19046 "PHI reordering is free.");
19048 Builder.SetInsertPoint(PH->getParent(),
19049 PH->getParent()->getFirstNonPHIIt());
19051 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19055 Builder.SetInsertPoint(PH->getParent(),
19056 PH->getParent()->getFirstInsertionPt());
19059 V = FinalShuffle(V,
E);
19061 E->VectorizedValue =
V;
19068 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19075 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19079 if (!VisitedBBs.
insert(IBB).second) {
19082 TreeEntry *OpTE = getOperandEntry(
E,
I);
19083 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19084 OpTE->VectorizedValue = VecOp;
19090 Value *Vec = vectorizeOperand(
E,
I);
19091 if (VecTy != Vec->
getType()) {
19093 MinBWs.contains(getOperandEntry(
E,
I))) &&
19094 "Expected item in MinBWs.");
19095 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19101 "Invalid number of incoming values");
19102 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19103 return E->VectorizedValue;
19106 case Instruction::ExtractElement: {
19107 Value *
V =
E->getSingleOperand(0);
19108 setInsertPointAfterBundle(
E);
19109 V = FinalShuffle(V,
E);
19110 E->VectorizedValue =
V;
19113 case Instruction::ExtractValue: {
19115 Builder.SetInsertPoint(LI);
19116 Value *
Ptr = LI->getPointerOperand();
19117 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19119 NewV = FinalShuffle(NewV,
E);
19120 E->VectorizedValue = NewV;
19123 case Instruction::InsertElement: {
19124 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19126 Value *
V = vectorizeOperand(
E, 1);
19128 Type *ScalarTy =
Op.front()->getType();
19131 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19132 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19133 V = Builder.CreateIntCast(
19143 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19145 const unsigned NumElts =
19147 const unsigned NumScalars =
E->Scalars.size();
19150 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19153 SmallVector<int>
Mask;
19154 if (!
E->ReorderIndices.empty()) {
19159 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19162 bool IsIdentity =
true;
19164 Mask.swap(PrevMask);
19165 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19168 IsIdentity &= InsertIdx -
Offset ==
I;
19171 if (!IsIdentity || NumElts != NumScalars) {
19172 Value *V2 =
nullptr;
19173 bool IsVNonPoisonous =
19175 SmallVector<int> InsertMask(Mask);
19176 if (NumElts != NumScalars &&
Offset == 0) {
19185 InsertMask[*InsertIdx] = *InsertIdx;
19186 if (!
Ins->hasOneUse())
19189 Ins->getUniqueUndroppableUser());
19191 SmallBitVector UseMask =
19192 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19193 SmallBitVector IsFirstPoison =
19195 SmallBitVector IsFirstUndef =
19197 if (!IsFirstPoison.
all()) {
19199 for (
unsigned I = 0;
I < NumElts;
I++) {
19201 IsFirstUndef.
test(
I)) {
19202 if (IsVNonPoisonous) {
19203 InsertMask[
I] =
I < NumScalars ?
I : 0;
19208 if (Idx >= NumScalars)
19209 Idx = NumScalars - 1;
19210 InsertMask[
I] = NumScalars + Idx;
19223 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19225 GatherShuffleExtractSeq.insert(
I);
19226 CSEBlocks.insert(
I->getParent());
19231 for (
unsigned I = 0;
I < NumElts;
I++) {
19235 SmallBitVector UseMask =
19236 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19237 SmallBitVector IsFirstUndef =
19239 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19240 NumElts != NumScalars) {
19241 if (IsFirstUndef.
all()) {
19243 SmallBitVector IsFirstPoison =
19245 if (!IsFirstPoison.
all()) {
19246 for (
unsigned I = 0;
I < NumElts;
I++) {
19248 InsertMask[
I] =
I + NumElts;
19251 V = Builder.CreateShuffleVector(
19257 GatherShuffleExtractSeq.insert(
I);
19258 CSEBlocks.insert(
I->getParent());
19262 SmallBitVector IsFirstPoison =
19264 for (
unsigned I = 0;
I < NumElts;
I++) {
19268 InsertMask[
I] += NumElts;
19270 V = Builder.CreateShuffleVector(
19271 FirstInsert->getOperand(0), V, InsertMask,
19274 GatherShuffleExtractSeq.insert(
I);
19275 CSEBlocks.insert(
I->getParent());
19280 ++NumVectorInstructions;
19281 E->VectorizedValue =
V;
19284 case Instruction::ZExt:
19285 case Instruction::SExt:
19286 case Instruction::FPToUI:
19287 case Instruction::FPToSI:
19288 case Instruction::FPExt:
19289 case Instruction::PtrToInt:
19290 case Instruction::IntToPtr:
19291 case Instruction::SIToFP:
19292 case Instruction::UIToFP:
19293 case Instruction::Trunc:
19294 case Instruction::FPTrunc:
19295 case Instruction::BitCast: {
19296 setInsertPointAfterBundle(
E);
19298 Value *InVec = vectorizeOperand(
E, 0);
19303 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19305 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19308 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19309 if (SrcIt != MinBWs.end())
19310 SrcBWSz = SrcIt->second.first;
19311 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19312 if (BWSz == SrcBWSz) {
19313 VecOpcode = Instruction::BitCast;
19314 }
else if (BWSz < SrcBWSz) {
19315 VecOpcode = Instruction::Trunc;
19316 }
else if (It != MinBWs.end()) {
19317 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19318 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19319 }
else if (SrcIt != MinBWs.end()) {
19320 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19322 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19324 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19325 !SrcIt->second.second) {
19326 VecOpcode = Instruction::UIToFP;
19328 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19330 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19331 V = FinalShuffle(V,
E);
19333 E->VectorizedValue =
V;
19334 ++NumVectorInstructions;
19337 case Instruction::FCmp:
19338 case Instruction::ICmp: {
19339 setInsertPointAfterBundle(
E);
19341 Value *
L = vectorizeOperand(
E, 0);
19342 Value *
R = vectorizeOperand(
E, 1);
19343 if (
L->getType() !=
R->getType()) {
19346 MinBWs.contains(getOperandEntry(
E, 0)) ||
19347 MinBWs.contains(getOperandEntry(
E, 1))) &&
19348 "Expected item in MinBWs.");
19353 ->getIntegerBitWidth()) {
19354 Type *CastTy =
R->getType();
19355 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19357 Type *CastTy =
L->getType();
19358 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19363 Value *
V = Builder.CreateCmp(P0, L, R);
19366 ICmp->setSameSign(
false);
19369 V = FinalShuffle(V,
E);
19371 E->VectorizedValue =
V;
19372 ++NumVectorInstructions;
19375 case Instruction::Select: {
19376 setInsertPointAfterBundle(
E);
19379 Value *True = vectorizeOperand(
E, 1);
19380 Value *False = vectorizeOperand(
E, 2);
19384 MinBWs.contains(getOperandEntry(
E, 1)) ||
19385 MinBWs.contains(getOperandEntry(
E, 2))) &&
19386 "Expected item in MinBWs.");
19387 if (True->
getType() != VecTy)
19388 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19389 if (False->
getType() != VecTy)
19390 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19395 assert(TrueNumElements >= CondNumElements &&
19396 TrueNumElements % CondNumElements == 0 &&
19397 "Cannot vectorize Instruction::Select");
19399 "Cannot vectorize Instruction::Select");
19400 if (CondNumElements != TrueNumElements) {
19403 Cond = Builder.CreateShuffleVector(
19408 "Cannot vectorize Instruction::Select");
19409 Value *
V = Builder.CreateSelect(
Cond, True, False);
19410 V = FinalShuffle(V,
E);
19412 E->VectorizedValue =
V;
19413 ++NumVectorInstructions;
19416 case Instruction::FNeg: {
19417 setInsertPointAfterBundle(
E);
19419 Value *
Op = vectorizeOperand(
E, 0);
19421 Value *
V = Builder.CreateUnOp(
19427 V = FinalShuffle(V,
E);
19429 E->VectorizedValue =
V;
19430 ++NumVectorInstructions;
19434 case Instruction::Freeze: {
19435 setInsertPointAfterBundle(
E);
19437 Value *
Op = vectorizeOperand(
E, 0);
19439 if (
Op->getType() != VecTy) {
19441 MinBWs.contains(getOperandEntry(
E, 0))) &&
19442 "Expected item in MinBWs.");
19443 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19445 Value *
V = Builder.CreateFreeze(
Op);
19446 V = FinalShuffle(V,
E);
19448 E->VectorizedValue =
V;
19449 ++NumVectorInstructions;
19453 case Instruction::Add:
19454 case Instruction::FAdd:
19455 case Instruction::Sub:
19456 case Instruction::FSub:
19457 case Instruction::Mul:
19458 case Instruction::FMul:
19459 case Instruction::UDiv:
19460 case Instruction::SDiv:
19461 case Instruction::FDiv:
19462 case Instruction::URem:
19463 case Instruction::SRem:
19464 case Instruction::FRem:
19465 case Instruction::Shl:
19466 case Instruction::LShr:
19467 case Instruction::AShr:
19468 case Instruction::And:
19469 case Instruction::Or:
19470 case Instruction::Xor: {
19471 setInsertPointAfterBundle(
E);
19475 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19480 return CI && CI->getValue().countr_one() >= It->second.first;
19482 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19483 E->VectorizedValue =
V;
19484 ++NumVectorInstructions;
19492 MinBWs.contains(getOperandEntry(
E, 0)) ||
19493 MinBWs.contains(getOperandEntry(
E, 1))) &&
19494 "Expected item in MinBWs.");
19496 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19498 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19501 Value *
V = Builder.CreateBinOp(
19508 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19510 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19512 I->setHasNoUnsignedWrap(
false);
19515 V = FinalShuffle(V,
E);
19517 E->VectorizedValue =
V;
19518 ++NumVectorInstructions;
19522 case Instruction::Load: {
19525 setInsertPointAfterBundle(
E);
19529 FixedVectorType *StridedLoadTy =
nullptr;
19530 Value *PO = LI->getPointerOperand();
19531 if (
E->State == TreeEntry::Vectorize) {
19532 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19533 }
else if (
E->State == TreeEntry::CompressVectorize) {
19534 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19535 CompressEntryToData.at(
E);
19536 Align CommonAlignment = LI->getAlign();
19542 for (
int I : CompressMask)
19546 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19549 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19552 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19563 }
else if (
E->State == TreeEntry::StridedVectorize) {
19566 PO = IsReverseOrder ? PtrN : Ptr0;
19567 Type *StrideTy = DL->getIndexType(PO->
getType());
19569 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19570 StridedLoadTy = SPtrInfo.Ty;
19571 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19572 unsigned StridedLoadEC =
19575 Value *Stride = SPtrInfo.StrideVal;
19577 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19578 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19579 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19580 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19581 &*Builder.GetInsertPoint());
19584 Builder.CreateIntCast(Stride, StrideTy,
true);
19585 StrideVal = Builder.CreateMul(
19586 NewStride, ConstantInt::get(
19587 StrideTy, (IsReverseOrder ? -1 : 1) *
19589 DL->getTypeAllocSize(ScalarTy))));
19591 auto *Inst = Builder.CreateIntrinsic(
19592 Intrinsic::experimental_vp_strided_load,
19593 {StridedLoadTy, PO->
getType(), StrideTy},
19596 Builder.getInt32(StridedLoadEC)});
19597 Inst->addParamAttr(
19602 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19603 Value *VecPtr = vectorizeOperand(
E, 0);
19608 unsigned ScalarTyNumElements =
19610 unsigned VecTyNumElements =
19612 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19613 "Cannot expand getelementptr.");
19614 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19617 return Builder.getInt64(I % ScalarTyNumElements);
19619 VecPtr = Builder.CreateGEP(
19620 VecTy->getElementType(),
19621 Builder.CreateShuffleVector(
19627 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19629 Value *
V =
E->State == TreeEntry::CompressVectorize
19633 V = FinalShuffle(V,
E);
19634 E->VectorizedValue =
V;
19635 ++NumVectorInstructions;
19638 case Instruction::Store: {
19641 setInsertPointAfterBundle(
E);
19643 Value *VecValue = vectorizeOperand(
E, 0);
19644 if (VecValue->
getType() != VecTy)
19646 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19647 VecValue = FinalShuffle(VecValue,
E);
19651 if (
E->State == TreeEntry::Vectorize) {
19652 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19654 assert(
E->State == TreeEntry::StridedVectorize &&
19655 "Expected either strided or consecutive stores.");
19656 if (!
E->ReorderIndices.empty()) {
19658 Ptr =
SI->getPointerOperand();
19661 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19662 auto *Inst = Builder.CreateIntrinsic(
19663 Intrinsic::experimental_vp_strided_store,
19664 {VecTy,
Ptr->getType(), StrideTy},
19667 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19668 Builder.getAllOnesMask(VecTy->getElementCount()),
19669 Builder.getInt32(
E->Scalars.size())});
19670 Inst->addParamAttr(
19678 E->VectorizedValue =
V;
19679 ++NumVectorInstructions;
19682 case Instruction::GetElementPtr: {
19684 setInsertPointAfterBundle(
E);
19686 Value *Op0 = vectorizeOperand(
E, 0);
19689 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19690 Value *OpVec = vectorizeOperand(
E, J);
19694 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19697 for (
Value *V :
E->Scalars) {
19704 V = FinalShuffle(V,
E);
19706 E->VectorizedValue =
V;
19707 ++NumVectorInstructions;
19711 case Instruction::Call: {
19713 setInsertPointAfterBundle(
E);
19718 CI,
ID, VecTy->getNumElements(),
19719 It != MinBWs.end() ? It->second.first : 0, TTI);
19722 VecCallCosts.first <= VecCallCosts.second;
19724 Value *ScalarArg =
nullptr;
19735 ScalarArg = CEI->getArgOperand(
I);
19738 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19739 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19740 ScalarArg = Builder.getFalse();
19747 Value *OpVec = vectorizeOperand(
E,
I);
19748 ScalarArg = CEI->getArgOperand(
I);
19751 It == MinBWs.end()) {
19754 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19755 }
else if (It != MinBWs.end()) {
19756 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19765 if (!UseIntrinsic) {
19770 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19777 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19780 V = FinalShuffle(V,
E);
19782 E->VectorizedValue =
V;
19783 ++NumVectorInstructions;
19786 case Instruction::ShuffleVector: {
19789 setInsertPointAfterBundle(
E);
19790 Value *Src = vectorizeOperand(
E, 0);
19793 SmallVector<int> NewMask(ThisMask.size());
19795 return SVSrc->getShuffleMask()[Mask];
19797 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19798 SVSrc->getOperand(1), NewMask);
19800 V = Builder.CreateShuffleVector(Src, ThisMask);
19805 V = FinalShuffle(V,
E);
19813 "Invalid Shuffle Vector Operand");
19817 setInsertPointAfterBundle(
E);
19818 LHS = vectorizeOperand(
E, 0);
19819 RHS = vectorizeOperand(
E, 1);
19821 setInsertPointAfterBundle(
E);
19822 LHS = vectorizeOperand(
E, 0);
19828 assert((It != MinBWs.end() ||
19829 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19830 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19831 MinBWs.contains(getOperandEntry(
E, 0)) ||
19832 MinBWs.contains(getOperandEntry(
E, 1))) &&
19833 "Expected item in MinBWs.");
19834 Type *CastTy = VecTy;
19840 ->getIntegerBitWidth())
19846 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19848 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19853 V0 = Builder.CreateBinOp(
19855 V1 = Builder.CreateBinOp(
19858 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19861 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19864 unsigned SrcBWSz = DL->getTypeSizeInBits(
19866 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19867 if (BWSz <= SrcBWSz) {
19868 if (BWSz < SrcBWSz)
19869 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
19871 "Expected same type as operand.");
19875 E->VectorizedValue =
LHS;
19876 ++NumVectorInstructions;
19880 V0 = Builder.CreateCast(
19882 V1 = Builder.CreateCast(
19887 for (
Value *V : {V0, V1}) {
19889 GatherShuffleExtractSeq.insert(
I);
19890 CSEBlocks.insert(
I->getParent());
19898 SmallVector<int>
Mask;
19899 E->buildAltOpShuffleMask(
19900 [
E,
this](Instruction *
I) {
19901 assert(
E->getMatchingMainOpOrAltOp(
I) &&
19902 "Unexpected main/alternate opcode");
19906 Mask, &OpScalars, &AltScalars);
19910 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19913 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
19915 if (isa<PoisonValue>(V))
19917 auto *IV = cast<Instruction>(V);
19918 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19920 I->setHasNoUnsignedWrap(
false);
19922 DropNuwFlag(V0,
E->getOpcode());
19923 DropNuwFlag(V1,
E->getAltOpcode());
19929 V = Builder.CreateShuffleVector(V0, V1, Mask);
19932 GatherShuffleExtractSeq.insert(
I);
19933 CSEBlocks.insert(
I->getParent());
19937 E->VectorizedValue =
V;
19938 ++NumVectorInstructions;
19956 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19959 EntryToLastInstruction.clear();
19961 for (
auto &BSIter : BlocksSchedules)
19962 scheduleBlock(*
this, BSIter.second.get());
19965 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19966 if (TE->isGather())
19968 (void)getLastInstructionInBundle(TE.get());
19972 Builder.SetInsertPoint(ReductionRoot->
getParent(),
19975 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19979 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19980 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19981 TE->UserTreeIndex.UserTE->hasState() &&
19982 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19983 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19984 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19985 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19986 all_of(TE->UserTreeIndex.UserTE->Scalars,
19987 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19989 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19993 for (
auto &Entry : GatherEntries) {
19995 Builder.SetInsertPoint(Entry.second);
19996 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20001 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20002 if (GatheredLoadsEntriesFirst.has_value() &&
20003 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20004 (!TE->isGather() || TE->UserTreeIndex)) {
20005 assert((TE->UserTreeIndex ||
20006 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20007 "Expected gathered load node.");
20016 for (
const TreeEntry *E : PostponedNodes) {
20017 auto *TE =
const_cast<TreeEntry *
>(E);
20019 TE->VectorizedValue =
nullptr;
20038 if (UI->comesBefore(InsertPt))
20041 Builder.SetInsertPoint(InsertPt);
20043 Builder.SetInsertPoint(PrevVec);
20045 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20048 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20049 Builder.GetInsertPoint()->comesBefore(VecI))
20050 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20051 Builder.GetInsertPoint());
20052 if (Vec->
getType() != PrevVec->getType()) {
20054 PrevVec->getType()->isIntOrIntVectorTy() &&
20055 "Expected integer vector types only.");
20056 std::optional<bool> IsSigned;
20057 for (
Value *V : TE->Scalars) {
20059 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20060 auto It = MinBWs.find(MNTE);
20061 if (It != MinBWs.end()) {
20062 IsSigned = IsSigned.value_or(
false) || It->second.second;
20067 if (IsSigned.value_or(
false))
20070 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20071 auto It = MinBWs.find(BVE);
20072 if (It != MinBWs.end()) {
20073 IsSigned = IsSigned.value_or(
false) || It->second.second;
20078 if (IsSigned.value_or(
false))
20082 IsSigned.value_or(
false) ||
20086 if (IsSigned.value_or(
false))
20090 if (IsSigned.value_or(
false)) {
20092 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20093 if (It != MinBWs.end())
20094 IsSigned = It->second.second;
20097 "Expected user node or perfect diamond match in MinBWs.");
20098 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20100 PrevVec->replaceAllUsesWith(Vec);
20101 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20104 auto It = PostponedValues.
find(PrevVec);
20105 if (It != PostponedValues.
end()) {
20106 for (TreeEntry *VTE : It->getSecond())
20107 VTE->VectorizedValue = Vec;
20127 for (
const auto &ExternalUse : ExternalUses) {
20128 Value *Scalar = ExternalUse.Scalar;
20135 const TreeEntry *E = &ExternalUse.E;
20136 assert(E &&
"Invalid scalar");
20137 assert(!E->isGather() &&
"Extracting from a gather list");
20139 if (E->getOpcode() == Instruction::GetElementPtr &&
20143 Value *Vec = E->VectorizedValue;
20144 assert(Vec &&
"Can't find vectorizable value");
20146 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20147 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20148 if (Scalar->getType() != Vec->
getType()) {
20149 Value *Ex =
nullptr;
20150 Value *ExV =
nullptr;
20152 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20153 auto It = ScalarToEEs.
find(Scalar);
20154 if (It != ScalarToEEs.
end()) {
20157 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20158 : Builder.GetInsertBlock());
20159 if (EEIt != It->second.end()) {
20160 Value *PrevV = EEIt->second.first;
20162 I && !ReplaceInst &&
20163 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20164 Builder.GetInsertPoint()->comesBefore(
I)) {
20165 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20166 Builder.GetInsertPoint());
20171 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20180 IgnoredExtracts.
insert(EE);
20183 auto *CloneInst = Inst->clone();
20184 CloneInst->insertBefore(Inst->getIterator());
20185 if (Inst->hasName())
20186 CloneInst->takeName(Inst);
20191 Value *V = ES->getVectorOperand();
20194 V = ETEs.front()->VectorizedValue;
20196 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20197 IV->comesBefore(IVec))
20198 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20200 Ex = Builder.CreateExtractElement(Vec, Lane);
20201 }
else if (
auto *VecTy =
20204 unsigned VecTyNumElements = VecTy->getNumElements();
20209 ExternalUse.Lane * VecTyNumElements);
20211 Ex = Builder.CreateExtractElement(Vec, Lane);
20216 if (Scalar->getType() != Ex->
getType())
20217 ExV = Builder.CreateIntCast(
20222 : &F->getEntryBlock(),
20223 std::make_pair(Ex, ExV));
20229 GatherShuffleExtractSeq.insert(ExI);
20230 CSEBlocks.insert(ExI->getParent());
20236 "In-tree scalar of vector type is not insertelement?");
20245 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20248 (ExternallyUsedValues.
count(Scalar) ||
20249 ExternalUsesWithNonUsers.count(Scalar) ||
20250 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20254 if (ExternalUsesAsOriginalScalar.contains(U))
20256 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20257 return !UseEntries.empty() &&
20258 (E->State == TreeEntry::Vectorize ||
20259 E->State == TreeEntry::StridedVectorize ||
20260 E->State == TreeEntry::CompressVectorize) &&
20261 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20262 return (UseEntry->State == TreeEntry::Vectorize ||
20264 TreeEntry::StridedVectorize ||
20266 TreeEntry::CompressVectorize) &&
20267 doesInTreeUserNeedToExtract(
20268 Scalar, getRootEntryInstruction(*UseEntry),
20272 "Scalar with nullptr User must be registered in "
20273 "ExternallyUsedValues map or remain as scalar in vectorized "
20277 if (
PHI->getParent()->isLandingPad())
20278 Builder.SetInsertPoint(
20281 PHI->getParent()->getLandingPadInst()->getIterator()));
20283 Builder.SetInsertPoint(
PHI->getParent(),
20284 PHI->getParent()->getFirstNonPHIIt());
20286 Builder.SetInsertPoint(VecI->getParent(),
20287 std::next(VecI->getIterator()));
20290 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20292 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20294 if (Scalar != NewInst) {
20297 "Extractelements should not be replaced.");
20298 Scalar->replaceAllUsesWith(NewInst);
20308 if (!UsedInserts.
insert(VU).second)
20311 auto BWIt = MinBWs.find(E);
20313 auto *ScalarTy = FTy->getElementType();
20314 auto Key = std::make_pair(Vec, ScalarTy);
20315 auto VecIt = VectorCasts.
find(
Key);
20316 if (VecIt == VectorCasts.
end()) {
20319 if (IVec->getParent()->isLandingPad())
20320 Builder.SetInsertPoint(IVec->getParent(),
20321 std::next(IVec->getParent()
20322 ->getLandingPadInst()
20325 Builder.SetInsertPoint(
20326 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20328 Builder.SetInsertPoint(IVec->getNextNode());
20330 Vec = Builder.CreateIntCast(
20335 BWIt->second.second);
20338 Vec = VecIt->second;
20345 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20352 unsigned Idx = *InsertIdx;
20353 if (It == ShuffledInserts.
end()) {
20355 It = std::next(ShuffledInserts.
begin(),
20356 ShuffledInserts.
size() - 1);
20361 Mask[Idx] = ExternalUse.Lane;
20373 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20374 if (PH->getIncomingValue(
I) == Scalar) {
20376 PH->getIncomingBlock(
I)->getTerminator();
20378 Builder.SetInsertPoint(VecI->getParent(),
20379 std::next(VecI->getIterator()));
20381 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20383 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20384 PH->setOperand(
I, NewInst);
20389 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20393 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20394 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20405 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20407 CombinedMask1[
I] = Mask[
I];
20409 CombinedMask2[
I] = Mask[
I] - VF;
20411 ShuffleInstructionBuilder ShuffleBuilder(
20413 ShuffleBuilder.add(V1, CombinedMask1);
20415 ShuffleBuilder.add(V2, CombinedMask2);
20416 return ShuffleBuilder.finalize({}, {}, {});
20419 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20420 bool ForSingleMask) {
20421 unsigned VF =
Mask.size();
20424 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20425 Vec = CreateShuffle(Vec,
nullptr, Mask);
20426 return std::make_pair(Vec,
true);
20428 if (!ForSingleMask) {
20430 for (
unsigned I = 0;
I < VF; ++
I) {
20434 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20438 return std::make_pair(Vec,
false);
20442 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20445 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20446 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20447 Builder.SetInsertPoint(LastInsert);
20448 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20453 return cast<VectorType>(Vec->getType())
20454 ->getElementCount()
20455 .getKnownMinValue();
20458 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20460 assert((Vals.size() == 1 || Vals.size() == 2) &&
20461 "Expected exactly 1 or 2 input values.");
20462 if (Vals.size() == 1) {
20465 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20466 ->getNumElements() ||
20467 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20468 return CreateShuffle(Vals.front(), nullptr, Mask);
20469 return Vals.front();
20471 return CreateShuffle(Vals.
front() ? Vals.
front()
20473 Vals.
back(), Mask);
20475 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20477 InsertElementInst *
II =
nullptr;
20478 if (It != ShuffledInserts[
I].InsertElements.rend())
20481 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20482 assert(
II &&
"Must be an insertelement instruction.");
20489 for (Instruction *
II :
reverse(Inserts)) {
20490 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20492 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20493 II->moveAfter(NewI);
20497 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20498 IE->replaceUsesOfWith(
IE->getOperand(0),
20500 IE->replaceUsesOfWith(
IE->getOperand(1),
20504 CSEBlocks.insert(LastInsert->
getParent());
20509 for (
auto &TEPtr : VectorizableTree) {
20510 TreeEntry *
Entry = TEPtr.get();
20513 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20516 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20519 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20522 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20526 EE && IgnoredExtracts.contains(EE))
20533 for (User *U :
Scalar->users()) {
20538 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20541 "Deleting out-of-tree value");
20545 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20554 V->mergeDIAssignID(RemovedInsts);
20557 if (UserIgnoreList) {
20558 for (Instruction *
I : RemovedInsts) {
20559 const TreeEntry *
IE = getTreeEntries(
I).front();
20560 if (
IE->Idx != 0 &&
20561 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20562 (ValueToGatherNodes.lookup(
I).contains(
20563 VectorizableTree.front().get()) ||
20564 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20565 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20566 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20567 IE->UserTreeIndex &&
20569 !(GatheredLoadsEntriesFirst.has_value() &&
20570 IE->Idx >= *GatheredLoadsEntriesFirst &&
20571 VectorizableTree.front()->isGather() &&
20573 !(!VectorizableTree.front()->isGather() &&
20574 VectorizableTree.front()->isCopyableElement(
I)))
20579 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20580 (match(U.getUser(), m_LogicalAnd()) ||
20581 match(U.getUser(), m_LogicalOr())) &&
20582 U.getOperandNo() == 0;
20583 if (IsPoisoningLogicalOp) {
20584 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20587 return UserIgnoreList->contains(
U.getUser());
20591 for (SelectInst *SI : LogicalOpSelects)
20601 Builder.ClearInsertionPoint();
20602 InstrElementSize.clear();
20604 const TreeEntry &RootTE = *VectorizableTree.front();
20605 Value *Vec = RootTE.VectorizedValue;
20606 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20607 It != MinBWs.end() &&
20608 ReductionBitWidth != It->second.first) {
20609 IRBuilder<>::InsertPointGuard Guard(Builder);
20610 Builder.SetInsertPoint(ReductionRoot->getParent(),
20611 ReductionRoot->getIterator());
20612 Vec = Builder.CreateIntCast(
20616 It->second.second);
20622 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20623 <<
" gather sequences instructions.\n");
20630 Loop *L = LI->getLoopFor(
I->getParent());
20635 BasicBlock *PreHeader = L->getLoopPreheader();
20643 auto *OpI = dyn_cast<Instruction>(V);
20644 return OpI && L->contains(OpI);
20650 CSEBlocks.insert(PreHeader);
20655 CSEWorkList.
reserve(CSEBlocks.size());
20658 assert(DT->isReachableFromEntry(
N));
20665 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20666 "Different nodes should have different DFS numbers");
20667 return A->getDFSNumIn() <
B->getDFSNumIn();
20675 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20678 if (I1->getType() != I2->getType())
20683 return I1->isIdenticalTo(I2);
20684 if (SI1->isIdenticalTo(SI2))
20686 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20687 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20690 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20694 unsigned LastUndefsCnt = 0;
20695 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20701 NewMask[
I] != SM1[
I])
20704 NewMask[
I] = SM1[
I];
20708 return SM1.
size() - LastUndefsCnt > 1 &&
20712 SM1.
size() - LastUndefsCnt));
20718 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20720 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20721 "Worklist not sorted properly!");
20728 !GatherShuffleExtractSeq.contains(&In))
20733 bool Replaced =
false;
20736 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20737 DT->dominates(V->getParent(), In.getParent())) {
20738 In.replaceAllUsesWith(V);
20741 if (!NewMask.
empty())
20742 SI->setShuffleMask(NewMask);
20747 GatherShuffleExtractSeq.contains(V) &&
20748 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20749 DT->dominates(In.getParent(), V->getParent())) {
20751 V->replaceAllUsesWith(&In);
20754 if (!NewMask.
empty())
20755 SI->setShuffleMask(NewMask);
20763 Visited.push_back(&In);
20768 GatherShuffleExtractSeq.clear();
20771BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20774 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20775 for (
Value *V : VL) {
20776 if (S.isNonSchedulable(V))
20779 if (S.isCopyableElement(V)) {
20781 ScheduleCopyableData &SD =
20782 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20784 BundlePtr->add(&SD);
20787 ScheduleData *BundleMember = getScheduleData(V);
20788 assert(BundleMember &&
"no ScheduleData for bundle member "
20789 "(maybe not in same basic block)");
20791 BundlePtr->add(BundleMember);
20792 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20795 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20801std::optional<BoUpSLP::ScheduleBundle *>
20803 const InstructionsState &S,
20807 bool HasCopyables = S.areInstructionsWithCopyableElements();
20811 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
20816 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20818 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20821 SmallVector<ScheduleData *> ControlDependentMembers;
20822 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20823 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20824 for (ScheduleEntity *SE : Bundle.getBundle()) {
20826 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20827 BundleMember && BundleMember->hasValidDependencies()) {
20828 BundleMember->clearDirectDependencies();
20829 if (RegionHasStackSave ||
20831 BundleMember->getInst()))
20832 ControlDependentMembers.
push_back(BundleMember);
20837 if (SD->hasValidDependencies() &&
20838 (!S.areInstructionsWithCopyableElements() ||
20839 !S.isCopyableElement(SD->getInst())) &&
20840 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20841 EI.UserTE->hasState() &&
20842 (!EI.UserTE->hasCopyableElements() ||
20843 !EI.UserTE->isCopyableElement(SD->getInst())))
20844 SD->clearDirectDependencies();
20845 for (
const Use &U : SD->getInst()->operands()) {
20848 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20849 .first->getSecond();
20852 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20854 if (ScheduleData *OpSD = getScheduleData(
Op);
20855 OpSD && OpSD->hasValidDependencies()) {
20856 OpSD->clearDirectDependencies();
20857 if (RegionHasStackSave ||
20859 ControlDependentMembers.
push_back(OpSD);
20870 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20871 for_each(ScheduleDataMap, [&](
auto &
P) {
20872 if (BB !=
P.first->getParent())
20874 ScheduleData *SD =
P.second;
20875 if (isInSchedulingRegion(*SD))
20876 SD->clearDependencies();
20878 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20879 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20880 if (isInSchedulingRegion(*SD))
20881 SD->clearDependencies();
20888 if (Bundle && !Bundle.getBundle().empty()) {
20889 if (S.areInstructionsWithCopyableElements() ||
20890 !ScheduleCopyableDataMap.empty())
20891 CheckIfNeedToClearDeps(Bundle);
20892 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20894 calculateDependencies(Bundle, !ReSchedule, SLP,
20895 ControlDependentMembers);
20896 }
else if (!ControlDependentMembers.
empty()) {
20897 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20898 calculateDependencies(
Invalid, !ReSchedule, SLP,
20899 ControlDependentMembers);
20904 initialFillReadyList(ReadyInsts);
20911 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20912 !ReadyInsts.empty()) {
20913 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20914 assert(Picked->isReady() &&
"must be ready to schedule");
20915 schedule(*SLP, S, EI, Picked, ReadyInsts);
20916 if (Picked == &Bundle)
20923 for (
Value *V : VL) {
20924 if (S.isNonSchedulable(V))
20926 if (!extendSchedulingRegion(V, S)) {
20933 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20934 TryScheduleBundleImpl(
false,
Invalid);
20935 return std::nullopt;
20939 bool ReSchedule =
false;
20940 for (
Value *V : VL) {
20941 if (S.isNonSchedulable(V))
20945 if (!CopyableData.
empty()) {
20946 for (ScheduleCopyableData *SD : CopyableData)
20947 ReadyInsts.remove(SD);
20949 ScheduleData *BundleMember = getScheduleData(V);
20950 assert((BundleMember || S.isCopyableElement(V)) &&
20951 "no ScheduleData for bundle member (maybe not in same basic block)");
20957 ReadyInsts.remove(BundleMember);
20959 !Bundles.
empty()) {
20960 for (ScheduleBundle *
B : Bundles)
20961 ReadyInsts.remove(
B);
20964 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20971 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
20972 <<
" was already scheduled\n");
20976 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
20977 TryScheduleBundleImpl(ReSchedule, Bundle);
20978 if (!Bundle.isReady()) {
20979 for (ScheduleEntity *BD : Bundle.getBundle()) {
20983 if (BD->isReady()) {
20985 if (Bundles.
empty()) {
20986 ReadyInsts.insert(BD);
20989 for (ScheduleBundle *
B : Bundles)
20991 ReadyInsts.insert(
B);
20994 ScheduledBundlesList.pop_back();
20995 SmallVector<ScheduleData *> ControlDependentMembers;
20996 SmallPtrSet<Instruction *, 4> Visited;
20997 for (
Value *V : VL) {
20998 if (S.isNonSchedulable(V))
21001 if (S.isCopyableElement(
I)) {
21004 auto KV = std::make_pair(EI,
I);
21005 assert(ScheduleCopyableDataMap.contains(KV) &&
21006 "no ScheduleCopyableData for copyable element");
21007 ScheduleCopyableData *SD =
21008 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21009 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21012 const auto *It =
find(
Op,
I);
21013 assert(It !=
Op.end() &&
"Lane not set");
21014 SmallPtrSet<Instruction *, 4> Visited;
21016 int Lane = std::distance(
Op.begin(), It);
21017 assert(Lane >= 0 &&
"Lane not set");
21019 !EI.UserTE->ReorderIndices.empty())
21020 Lane = EI.UserTE->ReorderIndices[Lane];
21021 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21022 "Couldn't find extract lane");
21024 if (!Visited.
insert(In).second) {
21028 ScheduleCopyableDataMapByInstUser
21029 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21032 }
while (It !=
Op.end());
21034 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21035 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21037 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21038 ScheduleCopyableDataMapByUsers.erase(
I);
21039 ScheduleCopyableDataMap.erase(KV);
21041 if (ScheduleData *OpSD = getScheduleData(
I);
21042 OpSD && OpSD->hasValidDependencies()) {
21043 OpSD->clearDirectDependencies();
21044 if (RegionHasStackSave ||
21046 ControlDependentMembers.
push_back(OpSD);
21050 ScheduledBundles.find(
I)->getSecond().pop_back();
21052 if (!ControlDependentMembers.
empty()) {
21053 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21054 calculateDependencies(
Invalid,
false, SLP,
21055 ControlDependentMembers);
21057 return std::nullopt;
21062BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21064 if (ChunkPos >= ChunkSize) {
21065 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21068 return &(ScheduleDataChunks.back()[ChunkPos++]);
21071bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21072 Value *V,
const InstructionsState &S) {
21074 assert(
I &&
"bundle member must be an instruction");
21075 if (getScheduleData(
I))
21077 if (!ScheduleStart) {
21079 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21081 ScheduleEnd =
I->getNextNode();
21082 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21083 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21091 ++ScheduleStart->getIterator().getReverse();
21097 return II->isAssumeLikeIntrinsic();
21100 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21101 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21102 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21104 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21105 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21112 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21113 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21115 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21116 assert(
I->getParent() == ScheduleStart->getParent() &&
21117 "Instruction is in wrong basic block.");
21118 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21124 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21125 "Expected to reach top of the basic block or instruction down the "
21127 assert(
I->getParent() == ScheduleEnd->getParent() &&
21128 "Instruction is in wrong basic block.");
21129 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21131 ScheduleEnd =
I->getNextNode();
21132 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21133 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21137void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21139 ScheduleData *PrevLoadStore,
21140 ScheduleData *NextLoadStore) {
21141 ScheduleData *CurrentLoadStore = PrevLoadStore;
21146 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21148 SD = allocateScheduleDataChunks();
21149 ScheduleDataMap[
I] = SD;
21151 assert(!isInSchedulingRegion(*SD) &&
21152 "new ScheduleData already in scheduling region");
21153 SD->init(SchedulingRegionID,
I);
21155 if (
I->mayReadOrWriteMemory() &&
21159 Intrinsic::pseudoprobe))) {
21161 if (CurrentLoadStore) {
21162 CurrentLoadStore->setNextLoadStore(SD);
21164 FirstLoadStoreInRegion = SD;
21166 CurrentLoadStore = SD;
21171 RegionHasStackSave =
true;
21173 if (NextLoadStore) {
21174 if (CurrentLoadStore)
21175 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21177 LastLoadStoreInRegion = CurrentLoadStore;
21181void BoUpSLP::BlockScheduling::calculateDependencies(
21182 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21184 SmallVector<ScheduleEntity *> WorkList;
21185 auto ProcessNode = [&](ScheduleEntity *SE) {
21187 if (CD->hasValidDependencies())
21190 CD->initDependencies();
21191 CD->resetUnscheduledDeps();
21192 const EdgeInfo &EI = CD->getEdgeInfo();
21195 const auto *It =
find(
Op, CD->getInst());
21196 assert(It !=
Op.end() &&
"Lane not set");
21197 SmallPtrSet<Instruction *, 4> Visited;
21199 int Lane = std::distance(
Op.begin(), It);
21200 assert(Lane >= 0 &&
"Lane not set");
21202 !EI.UserTE->ReorderIndices.empty())
21203 Lane = EI.UserTE->ReorderIndices[Lane];
21204 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21205 "Couldn't find extract lane");
21207 if (EI.UserTE->isCopyableElement(In)) {
21210 if (ScheduleCopyableData *UseSD =
21211 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21212 CD->incDependencies();
21213 if (!UseSD->isScheduled())
21214 CD->incrementUnscheduledDeps(1);
21215 if (!UseSD->hasValidDependencies() ||
21216 (InsertInReadyList && UseSD->isReady()))
21219 }
else if (Visited.
insert(In).second) {
21220 if (ScheduleData *UseSD = getScheduleData(In)) {
21221 CD->incDependencies();
21222 if (!UseSD->isScheduled())
21223 CD->incrementUnscheduledDeps(1);
21224 if (!UseSD->hasValidDependencies() ||
21225 (InsertInReadyList && UseSD->isReady()))
21230 }
while (It !=
Op.end());
21231 if (CD->isReady() && CD->getDependencies() == 0 &&
21232 (EI.UserTE->hasState() &&
21233 (EI.UserTE->getMainOp()->getParent() !=
21234 CD->getInst()->getParent() ||
21236 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21237 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21238 auto *IU = dyn_cast<Instruction>(U);
21241 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21247 CD->incDependencies();
21248 CD->incrementUnscheduledDeps(1);
21254 if (BundleMember->hasValidDependencies())
21256 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21257 BundleMember->initDependencies();
21258 BundleMember->resetUnscheduledDeps();
21260 SmallDenseMap<Value *, unsigned> UserToNumOps;
21261 for (User *U : BundleMember->getInst()->users()) {
21264 if (ScheduleData *UseSD = getScheduleData(U)) {
21268 if (areAllOperandsReplacedByCopyableData(
21271 BundleMember->incDependencies();
21272 if (!UseSD->isScheduled())
21273 BundleMember->incrementUnscheduledDeps(1);
21274 if (!UseSD->hasValidDependencies() ||
21275 (InsertInReadyList && UseSD->isReady()))
21279 for (ScheduleCopyableData *UseSD :
21280 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21281 BundleMember->incDependencies();
21282 if (!UseSD->isScheduled())
21283 BundleMember->incrementUnscheduledDeps(1);
21284 if (!UseSD->hasValidDependencies() ||
21285 (InsertInReadyList && UseSD->isReady()))
21289 SmallPtrSet<const Instruction *, 4> Visited;
21292 if (!Visited.
insert(
I).second)
21294 auto *DepDest = getScheduleData(
I);
21295 assert(DepDest &&
"must be in schedule window");
21296 DepDest->addControlDependency(BundleMember);
21297 BundleMember->incDependencies();
21298 if (!DepDest->isScheduled())
21299 BundleMember->incrementUnscheduledDeps(1);
21300 if (!DepDest->hasValidDependencies() ||
21301 (InsertInReadyList && DepDest->isReady()))
21309 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21310 I != ScheduleEnd;
I =
I->getNextNode()) {
21315 MakeControlDependent(
I);
21323 if (RegionHasStackSave) {
21328 match(BundleMember->getInst(),
21330 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21331 I != ScheduleEnd;
I =
I->getNextNode()) {
21342 MakeControlDependent(
I);
21352 BundleMember->getInst()->mayReadOrWriteMemory()) {
21353 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21354 I != ScheduleEnd;
I =
I->getNextNode()) {
21360 MakeControlDependent(
I);
21367 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21368 if (!NextLoadStore)
21372 "NextLoadStore list for non memory effecting bundle?");
21375 unsigned NumAliased = 0;
21376 unsigned DistToSrc = 1;
21377 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21379 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21380 DepDest = DepDest->getNextLoadStore()) {
21381 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21391 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21393 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21400 DepDest->addMemoryDependency(BundleMember);
21401 BundleMember->incDependencies();
21402 if (!DepDest->isScheduled())
21403 BundleMember->incrementUnscheduledDeps(1);
21404 if (!DepDest->hasValidDependencies() ||
21405 (InsertInReadyList && DepDest->isReady()))
21429 "expected at least one instruction to schedule");
21431 WorkList.
push_back(Bundle.getBundle().front());
21433 SmallPtrSet<ScheduleBundle *, 16> Visited;
21434 while (!WorkList.
empty()) {
21439 CopyableBundle.
push_back(&CD->getBundle());
21440 Bundles = CopyableBundle;
21442 Bundles = getScheduleBundles(SD->getInst());
21444 if (Bundles.
empty()) {
21445 if (!SD->hasValidDependencies())
21447 if (InsertInReadyList && SD->isReady()) {
21448 ReadyInsts.insert(SD);
21449 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21453 for (ScheduleBundle *Bundle : Bundles) {
21454 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21456 assert(isInSchedulingRegion(*Bundle) &&
21457 "ScheduleData not in scheduling region");
21458 for_each(Bundle->getBundle(), ProcessNode);
21460 if (InsertInReadyList && SD->isReady()) {
21461 for (ScheduleBundle *Bundle : Bundles) {
21462 assert(isInSchedulingRegion(*Bundle) &&
21463 "ScheduleData not in scheduling region");
21464 if (!Bundle->isReady())
21466 ReadyInsts.insert(Bundle);
21474void BoUpSLP::BlockScheduling::resetSchedule() {
21476 "tried to reset schedule on block which has not been scheduled");
21477 for_each(ScheduleDataMap, [&](
auto &
P) {
21478 if (BB !=
P.first->getParent())
21480 ScheduleData *SD =
P.second;
21481 if (isInSchedulingRegion(*SD)) {
21482 SD->setScheduled(
false);
21483 SD->resetUnscheduledDeps();
21486 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21487 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21488 if (isInSchedulingRegion(*SD)) {
21489 SD->setScheduled(false);
21490 SD->resetUnscheduledDeps();
21494 for_each(ScheduledBundles, [&](
auto &
P) {
21495 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21496 if (isInSchedulingRegion(*Bundle))
21497 Bundle->setScheduled(false);
21501 for (
auto &
P : ScheduleCopyableDataMap) {
21502 if (isInSchedulingRegion(*
P.second)) {
21503 P.second->setScheduled(
false);
21504 P.second->resetUnscheduledDeps();
21507 ReadyInsts.clear();
21510void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21511 if (!BS->ScheduleStart)
21514 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21521 BS->resetSchedule();
21528 struct ScheduleDataCompare {
21529 bool operator()(
const ScheduleEntity *SD1,
21530 const ScheduleEntity *SD2)
const {
21531 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21534 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21539 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21540 I =
I->getNextNode()) {
21542 if (!Bundles.
empty()) {
21543 for (ScheduleBundle *Bundle : Bundles) {
21544 Bundle->setSchedulingPriority(Idx++);
21545 if (!Bundle->hasValidDependencies())
21546 BS->calculateDependencies(*Bundle,
false,
this);
21549 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21550 ScheduleBundle &Bundle = SD->getBundle();
21551 Bundle.setSchedulingPriority(Idx++);
21552 if (!Bundle.hasValidDependencies())
21553 BS->calculateDependencies(Bundle,
false,
this);
21558 BS->getScheduleCopyableDataUsers(
I);
21559 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21562 SDTEs.
front()->doesNotNeedToSchedule() ||
21564 "scheduler and vectorizer bundle mismatch");
21565 SD->setSchedulingPriority(Idx++);
21566 if (!SD->hasValidDependencies() &&
21567 (!CopyableData.
empty() ||
21568 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21569 assert(TE->isGather() &&
"expected gather node");
21570 return TE->hasState() && TE->hasCopyableElements() &&
21571 TE->isCopyableElement(I);
21577 ScheduleBundle Bundle;
21579 BS->calculateDependencies(Bundle,
false,
this);
21582 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21583 ScheduleBundle &Bundle = SD->getBundle();
21584 Bundle.setSchedulingPriority(Idx++);
21585 if (!Bundle.hasValidDependencies())
21586 BS->calculateDependencies(Bundle,
false,
this);
21589 BS->initialFillReadyList(ReadyInsts);
21591 Instruction *LastScheduledInst = BS->ScheduleEnd;
21594 SmallPtrSet<Instruction *, 16> Scheduled;
21595 while (!ReadyInsts.empty()) {
21596 auto *Picked = *ReadyInsts.begin();
21597 ReadyInsts.erase(ReadyInsts.begin());
21602 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21603 Instruction *PickedInst = BundleMember->getInst();
21605 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21606 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21607 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21609 if (PickedInst->
getNextNode() != LastScheduledInst)
21611 LastScheduledInst = PickedInst;
21613 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21614 LastScheduledInst);
21618 if (PickedInst->
getNextNode() != LastScheduledInst)
21620 LastScheduledInst = PickedInst;
21622 auto Invalid = InstructionsState::invalid();
21627#ifdef EXPENSIVE_CHECKS
21631#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21633 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21634 I =
I->getNextNode()) {
21637 [](
const ScheduleBundle *Bundle) {
21638 return Bundle->isScheduled();
21640 "must be scheduled at this point");
21645 BS->ScheduleStart =
nullptr;
21653 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21658 auto E = InstrElementSize.find(V);
21659 if (E != InstrElementSize.end())
21676 Value *FirstNonBool =
nullptr;
21677 while (!Worklist.
empty()) {
21682 auto *Ty =
I->getType();
21685 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21693 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21701 for (
Use &U :
I->operands()) {
21703 if (Visited.
insert(J).second &&
21709 FirstNonBool = U.get();
21720 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21722 Width = DL->getTypeSizeInBits(V->getType());
21726 InstrElementSize[
I] = Width;
21731bool BoUpSLP::collectValuesToDemote(
21732 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21735 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21740 unsigned OrigBitWidth =
21741 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21748 if (NodesToKeepBWs.
contains(E.Idx))
21754 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21755 if (isa<PoisonValue>(R))
21757 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21759 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21762 if (getTreeEntries(V).
size() > 1)
21768 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21774 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21779 unsigned BitWidth2 =
21780 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21781 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21787 BitWidth1 = std::min(BitWidth1, BitWidth2);
21792 auto FinalAnalysis = [&, TTI = TTI]() {
21793 if (!IsProfitableToDemote)
21796 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21798 if (Res &&
E.isGather()) {
21799 if (
E.hasState()) {
21800 if (
const TreeEntry *SameTE =
21801 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21803 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21804 ToDemote, Visited, NodesToKeepBWs,
21805 MaxDepthLevel, IsProfitableToDemote,
21813 SmallPtrSet<Value *, 4> UniqueBases;
21814 for (
Value *V :
E.Scalars) {
21818 UniqueBases.
insert(EE->getVectorOperand());
21820 const unsigned VF =
E.Scalars.size();
21821 Type *OrigScalarTy =
E.Scalars.front()->getType();
21822 if (UniqueBases.
size() <= 2 ||
21835 if (
E.isGather() || !Visited.
insert(&
E).second ||
21837 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21838 return isa<InsertElementInst>(U) && !isVectorized(U);
21841 return FinalAnalysis();
21844 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21845 return isVectorized(U) ||
21846 (E.Idx == 0 && UserIgnoreList &&
21847 UserIgnoreList->contains(U)) ||
21848 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21849 !U->getType()->isScalableTy() &&
21850 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21851 }) && !IsPotentiallyTruncated(V,
BitWidth);
21856 bool &NeedToExit) {
21857 NeedToExit =
false;
21858 unsigned InitLevel = MaxDepthLevel;
21860 unsigned Level = InitLevel;
21861 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21862 ToDemote, Visited, NodesToKeepBWs, Level,
21863 IsProfitableToDemote, IsTruncRoot)) {
21864 if (!IsProfitableToDemote)
21867 if (!FinalAnalysis())
21871 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21875 auto AttemptCheckBitwidth =
21876 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
21878 NeedToExit =
false;
21879 unsigned BestFailBitwidth = 0;
21881 if (Checker(
BitWidth, OrigBitWidth))
21883 if (BestFailBitwidth == 0 && FinalAnalysis())
21887 if (BestFailBitwidth == 0) {
21898 auto TryProcessInstruction =
21900 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
21904 for (
Value *V :
E.Scalars)
21905 (void)IsPotentiallyTruncated(V,
BitWidth);
21910 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21913 bool NeedToExit =
false;
21914 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21918 if (!ProcessOperands(
Operands, NeedToExit))
21927 return IsProfitableToDemote;
21930 if (
E.State == TreeEntry::SplitVectorize)
21931 return TryProcessInstruction(
21933 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
21934 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
21936 switch (
E.getOpcode()) {
21940 case Instruction::Trunc:
21941 if (IsProfitableToDemoteRoot)
21942 IsProfitableToDemote =
true;
21943 return TryProcessInstruction(
BitWidth);
21944 case Instruction::ZExt:
21945 case Instruction::SExt:
21946 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
21947 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21948 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21950 IsProfitableToDemote =
true;
21951 return TryProcessInstruction(
BitWidth);
21955 case Instruction::Add:
21956 case Instruction::Sub:
21957 case Instruction::Mul:
21958 case Instruction::And:
21959 case Instruction::Or:
21960 case Instruction::Xor: {
21961 return TryProcessInstruction(
21962 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
21964 case Instruction::Freeze:
21965 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
21966 case Instruction::Shl: {
21969 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
21971 if (isa<PoisonValue>(V))
21973 auto *I = cast<Instruction>(V);
21974 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21975 return AmtKnownBits.getMaxValue().ult(BitWidth);
21978 return TryProcessInstruction(
21979 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
21981 case Instruction::LShr: {
21985 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21987 if (isa<PoisonValue>(V))
21989 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21990 if (E.isCopyableElement(V))
21991 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
21992 auto *I = cast<Instruction>(V);
21993 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21994 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21995 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
21996 SimplifyQuery(*DL));
21999 return TryProcessInstruction(
22000 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22003 case Instruction::AShr: {
22007 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22009 if (isa<PoisonValue>(V))
22011 auto *I = cast<Instruction>(V);
22012 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22013 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22014 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22016 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22019 return TryProcessInstruction(
22020 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22023 case Instruction::UDiv:
22024 case Instruction::URem: {
22026 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22029 auto *I = cast<Instruction>(V);
22030 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22031 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22032 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22035 return TryProcessInstruction(
22036 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22040 case Instruction::Select: {
22041 return TryProcessInstruction(
22042 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22046 case Instruction::PHI: {
22047 const unsigned NumOps =
E.getNumOperands();
22050 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22055 case Instruction::Call: {
22060 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22061 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22064 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22065 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22068 auto *I = cast<Instruction>(V);
22069 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22070 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22071 return MaskedValueIsZero(I->getOperand(0), Mask,
22072 SimplifyQuery(*DL)) &&
22073 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22075 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22076 "Expected min/max intrinsics only.");
22077 unsigned SignBits = OrigBitWidth -
BitWidth;
22079 unsigned Op0SignBits =
22081 unsigned Op1SignBits =
22083 return SignBits <= Op0SignBits &&
22084 ((SignBits != Op0SignBits &&
22087 SimplifyQuery(*DL))) &&
22088 SignBits <= Op1SignBits &&
22089 ((SignBits != Op1SignBits &&
22094 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22097 auto *I = cast<Instruction>(V);
22098 unsigned SignBits = OrigBitWidth - BitWidth;
22099 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22100 unsigned Op0SignBits =
22101 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22102 return SignBits <= Op0SignBits &&
22103 ((SignBits != Op0SignBits &&
22104 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22105 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22108 if (
ID != Intrinsic::abs) {
22109 Operands.push_back(getOperandEntry(&
E, 1));
22110 CallChecker = CompChecker;
22112 CallChecker = AbsChecker;
22115 std::numeric_limits<InstructionCost::CostType>::max();
22117 unsigned VF =
E.Scalars.size();
22119 auto Checker = [&](
unsigned BitWidth, unsigned) {
22127 if (
Cost < BestCost) {
22133 [[maybe_unused]]
bool NeedToExit;
22134 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22144 return FinalAnalysis();
22151 bool IsStoreOrInsertElt =
22152 VectorizableTree.front()->hasState() &&
22153 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22154 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22155 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22156 ExtraBitWidthNodes.size() <= 1 &&
22157 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22158 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22161 unsigned NodeIdx = 0;
22162 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22166 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22167 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22168 "Unexpected tree is graph.");
22172 bool IsTruncRoot =
false;
22173 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22176 if (NodeIdx != 0 &&
22177 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22178 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22179 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22180 IsTruncRoot =
true;
22182 IsProfitableToDemoteRoot =
true;
22187 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22191 auto ComputeMaxBitWidth =
22192 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22193 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22197 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22198 !NodesToKeepBWs.
contains(E.Idx) &&
22199 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22201 return V->hasOneUse() || isa<Constant>(V) ||
22202 (!V->hasNUsesOrMore(UsesLimit) &&
22203 none_of(V->users(), [&](User *U) {
22204 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22205 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22206 if (TEs.empty() || is_contained(TEs, UserTE))
22208 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22210 isa<SIToFPInst, UIToFPInst>(U) ||
22211 (UserTE->hasState() &&
22212 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22213 SelectInst>(UserTE->getMainOp()) ||
22214 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22216 unsigned UserTESz = DL->getTypeSizeInBits(
22217 UserTE->Scalars.front()->getType());
22218 if (all_of(TEs, [&](const TreeEntry *TE) {
22219 auto It = MinBWs.find(TE);
22220 return It != MinBWs.end() &&
22221 It->second.first > UserTESz;
22224 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22228 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22229 auto It = MinBWs.find(UserTE);
22230 if (It != MinBWs.end())
22231 return It->second.first;
22232 unsigned MaxBitWidth =
22233 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22234 MaxBitWidth =
bit_ceil(MaxBitWidth);
22235 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22237 return MaxBitWidth;
22243 unsigned VF = E.getVectorFactor();
22244 Type *ScalarTy = E.Scalars.front()->getType();
22251 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22260 unsigned MaxBitWidth = 1u;
22268 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22269 if (isa<PoisonValue>(R))
22271 KnownBits Known = computeKnownBits(R, *DL);
22272 return Known.isNonNegative();
22275 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22276 E.UserTreeIndex.UserTE->hasState() &&
22277 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22279 std::min(DL->getTypeSizeInBits(
22280 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22281 DL->getTypeSizeInBits(ScalarTy));
22285 for (
Value *Root : E.Scalars) {
22291 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22307 if (!IsKnownPositive)
22312 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22315 APInt Mask = DB->getDemandedBits(
I);
22316 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22318 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22321 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22326 if (NumParts > 1 &&
22334 unsigned Opcode = E.getOpcode();
22335 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22336 Opcode == Instruction::SExt ||
22337 Opcode == Instruction::ZExt || NumParts > 1;
22342 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22343 bool NeedToDemote = IsProfitableToDemote;
22345 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22346 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22347 NeedToDemote, IsTruncRoot) ||
22348 (MaxDepthLevel <= Limit &&
22349 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22350 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22351 DL->getTypeSizeInBits(TreeRootIT) /
22352 DL->getTypeSizeInBits(
22353 E.getMainOp()->getOperand(0)->getType()) >
22357 MaxBitWidth =
bit_ceil(MaxBitWidth);
22359 return MaxBitWidth;
22366 if (UserIgnoreList &&
22370 if (
all_of(*UserIgnoreList,
22375 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22376 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22377 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22378 Builder.getInt1Ty()) {
22379 ReductionBitWidth = 1;
22381 for (
Value *V : *UserIgnoreList) {
22385 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22386 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22389 unsigned BitWidth2 = BitWidth1;
22392 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22394 ReductionBitWidth =
22395 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22397 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22398 ReductionBitWidth = 8;
22400 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22403 bool IsTopRoot = NodeIdx == 0;
22404 while (NodeIdx < VectorizableTree.size() &&
22405 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22406 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22407 RootDemotes.push_back(NodeIdx);
22409 IsTruncRoot =
true;
22411 bool IsSignedCmp =
false;
22412 if (UserIgnoreList &&
all_of(*UserIgnoreList, [](
Value *V) {
22416 IsSignedCmp =
true;
22417 while (NodeIdx < VectorizableTree.size()) {
22419 unsigned Limit = 2;
22421 ReductionBitWidth ==
22422 DL->getTypeSizeInBits(
22423 VectorizableTree.front()->Scalars.front()->getType()))
22425 unsigned MaxBitWidth = ComputeMaxBitWidth(
22426 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22427 IsTruncRoot, IsSignedCmp);
22428 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22429 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22430 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22431 else if (MaxBitWidth == 0)
22432 ReductionBitWidth = 0;
22435 for (
unsigned Idx : RootDemotes) {
22436 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22437 uint32_t OrigBitWidth =
22438 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22439 if (OrigBitWidth > MaxBitWidth) {
22447 RootDemotes.clear();
22449 IsProfitableToDemoteRoot =
true;
22451 if (ExtraBitWidthNodes.empty()) {
22452 NodeIdx = VectorizableTree.size();
22454 unsigned NewIdx = 0;
22456 NewIdx = *ExtraBitWidthNodes.begin();
22457 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22458 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22461 NodeIdx < VectorizableTree.size() &&
22462 VectorizableTree[NodeIdx]->UserTreeIndex &&
22463 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22464 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22465 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22466 Instruction::Trunc &&
22467 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22469 NodeIdx < VectorizableTree.size() &&
22470 VectorizableTree[NodeIdx]->UserTreeIndex &&
22471 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22472 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22473 Instruction::ICmp &&
22475 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22477 auto *IC = dyn_cast<ICmpInst>(V);
22478 return IC && (IC->isSigned() ||
22479 !isKnownNonNegative(IC->getOperand(0),
22480 SimplifyQuery(*DL)) ||
22481 !isKnownNonNegative(IC->getOperand(1),
22482 SimplifyQuery(*DL)));
22488 if (MaxBitWidth == 0 ||
22492 if (UserIgnoreList)
22493 AnalyzedMinBWVals.insert_range(TreeRoot);
22500 for (
unsigned Idx : ToDemote) {
22501 TreeEntry *
TE = VectorizableTree[Idx].get();
22502 if (MinBWs.contains(TE))
22505 if (isa<PoisonValue>(R))
22507 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22509 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22550 DL = &
F.getDataLayout();
22558 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22560 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22565 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22568 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22572 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22578 DT->updateDFSNumbers();
22581 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22586 R.clearReductionData();
22587 collectSeedInstructions(BB);
22590 if (!Stores.empty()) {
22592 <<
" underlying objects.\n");
22593 Changed |= vectorizeStoreChains(R);
22597 Changed |= vectorizeChainsInBlock(BB, R);
22602 if (!GEPs.empty()) {
22604 <<
" underlying objects.\n");
22605 Changed |= vectorizeGEPIndices(BB, R);
22610 R.optimizeGatherSequence();
22618 unsigned Idx,
unsigned MinVF,
22623 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22624 unsigned VF = Chain.
size();
22630 VF < 2 || VF < MinVF) {
22638 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22642 for (
Value *V : Chain)
22645 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22646 InstructionsState S =
Analysis.buildInstructionsState(
22650 bool IsAllowedSize =
22654 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22655 (!S.getMainOp()->isSafeToRemove() ||
22658 return !isa<ExtractElementInst>(V) &&
22659 (V->getNumUses() > Chain.size() ||
22660 any_of(V->users(), [&](User *U) {
22661 return !Stores.contains(U);
22664 (ValOps.
size() > Chain.size() / 2 && !S)) {
22665 Size = (!IsAllowedSize && S) ? 1 : 2;
22669 if (
R.isLoadCombineCandidate(Chain))
22671 R.buildTree(Chain);
22673 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22674 if (
R.isGathered(Chain.front()) ||
22676 return std::nullopt;
22677 Size =
R.getCanonicalGraphSize();
22680 if (
R.isProfitableToReorder()) {
22681 R.reorderTopToBottom();
22682 R.reorderBottomToTop();
22684 R.transformNodes();
22685 R.buildExternalUses();
22687 R.computeMinimumValueSizes();
22689 Size =
R.getCanonicalGraphSize();
22690 if (S && S.getOpcode() == Instruction::Load)
22698 using namespace ore;
22700 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22702 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22703 <<
" and with tree size "
22704 <<
NV(
"TreeSize",
R.getTreeSize()));
22718 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22719 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22720 unsigned Size = First ? Val.first : Val.second;
22732 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22733 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22734 unsigned P = First ? Val.first : Val.second;
22737 return V + (P - Mean) * (P - Mean);
22740 return Dev * 96 / (Mean * Mean) == 0;
22748class RelatedStoreInsts {
22751 : AllStores(AllStores) {
22752 reset(BaseInstrIdx);
22755 void reset(
unsigned NewBaseInstr) {
22756 assert(NewBaseInstr < AllStores.size() &&
22757 "Instruction index out of bounds");
22758 BaseInstrIdx = NewBaseInstr;
22760 insertOrLookup(NewBaseInstr, 0);
22767 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22768 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22769 return Inserted ? std::nullopt : std::make_optional(It->second);
22772 using DistToInstMap = std::map<int64_t, unsigned>;
22773 const DistToInstMap &getStores()
const {
return Instrs; }
22777 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22778 ScalarEvolution &SE)
const {
22779 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22782 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22788 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22789 int64_t DistFromCurBase) {
22790 DistToInstMap PrevSet = std::move(Instrs);
22791 reset(NewBaseInstIdx);
22796 for (
auto [Dist, InstIdx] : PrevSet) {
22797 if (InstIdx >= MinSafeIdx)
22798 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22804 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22805 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22806 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22811 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22812 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22817 unsigned BaseInstrIdx;
22820 DistToInstMap Instrs;
22828bool SLPVectorizerPass::vectorizeStores(
22830 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22837 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22838 int64_t PrevDist = -1;
22842 auto &[Dist, InstIdx] =
Data;
22843 if (
Operands.empty() || Dist - PrevDist == 1) {
22844 Operands.push_back(Stores[InstIdx]);
22846 if (Idx != StoreSeq.size() - 1)
22851 Operands.push_back(Stores[InstIdx]);
22857 .
insert({Operands.front(),
22858 cast<StoreInst>(Operands.front())->getValueOperand(),
22860 cast<StoreInst>(Operands.back())->getValueOperand(),
22865 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22866 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22870 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22872 Type *StoreTy =
Store->getValueOperand()->getType();
22873 Type *ValueTy = StoreTy;
22875 ValueTy = Trunc->getSrcTy();
22884 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22887 MinVF = std::max<unsigned>(2, MinVF);
22889 if (MaxVF < MinVF) {
22890 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22892 <<
"MinVF (" << MinVF <<
")\n");
22896 unsigned NonPowerOf2VF = 0;
22901 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22903 NonPowerOf2VF = CandVF;
22904 assert(NonPowerOf2VF != MaxVF &&
22905 "Non-power-of-2 VF should not be equal to MaxVF");
22912 unsigned MaxRegVF = MaxVF;
22915 if (MaxVF < MinVF) {
22916 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22918 <<
"MinVF (" << MinVF <<
")\n");
22922 SmallVector<unsigned> CandidateVFs;
22923 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22928 unsigned Repeat = 0;
22929 constexpr unsigned MaxAttempts = 4;
22930 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(
Operands.size());
22931 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22932 P.first =
P.second = 1;
22933 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22934 auto IsNotVectorized = [](
bool First,
22935 const std::pair<unsigned, unsigned> &
P) {
22936 return First ?
P.first > 0 :
P.second > 0;
22938 auto IsVectorized = [](
bool First,
22939 const std::pair<unsigned, unsigned> &
P) {
22940 return First ?
P.first == 0 :
P.second == 0;
22942 auto VFIsProfitable = [](
bool First,
unsigned Size,
22943 const std::pair<unsigned, unsigned> &
P) {
22946 auto FirstSizeSame = [](
unsigned Size,
22947 const std::pair<unsigned, unsigned> &
P) {
22948 return Size ==
P.first;
22952 bool RepeatChanged =
false;
22953 bool AnyProfitableGraph =
false;
22954 for (
unsigned VF : CandidateVFs) {
22955 AnyProfitableGraph =
false;
22956 unsigned FirstUnvecStore =
22957 std::distance(RangeSizes.begin(),
22958 find_if(RangeSizes, std::bind(IsNotVectorized,
22959 VF >= MaxRegVF, _1)));
22963 while (FirstUnvecStore < End) {
22964 unsigned FirstVecStore = std::distance(
22965 RangeSizes.begin(),
22966 find_if(RangeSizes.drop_front(FirstUnvecStore),
22967 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22968 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
22969 for (
unsigned SliceStartIdx = FirstUnvecStore;
22970 SliceStartIdx + VF <= MaxSliceEnd;) {
22981 ->getValueOperand()
22984 ->getValueOperand()
22987 "Expected all operands of same type.");
22988 if (!NonSchedulable.
empty()) {
22989 auto [NonSchedSizeMax, NonSchedSizeMin] =
22991 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
22994 SliceStartIdx += NonSchedSizeMax;
22999 std::optional<bool> Res =
23000 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23006 .first->getSecond()
23014 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23017 for (std::pair<unsigned, unsigned> &
P :
23018 RangeSizes.slice(SliceStartIdx, VF))
23019 P.first =
P.second = 0;
23020 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23021 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23022 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23023 P.first =
P.second = 0;
23024 FirstUnvecStore = SliceStartIdx + VF;
23026 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23027 for (std::pair<unsigned, unsigned> &
P :
23028 RangeSizes.slice(SliceStartIdx + VF,
23029 MaxSliceEnd - (SliceStartIdx + VF)))
23030 P.first =
P.second = 0;
23031 if (MaxSliceEnd == End)
23032 End = SliceStartIdx;
23033 MaxSliceEnd = SliceStartIdx;
23035 SliceStartIdx += VF;
23038 if (VF > 2 && Res &&
23039 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23040 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23042 SliceStartIdx += VF;
23047 if (VF > MaxRegVF && TreeSize > 1 &&
23048 all_of(RangeSizes.slice(SliceStartIdx, VF),
23049 std::bind(FirstSizeSame, TreeSize, _1))) {
23050 SliceStartIdx += VF;
23051 while (SliceStartIdx != MaxSliceEnd &&
23052 RangeSizes[SliceStartIdx].first == TreeSize)
23056 if (TreeSize > 1) {
23057 for (std::pair<unsigned, unsigned> &
P :
23058 RangeSizes.slice(SliceStartIdx, VF)) {
23059 if (VF >= MaxRegVF)
23060 P.second = std::max(
P.second, TreeSize);
23062 P.first = std::max(
P.first, TreeSize);
23066 AnyProfitableGraph =
true;
23068 if (FirstUnvecStore >= End)
23070 if (MaxSliceEnd - FirstUnvecStore < VF &&
23071 MaxSliceEnd - FirstUnvecStore >= MinVF)
23072 AnyProfitableGraph =
true;
23073 FirstUnvecStore = std::distance(
23074 RangeSizes.begin(),
23075 find_if(RangeSizes.drop_front(MaxSliceEnd),
23076 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23078 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23082 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23083 return P.first == 0 &&
P.second == 0;
23087 if (Repeat >= MaxAttempts ||
23088 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23090 constexpr unsigned StoresLimit = 64;
23091 const unsigned MaxTotalNum = std::min<unsigned>(
23093 static_cast<unsigned>(
23096 RangeSizes.begin(),
23097 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23099 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23102 CandidateVFs.clear();
23104 CandidateVFs.push_back(Limit);
23105 if (VF > MaxTotalNum || VF >= StoresLimit)
23107 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23109 P.first = std::max(
P.second,
P.first);
23113 CandidateVFs.push_back(VF);
23153 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23154 std::optional<int64_t> PtrDist;
23155 auto *RelatedStores =
find_if(
23156 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23157 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23158 return PtrDist.has_value();
23162 if (RelatedStores == SortedStores.
end()) {
23170 if (std::optional<unsigned> PrevInst =
23171 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23172 TryToVectorize(RelatedStores->getStores());
23173 RelatedStores->clearVectorizedStores(VectorizedStores);
23174 RelatedStores->rebase(*PrevInst + 1,
23179 Type *PrevValTy =
nullptr;
23181 if (
R.isDeleted(SI))
23184 PrevValTy =
SI->getValueOperand()->getType();
23186 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23187 for (RelatedStoreInsts &StoreSeq : SortedStores)
23188 TryToVectorize(StoreSeq.getStores());
23189 SortedStores.clear();
23190 PrevValTy =
SI->getValueOperand()->getType();
23192 FillStoresSet(
I, SI);
23196 for (RelatedStoreInsts &StoreSeq : SortedStores)
23197 TryToVectorize(StoreSeq.getStores());
23202void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23210 for (Instruction &
I : *BB) {
23214 if (!
SI->isSimple())
23225 if (
GEP->getNumIndices() != 1)
23227 Value *Idx =
GEP->idx_begin()->get();
23232 if (
GEP->getType()->isVectorTy())
23244 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23245 << VL.
size() <<
".\n");
23256 for (
Value *V : VL) {
23257 Type *Ty =
V->getType();
23261 R.getORE()->emit([&]() {
23262 std::string TypeStr;
23263 llvm::raw_string_ostream OS(TypeStr);
23265 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23266 <<
"Cannot SLP vectorize list: type "
23267 << TypeStr +
" is unsupported by vectorizer";
23274 unsigned Sz =
R.getVectorElementSize(I0);
23275 unsigned MinVF =
R.getMinVF(Sz);
23276 unsigned MaxVF = std::max<unsigned>(
23278 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23280 R.getORE()->emit([&]() {
23281 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23282 <<
"Cannot SLP vectorize list: vectorization factor "
23283 <<
"less than 2 is not supported";
23289 bool CandidateFound =
false;
23292 unsigned NextInst = 0, MaxInst = VL.size();
23293 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23299 if (TTI->getNumberOfParts(VecTy) == VF)
23301 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23302 unsigned ActualVF = std::min(MaxInst -
I, VF);
23307 if (MaxVFOnly && ActualVF < MaxVF)
23309 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23314 for (
Value *V : VL.drop_front(
I)) {
23318 !Inst || !
R.isDeleted(Inst)) {
23321 if (Idx == ActualVF)
23326 if (Idx != ActualVF)
23329 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23333 if (
R.isTreeTinyAndNotFullyVectorizable())
23335 if (
R.isProfitableToReorder()) {
23336 R.reorderTopToBottom();
23339 R.transformNodes();
23340 R.buildExternalUses();
23342 R.computeMinimumValueSizes();
23344 CandidateFound =
true;
23345 MinCost = std::min(MinCost,
Cost);
23348 <<
" for VF=" << ActualVF <<
"\n");
23351 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23353 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23354 <<
" and with tree size "
23355 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23366 if (!
Changed && CandidateFound) {
23367 R.getORE()->emit([&]() {
23368 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23369 <<
"List vectorization was possible but not beneficial with cost "
23370 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23374 R.getORE()->emit([&]() {
23375 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23376 <<
"Cannot SLP vectorize list: vectorization was impossible"
23377 <<
" with available vectorization factors";
23412 using ReductionOpsType = SmallVector<Value *, 16>;
23413 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23414 ReductionOpsListType ReductionOps;
23418 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23419 WeakTrackingVH ReductionRoot;
23424 bool IsSupportedHorRdxIdentityOp =
false;
23431 static bool isCmpSelMinMax(Instruction *
I) {
23439 static bool isBoolLogicOp(Instruction *
I) {
23445 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23446 bool TwoElementReduction =
false) {
23447 if (Kind == RecurKind::None)
23456 if (TwoElementReduction)
23459 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23463 return I->getFastMathFlags().noNaNs();
23466 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23469 return I->isAssociative();
23472 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23478 return I->getOperand(2);
23479 return I->getOperand(Index);
23484 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23488 case RecurKind::Or: {
23497 case RecurKind::And: {
23506 case RecurKind::Add:
23507 case RecurKind::Mul:
23508 case RecurKind::Xor:
23509 case RecurKind::FAdd:
23510 case RecurKind::FMul: {
23515 case RecurKind::SMax:
23516 case RecurKind::SMin:
23517 case RecurKind::UMax:
23518 case RecurKind::UMin:
23525 case RecurKind::FMax:
23526 case RecurKind::FMin:
23527 case RecurKind::FMaximum:
23528 case RecurKind::FMinimum:
23529 case RecurKind::FMaximumNum:
23530 case RecurKind::FMinimumNum: {
23543 const ReductionOpsListType &ReductionOps) {
23544 bool UseSelect = ReductionOps.size() == 2 ||
23546 (ReductionOps.size() == 1 &&
23548 assert((!UseSelect || ReductionOps.size() != 2 ||
23550 "Expected cmp + select pairs for reduction");
23551 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23569 return RecurKind::None;
23571 return RecurKind::Add;
23573 return RecurKind::Mul;
23576 return RecurKind::And;
23579 return RecurKind::Or;
23581 return RecurKind::Xor;
23583 return RecurKind::FAdd;
23585 return RecurKind::FMul;
23588 return RecurKind::FMax;
23590 return RecurKind::FMin;
23593 return RecurKind::FMaximum;
23595 return RecurKind::FMinimum;
23601 return RecurKind::SMax;
23603 return RecurKind::SMin;
23605 return RecurKind::UMax;
23607 return RecurKind::UMin;
23633 return RecurKind::None;
23637 return RecurKind::None;
23640 return RecurKind::None;
23644 return RecurKind::None;
23649 return RecurKind::None;
23652 return RecurKind::SMax;
23655 return RecurKind::SMin;
23658 return RecurKind::UMax;
23661 return RecurKind::UMin;
23664 return RecurKind::None;
23668 static unsigned getFirstOperandIndex(Instruction *
I) {
23669 return isCmpSelMinMax(
I) ? 1 : 0;
23674 static unsigned getNumberOfOperands(Instruction *
I) {
23675 return isCmpSelMinMax(
I) ? 3 : 2;
23680 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23681 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23684 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23686 return I->getParent() == BB;
23690 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23691 if (IsCmpSelMinMax) {
23695 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23696 return I->hasNUses(2);
23704 void initReductionOps(Instruction *
I) {
23705 if (isCmpSelMinMax(
I))
23706 ReductionOps.assign(2, ReductionOpsType());
23708 ReductionOps.assign(1, ReductionOpsType());
23712 void addReductionOps(Instruction *
I) {
23713 if (isCmpSelMinMax(
I)) {
23715 ReductionOps[1].emplace_back(
I);
23717 ReductionOps[0].emplace_back(
I);
23722 int Sz =
Data.size();
23731 : ReductionRoot(
I), ReductionLimit(2) {
23732 RdxKind = HorizontalReduction::getRdxKind(
I);
23733 ReductionOps.emplace_back().push_back(
I);
23736 ReducedValsToOps[
V].push_back(
I);
23739 bool matchReductionForOperands()
const {
23742 assert(ReductionRoot &&
"Reduction root is not set!");
23745 return Ops.size() == 2;
23753 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23754 ScalarEvolution &SE,
const DataLayout &
DL,
23755 const TargetLibraryInfo &TLI) {
23756 RdxKind = HorizontalReduction::getRdxKind(Root);
23757 if (!isVectorizable(RdxKind, Root))
23769 if (!Sel->getCondition()->hasOneUse())
23772 ReductionRoot = Root;
23777 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23779 1, std::make_pair(Root, 0));
23784 SmallVectorImpl<Value *> &PossibleReducedVals,
23785 SmallVectorImpl<Instruction *> &ReductionOps,
23788 getNumberOfOperands(TreeN)))) {
23789 Value *EdgeVal = getRdxOperand(TreeN,
I);
23790 ReducedValsToOps[EdgeVal].push_back(TreeN);
23798 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23799 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23800 !isVectorizable(RdxKind, EdgeInst) ||
23801 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23803 PossibleReducedVals.push_back(EdgeVal);
23806 ReductionOps.push_back(EdgeInst);
23815 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23817 PossibleReducedVals;
23818 initReductionOps(Root);
23820 SmallSet<size_t, 2> LoadKeyUsed;
23822 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23827 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23828 if (LIt != LoadsMap.
end()) {
23829 for (LoadInst *RLI : LIt->second) {
23835 for (LoadInst *RLI : LIt->second) {
23842 if (LIt->second.size() > 2) {
23844 hash_value(LIt->second.back()->getPointerOperand());
23850 .first->second.push_back(LI);
23854 while (!Worklist.empty()) {
23855 auto [TreeN,
Level] = Worklist.pop_back_val();
23858 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23859 addReductionOps(TreeN);
23862 for (
Value *V : PossibleRedVals) {
23866 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
23868 for (Instruction *
I :
reverse(PossibleReductionOps))
23869 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23871 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23874 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23875 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23877 for (
auto &Slice : PossibleRedVals) {
23879 auto RedValsVect = Slice.second.takeVector();
23881 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
23882 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
23884 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23885 return P1.size() > P2.size();
23892 }
else if (!isGoodForReduction(
Data)) {
23895 if (!LI || !LastLI ||
23900 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
23906 return P1.size() > P2.
size();
23912 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
23913 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23914 DominatorTree &DT) {
23915 constexpr unsigned RegMaxNumber = 4;
23916 constexpr unsigned RedValsMaxNumber = 128;
23920 if (
unsigned NumReducedVals = std::accumulate(
23921 ReducedVals.
begin(), ReducedVals.
end(), 0,
23923 if (!isGoodForReduction(Vals))
23925 return Num + Vals.size();
23927 NumReducedVals < ReductionLimit &&
23931 for (ReductionOpsType &RdxOps : ReductionOps)
23932 for (
Value *RdxOp : RdxOps)
23937 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23943 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
23944 ReducedVals.
front().size());
23948 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23950 "Expected min/max reduction to have select root instruction");
23953 "Expected min/max reduction to have compare condition");
23957 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23958 return isBoolLogicOp(cast<Instruction>(V));
23961 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23962 if (VectorizedTree) {
23966 if (AnyBoolLogicOp) {
23967 auto It = ReducedValsToOps.
find(VectorizedTree);
23968 auto It1 = ReducedValsToOps.
find(Res);
23969 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
23971 (It != ReducedValsToOps.
end() &&
23972 any_of(It->getSecond(), [&](Instruction *
I) {
23973 return isBoolLogicOp(I) &&
23974 getRdxOperand(I, 0) == VectorizedTree;
23978 (It1 != ReducedValsToOps.
end() &&
23979 any_of(It1->getSecond(), [&](Instruction *
I) {
23980 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
23984 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
23988 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
23994 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
23995 ReductionOps.front().size());
23996 for (ReductionOpsType &RdxOps : ReductionOps)
23997 for (
Value *RdxOp : RdxOps) {
24000 IgnoreList.insert(RdxOp);
24003 FastMathFlags RdxFMF;
24005 for (
Value *U : IgnoreList)
24007 RdxFMF &= FPMO->getFastMathFlags();
24013 for (
Value *V : Candidates)
24014 TrackedVals.try_emplace(V, V);
24016 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24017 Value *
V) ->
unsigned & {
24018 auto *It = MV.
find(V);
24019 assert(It != MV.
end() &&
"Unable to find given key.");
24023 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24026 SmallPtrSet<Value *, 4> RequiredExtract;
24027 WeakTrackingVH VectorizedTree =
nullptr;
24028 bool CheckForReusedReductionOps =
false;
24033 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24035 InstructionsState S = States[
I];
24038 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24039 for (
Value *ReducedVal : OrigReducedVals) {
24040 Value *RdxVal = TrackedVals.at(ReducedVal);
24047 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24051 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24053 bool ShuffledExtracts =
false;
24055 if (S && S.getOpcode() == Instruction::ExtractElement &&
24056 !S.isAltShuffle() &&
I + 1 <
E) {
24058 for (
Value *RV : ReducedVals[
I + 1]) {
24059 Value *RdxVal = TrackedVals.at(RV);
24066 CommonCandidates.push_back(RdxVal);
24067 TrackedToOrig.try_emplace(RdxVal, RV);
24069 SmallVector<int>
Mask;
24072 Candidates.
swap(CommonCandidates);
24073 ShuffledExtracts =
true;
24080 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24081 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24083 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24084 Value *OrigV = TrackedToOrig.at(VC);
24085 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24087 V.analyzedReductionRoot(ResI);
24089 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24093 unsigned NumReducedVals = Candidates.
size();
24094 if (NumReducedVals < ReductionLimit &&
24095 (NumReducedVals < 2 || !
isSplat(Candidates)))
24100 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24101 RdxKind != RecurKind::FMul &&
24102 RdxKind != RecurKind::FMulAdd;
24104 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24105 if (IsSupportedHorRdxIdentityOp)
24106 for (
Value *V : Candidates) {
24107 Value *OrigV = TrackedToOrig.at(V);
24108 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24120 bool SameScaleFactor =
false;
24121 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24122 SameValuesCounter.
size() != Candidates.size();
24124 if (OptReusedScalars) {
24126 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24127 RdxKind == RecurKind::Xor) &&
24129 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24130 return P.second == SameValuesCounter.
front().second;
24132 Candidates.resize(SameValuesCounter.
size());
24133 transform(SameValuesCounter, Candidates.begin(),
24134 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24135 NumReducedVals = Candidates.size();
24137 if (NumReducedVals == 1) {
24138 Value *OrigV = TrackedToOrig.at(Candidates.front());
24139 unsigned Cnt = At(SameValuesCounter, OrigV);
24141 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24142 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24143 VectorizedVals.try_emplace(OrigV, Cnt);
24144 ExternallyUsedValues.
insert(OrigV);
24149 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24150 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24151 const unsigned MaxElts = std::clamp<unsigned>(
24153 RegMaxNumber * RedValsMaxNumber);
24155 unsigned ReduxWidth = NumReducedVals;
24156 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24157 unsigned NumParts, NumRegs;
24158 Type *ScalarTy = Candidates.front()->getType();
24165 while (NumParts > NumRegs) {
24166 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24167 ReduxWidth =
bit_floor(ReduxWidth - 1);
24173 if (NumParts > NumRegs / 2)
24178 ReduxWidth = GetVectorFactor(ReduxWidth);
24179 ReduxWidth = std::min(ReduxWidth, MaxElts);
24181 unsigned Start = 0;
24182 unsigned Pos =
Start;
24184 unsigned PrevReduxWidth = ReduxWidth;
24185 bool CheckForReusedReductionOpsLocal =
false;
24186 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24187 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24188 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24191 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24194 if (Pos < NumReducedVals - ReduxWidth + 1)
24195 return IsAnyRedOpGathered;
24198 if (ReduxWidth > 1)
24199 ReduxWidth = GetVectorFactor(ReduxWidth);
24200 return IsAnyRedOpGathered;
24202 bool AnyVectorized =
false;
24203 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24204 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24205 ReduxWidth >= ReductionLimit) {
24208 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24210 CheckForReusedReductionOps =
true;
24213 PrevReduxWidth = ReduxWidth;
24216 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24219 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24221 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24223 V.areAnalyzedReductionVals(VL)) {
24224 (void)AdjustReducedVals(
true);
24231 return RedValI &&
V.isDeleted(RedValI);
24234 V.buildTree(VL, IgnoreList);
24235 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24236 if (!AdjustReducedVals())
24237 V.analyzedReductionVals(VL);
24240 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24241 if (!AdjustReducedVals())
24242 V.analyzedReductionVals(VL);
24245 V.reorderTopToBottom();
24248 VL.front()->getType()->isIntOrIntVectorTy() ||
24249 ReductionLimit > 2);
24253 ExternallyUsedValues);
24257 LocalExternallyUsedValues.insert(ReductionRoot);
24258 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24259 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24261 for (
Value *V : ReducedVals[Cnt])
24263 LocalExternallyUsedValues.insert(TrackedVals[V]);
24265 if (!IsSupportedHorRdxIdentityOp) {
24268 "Reused values counter map is not empty");
24269 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24270 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24272 Value *
V = Candidates[Cnt];
24273 Value *OrigV = TrackedToOrig.at(V);
24274 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24277 V.transformNodes();
24280 SmallPtrSet<Value *, 4> Visited;
24281 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24282 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24284 Value *RdxVal = Candidates[Cnt];
24285 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24286 RdxVal = It->second;
24287 if (!Visited.
insert(RdxVal).second)
24291 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24292 LocalExternallyUsedValues.insert(RdxVal);
24295 Value *OrigV = TrackedToOrig.at(RdxVal);
24297 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24298 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24299 LocalExternallyUsedValues.insert(RdxVal);
24302 if (!IsSupportedHorRdxIdentityOp)
24303 SameValuesCounter.
clear();
24304 for (
Value *RdxVal : VL)
24305 if (RequiredExtract.
contains(RdxVal))
24306 LocalExternallyUsedValues.insert(RdxVal);
24307 V.buildExternalUses(LocalExternallyUsedValues);
24309 V.computeMinimumValueSizes();
24313 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24316 <<
" for reduction\n");
24320 V.getORE()->emit([&]() {
24321 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24322 ReducedValsToOps.
at(VL[0]).front())
24323 <<
"Vectorizing horizontal reduction is possible "
24324 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24325 <<
" and threshold "
24328 if (!AdjustReducedVals()) {
24329 V.analyzedReductionVals(VL);
24331 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24334 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24335 VF >= ReductionLimit;
24337 *
TTI, VL.front()->getType(), VF - 1)) {
24339 V.getCanonicalGraphSize() !=
V.getTreeSize())
24342 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24349 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24350 <<
Cost <<
". (HorRdx)\n");
24351 V.getORE()->emit([&]() {
24352 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24353 ReducedValsToOps.
at(VL[0]).front())
24354 <<
"Vectorized horizontal reduction with cost "
24355 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24356 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24365 if (IsCmpSelMinMax)
24366 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24369 Value *VectorizedRoot =
V.vectorizeTree(
24370 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24373 for (
Value *RdxVal : Candidates) {
24374 Value *OrigVal = TrackedToOrig.at(RdxVal);
24375 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24376 if (TransformedRdxVal != RdxVal)
24377 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24386 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24389 if (OptReusedScalars && !SameScaleFactor) {
24390 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24391 SameValuesCounter, TrackedToOrig);
24394 Type *ScalarTy = VL.front()->getType();
24399 OptReusedScalars && SameScaleFactor
24400 ? SameValuesCounter.
front().second
24403 ?
V.isSignedMinBitwidthRootNode()
24407 for (
Value *RdxVal : VL) {
24408 Value *OrigV = TrackedToOrig.at(RdxVal);
24409 if (IsSupportedHorRdxIdentityOp) {
24410 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24413 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24414 if (!
V.isVectorized(RdxVal))
24415 RequiredExtract.
insert(RdxVal);
24419 ReduxWidth = NumReducedVals - Pos;
24420 if (ReduxWidth > 1)
24421 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24422 AnyVectorized =
true;
24424 if (OptReusedScalars && !AnyVectorized) {
24425 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24426 Value *RdxVal = TrackedVals.at(
P.first);
24427 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24428 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24429 VectorizedVals.try_emplace(
P.first,
P.second);
24434 if (!VectorValuesAndScales.
empty())
24435 VectorizedTree = GetNewVectorizedTree(
24437 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24438 if (VectorizedTree) {
24459 if (!AnyBoolLogicOp)
24461 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24462 getRdxOperand(RedOp1, 0) ==
LHS ||
24465 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24466 getRdxOperand(RedOp2, 0) ==
RHS ||
24471 if (
LHS != VectorizedTree)
24482 unsigned Sz = InstVals.
size();
24485 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24488 Value *RdxVal1 = InstVals[
I].second;
24489 Value *StableRdxVal1 = RdxVal1;
24490 auto It1 = TrackedVals.find(RdxVal1);
24491 if (It1 != TrackedVals.end())
24492 StableRdxVal1 = It1->second;
24493 Value *RdxVal2 = InstVals[
I + 1].second;
24494 Value *StableRdxVal2 = RdxVal2;
24495 auto It2 = TrackedVals.find(RdxVal2);
24496 if (It2 != TrackedVals.end())
24497 StableRdxVal2 = It2->second;
24501 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24503 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24504 StableRdxVal2,
"op.rdx", ReductionOps);
24505 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24508 ExtraReds[Sz / 2] = InstVals.
back();
24514 SmallPtrSet<Value *, 8> Visited;
24516 for (
Value *RdxVal : Candidates) {
24517 if (!Visited.
insert(RdxVal).second)
24519 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24520 for (Instruction *RedOp :
24526 bool InitStep =
true;
24527 while (ExtraReductions.
size() > 1) {
24529 FinalGen(ExtraReductions, InitStep);
24530 ExtraReductions.
swap(NewReds);
24533 VectorizedTree = ExtraReductions.
front().second;
24535 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24542 SmallPtrSet<Value *, 4> IgnoreSet;
24551 for (
auto *U :
Ignore->users()) {
24553 "All users must be either in the reduction ops list.");
24556 if (!
Ignore->use_empty()) {
24558 Ignore->replaceAllUsesWith(
P);
24561 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24563 }
else if (!CheckForReusedReductionOps) {
24564 for (ReductionOpsType &RdxOps : ReductionOps)
24565 for (
Value *RdxOp : RdxOps)
24568 return VectorizedTree;
24574 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24575 Value *Vec,
unsigned Scale,
bool IsSigned,
24599 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24602 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24604 if (Rdx->
getType() != DestTy)
24610 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24617 bool IsCmpSelMinMax, FastMathFlags FMF,
24618 const BoUpSLP &R, DominatorTree &DT,
24619 const DataLayout &
DL,
24620 const TargetLibraryInfo &TLI) {
24622 Type *ScalarTy = ReducedVals.
front()->getType();
24623 unsigned ReduxWidth = ReducedVals.
size();
24624 FixedVectorType *VectorTy =
R.getReductionType();
24629 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24632 int Cnt = ReducedVals.
size();
24633 for (
Value *RdxVal : ReducedVals) {
24638 Cost += GenCostFn();
24642 for (User *U : RdxVal->
users()) {
24644 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24645 if (RdxKind == RecurKind::FAdd) {
24655 FMACost -= FMulCost;
24657 ScalarCost += FMACost;
24664 ScalarCost = InstructionCost::getInvalid();
24668 Cost += ScalarCost;
24670 Cost += GenCostFn();
24679 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24681 case RecurKind::Add:
24682 case RecurKind::Mul:
24683 case RecurKind::Or:
24684 case RecurKind::And:
24685 case RecurKind::Xor:
24686 case RecurKind::FAdd:
24687 case RecurKind::FMul: {
24690 if (DoesRequireReductionOp) {
24693 unsigned ScalarTyNumElements = VecTy->getNumElements();
24698 ReducedVals.size()),
24709 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24710 std::make_pair(RedTy,
true));
24711 if (RType == RedTy) {
24716 RdxOpcode, !IsSigned, RedTy,
24722 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24723 std::make_pair(RedTy,
true));
24726 if (RdxKind == RecurKind::FAdd) {
24731 for (
Value *RdxVal : ReducedVals) {
24737 FMF &= FPCI->getFastMathFlags();
24740 if (!
Ops.empty()) {
24745 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24746 {RVecTy, RVecTy, RVecTy}, FMF);
24752 Instruction::FMul, RVecTy,
CostKind);
24754 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24755 FMACost -= FMulCost;
24759 if (FMACost.isValid())
24760 VectorCost += FMACost;
24764 if (RType != RedTy) {
24765 unsigned Opcode = Instruction::Trunc;
24767 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24773 ScalarCost = EvaluateScalarCost([&]() {
24778 case RecurKind::FMax:
24779 case RecurKind::FMin:
24780 case RecurKind::FMaximum:
24781 case RecurKind::FMinimum:
24782 case RecurKind::SMax:
24783 case RecurKind::SMin:
24784 case RecurKind::UMax:
24785 case RecurKind::UMin: {
24788 if (DoesRequireReductionOp) {
24794 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24795 std::make_pair(RedTy,
true));
24797 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24799 if (RType != RedTy) {
24800 unsigned Opcode = Instruction::Trunc;
24802 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24808 ScalarCost = EvaluateScalarCost([&]() {
24809 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24818 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24820 <<
" (It is a splitting reduction)\n");
24821 return VectorCost - ScalarCost;
24827 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24829 Value *ReducedSubTree =
nullptr;
24831 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24832 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24833 if (ReducedSubTree)
24834 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24835 "op.rdx", ReductionOps);
24837 ReducedSubTree = Rdx;
24839 if (VectorValuesAndScales.
size() == 1) {
24840 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24841 CreateSingleOp(Vec, Scale, IsSigned);
24842 return ReducedSubTree;
24846 Value *VecRes =
nullptr;
24847 bool VecResSignedness =
false;
24848 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24854 case RecurKind::Add: {
24855 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24858 <<
". (HorRdx)\n");
24861 std::iota(std::next(
Mask.begin(), VF *
I),
24862 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24863 ++NumVectorInstructions;
24874 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24875 <<
". (HorRdx)\n");
24876 ++NumVectorInstructions;
24880 case RecurKind::Xor: {
24883 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24888 case RecurKind::FAdd: {
24892 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24893 <<
". (HorRdx)\n");
24894 ++NumVectorInstructions;
24898 case RecurKind::And:
24899 case RecurKind::Or:
24900 case RecurKind::SMax:
24901 case RecurKind::SMin:
24902 case RecurKind::UMax:
24903 case RecurKind::UMin:
24904 case RecurKind::FMax:
24905 case RecurKind::FMin:
24906 case RecurKind::FMaximum:
24907 case RecurKind::FMinimum:
24910 case RecurKind::Sub:
24911 case RecurKind::AddChainWithSubs:
24912 case RecurKind::Mul:
24913 case RecurKind::FMul:
24914 case RecurKind::FMulAdd:
24915 case RecurKind::AnyOf:
24916 case RecurKind::FindFirstIVSMin:
24917 case RecurKind::FindFirstIVUMin:
24918 case RecurKind::FindLastIVSMax:
24919 case RecurKind::FindLastIVUMax:
24920 case RecurKind::FMaxNum:
24921 case RecurKind::FMinNum:
24922 case RecurKind::FMaximumNum:
24923 case RecurKind::FMinimumNum:
24924 case RecurKind::None:
24931 VecResSignedness = IsSigned;
24933 ++NumVectorInstructions;
24934 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24940 std::iota(
Mask.begin(),
Mask.end(), 0);
24942 if (VecResVF < VecVF) {
24946 if (VecResVF != VecVF) {
24948 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24965 if (VecResVF < VecVF) {
24971 if (VecResVF != VecVF)
24973 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
24974 if (VecResVF != VecVF)
24979 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
24980 CreateVecOp(Vec, Scale, IsSigned);
24981 CreateSingleOp(VecRes, 1,
false);
24983 return ReducedSubTree;
24987 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
24988 const TargetTransformInfo *
TTI,
Type *DestTy) {
24989 assert(VectorizedValue &&
"Need to have a vectorized tree node");
24990 assert(RdxKind != RecurKind::FMulAdd &&
24991 "A call to the llvm.fmuladd intrinsic is not handled yet");
24994 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
24995 RdxKind == RecurKind::Add &&
25000 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25001 ++NumVectorInstructions;
25004 ++NumVectorInstructions;
25009 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25011 assert(IsSupportedHorRdxIdentityOp &&
25012 "The optimization of matched scalar identity horizontal reductions "
25013 "must be supported.");
25015 return VectorizedValue;
25017 case RecurKind::Add: {
25019 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25021 << VectorizedValue <<
". (HorRdx)\n");
25022 return Builder.
CreateMul(VectorizedValue, Scale);
25024 case RecurKind::Xor: {
25026 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25027 <<
". (HorRdx)\n");
25030 return VectorizedValue;
25032 case RecurKind::FAdd: {
25034 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25036 << VectorizedValue <<
". (HorRdx)\n");
25037 return Builder.
CreateFMul(VectorizedValue, Scale);
25039 case RecurKind::And:
25040 case RecurKind::Or:
25041 case RecurKind::SMax:
25042 case RecurKind::SMin:
25043 case RecurKind::UMax:
25044 case RecurKind::UMin:
25045 case RecurKind::FMax:
25046 case RecurKind::FMin:
25047 case RecurKind::FMaximum:
25048 case RecurKind::FMinimum:
25050 return VectorizedValue;
25051 case RecurKind::Sub:
25052 case RecurKind::AddChainWithSubs:
25053 case RecurKind::Mul:
25054 case RecurKind::FMul:
25055 case RecurKind::FMulAdd:
25056 case RecurKind::AnyOf:
25057 case RecurKind::FindFirstIVSMin:
25058 case RecurKind::FindFirstIVUMin:
25059 case RecurKind::FindLastIVSMax:
25060 case RecurKind::FindLastIVUMax:
25061 case RecurKind::FMaxNum:
25062 case RecurKind::FMinNum:
25063 case RecurKind::FMaximumNum:
25064 case RecurKind::FMinimumNum:
25065 case RecurKind::None:
25074 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25075 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25076 const DenseMap<Value *, Value *> &TrackedToOrig) {
25077 assert(IsSupportedHorRdxIdentityOp &&
25078 "The optimization of matched scalar identity horizontal reductions "
25079 "must be supported.");
25082 if (VTy->getElementType() != VL.
front()->getType()) {
25086 R.isSignedMinBitwidthRootNode());
25089 case RecurKind::Add: {
25092 for (
Value *V : VL) {
25093 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25094 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25098 << VectorizedValue <<
". (HorRdx)\n");
25099 return Builder.
CreateMul(VectorizedValue, Scale);
25101 case RecurKind::And:
25102 case RecurKind::Or:
25105 <<
". (HorRdx)\n");
25106 return VectorizedValue;
25107 case RecurKind::SMax:
25108 case RecurKind::SMin:
25109 case RecurKind::UMax:
25110 case RecurKind::UMin:
25111 case RecurKind::FMax:
25112 case RecurKind::FMin:
25113 case RecurKind::FMaximum:
25114 case RecurKind::FMinimum:
25117 <<
". (HorRdx)\n");
25118 return VectorizedValue;
25119 case RecurKind::Xor: {
25124 SmallVector<int>
Mask(
25127 std::iota(
Mask.begin(),
Mask.end(), 0);
25128 bool NeedShuffle =
false;
25129 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25131 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25132 if (Cnt % 2 == 0) {
25134 NeedShuffle =
true;
25140 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25144 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25145 return VectorizedValue;
25147 case RecurKind::FAdd: {
25150 for (
Value *V : VL) {
25151 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25152 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25155 return Builder.
CreateFMul(VectorizedValue, Scale);
25157 case RecurKind::Sub:
25158 case RecurKind::AddChainWithSubs:
25159 case RecurKind::Mul:
25160 case RecurKind::FMul:
25161 case RecurKind::FMulAdd:
25162 case RecurKind::AnyOf:
25163 case RecurKind::FindFirstIVSMin:
25164 case RecurKind::FindFirstIVUMin:
25165 case RecurKind::FindLastIVSMax:
25166 case RecurKind::FindLastIVUMax:
25167 case RecurKind::FMaxNum:
25168 case RecurKind::FMinNum:
25169 case RecurKind::FMaximumNum:
25170 case RecurKind::FMinimumNum:
25171 case RecurKind::None:
25181 return HorizontalReduction::getRdxKind(V);
25187 unsigned AggregateSize = 1;
25189 Type *CurrentType =
IV->getType();
25192 for (
auto *Elt : ST->elements())
25193 if (Elt != ST->getElementType(0))
25194 return std::nullopt;
25195 AggregateSize *= ST->getNumElements();
25196 CurrentType = ST->getElementType(0);
25198 AggregateSize *= AT->getNumElements();
25199 CurrentType = AT->getElementType();
25201 AggregateSize *= VT->getNumElements();
25202 return AggregateSize;
25204 return AggregateSize;
25206 return std::nullopt;
25215 unsigned OperandOffset,
const BoUpSLP &R) {
25218 std::optional<unsigned> OperandIndex =
25220 if (!OperandIndex || R.isDeleted(LastInsertInst))
25224 BuildVectorOpds, InsertElts, *OperandIndex, R);
25227 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25228 InsertElts[*OperandIndex] = LastInsertInst;
25231 }
while (LastInsertInst !=
nullptr &&
25258 "Expected insertelement or insertvalue instruction!");
25261 "Expected empty result vectors!");
25264 if (!AggregateSize)
25266 BuildVectorOpds.
resize(*AggregateSize);
25267 InsertElts.
resize(*AggregateSize);
25272 if (BuildVectorOpds.
size() >= 2)
25290 auto DominatedReduxValue = [&](
Value *R) {
25298 if (
P->getIncomingBlock(0) == ParentBB) {
25300 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25304 if (Rdx && DominatedReduxValue(Rdx))
25317 if (
P->getIncomingBlock(0) == BBLatch) {
25319 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25323 if (Rdx && DominatedReduxValue(Rdx))
25359 "Expected binop, select, or intrinsic for reduction matching");
25361 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25363 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25374 Value *Op0 =
nullptr;
25375 Value *Op1 =
nullptr;
25384 Value *B0 =
nullptr, *B1 =
nullptr;
25389bool SLPVectorizerPass::vectorizeHorReduction(
25390 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25391 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25400 auto SelectRoot = [&]() {
25419 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25420 Stack.emplace(SelectRoot(), 0);
25421 SmallPtrSet<Value *, 8> VisitedInstrs;
25424 if (
R.isAnalyzedReductionRoot(Inst))
25429 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25431 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25433 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25434 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25446 while (!
Stack.empty()) {
25449 std::tie(Inst, Level) =
Stack.front();
25454 if (
R.isDeleted(Inst))
25456 if (
Value *VectorizedV = TryToReduce(Inst)) {
25460 Stack.emplace(
I, Level);
25463 if (
R.isDeleted(Inst))
25467 if (!TryAppendToPostponedInsts(Inst)) {
25478 if (VisitedInstrs.
insert(
Op).second)
25483 !
R.isDeleted(
I) &&
I->getParent() == BB)
25484 Stack.emplace(
I, Level);
25489bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25496 if ((
I->getOpcode() == Instruction::FAdd ||
25497 I->getOpcode() == Instruction::FSub) &&
25507 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25508 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25518 if (
A &&
B &&
B->hasOneUse()) {
25521 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25523 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25527 if (
B &&
A &&
A->hasOneUse()) {
25530 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25532 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25536 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25540 Type *Ty = Inst->getType();
25544 if (!HorRdx.matchReductionForOperands())
25550 TTI.getScalarizationOverhead(
25553 TTI.getInstructionCost(Inst,
CostKind);
25565 FMF = FPCI->getFastMathFlags();
25566 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25573 if (RedCost >= ScalarCost)
25576 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25578 if (Candidates.
size() == 1)
25579 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25582 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25583 if (!BestCandidate)
25585 return (*BestCandidate == 0 &&
25586 TryToReduce(
I, {Candidates[*BestCandidate].first,
25587 Candidates[*BestCandidate].second})) ||
25588 tryToVectorizeList({Candidates[*BestCandidate].first,
25589 Candidates[*BestCandidate].second},
25593bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25594 BasicBlock *BB,
BoUpSLP &R) {
25596 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25597 Res |= tryToVectorize(PostponedInsts, R);
25604 for (
Value *V : Insts)
25606 Res |= tryToVectorize(Inst, R);
25610bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25613 if (!
R.canMapToVector(IVI->
getType()))
25616 SmallVector<Value *, 16> BuildVectorOpds;
25617 SmallVector<Value *, 16> BuildVectorInsts;
25621 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25622 R.getORE()->emit([&]() {
25623 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25624 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25625 "trying reduction first.";
25629 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25631 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25634bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25637 SmallVector<Value *, 16> BuildVectorInsts;
25638 SmallVector<Value *, 16> BuildVectorOpds;
25639 SmallVector<int>
Mask;
25645 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25646 R.getORE()->emit([&]() {
25647 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25648 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25649 "trying reduction first.";
25653 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25654 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25657template <
typename T>
25662 bool MaxVFOnly,
BoUpSLP &R) {
25675 if (!
I || R.isDeleted(
I)) {
25679 auto *SameTypeIt = IncIt;
25682 AreCompatible(VL, *SameTypeIt))) {
25685 if (
I && !R.isDeleted(
I))
25690 unsigned NumElts = VL.
size();
25691 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25692 << NumElts <<
")\n");
25702 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25705 VL.
swap(Candidates);
25706 Candidates.
clear();
25714 auto GetMinNumElements = [&R](
Value *V) {
25715 unsigned EltSize = R.getVectorElementSize(V);
25716 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25718 if (NumElts < GetMinNumElements(*IncIt) &&
25719 (Candidates.
empty() ||
25720 Candidates.
front()->getType() == (*IncIt)->getType())) {
25728 if (Candidates.
size() > 1 &&
25729 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25730 if (TryToVectorizeHelper(Candidates,
false)) {
25733 }
else if (MaxVFOnly) {
25736 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25739 if (!
I || R.isDeleted(
I)) {
25743 auto *SameTypeIt = It;
25744 while (SameTypeIt != End &&
25747 AreCompatible(*SameTypeIt, *It))) {
25750 if (
I && !R.isDeleted(
I))
25753 unsigned NumElts = VL.
size();
25754 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25760 Candidates.
clear();
25764 IncIt = SameTypeIt;
25776template <
bool IsCompatibility>
25781 "Expected valid element types only.");
25783 return IsCompatibility;
25786 if (CI1->getOperand(0)->getType()->getTypeID() <
25788 return !IsCompatibility;
25789 if (CI1->getOperand(0)->getType()->getTypeID() >
25792 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25794 return !IsCompatibility;
25795 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25804 if (BasePred1 < BasePred2)
25805 return !IsCompatibility;
25806 if (BasePred1 > BasePred2)
25809 bool CI1Preds = Pred1 == BasePred1;
25810 bool CI2Preds = Pred2 == BasePred1;
25811 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25812 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25817 return !IsCompatibility;
25822 if (IsCompatibility) {
25823 if (I1->getParent() != I2->getParent())
25830 return NodeI2 !=
nullptr;
25833 assert((NodeI1 == NodeI2) ==
25835 "Different nodes should have different DFS numbers");
25836 if (NodeI1 != NodeI2)
25840 if (S && (IsCompatibility || !S.isAltShuffle()))
25842 if (IsCompatibility)
25844 if (I1->getOpcode() != I2->getOpcode())
25845 return I1->getOpcode() < I2->getOpcode();
25848 return IsCompatibility;
25851template <
typename ItT>
25853 BasicBlock *BB,
BoUpSLP &R) {
25856 for (CmpInst *
I : CmpInsts) {
25857 if (
R.isDeleted(
I))
25861 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25862 if (
R.isDeleted(
I))
25867 for (CmpInst *
I : CmpInsts) {
25868 if (
R.isDeleted(
I))
25887 for (Instruction *V : CmpInsts)
25890 if (Vals.
size() <= 1)
25893 Vals, CompareSorter, AreCompatibleCompares,
25896 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25897 return any_of(
V->users(), [V](User *U) {
25898 auto *Select = dyn_cast<SelectInst>(U);
25900 Select->getParent() != cast<Instruction>(V)->getParent();
25903 if (ArePossiblyReducedInOtherBlock)
25905 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25911bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25912 BasicBlock *BB,
BoUpSLP &R) {
25914 "This function only accepts Insert instructions");
25915 bool OpsChanged =
false;
25917 for (
auto *
I :
reverse(Instructions)) {
25923 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25926 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25929 if (
R.isDeleted(
I))
25931 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25937 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25939 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25944 OpsChanged |= tryToVectorize(PostponedInsts, R);
25950bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
25953 SmallPtrSet<Value *, 16> VisitedInstrs;
25957 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25958 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25961 "Expected vectorizable types only.");
25971 V2->getType()->getScalarSizeInBits())
25974 V2->getType()->getScalarSizeInBits())
25978 if (Opcodes1.
size() < Opcodes2.
size())
25980 if (Opcodes1.
size() > Opcodes2.
size())
25982 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
25991 return NodeI2 !=
nullptr;
25994 assert((NodeI1 == NodeI2) ==
25996 "Different nodes should have different DFS numbers");
25997 if (NodeI1 != NodeI2)
26000 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26016 DT->getNode(V1->getParent());
26018 DT->getNode(V2->getParent());
26020 return NodeI2 !=
nullptr;
26023 assert((NodeI1 == NodeI2) ==
26025 "Different nodes should have different DFS numbers");
26026 if (NodeI1 != NodeI2)
26028 return V1->comesBefore(V2);
26041 return *Id1 < *Id2;
26045 if (
I1->getOpcode() == I2->getOpcode())
26047 return I1->getOpcode() < I2->getOpcode();
26070 auto ValID1 = Opcodes1[
I]->getValueID();
26071 auto ValID2 = Opcodes2[
I]->getValueID();
26072 if (ValID1 == ValID2)
26074 if (ValID1 < ValID2)
26076 if (ValID1 > ValID2)
26085 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26091 if (VL.empty() || V1 == VL.back())
26093 Value *V2 = VL.back();
26098 if (Opcodes1.
size() != Opcodes2.
size())
26100 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26106 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26108 if (
I1->getParent() != I2->getParent())
26116 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26122 bool HaveVectorizedPhiNodes =
false;
26126 for (Instruction &
I : *BB) {
26133 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26138 if (Incoming.
size() <= 1)
26143 for (
Value *V : Incoming) {
26144 SmallVectorImpl<Value *> &Opcodes =
26146 if (!Opcodes.
empty())
26149 SmallPtrSet<Value *, 4> Visited;
26150 while (!Nodes.empty()) {
26154 for (
Value *V :
PHI->incoming_values()) {
26156 Nodes.push_back(PHI1);
26165 Incoming, PHICompare, AreCompatiblePHIs,
26167 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26170 Changed |= HaveVectorizedPhiNodes;
26171 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26173 return !
PHI ||
R.isDeleted(
PHI);
26175 PHIToOpcodes.
clear();
26177 }
while (HaveVectorizedPhiNodes);
26179 VisitedInstrs.
clear();
26181 InstSetVector PostProcessInserts;
26182 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26185 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26186 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26187 if (VectorizeCmps) {
26189 PostProcessCmps.
clear();
26191 PostProcessInserts.clear();
26197 return PostProcessCmps.
contains(Cmp);
26199 PostProcessInserts.contains(
I);
26205 return I->use_empty() &&
26215 if (
R.isDeleted(&*It))
26218 if (!VisitedInstrs.
insert(&*It).second) {
26219 if (HasNoUsers(&*It) &&
26220 VectorizeInsertsAndCmps(It->isTerminator())) {
26233 if (
P->getNumIncomingValues() == 2) {
26236 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26250 if (BB ==
P->getIncomingBlock(
I) ||
26251 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26257 PI && !IsInPostProcessInstrs(PI)) {
26259 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26261 if (Res &&
R.isDeleted(
P)) {
26271 if (HasNoUsers(&*It)) {
26272 bool OpsChanged =
false;
26283 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26284 SI->getValueOperand()->hasOneUse();
26286 if (TryToVectorizeRoot) {
26287 for (
auto *V : It->operand_values()) {
26291 VI && !IsInPostProcessInstrs(VI))
26293 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26300 VectorizeInsertsAndCmps(It->isTerminator());
26312 PostProcessInserts.insert(&*It);
26320bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26322 for (
auto &Entry : GEPs) {
26325 if (
Entry.second.size() < 2)
26328 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26329 <<
Entry.second.size() <<
".\n");
26337 return !R.isDeleted(GEP);
26339 if (It ==
Entry.second.end())
26341 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26342 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26343 if (MaxVecRegSize < EltSize)
26346 unsigned MaxElts = MaxVecRegSize / EltSize;
26347 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26348 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26361 Candidates.remove_if([&R](
Value *
I) {
26371 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26372 auto *GEPI = GEPList[
I];
26373 if (!Candidates.count(GEPI))
26375 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26376 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26377 auto *GEPJ = GEPList[J];
26378 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26380 Candidates.remove(GEPI);
26381 Candidates.remove(GEPJ);
26382 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26383 Candidates.remove(GEPJ);
26390 if (Candidates.
size() < 2)
26396 SmallVector<Value *, 16> Bundle(Candidates.
size());
26397 auto BundleIndex = 0
u;
26398 for (
auto *V : Candidates) {
26400 auto *GEPIdx =
GEP->idx_begin()->get();
26402 Bundle[BundleIndex++] = GEPIdx;
26414 Changed |= tryToVectorizeList(Bundle, R);
26420bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26425 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26426 if (
V->getValueOperand()->getType()->getTypeID() <
26429 if (
V->getValueOperand()->getType()->getTypeID() >
26432 if (
V->getPointerOperandType()->getTypeID() <
26433 V2->getPointerOperandType()->getTypeID())
26435 if (
V->getPointerOperandType()->getTypeID() >
26436 V2->getPointerOperandType()->getTypeID())
26438 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26441 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26447 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26448 DT->getNode(
I1->getParent());
26449 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26450 DT->getNode(I2->getParent());
26451 assert(NodeI1 &&
"Should only process reachable instructions");
26452 assert(NodeI2 &&
"Should only process reachable instructions");
26453 assert((NodeI1 == NodeI2) ==
26455 "Different nodes should have different DFS numbers");
26456 if (NodeI1 != NodeI2)
26458 return I1->getOpcode() < I2->getOpcode();
26460 return V->getValueOperand()->getValueID() <
26464 bool SameParent =
true;
26470 StoreInst *V2 = VL.
back();
26495 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26497 for (
auto [SI, V] :
zip(VL, NewVL))
26498 V =
SI->getValueOperand();
26499 NewVL.back() = V1->getValueOperand();
26500 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26501 InstructionsState S =
Analysis.buildInstructionsState(
26509 return V1->getValueOperand()->
getValueID() ==
26514 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26515 for (
auto &Pair : Stores) {
26516 if (Pair.second.size() < 2)
26520 << Pair.second.size() <<
".\n");
26529 Pair.second.rend());
26531 ReversedStores, StoreSorter, AreCompatibleStores,
26533 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const