74#ifdef EXPENSIVE_CHECKS
107using namespace slpvectorizer;
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Display the SLP trees with Graphviz"));
207 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
212 cl::desc(
"Try to replace values with the idempotent instructions for "
213 "better vectorization."));
244 if (
SLPReVec && isa<FixedVectorType>(Ty))
246 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
255 if (
auto *SI = dyn_cast<StoreInst>(V))
256 return SI->getValueOperand()->getType();
257 if (
auto *CI = dyn_cast<CmpInst>(V))
258 return CI->getOperand(0)->getType();
259 if (
auto *IE = dyn_cast<InsertElementInst>(V))
260 return IE->getOperand(1)->getType();
266 assert(!isa<ScalableVectorType>(Ty) &&
267 "ScalableVectorType is not supported.");
268 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
269 return VecTy->getNumElements();
283 Type *Ty,
unsigned Sz) {
288 if (NumParts == 0 || NumParts >= Sz)
303 if (NumParts == 0 || NumParts >= Sz)
308 return (Sz / RegVF) * RegVF;
318 for (
unsigned I : seq<unsigned>(Mask.size()))
320 I * VecTyNumElements, VecTyNumElements)))
322 : Mask[
I] * VecTyNumElements + J;
353 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
355 auto *SV = cast<ShuffleVectorInst>(VL.
front());
356 unsigned SVNumElements =
357 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
358 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
359 if (SVNumElements % ShuffleMaskSize != 0)
361 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
362 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
364 unsigned NumGroup = 0;
365 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
366 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
367 Value *Src = SV->getOperand(0);
371 auto *SV = cast<ShuffleVectorInst>(V);
373 if (SV->getOperand(0) != Src)
376 if (!SV->isExtractSubvectorMask(Index))
378 ExpectedIndex.
set(Index / ShuffleMaskSize);
382 if (!ExpectedIndex.
all())
386 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
404 auto *SV = cast<ShuffleVectorInst>(VL.
front());
405 unsigned SVNumElements =
406 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
408 unsigned AccumulateLength = 0;
409 for (
Value *V : VL) {
410 auto *SV = cast<ShuffleVectorInst>(V);
411 for (
int M : SV->getShuffleMask())
413 : AccumulateLength + M);
414 AccumulateLength += SVNumElements;
422 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
429 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
430 !isa<ExtractValueInst, UndefValue>(V))
432 auto *
I = dyn_cast<Instruction>(V);
433 if (!
I || isa<ExtractValueInst>(
I))
435 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
437 if (isa<ExtractElementInst>(
I))
439 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
455 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
464 OS <<
"Idx: " <<
Idx <<
", ";
465 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
473 auto *It =
find_if(VL, IsaPred<Instruction>);
482 if (isa<PoisonValue>(V))
484 auto *
II = dyn_cast<Instruction>(V);
488 if (BB !=
II->getParent())
505 Value *FirstNonUndef =
nullptr;
506 for (
Value *V : VL) {
507 if (isa<UndefValue>(V))
509 if (!FirstNonUndef) {
513 if (V != FirstNonUndef)
516 return FirstNonUndef !=
nullptr;
531 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
532 return Cmp->isCommutative();
533 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
534 return BO->isCommutative() ||
535 (BO->getOpcode() == Instruction::Sub &&
542 if (match(U.getUser(),
543 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
544 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
548 return match(U.getUser(),
549 m_Intrinsic<Intrinsic::abs>(
550 m_Specific(U.get()), m_ConstantInt(Flag))) &&
551 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
554 (BO->getOpcode() == Instruction::FSub &&
557 return match(U.getUser(),
558 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
560 return I->isCommutative();
580 constexpr unsigned IntrinsicNumOperands = 2;
581 return IntrinsicNumOperands;
583 return I->getNumOperands();
589 static_assert(std::is_same_v<T, InsertElementInst> ||
590 std::is_same_v<T, ExtractElementInst>,
593 if (
const auto *IE = dyn_cast<T>(Inst)) {
594 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
597 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
600 if (CI->getValue().uge(VT->getNumElements()))
602 Index *= VT->getNumElements();
603 Index += CI->getZExtValue();
614 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
616 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
621 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
625 Type *CurrentType =
IV->getType();
626 for (
unsigned I :
IV->indices()) {
627 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
628 Index *= ST->getNumElements();
629 CurrentType = ST->getElementType(
I);
630 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
631 Index *= AT->getNumElements();
632 CurrentType = AT->getElementType();
646 auto *It =
find_if(VL, IsaPred<Instruction>);
651 bool IsCmpOp = isa<CmpInst>(MainOp);
654 return std::all_of(It, VL.
end(), [&](
Value *V) {
655 if (auto *CI = dyn_cast<CmpInst>(V))
656 return BasePred == CI->getPredicate();
657 if (auto *I = dyn_cast<Instruction>(V))
658 return I->getOpcode() == Opcode;
659 return isa<PoisonValue>(V);
687 if (MaskArg == UseMask::UndefsAsMask)
691 if (MaskArg == UseMask::FirstArg &&
Value < VF)
692 UseMask.reset(
Value);
693 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
694 UseMask.reset(
Value - VF);
702template <
bool IsPoisonOnly = false>
706 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
709 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
712 auto *
C = dyn_cast<Constant>(V);
714 if (!UseMask.empty()) {
716 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
718 if (isa<T>(
II->getOperand(1)))
725 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
733 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
740 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
741 if (
Constant *Elem =
C->getAggregateElement(
I))
743 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
771static std::optional<TargetTransformInfo::ShuffleKind>
774 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
778 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
779 auto *EI = dyn_cast<ExtractElementInst>(V);
782 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
785 return std::max(S, VTy->getNumElements());
788 Value *Vec1 =
nullptr;
789 Value *Vec2 =
nullptr;
791 auto *EE = dyn_cast<ExtractElementInst>(V);
794 Value *Vec = EE->getVectorOperand();
795 if (isa<UndefValue>(Vec))
800 ShuffleMode CommonShuffleMode =
Unknown;
802 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
804 if (isa<UndefValue>(VL[
I]))
806 auto *EI = cast<ExtractElementInst>(VL[
I]);
807 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
809 auto *Vec = EI->getVectorOperand();
811 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
814 if (isa<UndefValue>(Vec)) {
817 if (isa<UndefValue>(EI->getIndexOperand()))
819 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
825 unsigned IntIdx =
Idx->getValue().getZExtValue();
832 if (!Vec1 || Vec1 == Vec) {
834 }
else if (!Vec2 || Vec2 == Vec) {
840 if (CommonShuffleMode == Permute)
844 if (Mask[
I] %
Size !=
I) {
845 CommonShuffleMode = Permute;
848 CommonShuffleMode =
Select;
851 if (CommonShuffleMode ==
Select && Vec2)
862 assert((Opcode == Instruction::ExtractElement ||
863 Opcode == Instruction::ExtractValue) &&
864 "Expected extractelement or extractvalue instruction.");
865 if (Opcode == Instruction::ExtractElement) {
866 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
869 return CI->getZExtValue();
871 auto *EI = cast<ExtractValueInst>(E);
872 if (EI->getNumIndices() != 1)
874 return *EI->idx_begin();
900bool isValidForAlternation(
unsigned Opcode) {
909class BinOpSameOpcodeHelper {
910 using MaskType = std::uint_fast16_t;
912 constexpr static std::initializer_list<unsigned> SupportedOp = {
913 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
914 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
924 MainOpBIT = 0b100000000,
932 static std::pair<ConstantInt *, unsigned>
934 unsigned Opcode =
I->getOpcode();
937 auto *BinOp = cast<BinaryOperator>(
I);
938 if (
auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
940 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
941 Opcode == Instruction::AShr)
943 if (
auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
947 struct InterchangeableInfo {
950 MaskType
Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
951 MulBIT | AShrBIT | ShlBIT;
956 MaskType SeenBefore = 0;
961 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
962 if (Mask & InterchangeableMask) {
963 SeenBefore |= OpcodeInMaskForm;
964 Mask &= InterchangeableMask;
969 bool equal(
unsigned Opcode) {
970 if (Opcode ==
I->getOpcode())
971 return trySet(MainOpBIT, MainOpBIT);
975 MaskType Candidate =
Mask & SeenBefore;
976 if (Candidate & MainOpBIT)
977 return I->getOpcode();
978 if (Candidate & ShlBIT)
979 return Instruction::Shl;
980 if (Candidate & AShrBIT)
981 return Instruction::AShr;
982 if (Candidate & MulBIT)
983 return Instruction::Mul;
984 if (Candidate & AddBIT)
985 return Instruction::Add;
986 if (Candidate & SubBIT)
987 return Instruction::Sub;
988 if (Candidate & AndBIT)
989 return Instruction::And;
990 if (Candidate & OrBIT)
991 return Instruction::Or;
992 if (Candidate & XorBIT)
993 return Instruction::Xor;
998 bool hasCandidateOpcode(
unsigned Opcode)
const {
999 MaskType Candidate =
Mask & SeenBefore;
1001 case Instruction::Shl:
1002 return Candidate & ShlBIT;
1003 case Instruction::AShr:
1004 return Candidate & AShrBIT;
1005 case Instruction::Mul:
1006 return Candidate & MulBIT;
1007 case Instruction::Add:
1008 return Candidate & AddBIT;
1009 case Instruction::Sub:
1010 return Candidate & SubBIT;
1011 case Instruction::And:
1012 return Candidate & AndBIT;
1013 case Instruction::Or:
1014 return Candidate & OrBIT;
1015 case Instruction::Xor:
1016 return Candidate & XorBIT;
1017 case Instruction::LShr:
1018 case Instruction::FAdd:
1019 case Instruction::FSub:
1020 case Instruction::FMul:
1021 case Instruction::SDiv:
1022 case Instruction::UDiv:
1023 case Instruction::FDiv:
1024 case Instruction::SRem:
1025 case Instruction::URem:
1026 case Instruction::FRem:
1036 unsigned FromOpcode =
I->getOpcode();
1037 if (FromOpcode == ToOpcode)
1040 auto [CI, Pos] = isBinOpWithConstantInt(I);
1041 const APInt &FromCIValue = CI->getValue();
1042 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1044 switch (FromOpcode) {
1045 case Instruction::Shl:
1046 if (ToOpcode == Instruction::Mul) {
1050 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1051 ToCIValue = ToOpcode == Instruction::And
1053 :
APInt::getZero(FromCIValueBitWidth);
1056 case Instruction::Mul:
1058 if (ToOpcode == Instruction::Shl) {
1059 ToCIValue =
APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1061 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1062 ToCIValue = ToOpcode == Instruction::And
1064 :
APInt::getZero(FromCIValueBitWidth);
1067 case Instruction::Add:
1068 case Instruction::Sub:
1069 if (FromCIValue.
isZero()) {
1073 "Cannot convert the instruction.");
1074 ToCIValue = FromCIValue;
1078 case Instruction::And:
1080 ToCIValue = ToOpcode == Instruction::Mul
1082 :
APInt::getZero(FromCIValueBitWidth);
1085 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1091 ConstantInt::get(
I->getOperand(Pos)->getType(), ToCIValue);
1095 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1100 InterchangeableInfo MainOp;
1101 InterchangeableInfo AltOp;
1102 bool isValidForAlternation(
const Instruction *
I)
const {
1103 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1104 ::isValidForAlternation(
I->getOpcode());
1109 if (!isValidForAlternation(
I))
1118 : MainOp(MainOp), AltOp(AltOp) {
1122 assert(isa<BinaryOperator>(
I) &&
1123 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1124 unsigned Opcode =
I->getOpcode();
1125 MaskType OpcodeInMaskForm;
1128 case Instruction::Shl:
1129 OpcodeInMaskForm = ShlBIT;
1131 case Instruction::AShr:
1132 OpcodeInMaskForm = AShrBIT;
1134 case Instruction::Mul:
1135 OpcodeInMaskForm = MulBIT;
1137 case Instruction::Add:
1138 OpcodeInMaskForm = AddBIT;
1140 case Instruction::Sub:
1141 OpcodeInMaskForm = SubBIT;
1143 case Instruction::And:
1144 OpcodeInMaskForm = AndBIT;
1146 case Instruction::Or:
1147 OpcodeInMaskForm = OrBIT;
1149 case Instruction::Xor:
1150 OpcodeInMaskForm = XorBIT;
1153 return MainOp.equal(Opcode) ||
1154 (initializeAltOp(
I) && AltOp.equal(Opcode));
1156 MaskType InterchangeableMask = OpcodeInMaskForm;
1159 constexpr MaskType CanBeAll =
1160 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1163 case Instruction::Shl:
1165 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1167 case Instruction::Mul:
1168 if (CIValue.
isOne()) {
1169 InterchangeableMask = CanBeAll;
1173 InterchangeableMask = MulBIT | ShlBIT;
1175 case Instruction::Add:
1176 case Instruction::Sub:
1177 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1179 case Instruction::And:
1181 InterchangeableMask = CanBeAll;
1185 InterchangeableMask = CanBeAll;
1189 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1190 (initializeAltOp(
I) &&
1191 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1193 unsigned getMainOpcode()
const {
return MainOp.
getOpcode(); }
1195 bool hasCandidateOpcode(
unsigned Opcode)
const {
1196 return MainOp.hasCandidateOpcode(Opcode);
1198 bool hasAltOp()
const {
return AltOp.I; }
1199 unsigned getAltOpcode()
const {
1200 return hasAltOp() ? AltOp.
getOpcode() : getMainOpcode();
1208class InstructionsState {
1234 bool HasCopyables =
false;
1238 assert(valid() &&
"InstructionsState is invalid.");
1243 assert(valid() &&
"InstructionsState is invalid.");
1248 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1250 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1253 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1263 assert(MainOp &&
"MainOp cannot be nullptr.");
1267 assert(AltOp &&
"AltOp cannot be nullptr.");
1270 if (!
I->isBinaryOp())
1272 BinOpSameOpcodeHelper
Converter(MainOp);
1276 BinOpSameOpcodeHelper AltConverter(AltOp);
1277 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1278 AltConverter.hasCandidateOpcode(AltOp->
getOpcode()))
1281 if (
Converter.hasAltOp() && !isAltShuffle())
1283 return Converter.hasAltOp() ? AltOp : MainOp;
1287 bool isShiftOp()
const {
1288 return getMainOp()->
isShift() && getAltOp()->isShift();
1293 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1297 bool isMulDivLikeOp()
const {
1298 constexpr std::array<unsigned, 8> MulDiv = {
1299 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1300 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1301 Instruction::URem, Instruction::FRem};
1307 bool isAddSubLikeOp()
const {
1308 constexpr std::array<unsigned, 4>
AddSub = {
1309 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1316 bool isCmpOp()
const {
1317 return (
getOpcode() == Instruction::ICmp ||
1323 bool valid()
const {
return MainOp && AltOp; }
1325 explicit operator bool()
const {
return valid(); }
1327 InstructionsState() =
delete;
1329 bool HasCopyables =
false)
1330 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1331 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1334 bool isCopyableElement(
Value *V)
const {
1335 assert(valid() &&
"InstructionsState is invalid.");
1338 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1340 auto *
I = dyn_cast<Instruction>(V);
1342 return !isa<PoisonValue>(V);
1349 if (!
I->isBinaryOp())
1351 BinOpSameOpcodeHelper
Converter(MainOp);
1357 bool isNonSchedulable(
Value *V)
const {
1358 assert(valid() &&
"InstructionsState is invalid.");
1359 auto *
I = dyn_cast<Instruction>(V);
1365 if (getMainOp() == V)
1367 if (isCopyableElement(V)) {
1368 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1369 auto *
I = dyn_cast<Instruction>(V);
1370 return !
I || isa<PHINode>(
I) ||
I->getParent() != MainOp->
getParent() ||
1378 return IsNonSchedulableCopyableElement(V);
1385 bool areInstructionsWithCopyableElements()
const {
1386 assert(valid() &&
"InstructionsState is invalid.");
1387 return HasCopyables;
1391std::pair<Instruction *, SmallVector<Value *>>
1393 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1394 assert(SelectedOp &&
"Cannot convert the instruction.");
1395 if (
I->isBinaryOp()) {
1397 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1416 for (
Value *V : VL) {
1417 if (isa<PoisonValue>(V))
1419 assert(isa<Instruction>(V) &&
"Only accepts PoisonValue and Instruction.");
1420 auto *Inst = cast<Instruction>(V);
1421 if (Inst->getOpcode() == Opcode)
1433 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1434 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1435 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1446 "Assessing comparisons of different types?");
1456 return (BasePred == Pred &&
1458 (BasePred == SwappedPred &&
1468 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
1469 return InstructionsState::invalid();
1471 auto *It =
find_if(VL, IsaPred<Instruction>);
1473 return InstructionsState::invalid();
1476 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
1477 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
1478 (VL.
size() == 2 && InstCnt < 2))
1479 return InstructionsState::invalid();
1481 bool IsCastOp = isa<CastInst>(MainOp);
1482 bool IsBinOp = isa<BinaryOperator>(MainOp);
1483 bool IsCmpOp = isa<CmpInst>(MainOp);
1488 unsigned AltOpcode = Opcode;
1490 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1491 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1493 UniquePreds.
insert(BasePred);
1494 UniqueNonSwappedPreds.
insert(BasePred);
1495 for (
Value *V : VL) {
1496 auto *
I = dyn_cast<CmpInst>(V);
1502 UniqueNonSwappedPreds.
insert(CurrentPred);
1503 if (!UniquePreds.
contains(CurrentPred) &&
1504 !UniquePreds.
contains(SwappedCurrentPred))
1505 UniquePreds.
insert(CurrentPred);
1510 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1516 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
1520 return InstructionsState::invalid();
1522 bool AnyPoison = InstCnt != VL.
size();
1526 auto *
I = dyn_cast<Instruction>(V);
1533 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
1534 return InstructionsState::invalid();
1535 unsigned InstOpcode =
I->getOpcode();
1536 if (IsBinOp && isa<BinaryOperator>(
I)) {
1537 if (BinOpHelper.add(
I))
1539 }
else if (IsCastOp && isa<CastInst>(
I)) {
1542 Value *Op1 =
I->getOperand(0);
1545 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1547 if (Opcode == AltOpcode) {
1548 assert(isValidForAlternation(Opcode) &&
1549 isValidForAlternation(InstOpcode) &&
1550 "Cast isn't safe for alternation, logic needs to be updated!");
1551 AltOpcode = InstOpcode;
1556 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1557 auto *BaseInst = cast<CmpInst>(MainOp);
1558 Type *Ty0 = BaseInst->getOperand(0)->getType();
1559 Type *Ty1 = Inst->getOperand(0)->getType();
1561 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1562 assert(InstOpcode == AltOpcode &&
1563 "Alternate instructions are only supported by BinaryOperator "
1571 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1572 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1577 auto *AltInst = cast<CmpInst>(AltOp);
1578 if (MainOp != AltOp) {
1581 }
else if (BasePred != CurrentPred) {
1583 isValidForAlternation(InstOpcode) &&
1584 "CmpInst isn't safe for alternation, logic needs to be updated!");
1589 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1590 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1593 }
else if (InstOpcode == Opcode) {
1594 assert(InstOpcode == AltOpcode &&
1595 "Alternate instructions are only supported by BinaryOperator and "
1597 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1598 if (Gep->getNumOperands() != 2 ||
1600 return InstructionsState::invalid();
1601 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1603 return InstructionsState::invalid();
1604 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1605 auto *BaseLI = cast<LoadInst>(MainOp);
1606 if (!LI->isSimple() || !BaseLI->isSimple())
1607 return InstructionsState::invalid();
1608 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1609 auto *
CallBase = cast<CallInst>(MainOp);
1611 return InstructionsState::invalid();
1612 if (Call->hasOperandBundles() &&
1614 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1615 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1618 return InstructionsState::invalid();
1621 return InstructionsState::invalid();
1624 if (Mappings.
size() != BaseMappings.
size() ||
1625 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1626 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1627 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1628 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1629 Mappings.
front().Shape.Parameters !=
1630 BaseMappings.
front().Shape.Parameters)
1631 return InstructionsState::invalid();
1636 return InstructionsState::invalid();
1641 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1643 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1646 "Incorrect implementation of allSameOpcode.");
1647 InstructionsState S(MainOp, AltOp);
1650 return isa<PoisonValue>(V) ||
1651 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1653 "Invalid InstructionsState.");
1661 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1671 unsigned Opcode = UserInst->
getOpcode();
1673 case Instruction::Load: {
1674 LoadInst *LI = cast<LoadInst>(UserInst);
1677 case Instruction::Store: {
1678 StoreInst *SI = cast<StoreInst>(UserInst);
1679 return (SI->getPointerOperand() == Scalar);
1681 case Instruction::Call: {
1682 CallInst *CI = cast<CallInst>(UserInst);
1685 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1686 Arg.value().get() == Scalar;
1698 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1705 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1706 return LI->isSimple();
1708 return SI->isSimple();
1710 return !
MI->isVolatile();
1718 bool ExtendingManyInputs =
false) {
1719 if (SubMask.
empty())
1722 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1725 "SubMask with many inputs support must be larger than the mask.");
1727 Mask.append(SubMask.
begin(), SubMask.
end());
1731 int TermValue = std::min(Mask.size(), SubMask.
size());
1732 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1734 (!ExtendingManyInputs &&
1735 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1737 NewMask[
I] = Mask[SubMask[
I]];
1753 const size_t Sz = Order.
size();
1756 for (
unsigned I = 0;
I < Sz; ++
I) {
1758 UnusedIndices.
reset(Order[
I]);
1760 MaskedIndices.
set(
I);
1762 if (MaskedIndices.
none())
1765 "Non-synced masked/available indices.");
1769 assert(
Idx >= 0 &&
"Indices must be synced.");
1779 unsigned Opcode0,
unsigned Opcode1) {
1782 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1783 if (isa<PoisonValue>(VL[Lane]))
1785 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1786 OpcodeMask.
set(Lane * ScalarTyNumElements,
1787 Lane * ScalarTyNumElements + ScalarTyNumElements);
1796 "Expected scalar constants.");
1799 std::fill_n(NewVal.
begin() +
I * VF, VF, V);
1808 const unsigned E = Indices.
size();
1810 for (
unsigned I = 0;
I < E; ++
I)
1811 Mask[Indices[
I]] =
I;
1817 assert(!Mask.empty() &&
"Expected non-empty mask.");
1821 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1823 Scalars[Mask[
I]] = Prev[
I];
1831 auto *
I = dyn_cast<Instruction>(V);
1836 auto *IO = dyn_cast<Instruction>(V);
1839 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1848 auto *
I = dyn_cast<Instruction>(V);
1852 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1854 auto *IU = dyn_cast<Instruction>(U);
1857 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1873 return !VL.
empty() &&
1889 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1898 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1900 if (NumParts == 0 || NumParts >= Limit)
1903 if (NumParts >= Sz || Sz % NumParts != 0 ||
1909namespace slpvectorizer {
1914 class ScheduleEntity;
1916 class ScheduleCopyableData;
1917 class ScheduleBundle;
1942 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1943 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1975 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1996 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1997 return VectorizableTree.
front()->Scalars;
2003 const TreeEntry &Root = *VectorizableTree.
front();
2004 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2005 !Root.Scalars.front()->getType()->isIntegerTy())
2006 return std::nullopt;
2007 auto It = MinBWs.
find(&Root);
2008 if (It != MinBWs.
end())
2012 if (Root.getOpcode() == Instruction::ZExt ||
2013 Root.getOpcode() == Instruction::SExt)
2014 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2015 Root.getOpcode() == Instruction::SExt);
2016 return std::nullopt;
2022 return MinBWs.
at(VectorizableTree.
front().get()).second;
2027 if (ReductionBitWidth == 0 ||
2028 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
2029 ReductionBitWidth >=
2030 DL->getTypeSizeInBits(
2031 VectorizableTree.
front()->Scalars.front()->getType()))
2033 VectorizableTree.
front()->Scalars.front()->getType(),
2034 VectorizableTree.
front()->getVectorFactor());
2037 VectorizableTree.
front()->Scalars.front()->getContext(),
2039 VectorizableTree.
front()->getVectorFactor());
2054 VectorizableTree.
clear();
2055 ScalarToTreeEntries.clear();
2056 OperandsToTreeEntry.
clear();
2057 ScalarsInSplitNodes.clear();
2059 NonScheduledFirst.
clear();
2060 EntryToLastInstruction.clear();
2061 LoadEntriesToVectorize.
clear();
2062 IsGraphTransformMode =
false;
2063 GatheredLoadsEntriesFirst.reset();
2064 CompressEntryToData.clear();
2065 ExternalUses.
clear();
2066 ExternalUsesAsOriginalScalar.clear();
2067 ExternalUsesWithNonUsers.clear();
2068 for (
auto &Iter : BlocksSchedules) {
2069 BlockScheduling *BS = Iter.second.get();
2073 ReductionBitWidth = 0;
2075 CastMaxMinBWSizes.reset();
2076 ExtraBitWidthNodes.
clear();
2077 InstrElementSize.clear();
2078 UserIgnoreList =
nullptr;
2079 PostponedGathers.
clear();
2080 ValueToGatherNodes.
clear();
2096 assert(!Order.
empty() &&
"expected non-empty order");
2097 const unsigned Sz = Order.
size();
2099 return P.value() ==
P.index() ||
P.value() == Sz;
2112 bool IgnoreReorder);
2125 std::optional<OrdersType>
2163 return MaxVecRegSize;
2168 return MinVecRegSize;
2176 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2178 return MaxVF ? MaxVF : UINT_MAX;
2230 unsigned *BestVF =
nullptr,
2231 bool TryRecursiveCheck =
true)
const;
2239 template <
typename T>
2266 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2267 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2292 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
2293 MaxLevel(MaxLevel) {}
2347 if (isa<LoadInst>(V1)) {
2349 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2354 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2356 return U == U1 || U == U2 || R.isVectorized(U);
2359 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2362 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2364 ((
int)V1->getNumUses() == NumLanes ||
2365 AllUsersAreInternal(V1, V2)))
2371 auto CheckSameEntryOrFail = [&]() {
2376 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2382 auto *LI1 = dyn_cast<LoadInst>(V1);
2383 auto *LI2 = dyn_cast<LoadInst>(V2);
2385 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2387 return CheckSameEntryOrFail();
2390 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2391 LI2->getPointerOperand(),
DL, SE,
true);
2392 if (!Dist || *Dist == 0) {
2395 R.TTI->isLegalMaskedGather(
2398 return CheckSameEntryOrFail();
2402 if (std::abs(*Dist) > NumLanes / 2)
2411 auto *C1 = dyn_cast<Constant>(V1);
2412 auto *C2 = dyn_cast<Constant>(V2);
2417 if ((C1 && isa<InsertElementInst>(V2)) ||
2418 (C2 && isa<InsertElementInst>(V1)))
2431 if (isa<UndefValue>(V2))
2435 Value *EV2 =
nullptr;
2448 int Dist = Idx2 - Idx1;
2451 if (std::abs(Dist) == 0)
2453 if (std::abs(Dist) > NumLanes / 2)
2460 return CheckSameEntryOrFail();
2463 auto *I1 = dyn_cast<Instruction>(V1);
2464 auto *I2 = dyn_cast<Instruction>(V2);
2466 if (I1->getParent() != I2->getParent())
2467 return CheckSameEntryOrFail();
2475 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2476 !S.isAltShuffle()) &&
2478 return isa<PoisonValue>(V) ||
2479 cast<Instruction>(V)->getNumOperands() ==
2480 S.getMainOp()->getNumOperands();
2486 if (I1 && isa<PoisonValue>(V2))
2489 if (isa<UndefValue>(V2))
2492 return CheckSameEntryOrFail();
2526 int ShallowScoreAtThisLevel =
2535 auto *I1 = dyn_cast<Instruction>(
LHS);
2536 auto *I2 = dyn_cast<Instruction>(
RHS);
2537 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2539 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2540 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2541 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
2542 ShallowScoreAtThisLevel))
2543 return ShallowScoreAtThisLevel;
2544 assert(I1 && I2 &&
"Should have early exited.");
2551 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2552 OpIdx1 != NumOperands1; ++OpIdx1) {
2554 int MaxTmpScore = 0;
2555 unsigned MaxOpIdx2 = 0;
2556 bool FoundBest =
false;
2560 ? I2->getNumOperands()
2561 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2562 assert(FromIdx <= ToIdx &&
"Bad index");
2563 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2565 if (Op2Used.
count(OpIdx2))
2570 I1, I2, CurrLevel + 1, {});
2573 TmpScore > MaxTmpScore) {
2574 MaxTmpScore = TmpScore;
2581 Op2Used.
insert(MaxOpIdx2);
2582 ShallowScoreAtThisLevel += MaxTmpScore;
2585 return ShallowScoreAtThisLevel;
2616 struct OperandData {
2617 OperandData() =
default;
2618 OperandData(
Value *V,
bool APO,
bool IsUsed)
2619 : V(V), APO(APO), IsUsed(IsUsed) {}
2629 bool IsUsed =
false;
2638 enum class ReorderingMode {
2652 unsigned ArgSize = 0;
2658 const Loop *L =
nullptr;
2661 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2662 return OpsVec[
OpIdx][Lane];
2666 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2667 return OpsVec[
OpIdx][Lane];
2672 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2674 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2676 OpsVec[
OpIdx][Lane].IsUsed =
false;
2680 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2681 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2693 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2695 Value *IdxLaneV = getData(
Idx, Lane).V;
2696 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(
OpIdx, Lane).V ||
2697 isa<ExtractElementInst>(IdxLaneV))
2700 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2704 if (!isa<Instruction>(OpIdxLnV))
2708 unsigned UniquesCount = Uniques.
size();
2709 auto IdxIt = Uniques.
find(IdxLaneV);
2710 unsigned UniquesCntWithIdxLaneV =
2711 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2713 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2714 unsigned UniquesCntWithOpIdxLaneV =
2715 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2716 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2718 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2719 UniquesCntWithOpIdxLaneV,
2720 UniquesCntWithOpIdxLaneV -
2722 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2723 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2724 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2733 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2734 Value *IdxLaneV = getData(
Idx, Lane).V;
2744 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2745 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2747 return R.areAllUsersVectorized(IdxLaneI)
2755 static const int ScoreScaleFactor = 10;
2763 int Lane,
unsigned OpIdx,
unsigned Idx,
2773 int SplatScore = getSplatScore(Lane,
OpIdx,
Idx, UsedLanes);
2774 if (Score <= -SplatScore) {
2778 Score += SplatScore;
2784 Score *= ScoreScaleFactor;
2785 Score += getExternalUseScore(Lane,
OpIdx,
Idx);
2803 std::optional<unsigned>
2804 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2808 unsigned NumOperands = getNumOperands();
2811 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2814 ReorderingMode RMode = ReorderingModes[
OpIdx];
2815 if (RMode == ReorderingMode::Failed)
2816 return std::nullopt;
2819 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2825 std::optional<unsigned>
Idx;
2835 bool IsUsed = RMode == ReorderingMode::Splat ||
2836 RMode == ReorderingMode::Constant ||
2837 RMode == ReorderingMode::Load;
2839 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2841 OperandData &OpData = getData(
Idx, Lane);
2843 bool OpAPO = OpData.APO;
2852 if (OpAPO != OpIdxAPO)
2857 case ReorderingMode::Load:
2858 case ReorderingMode::Opcode: {
2859 bool LeftToRight = Lane > LastLane;
2860 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2861 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2862 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2864 if (Score >
static_cast<int>(BestOp.Score) ||
2865 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2868 BestOp.Score = Score;
2869 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2873 case ReorderingMode::Constant:
2874 if (isa<Constant>(
Op) ||
2875 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2877 if (isa<Constant>(
Op)) {
2879 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2882 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2886 case ReorderingMode::Splat:
2887 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2888 IsUsed =
Op == OpLastLane;
2889 if (
Op == OpLastLane) {
2891 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2897 case ReorderingMode::Failed:
2903 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2907 return std::nullopt;
2914 unsigned getBestLaneToStartReordering()
const {
2915 unsigned Min = UINT_MAX;
2916 unsigned SameOpNumber = 0;
2927 for (
int I = getNumLanes();
I > 0; --
I) {
2928 unsigned Lane =
I - 1;
2929 OperandsOrderData NumFreeOpsHash =
2930 getMaxNumOperandsThatCanBeReordered(Lane);
2933 if (NumFreeOpsHash.NumOfAPOs < Min) {
2934 Min = NumFreeOpsHash.NumOfAPOs;
2935 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2937 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2938 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2939 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2942 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2943 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2944 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2945 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2946 auto [It, Inserted] =
2947 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2953 unsigned BestLane = 0;
2954 unsigned CntMin = UINT_MAX;
2956 if (
Data.second.first < CntMin) {
2957 CntMin =
Data.second.first;
2958 BestLane =
Data.second.second;
2965 struct OperandsOrderData {
2968 unsigned NumOfAPOs = UINT_MAX;
2971 unsigned NumOpsWithSameOpcodeParent = 0;
2985 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2986 unsigned CntTrue = 0;
2987 unsigned NumOperands = getNumOperands();
2997 bool AllUndefs =
true;
2998 unsigned NumOpsWithSameOpcodeParent = 0;
3003 const OperandData &OpData = getData(
OpIdx, Lane);
3008 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
3010 I->getParent() != Parent) {
3011 if (NumOpsWithSameOpcodeParent == 0) {
3012 NumOpsWithSameOpcodeParent = 1;
3014 Parent =
I->getParent();
3016 --NumOpsWithSameOpcodeParent;
3019 ++NumOpsWithSameOpcodeParent;
3024 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3028 OperandsOrderData
Data;
3029 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3030 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3037 const InstructionsState &S) {
3041 return VL.
size() == getNumLanes();
3043 "Expected same number of lanes");
3044 assert(S.valid() &&
"InstructionsState is invalid.");
3051 unsigned NumLanes = VL.
size();
3053 Ops.resize(NumLanes);
3054 for (
unsigned Lane : seq<unsigned>(NumLanes)) {
3065 auto *
I = dyn_cast<Instruction>(VL[Lane]);
3066 if (!
I && isa<PoisonValue>(VL[Lane])) {
3067 for (
unsigned OpIdx : seq<unsigned>(NumOperands))
3071 bool IsInverseOperation =
false;
3072 if (S.isCopyableElement(VL[Lane])) {
3076 assert(
I &&
"Expected instruction");
3077 auto [SelectedOp, Ops] = convertTo(
I, S);
3083 for (
unsigned OpIdx : seq<unsigned>(ArgSize)) {
3084 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3091 unsigned getNumOperands()
const {
return ArgSize; }
3094 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
3097 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3098 return getData(
OpIdx, Lane).V;
3102 bool empty()
const {
return OpsVec.
empty(); }
3105 void clear() { OpsVec.
clear(); }
3110 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3112 "Op is expected to be getValue(OpIdx, Lane).");
3114 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
3116 bool OpAPO = getData(
OpIdx, Lane).APO;
3117 bool IsInvariant = L && L->isLoopInvariant(
Op);
3119 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3123 bool FoundCandidate =
false;
3124 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3125 OperandData &
Data = getData(OpI, Ln);
3126 if (
Data.APO != OpAPO ||
Data.IsUsed)
3128 Value *OpILane = getValue(OpI, Lane);
3129 bool IsConstantOp = isa<Constant>(OpILane);
3138 ((Lns > 2 && isa<Constant>(
Data.V)) ||
3143 isa<Constant>(
Data.V)))) ||
3150 (IsInvariant && !isa<Constant>(
Data.V) &&
3152 L->isLoopInvariant(
Data.V))) {
3153 FoundCandidate =
true;
3160 if (!FoundCandidate)
3163 return getNumLanes() == 2 || Cnt > 1;
3170 "Op is expected to be getValue(OpIdx, Lane).");
3171 bool OpAPO = getData(
OpIdx, Lane).APO;
3172 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3175 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
3176 const OperandData &
Data = getData(OpI, Ln);
3177 if (
Data.APO != OpAPO ||
Data.IsUsed)
3179 Value *OpILn = getValue(OpI, Ln);
3180 return (L && L->isLoopInvariant(OpILn)) ||
3192 const InstructionsState &S,
const BoUpSLP &R)
3193 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
3194 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3196 appendOperands(RootVL,
Operands, S);
3204 "Expected same num of lanes across all operands");
3205 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3206 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3214 unsigned NumOperands = getNumOperands();
3215 unsigned NumLanes = getNumLanes();
3235 unsigned FirstLane = getBestLaneToStartReordering();
3242 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3244 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3245 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3246 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3247 else if (isa<LoadInst>(OpILane0))
3248 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3250 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3251 }
else if (isa<Constant>(OpLane0)) {
3252 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3253 }
else if (isa<Argument>(OpLane0)) {
3255 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3265 auto &&SkipReordering = [
this]() {
3268 for (
const OperandData &
Data : Op0)
3272 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3279 return UniqueValues.
size() != 2 &&
3281 UniqueValues.
size());
3293 if (SkipReordering())
3296 bool StrategyFailed =
false;
3304 for (
unsigned I = 0;
I < NumOperands; ++
I)
3305 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3308 UsedLanes.
set(FirstLane);
3309 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3312 int Lane = FirstLane +
Direction * Distance;
3313 if (Lane < 0 || Lane >= (
int)NumLanes)
3315 UsedLanes.
set(Lane);
3317 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3322 std::optional<unsigned> BestIdx =
3323 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3324 MainAltOps[
OpIdx], UsedLanes);
3331 swap(
OpIdx, *BestIdx, Lane);
3334 StrategyFailed =
true;
3338 OperandData &AltOp = getData(
OpIdx, Lane);
3339 InstructionsState OpS =
3341 if (OpS && OpS.isAltShuffle())
3348 if (!StrategyFailed)
3353#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3356 case ReorderingMode::Load:
3358 case ReorderingMode::Opcode:
3360 case ReorderingMode::Constant:
3362 case ReorderingMode::Splat:
3364 case ReorderingMode::Failed:
3385 const unsigned Indent = 2;
3388 OS <<
"Operand " << Cnt++ <<
"\n";
3389 for (
const OperandData &OpData : OpDataVec) {
3391 if (
Value *V = OpData.V)
3395 OS <<
", APO:" << OpData.APO <<
"}\n";
3417 int BestScore = Limit;
3418 std::optional<int> Index;
3419 for (
int I : seq<int>(0, Candidates.size())) {
3421 Candidates[
I].second,
3424 if (Score > BestScore) {
3439 DeletedInstructions.insert(
I);
3444 template <
typename T>
3447 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3449 for (
T *V : DeadVals) {
3450 auto *
I = cast<Instruction>(V);
3454 for (
T *V : DeadVals) {
3455 if (!V || !Processed.
insert(V).second)
3457 auto *
I = cast<Instruction>(V);
3460 for (
Use &U :
I->operands()) {
3461 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3462 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3464 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3465 return Entry->VectorizedValue == OpI;
3469 I->dropAllReferences();
3471 for (
T *V : DeadVals) {
3472 auto *
I = cast<Instruction>(V);
3473 if (!
I->getParent())
3478 cast<Instruction>(U.getUser()));
3480 "trying to erase instruction with users.");
3481 I->removeFromParent();
3485 while (!DeadInsts.
empty()) {
3488 if (!VI || !VI->getParent())
3491 "Live instruction found in dead worklist!");
3492 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3499 for (
Use &OpU : VI->operands()) {
3500 Value *OpV = OpU.get();
3511 if (
auto *OpI = dyn_cast<Instruction>(OpV))
3512 if (!DeletedInstructions.contains(OpI) &&
3513 (!OpI->getType()->isVectorTy() ||
3514 none_of(VectorValuesAndScales,
3515 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3516 return std::get<0>(V) == OpI;
3522 VI->removeFromParent();
3531 return AnalyzedReductionsRoots.count(
I);
3536 AnalyzedReductionsRoots.insert(
I);
3550 AnalyzedReductionsRoots.clear();
3551 AnalyzedReductionVals.
clear();
3552 AnalyzedMinBWVals.
clear();
3564 return NonScheduledFirst.
contains(V);
3569 assert(V &&
"V cannot be nullptr.");
3570 return ScalarToTreeEntries.contains(V);
3580 bool collectValuesToDemote(
3581 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3584 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3593 void buildReorderableOperands(
3601 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3604 bool areAllUsersVectorized(
3613 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3614 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3615 return const_cast<TreeEntry *
>(
3616 getOperandEntry(
const_cast<const TreeEntry *
>(E),
Idx));
3622 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3626 getCastContextHint(
const TreeEntry &TE)
const;
3640 const InstructionsState &LocalState,
3647 unsigned InterleaveFactor = 0);
3658 bool ResizeAllowed =
false)
const;
3665 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3670 template <
typename BVTy,
typename ResTy,
typename... Args>
3671 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3676 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3682 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3689 std::optional<TargetTransformInfo::ShuffleKind>
3701 unsigned NumParts)
const;
3713 std::optional<TargetTransformInfo::ShuffleKind>
3714 isGatherShuffledSingleRegisterEntry(
3731 isGatherShuffledEntry(
3734 unsigned NumParts,
bool ForOrder =
false);
3740 Type *ScalarTy)
const;
3744 void setInsertPointAfterBundle(
const TreeEntry *E);
3754 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3759 void tryToVectorizeGatheredLoads(
3761 std::tuple<BasicBlock *, Value *, Type *>,
3769 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3785 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3789 void reorderGatherNode(TreeEntry &TE);
3794 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3798 if (State == TreeEntry::SplitVectorize)
3808 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3809 "Expected only split vectorize node.");
3811 unsigned CommonVF = std::max<unsigned>(
3812 CombinedEntriesWithIndices.back().second,
3813 Scalars.size() - CombinedEntriesWithIndices.back().second);
3814 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3816 Idx + (
Idx >= CombinedEntriesWithIndices.back().second
3817 ? CommonVF - CombinedEntriesWithIndices.back().second
3834 [Scalars](
Value *V,
int Idx) {
3835 return (isa<UndefValue>(V) &&
3836 Idx == PoisonMaskElem) ||
3837 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3840 if (!ReorderIndices.empty()) {
3847 return IsSame(Scalars, Mask);
3848 if (VL.
size() == ReuseShuffleIndices.size()) {
3850 return IsSame(Scalars, Mask);
3854 return IsSame(Scalars, ReuseShuffleIndices);
3858 bool hasEqualOperands(
const TreeEntry &TE)
const {
3859 if (
TE.getNumOperands() != getNumOperands())
3862 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3863 unsigned PrevCount =
Used.count();
3864 for (
unsigned K = 0;
K < E; ++
K) {
3867 if (getOperand(K) ==
TE.getOperand(
I)) {
3873 if (PrevCount ==
Used.count())
3882 unsigned getVectorFactor()
const {
3883 if (!ReuseShuffleIndices.empty())
3884 return ReuseShuffleIndices.size();
3885 return Scalars.
size();
3889 bool isGather()
const {
return State == NeedToGather; }
3916 enum CombinedOpcode {
3918 MinMax = Instruction::OtherOpsEnd + 1,
3921 CombinedOpcode CombinedOp = NotCombinedOp;
3935 VecTreeTy &Container;
3938 EdgeInfo UserTreeIndex;
3958 InstructionsState S = InstructionsState::invalid();
3961 unsigned InterleaveFactor = 0;
3964 bool DoesNotNeedToSchedule =
false;
3970 assert(Operands[
OpIdx].empty() &&
"Already resized?");
3972 "Number of operands is greater than the number of scalars.");
3979 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3981 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3984 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
3991 for (
unsigned I : seq<unsigned>(
Operands.size()))
3992 setOperand(
I, Operands[
I]);
4014 unsigned getNumOperands()
const {
return Operands.size(); }
4017 Value *getSingleOperand(
unsigned OpIdx)
const {
4019 assert(!Operands[
OpIdx].empty() &&
"No operand available");
4024 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4027 return S.getMatchingMainOpOrAltOp(
I);
4034 auto *
I = dyn_cast<Instruction>(
Op);
4035 if (
I && getMatchingMainOpOrAltOp(
I))
4037 return S.getMainOp();
4040 void setOperations(
const InstructionsState &S) {
4041 assert(S &&
"InstructionsState is invalid.");
4045 Instruction *getMainOp()
const {
return S.getMainOp(); }
4047 Instruction *getAltOp()
const {
return S.getAltOp(); }
4050 unsigned getOpcode()
const {
return S.
getOpcode(); }
4052 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4054 bool hasState()
const {
return S.valid(); }
4057 void addCopyableElement(
Value *V) {
4058 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4059 CopyableElements.
insert(V);
4063 bool isCopyableElement(
Value *V)
const {
4064 return CopyableElements.
contains(V);
4068 bool hasCopyableElements()
const {
return !CopyableElements.
empty(); }
4071 const InstructionsState &getOperations()
const {
return S; }
4075 unsigned findLaneForValue(
Value *V)
const {
4076 unsigned FoundLane = getVectorFactor();
4077 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
4078 std::advance(It, 1)) {
4081 FoundLane = std::distance(Scalars.begin(), It);
4082 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4083 if (!ReorderIndices.
empty())
4084 FoundLane = ReorderIndices[FoundLane];
4085 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4086 if (ReuseShuffleIndices.
empty())
4088 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4089 RIt != ReuseShuffleIndices.
end()) {
4090 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
4094 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4107 bool isNonPowOf2Vec()
const {
4109 return IsNonPowerOf2;
4118 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
4119 "Reshuffling not supported with non-power-of-2 vectors yet.");
4120 return IsNonPowerOf2;
4123 Value *getOrdered(
unsigned Idx)
const {
4124 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
4125 if (ReorderIndices.
empty())
4126 return Scalars[
Idx];
4136 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
4137 dbgs() <<
"Operand " << OpI <<
":\n";
4138 for (
const Value *V : Operands[OpI])
4141 dbgs() <<
"Scalars: \n";
4142 for (
Value *V : Scalars)
4144 dbgs() <<
"State: ";
4145 if (S && hasCopyableElements())
4146 dbgs() <<
"[[Copyable]] ";
4149 if (InterleaveFactor > 0) {
4150 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4153 dbgs() <<
"Vectorize\n";
4156 case ScatterVectorize:
4157 dbgs() <<
"ScatterVectorize\n";
4159 case StridedVectorize:
4160 dbgs() <<
"StridedVectorize\n";
4162 case CompressVectorize:
4163 dbgs() <<
"CompressVectorize\n";
4166 dbgs() <<
"NeedToGather\n";
4168 case CombinedVectorize:
4169 dbgs() <<
"CombinedVectorize\n";
4171 case SplitVectorize:
4172 dbgs() <<
"SplitVectorize\n";
4176 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4177 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4179 dbgs() <<
"MainOp: NULL\n";
4180 dbgs() <<
"AltOp: NULL\n";
4182 dbgs() <<
"VectorizedValue: ";
4183 if (VectorizedValue)
4184 dbgs() << *VectorizedValue <<
"\n";
4187 dbgs() <<
"ReuseShuffleIndices: ";
4188 if (ReuseShuffleIndices.
empty())
4191 for (
int ReuseIdx : ReuseShuffleIndices)
4192 dbgs() << ReuseIdx <<
", ";
4194 dbgs() <<
"ReorderIndices: ";
4195 for (
unsigned ReorderIdx : ReorderIndices)
4196 dbgs() << ReorderIdx <<
", ";
4198 dbgs() <<
"UserTreeIndex: ";
4200 dbgs() << UserTreeIndex;
4202 dbgs() <<
"<invalid>";
4204 if (!CombinedEntriesWithIndices.
empty()) {
4205 dbgs() <<
"Combined entries: ";
4207 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4216 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
4219 dbgs() <<
"SLP: " << Banner <<
":\n";
4221 dbgs() <<
"SLP: Costs:\n";
4222 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4223 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4224 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4225 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4226 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4232 const InstructionsState &S,
4233 const EdgeInfo &UserTreeIdx,
4235 auto Invalid = ScheduleBundle::invalid();
4236 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4241 const InstructionsState &S,
4242 const EdgeInfo &UserTreeIdx,
4245 unsigned InterleaveFactor = 0) {
4246 TreeEntry::EntryState EntryState =
4247 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4248 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4249 ReuseShuffleIndices, ReorderIndices);
4250 if (E && InterleaveFactor > 0)
4251 E->setInterleave(InterleaveFactor);
4256 TreeEntry::EntryState EntryState,
4257 ScheduleBundle &Bundle,
const InstructionsState &S,
4258 const EdgeInfo &UserTreeIdx,
4261 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4262 EntryState == TreeEntry::SplitVectorize)) ||
4263 (Bundle && EntryState != TreeEntry::NeedToGather &&
4264 EntryState != TreeEntry::SplitVectorize)) &&
4265 "Need to vectorize gather entry?");
4267 if (GatheredLoadsEntriesFirst.has_value() &&
4268 EntryState == TreeEntry::NeedToGather && S &&
4269 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4270 !UserTreeIdx.UserTE)
4272 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
4273 TreeEntry *
Last = VectorizableTree.
back().get();
4274 Last->Idx = VectorizableTree.
size() - 1;
4275 Last->State = EntryState;
4276 if (UserTreeIdx.UserTE)
4278 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4283 ReuseShuffleIndices.empty()) &&
4284 "Reshuffling scalars not yet supported for nodes with padding");
4285 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4286 ReuseShuffleIndices.end());
4287 if (ReorderIndices.
empty()) {
4290 Last->setOperations(S);
4293 Last->Scalars.assign(VL.
size(),
nullptr);
4296 if (Idx >= VL.size())
4297 return UndefValue::get(VL.front()->getType());
4302 Last->setOperations(S);
4303 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4305 if (EntryState == TreeEntry::SplitVectorize) {
4306 assert(S &&
"Split nodes must have operations.");
4307 Last->setOperations(S);
4309 for (
Value *V : VL) {
4310 auto *
I = dyn_cast<Instruction>(V);
4313 auto It = ScalarsInSplitNodes.find(V);
4314 if (It == ScalarsInSplitNodes.end()) {
4315 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4316 (void)Processed.
insert(V);
4317 }
else if (Processed.
insert(V).second) {
4319 "Value already associated with the node.");
4320 It->getSecond().push_back(
Last);
4323 }
else if (!
Last->isGather()) {
4324 if (isa<PHINode>(S.getMainOp()) ||
4326 (!S.areInstructionsWithCopyableElements() &&
4328 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4329 Last->setDoesNotNeedToSchedule();
4331 for (
Value *V : VL) {
4332 if (isa<PoisonValue>(V))
4334 if (S.isCopyableElement(V)) {
4335 Last->addCopyableElement(V);
4338 auto It = ScalarToTreeEntries.find(V);
4339 if (It == ScalarToTreeEntries.end()) {
4340 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4341 (void)Processed.
insert(V);
4342 }
else if (Processed.
insert(V).second) {
4344 "Value already associated with the node.");
4345 It->getSecond().push_back(
Last);
4349 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4350 "Bundle and VL out of sync");
4351 if (!Bundle.getBundle().empty()) {
4352#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4353 auto *BundleMember = Bundle.getBundle().begin();
4355 for (
Value *V : VL) {
4356 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4360 assert(BundleMember == Bundle.getBundle().end() &&
4361 "Bundle and VL out of sync");
4363 Bundle.setTreeEntry(
Last);
4367 bool AllConstsOrCasts =
true;
4368 for (
Value *V : VL) {
4369 if (S && S.areInstructionsWithCopyableElements() &&
4370 S.isCopyableElement(V))
4371 Last->addCopyableElement(V);
4373 auto *
I = dyn_cast<CastInst>(V);
4374 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4375 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4376 !UserTreeIdx.UserTE->isGather())
4380 if (AllConstsOrCasts)
4382 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4386 if (UserTreeIdx.UserTE)
4387 Last->UserTreeIndex = UserTreeIdx;
4393 TreeEntry::VecTreeTy VectorizableTree;
4398 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4399 VectorizableTree[
Id]->dump();
4407 assert(V &&
"V cannot be nullptr.");
4408 auto It = ScalarToTreeEntries.find(V);
4409 if (It == ScalarToTreeEntries.end())
4411 return It->getSecond();
4416 assert(V &&
"V cannot be nullptr.");
4417 auto It = ScalarsInSplitNodes.find(V);
4418 if (It == ScalarsInSplitNodes.end())
4420 return It->getSecond();
4425 bool SameVF =
false)
const {
4426 assert(V &&
"V cannot be nullptr.");
4427 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4428 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4439 bool areAltOperandsProfitable(
const InstructionsState &S,
4444 class ScalarsVectorizationLegality {
4445 InstructionsState S;
4447 bool TryToFindDuplicates;
4448 bool TrySplitVectorize;
4451 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4452 bool TryToFindDuplicates =
true,
4453 bool TrySplitVectorize =
false)
4454 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4455 TrySplitVectorize(TrySplitVectorize) {
4456 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4457 "Inconsistent state");
4459 const InstructionsState &getInstructionsState()
const {
return S; };
4460 bool isLegal()
const {
return IsLegal; }
4462 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4467 ScalarsVectorizationLegality
4469 const EdgeInfo &UserTreeIdx,
4470 bool TryCopyableElementsVectorization)
const;
4474 TreeEntry::EntryState
4476 bool IsScatterVectorizeUserTE,
4485 OperandsToTreeEntry;
4512 using ValueToGatherNodesMap =
4514 ValueToGatherNodesMap ValueToGatherNodes;
4522 bool IsGraphTransformMode =
false;
4525 std::optional<unsigned> GatheredLoadsEntriesFirst;
4530 CompressEntryToData;
4533 struct ExternalUser {
4535 : Scalar(S),
User(
U), E(E), Lane(
L) {}
4538 Value *Scalar =
nullptr;
4559 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4562 return Res.first->second;
4565 Res.first->getSecond() = Aliased;
4569 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4601 UserList ExternalUses;
4629 class ScheduleEntity {
4630 friend class ScheduleBundle;
4631 friend class ScheduleData;
4632 friend class ScheduleCopyableData;
4635 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4636 Kind getKind()
const {
return K; }
4637 ScheduleEntity(Kind K) :
K(
K) {}
4641 int SchedulingPriority = 0;
4644 bool IsScheduled =
false;
4646 const Kind K = Kind::ScheduleData;
4649 ScheduleEntity() =
delete;
4651 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4652 int getSchedulingPriority()
const {
return SchedulingPriority; }
4653 bool isReady()
const {
4654 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4655 return SD->isReady();
4656 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4657 return CD->isReady();
4658 return cast<ScheduleBundle>(
this)->isReady();
4663 bool hasValidDependencies()
const {
4664 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4665 return SD->hasValidDependencies();
4666 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4667 return CD->hasValidDependencies();
4668 return cast<ScheduleBundle>(
this)->hasValidDependencies();
4671 int getUnscheduledDeps()
const {
4672 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4673 return SD->getUnscheduledDeps();
4674 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4675 return CD->getUnscheduledDeps();
4676 return cast<ScheduleBundle>(
this)->unscheduledDepsInBundle();
4679 int incrementUnscheduledDeps(
int Incr) {
4680 if (
auto *SD = dyn_cast<ScheduleData>(
this))
4681 return SD->incrementUnscheduledDeps(Incr);
4682 return cast<ScheduleCopyableData>(
this)->incrementUnscheduledDeps(Incr);
4685 int getDependencies()
const {
4686 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4687 return SD->getDependencies();
4688 return cast<ScheduleCopyableData>(
this)->getDependencies();
4692 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4693 return SD->getInst();
4694 return cast<ScheduleCopyableData>(
this)->getInst();
4698 bool isScheduled()
const {
return IsScheduled; }
4699 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4701 static bool classof(
const ScheduleEntity *) {
return true; }
4703#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4705 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4706 return SD->dump(
OS);
4707 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4708 return CD->dump(
OS);
4709 return cast<ScheduleBundle>(
this)->dump(
OS);
4719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4721 const BoUpSLP::ScheduleEntity &SE) {
4731 class ScheduleData final :
public ScheduleEntity {
4735 enum { InvalidDeps = -1 };
4737 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4738 static bool classof(
const ScheduleEntity *Entity) {
4739 return Entity->getKind() == Kind::ScheduleData;
4743 NextLoadStore =
nullptr;
4744 IsScheduled =
false;
4745 SchedulingRegionID = BlockSchedulingRegionID;
4746 clearDependencies();
4752 if (hasValidDependencies()) {
4753 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4755 assert(UnscheduledDeps == Dependencies &&
"invariant");
4759 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4760 "unexpected scheduled state");
4767 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4771 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4776 int incrementUnscheduledDeps(
int Incr) {
4777 assert(hasValidDependencies() &&
4778 "increment of unscheduled deps would be meaningless");
4779 UnscheduledDeps += Incr;
4780 return UnscheduledDeps;
4785 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4788 void clearDependencies() {
4789 clearDirectDependencies();
4790 MemoryDependencies.clear();
4791 ControlDependencies.clear();
4798 void clearDirectDependencies() {
4799 Dependencies = InvalidDeps;
4800 resetUnscheduledDeps();
4801 IsScheduled =
false;
4805 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4807 int getDependencies()
const {
return Dependencies; }
4809 void initDependencies() { Dependencies = 0; }
4811 void incDependencies() { Dependencies++; }
4814 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4821 return MemoryDependencies;
4824 void addMemoryDependency(ScheduleData *Dep) {
4825 MemoryDependencies.push_back(Dep);
4829 return ControlDependencies;
4832 void addControlDependency(ScheduleData *Dep) {
4833 ControlDependencies.push_back(Dep);
4836 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4837 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4851 ScheduleData *NextLoadStore =
nullptr;
4865 int SchedulingRegionID = 0;
4871 int Dependencies = InvalidDeps;
4877 int UnscheduledDeps = InvalidDeps;
4882 const BoUpSLP::ScheduleData &SD) {
4888 class ScheduleBundle final :
public ScheduleEntity {
4892 bool IsValid =
true;
4894 TreeEntry *TE =
nullptr;
4895 ScheduleBundle(
bool IsValid)
4896 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4899 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4900 static bool classof(
const ScheduleEntity *Entity) {
4901 return Entity->getKind() == Kind::ScheduleBundle;
4906 for (
const ScheduleEntity *SD : Bundle) {
4907 if (SD->hasValidDependencies()) {
4908 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4911 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4915 if (isScheduled()) {
4916 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4917 "unexpected scheduled state");
4923 int unscheduledDepsInBundle()
const {
4924 assert(*
this &&
"bundle must not be empty");
4926 for (
const ScheduleEntity *BundleMember : Bundle) {
4927 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4928 return ScheduleData::InvalidDeps;
4929 Sum += BundleMember->getUnscheduledDeps();
4937 bool hasValidDependencies()
const {
4938 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4939 return SD->hasValidDependencies();
4945 bool isReady()
const {
4946 assert(*
this &&
"bundle must not be empty");
4947 return unscheduledDepsInBundle() == 0 && !isScheduled();
4955 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4958 void setTreeEntry(TreeEntry *TE) { this->TE =
TE; }
4959 TreeEntry *getTreeEntry()
const {
return TE; }
4961 static ScheduleBundle invalid() {
return {
false}; }
4963 operator bool()
const {
return IsValid; }
4973 if (isa<ScheduleCopyableData>(SD))
4975 OS << *SD->getInst();
4989 const BoUpSLP::ScheduleBundle &Bundle) {
5000 class ScheduleCopyableData final :
public ScheduleEntity {
5007 int SchedulingRegionID = 0;
5009 ScheduleBundle &Bundle;
5012 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5013 const EdgeInfo &EI, ScheduleBundle &Bundle)
5014 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5015 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5016 static bool classof(
const ScheduleEntity *Entity) {
5017 return Entity->getKind() == Kind::ScheduleCopyableData;
5022 if (hasValidDependencies()) {
5023 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5025 assert(UnscheduledDeps == Dependencies &&
"invariant");
5029 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5030 "unexpected scheduled state");
5037 bool hasValidDependencies()
const {
5038 return Dependencies != ScheduleData::InvalidDeps;
5043 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5048 int incrementUnscheduledDeps(
int Incr) {
5049 assert(hasValidDependencies() &&
5050 "increment of unscheduled deps would be meaningless");
5051 UnscheduledDeps += Incr;
5052 assert(UnscheduledDeps >= 0 &&
"invariant");
5053 return UnscheduledDeps;
5058 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5061 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5063 int getDependencies()
const {
return Dependencies; }
5065 void initDependencies() { Dependencies = 0; }
5067 void incDependencies() { Dependencies++; }
5070 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5076 void clearDependencies() {
5077 Dependencies = ScheduleData::InvalidDeps;
5078 UnscheduledDeps = ScheduleData::InvalidDeps;
5079 IsScheduled =
false;
5083 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5086 ScheduleBundle &getBundle() {
return Bundle; }
5087 const ScheduleBundle &getBundle()
const {
return Bundle; }
5089#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5101 int Dependencies = ScheduleData::InvalidDeps;
5107 int UnscheduledDeps = ScheduleData::InvalidDeps;
5137 struct BlockScheduling {
5139 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5142 ScheduledBundles.clear();
5143 ScheduledBundlesList.clear();
5144 ScheduleCopyableDataMap.clear();
5145 ScheduleCopyableDataMapByInst.clear();
5146 ScheduleCopyableDataMapByInstUser.clear();
5147 ScheduleCopyableDataMapByUsers.clear();
5149 ScheduleStart =
nullptr;
5150 ScheduleEnd =
nullptr;
5151 FirstLoadStoreInRegion =
nullptr;
5152 LastLoadStoreInRegion =
nullptr;
5153 RegionHasStackSave =
false;
5157 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5160 ScheduleRegionSize = 0;
5164 ++SchedulingRegionID;
5170 if (BB !=
I->getParent())
5173 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5174 if (SD && isInSchedulingRegion(*SD))
5179 ScheduleData *getScheduleData(
Value *V) {
5180 return getScheduleData(dyn_cast<Instruction>(V));
5185 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5186 const Value *V)
const {
5187 if (ScheduleCopyableDataMap.empty())
5189 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5190 if (It == ScheduleCopyableDataMap.end())
5192 ScheduleCopyableData *SD = It->getSecond().get();
5193 if (!isInSchedulingRegion(*SD))
5201 getScheduleCopyableData(
const Value *
User,
unsigned OperandIdx,
5203 if (ScheduleCopyableDataMapByInstUser.empty())
5205 const auto It = ScheduleCopyableDataMapByInstUser.find(
5206 std::make_pair(std::make_pair(
User, OperandIdx), V));
5207 if (It == ScheduleCopyableDataMapByInstUser.end())
5210 for (ScheduleCopyableData *SD : It->getSecond()) {
5211 if (isInSchedulingRegion(*SD))
5227 unsigned NumOps)
const {
5228 assert(NumOps > 0 &&
"No operands");
5229 if (ScheduleCopyableDataMap.empty())
5237 if (Entries.empty())
5241 for (TreeEntry *TE : Entries) {
5247 bool IsCommutativeUser =
5249 EdgeInfo EI(TE,
U.getOperandNo());
5250 if (!IsCommutativeUser && !isa<CmpInst>(
User)) {
5252 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5253 if (!getScheduleCopyableData(EI,
Op) && OpCnt < NumOps)
5259 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5260 .first->getSecond();
5264 if (!PotentiallyReorderedEntriesCount.
empty()) {
5265 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5266 auto *It =
find(
P.first->Scalars,
User);
5267 assert(It !=
P.first->Scalars.end() &&
5268 "User is not in the tree entry");
5269 int Lane = std::distance(
P.first->Scalars.begin(), It);
5270 assert(Lane >= 0 &&
"Lane is not found");
5271 if (isa<StoreInst>(
User) && !
P.first->ReorderIndices.empty())
5272 Lane =
P.first->ReorderIndices[Lane];
5273 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5274 "Couldn't find extract lane");
5276 for (
unsigned OpIdx :
5278 P.first->getMainOp()))) {
5279 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5280 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5284 return all_of(PotentiallyReorderedEntriesCount,
5285 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5286 return P.second == NumOps - 1;
5294 if (ScheduleCopyableDataMapByInst.empty())
5296 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5297 if (It == ScheduleCopyableDataMapByInst.end())
5300 for (ScheduleCopyableData *SD : It->getSecond()) {
5301 if (isInSchedulingRegion(*SD))
5309 if (ScheduleCopyableDataMapByUsers.empty())
5311 const auto It = ScheduleCopyableDataMapByUsers.find(
User);
5312 if (It == ScheduleCopyableDataMapByUsers.end())
5315 for (ScheduleCopyableData *SD : It->getSecond()) {
5316 if (isInSchedulingRegion(*SD))
5322 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5324 int SchedulingRegionID,
5325 ScheduleBundle &Bundle) {
5326 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5327 ScheduleCopyableData *CD =
5328 ScheduleCopyableDataMap
5329 .try_emplace(std::make_pair(EI,
I),
5330 std::make_unique<ScheduleCopyableData>(
5331 SchedulingRegionID,
I, EI, Bundle))
5334 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5338 assert(It !=
Op.end() &&
"Lane not set");
5341 int Lane = std::distance(
Op.begin(), It);
5342 assert(Lane >= 0 &&
"Lane not set");
5343 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5344 !EI.UserTE->ReorderIndices.empty())
5345 Lane = EI.UserTE->ReorderIndices[Lane];
5346 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5347 "Couldn't find extract lane");
5348 auto *
In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5349 if (!Visited.
insert(In).second) {
5353 ScheduleCopyableDataMapByInstUser
5354 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5357 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5364 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5365 if (ScheduleCopyableData *UserCD =
5366 getScheduleCopyableData(UserEI, In))
5367 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5370 }
while (It !=
Op.end());
5372 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5379 auto *
I = dyn_cast<Instruction>(V);
5382 auto It = ScheduledBundles.find(
I);
5383 if (It == ScheduledBundles.end())
5385 return It->getSecond();
5389 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5390 if (
const auto *
Data = dyn_cast<ScheduleData>(&SD))
5391 return Data->getSchedulingRegionID() == SchedulingRegionID;
5392 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5393 return CD->getSchedulingRegionID() == SchedulingRegionID;
5394 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5395 [&](
const ScheduleEntity *BundleMember) {
5396 return isInSchedulingRegion(*BundleMember);
5402 template <
typename ReadyListType>
5403 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5404 const EdgeInfo &EI, ScheduleEntity *
Data,
5405 ReadyListType &ReadyList) {
5406 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5411 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5412 if ((IsControl ||
Data->hasValidDependencies()) &&
5413 Data->incrementUnscheduledDeps(-1) == 0) {
5419 if (
auto *CD = dyn_cast<ScheduleCopyableData>(
Data)) {
5420 CopyableBundle.
push_back(&CD->getBundle());
5421 Bundles = CopyableBundle;
5423 Bundles = getScheduleBundles(
Data->getInst());
5425 if (!Bundles.
empty()) {
5426 for (ScheduleBundle *Bundle : Bundles) {
5427 if (Bundle->unscheduledDepsInBundle() == 0) {
5428 assert(!Bundle->isScheduled() &&
5429 "already scheduled bundle gets ready");
5430 ReadyList.insert(Bundle);
5432 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5438 "already scheduled bundle gets ready");
5440 "Expected non-copyable data");
5441 ReadyList.insert(
Data);
5448 if (!ScheduleCopyableDataMap.empty()) {
5451 for (ScheduleCopyableData *CD : CopyableData)
5452 DecrUnsched(CD,
false);
5453 if (!CopyableData.empty())
5456 if (ScheduleData *OpSD = getScheduleData(
I))
5457 DecrUnsched(OpSD,
false);
5463 if (!Bundles.empty()) {
5464 auto *
In = BundleMember->getInst();
5467 unsigned TotalOpCount = 0;
5468 if (isa<ScheduleCopyableData>(BundleMember)) {
5470 TotalOpCount = OperandsUses[
In] = 1;
5472 for (
const Use &U :
In->operands()) {
5473 if (
auto *
I = dyn_cast<Instruction>(
U.get())) {
5475 ++Res.first->getSecond();
5482 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5484 if (!ScheduleCopyableDataMap.empty()) {
5485 const EdgeInfo EI = {UserTE,
OpIdx};
5486 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5487 DecrUnsched(CD,
false);
5491 auto It = OperandsUses.
find(
I);
5492 assert(It != OperandsUses.
end() &&
"Operand not found");
5493 if (It->second > 0) {
5495 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5497 if (ScheduleData *OpSD = getScheduleData(
I))
5498 DecrUnsched(OpSD,
false);
5502 for (ScheduleBundle *Bundle : Bundles) {
5503 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5507 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5508 find(Bundle->getTreeEntry()->Scalars, In));
5509 assert(Lane >= 0 &&
"Lane not set");
5510 if (isa<StoreInst>(In) &&
5511 !Bundle->getTreeEntry()->ReorderIndices.empty())
5512 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5513 assert(Lane <
static_cast<int>(
5514 Bundle->getTreeEntry()->Scalars.size()) &&
5515 "Couldn't find extract lane");
5524 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5525 In->getNumOperands() ==
5526 Bundle->getTreeEntry()->getNumOperands() ||
5527 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5528 "Missed TreeEntry operands?");
5530 for (
unsigned OpIdx :
5531 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5532 if (
auto *
I = dyn_cast<Instruction>(
5533 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5536 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5542 for (
Use &U : BundleMember->getInst()->operands()) {
5543 if (
auto *
I = dyn_cast<Instruction>(
U.get())) {
5545 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5546 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5551 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5555 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5556 if (!VisitedMemory.
insert(MemoryDep).second)
5561 << *MemoryDep <<
"\n");
5562 DecrUnsched(MemoryDep);
5566 for (ScheduleData *Dep : SD->getControlDependencies()) {
5567 if (!VisitedControl.
insert(Dep).second)
5572 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5573 DecrUnsched(Dep,
true);
5576 if (
auto *SD = dyn_cast<ScheduleData>(
Data)) {
5577 SD->setScheduled(
true);
5579 ProcessBundleMember(SD, {});
5581 ScheduleBundle &Bundle = *cast<ScheduleBundle>(
Data);
5582 Bundle.setScheduled(
true);
5584 auto AreAllBundlesScheduled =
5585 [&](
const ScheduleEntity *SD,
5587 if (isa<ScheduleCopyableData>(SD))
5589 return !SDBundles.empty() &&
5590 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5591 return SDBundle->isScheduled();
5594 for (ScheduleEntity *SD : Bundle.getBundle()) {
5596 if (!isa<ScheduleCopyableData>(SD))
5597 SDBundles = getScheduleBundles(SD->getInst());
5598 if (AreAllBundlesScheduled(SD, SDBundles)) {
5599 SD->setScheduled(
true);
5600 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5612 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5613 ScheduleStart->comesBefore(ScheduleEnd) &&
5614 "Not a valid scheduling region?");
5616 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5618 if (!Bundles.
empty()) {
5619 for (ScheduleBundle *Bundle : Bundles) {
5620 assert(isInSchedulingRegion(*Bundle) &&
5621 "primary schedule data not in window?");
5626 auto *SD = getScheduleData(
I);
5629 assert(isInSchedulingRegion(*SD) &&
5630 "primary schedule data not in window?");
5635 [](
const ScheduleEntity *Bundle) {
5636 return Bundle->isReady();
5638 "item in ready list not ready?");
5642 template <
typename ReadyListType>
5643 void initialFillReadyList(ReadyListType &ReadyList) {
5645 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5646 ScheduleData *SD = getScheduleData(
I);
5647 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5650 for (ScheduleBundle *Bundle : Bundles) {
5651 if (!Visited.
insert(Bundle).second)
5653 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5654 ReadyList.insert(Bundle);
5656 << *Bundle <<
"\n");
5661 ReadyList.insert(SD);
5663 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5674 const InstructionsState &S,
const EdgeInfo &EI);
5681 std::optional<ScheduleBundle *>
5683 const InstructionsState &S,
const EdgeInfo &EI);
5686 ScheduleData *allocateScheduleDataChunks();
5690 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5695 ScheduleData *PrevLoadStore,
5696 ScheduleData *NextLoadStore);
5700 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5705 void resetSchedule();
5727 std::unique_ptr<ScheduleCopyableData>>
5728 ScheduleCopyableDataMap;
5735 ScheduleCopyableDataMapByInst;
5743 ScheduleCopyableDataMapByInstUser;
5764 ScheduleCopyableDataMapByUsers;
5783 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5787 ScheduleData *LastLoadStoreInRegion =
nullptr;
5792 bool RegionHasStackSave =
false;
5795 int ScheduleRegionSize = 0;
5804 int SchedulingRegionID = 1;
5812 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5819 struct OrdersTypeDenseMapInfo {
5832 static unsigned getHashValue(
const OrdersType &V) {
5853 unsigned MaxVecRegSize;
5854 unsigned MinVecRegSize;
5869 unsigned ReductionBitWidth = 0;
5872 unsigned BaseGraphSize = 1;
5876 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5890 SecondInfo::getEmptyKey());
5895 SecondInfo::getTombstoneKey());
5900 SecondInfo::getHashValue(Val.
EdgeIdx));
5919 struct ChildIteratorType
5921 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5932 return R.VectorizableTree[0].get();
5936 return {&
N->UserTreeIndex,
N->Container};
5940 return {&
N->UserTreeIndex + 1,
N->Container};
5945 class nodes_iterator {
5956 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
5960 return nodes_iterator(R->VectorizableTree.begin());
5964 return nodes_iterator(R->VectorizableTree.end());
5967 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
5978 OS << Entry->Idx <<
".\n";
5981 for (
auto *V : Entry->Scalars) {
5983 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
5984 return EU.Scalar == V;
5994 if (Entry->isGather())
5996 if (Entry->State == TreeEntry::ScatterVectorize ||
5997 Entry->State == TreeEntry::StridedVectorize ||
5998 Entry->State == TreeEntry::CompressVectorize)
5999 return "color=blue";
6008 for (
auto *
I : DeletedInstructions) {
6009 if (!
I->getParent()) {
6012 if (isa<PHINode>(
I))
6014 I->insertBefore(
F->getEntryBlock(),
6015 F->getEntryBlock().getFirstNonPHIIt());
6017 I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
6020 for (
Use &U :
I->operands()) {
6021 auto *
Op = dyn_cast<Instruction>(U.get());
6022 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6026 I->dropAllReferences();
6028 for (
auto *
I : DeletedInstructions) {
6030 "trying to erase instruction with users.");
6031 I->eraseFromParent();
6037#ifdef EXPENSIVE_CHECKS
6048 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6049 "Expected non-empty mask.");
6052 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6054 Reuses[Mask[
I]] = Prev[
I];
6062 bool BottomOrder =
false) {
6063 assert(!Mask.empty() &&
"Expected non-empty mask.");
6064 unsigned Sz = Mask.size();
6067 if (Order.
empty()) {
6069 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6071 PrevOrder.
swap(Order);
6074 for (
unsigned I = 0;
I < Sz; ++
I)
6076 Order[
I] = PrevOrder[Mask[
I]];
6078 return Data.value() == Sz ||
Data.index() ==
Data.value();
6087 if (Order.
empty()) {
6089 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6099 for (
unsigned I = 0;
I < Sz; ++
I)
6101 Order[MaskOrder[
I]] =
I;
6105std::optional<BoUpSLP::OrdersType>
6107 bool TopToBottom,
bool IgnoreReorder) {
6108 assert(TE.isGather() &&
"Expected gather node only.");
6112 Type *ScalarTy = GatheredScalars.
front()->getType();
6113 size_t NumScalars = GatheredScalars.
size();
6115 return std::nullopt;
6122 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6124 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6127 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6128 return std::nullopt;
6129 OrdersType CurrentOrder(NumScalars, NumScalars);
6130 if (GatherShuffles.
size() == 1 &&
6132 Entries.front().front()->isSame(TE.Scalars)) {
6136 return std::nullopt;
6138 if (Entries.front().front()->UserTreeIndex.UserTE ==
6139 TE.UserTreeIndex.UserTE)
6140 return std::nullopt;
6143 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6144 return std::nullopt;
6147 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6148 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6151 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6153 return std::nullopt;
6157 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6158 return CurrentOrder;
6162 return all_of(Mask, [&](
int I) {
6169 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6170 (Entries.size() != 1 ||
6171 Entries.front().front()->ReorderIndices.empty())) ||
6172 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6173 return std::nullopt;
6178 for (
int I : seq<int>(0, NumParts)) {
6179 if (ShuffledSubMasks.
test(
I))
6181 const int VF = GetVF(
I);
6187 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6189 ShuffledSubMasks.
set(
I);
6193 int FirstMin = INT_MAX;
6194 int SecondVecFound =
false;
6195 for (
int K : seq<int>(Limit)) {
6196 int Idx = Mask[
I * PartSz + K];
6198 Value *V = GatheredScalars[
I * PartSz + K];
6200 SecondVecFound =
true;
6209 SecondVecFound =
true;
6213 FirstMin = (FirstMin / PartSz) * PartSz;
6215 if (SecondVecFound) {
6217 ShuffledSubMasks.
set(
I);
6220 for (
int K : seq<int>(Limit)) {
6221 int Idx = Mask[
I * PartSz + K];
6225 if (
Idx >= PartSz) {
6226 SecondVecFound =
true;
6229 if (CurrentOrder[
I * PartSz +
Idx] >
6230 static_cast<unsigned>(
I * PartSz + K) &&
6231 CurrentOrder[
I * PartSz +
Idx] !=
6232 static_cast<unsigned>(
I * PartSz +
Idx))
6233 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
6236 if (SecondVecFound) {
6238 ShuffledSubMasks.
set(
I);
6244 if (!ExtractShuffles.
empty())
6245 TransformMaskToOrder(
6246 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6247 if (!ExtractShuffles[
I])
6250 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6251 for (
unsigned Idx : seq<unsigned>(Sz)) {
6252 int K =
I * PartSz +
Idx;
6255 if (!TE.ReuseShuffleIndices.empty())
6256 K = TE.ReuseShuffleIndices[K];
6259 if (!TE.ReorderIndices.empty())
6260 K = std::distance(TE.ReorderIndices.begin(),
6261 find(TE.ReorderIndices, K));
6262 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6265 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6267 .getKnownMinValue());
6272 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6273 if (ShuffledSubMasks.
any())
6274 return std::nullopt;
6275 PartSz = NumScalars;
6278 if (!Entries.empty())
6279 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6280 if (!GatherShuffles[
I])
6282 return std::max(Entries[
I].front()->getVectorFactor(),
6283 Entries[
I].back()->getVectorFactor());
6285 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6286 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6287 return std::nullopt;
6288 return std::move(CurrentOrder);
6293 bool CompareOpcodes =
true) {
6297 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6298 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6299 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6300 (!GEP2 || GEP2->getNumOperands() == 2) &&
6301 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6302 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6305 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6309template <
typename T>
6313 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
6314 return CommonAlignment;
6320 "Order is empty. Please check it before using isReverseOrder.");
6321 unsigned Sz = Order.
size();
6323 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6334static std::optional<Value *>
6340 const SCEV *PtrSCEVLowest =
nullptr;
6341 const SCEV *PtrSCEVHighest =
nullptr;
6347 return std::nullopt;
6349 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6350 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6354 if (isa<SCEVCouldNotCompute>(Diff))
6355 return std::nullopt;
6357 PtrSCEVLowest = PtrSCEV;
6361 if (isa<SCEVCouldNotCompute>(Diff1))
6362 return std::nullopt;
6364 PtrSCEVHighest = PtrSCEV;
6370 if (isa<SCEVCouldNotCompute>(Dist))
6371 return std::nullopt;
6372 int Size =
DL.getTypeStoreSize(ElemTy);
6373 auto TryGetStride = [&](
const SCEV *Dist,
6374 const SCEV *Multiplier) ->
const SCEV * {
6375 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6376 if (M->getOperand(0) == Multiplier)
6377 return M->getOperand(1);
6378 if (M->getOperand(1) == Multiplier)
6379 return M->getOperand(0);
6382 if (Multiplier == Dist)
6387 const SCEV *Stride =
nullptr;
6388 if (
Size != 1 || SCEVs.
size() > 2) {
6390 Stride = TryGetStride(Dist, Sz);
6392 return std::nullopt;
6394 if (!Stride || isa<SCEVConstant>(Stride))
6395 return std::nullopt;
6398 using DistOrdPair = std::pair<int64_t, int>;
6400 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6402 bool IsConsecutive =
true;
6403 for (
const SCEV *PtrSCEV : SCEVs) {
6405 if (PtrSCEV != PtrSCEVLowest) {
6407 const SCEV *Coeff = TryGetStride(Diff, Stride);
6409 return std::nullopt;
6410 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6411 if (!SC || isa<SCEVCouldNotCompute>(SC))
6412 return std::nullopt;
6416 return std::nullopt;
6417 Dist = SC->getAPInt().getZExtValue();
6421 return std::nullopt;
6422 auto Res = Offsets.emplace(Dist, Cnt);
6424 return std::nullopt;
6426 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6429 if (Offsets.size() != SCEVs.
size())
6430 return std::nullopt;
6431 SortedIndices.
clear();
6432 if (!IsConsecutive) {
6436 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6437 SortedIndices[Cnt] = Pair.second;
6447static std::pair<InstructionCost, InstructionCost>
6468 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6471 Mask, NumSrcElts, NumSubElts,
Index)) {
6472 if (
Index + NumSubElts > NumSrcElts &&
6473 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6489 assert(!isa<ScalableVectorType>(Ty) &&
6490 "ScalableVectorType is not supported.");
6493 "Incorrect usage.");
6494 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6498 unsigned ScalarTyNumElements = VecTy->getNumElements();
6501 if (!DemandedElts[
I])
6505 I * ScalarTyNumElements, VecTy);
6508 I * ScalarTyNumElements, VecTy);
6521 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6522 if (Opcode == Instruction::ExtractElement) {
6523 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6525 assert(isa<VectorType>(Val) &&
"Val must be a vector type.");
6527 cast<VectorType>(Val), {},
CostKind,
6528 Index * VecTy->getNumElements(), VecTy);
6541 if (
auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6544 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6546 Index * ScalarTy->getNumElements(), SubTp) +
6558 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6565 if (isa<PoisonValue>(Vec)) {
6566 auto *Begin = std::next(
Mask.begin(),
Index);
6567 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6571 std::iota(
Mask.begin(),
Mask.end(), 0);
6572 std::iota(std::next(
Mask.begin(),
Index),
6573 std::next(
Mask.begin(),
Index + SubVecVF), VecVF);
6575 return Generator(Vec, V, Mask);
6578 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6586 unsigned SubVecVF,
unsigned Index) {
6588 std::iota(Mask.begin(), Mask.end(),
Index);
6599 const unsigned Sz = PointerOps.
size();
6602 CompressMask[0] = 0;
6604 std::optional<unsigned> Stride = 0;
6606 for (
unsigned I : seq<unsigned>(1, Sz)) {
6608 std::optional<int64_t> OptPos =
6610 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6612 unsigned Pos =
static_cast<unsigned>(*OptPos);
6613 CompressMask[
I] = Pos;
6620 if (Pos != *Stride *
I)
6623 return Stride.has_value();
6636 InterleaveFactor = 0;
6638 const size_t Sz = VL.
size();
6646 if (AreAllUsersVectorized(V))
6650 Mask.empty() ?
I : Mask[
I]);
6653 if (ExtractCost <= ScalarCost)
6658 if (Order.
empty()) {
6659 Ptr0 = PointerOps.
front();
6660 PtrN = PointerOps.
back();
6662 Ptr0 = PointerOps[Order.
front()];
6663 PtrN = PointerOps[Order.
back()];
6665 std::optional<int64_t> Diff =
6669 const size_t MaxRegSize =
6673 if (*Diff / Sz >= MaxRegSize / 8)
6676 auto *LI = cast<LoadInst>(Order.
empty() ? VL.
front() : VL[Order.
front()]);
6677 Align CommonAlignment = LI->getAlign();
6679 Ptr0, LoadVecTy, CommonAlignment,
DL,
6680 cast<LoadInst>(Order.
empty() ? VL.
back() : VL[Order.
back()]), &AC, &DT,
6683 LI->getPointerAddressSpace()))
6689 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6693 auto [ScalarGEPCost, VectorGEPCost] =
6695 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6714 LI->getPointerAddressSpace(),
CostKind);
6718 LI->getPointerAddressSpace(),
CostKind);
6720 if (IsStrided && !IsMasked && Order.
empty()) {
6725 DL, cast<LoadInst>(VL.
back()), &AC, &DT,
6727 AlignedLoadVecTy = LoadVecTy;
6730 LI->getPointerAddressSpace())) {
6733 Instruction::Load, AlignedLoadVecTy,
6734 CompressMask[1], {}, CommonAlignment,
6735 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6736 if (InterleavedCost < GatherCost) {
6737 InterleaveFactor = CompressMask[1];
6738 LoadVecTy = AlignedLoadVecTy;
6745 if (!Order.
empty()) {
6747 for (
unsigned I : seq<unsigned>(Sz)) {
6748 NewMask[
I] = CompressMask[Mask[
I]];
6750 CompressMask.
swap(NewMask);
6752 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6753 return TotalVecCost < GatherCost;
6766 unsigned InterleaveFactor;
6770 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6771 CompressMask, LoadVecTy);
6791 const bool IsAnyPointerUsedOutGraph,
6792 const int64_t Diff) {
6793 const size_t Sz = VL.
size();
6794 const uint64_t AbsoluteDiff = std::abs(Diff);
6797 if (IsAnyPointerUsedOutGraph ||
6798 (AbsoluteDiff > Sz &&
6801 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6802 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6803 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6804 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6813 if (Order.
empty()) {
6814 Ptr0 = PointerOps.
front();
6815 PtrN = PointerOps.
back();
6817 Ptr0 = PointerOps[Order.
front()];
6818 PtrN = PointerOps[Order.
back()];
6827 else if (
Ptr != Ptr0)
6831 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6834 if (Dists.
size() == Sz)
6844 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6857 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
6863 const size_t Sz = VL.
size();
6865 auto *POIter = PointerOps.
begin();
6866 for (
Value *V : VL) {
6867 auto *L = dyn_cast<LoadInst>(V);
6868 if (!L || !L->isSimple())
6870 *POIter = L->getPointerOperand();
6879 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6899 if (Order.
empty()) {
6900 Ptr0 = PointerOps.
front();
6901 PtrN = PointerOps.
back();
6903 Ptr0 = PointerOps[Order.
front()];
6904 PtrN = PointerOps[Order.
back()];
6906 std::optional<int64_t> Diff =
6909 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6912 *TLI, [&](
Value *V) {
6913 return areAllUsersVectorized(
6914 cast<Instruction>(V), UserIgnoreList);
6918 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6920 auto IsAnyPointerUsedOutGraph =
6921 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
6922 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
6923 return !isVectorized(U) && !MustGather.contains(U);
6926 if (IsPossibleStrided &&
6928 IsAnyPointerUsedOutGraph, *Diff))
6937 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
6939 bool ProfitableGatherPointers) {
6944 auto [ScalarGEPCost, VectorGEPCost] =
6946 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6950 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6952 if (
static_cast<unsigned>(
count_if(
6953 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
6979 false, CommonAlignment,
CostKind) +
6980 (ProfitableGatherPointers ? 0 : VectorGEPCost);
6988 constexpr unsigned ListLimit = 4;
6989 if (!TryRecursiveCheck || VL.
size() < ListLimit)
6998 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
7008 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
7021 DemandedElts.
setBits(Cnt, Cnt + VF);
7037 if (!DemandedElts.
isZero()) {
7042 for (
unsigned Idx : seq<unsigned>(VL.
size()))
7043 if (DemandedElts[
Idx])
7049 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
7054 LI0->getPointerOperand(),
7055 Instruction::GetElementPtr,
CostKind, ScalarTy,
7059 if (
static_cast<unsigned>(
7060 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7061 PointerOps.
size() - 1 ||
7081 LI0->getPointerAddressSpace(),
CostKind,
7087 LI0->getPointerOperand(),
7094 Instruction::Load, SubVecTy, CommonAlignment,
7095 LI0->getPointerAddressSpace(),
CostKind) +
7102 LI0->getPointerOperand(),
7112 for (
int Idx : seq<int>(0, VL.
size()))
7122 if (MaskedGatherCost >= VecLdCost &&
7135 bool ProfitableGatherPointers =
7136 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7137 return L->isLoopInvariant(V);
7139 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7140 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
7142 (
GEP &&
GEP->getNumOperands() == 2 &&
7143 isa<Constant, Instruction>(
GEP->getOperand(1)));
7150 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7151 ProfitableGatherPointers))
7164 "Expected list of pointer operands.");
7169 std::pair<BasicBlock *, Value *>,
7175 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
7177 SortedIndices.
clear();
7179 auto Key = std::make_pair(BBs[Cnt + 1],
7183 std::optional<int64_t> Diff =
7184 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7185 ElemTy, Ptr, DL, SE,
7190 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7196 if (Bases.
size() > VL.
size() / 2 - 1)
7200 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7207 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
7208 Bases.
front().second.size() == VL.
size()))
7213 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7222 FirstPointers.
insert(P1);
7223 SecondPointers.
insert(P2);
7229 "Unable to find matching root.");
7232 for (
auto &
Base : Bases) {
7233 for (
auto &Vec :
Base.second) {
7234 if (Vec.size() > 1) {
7236 int64_t InitialOffset = std::get<1>(Vec[0]);
7237 bool AnyConsecutive =
7239 return std::get<1>(
P.value()) ==
7240 int64_t(
P.index()) + InitialOffset;
7244 if (!AnyConsecutive)
7249 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7253 for (
auto &
T : Bases)
7254 for (
const auto &Vec :
T.second)
7255 for (
const auto &
P : Vec)
7259 "Expected SortedIndices to be the size of VL");
7263std::optional<BoUpSLP::OrdersType>
7265 assert(TE.isGather() &&
"Expected gather node only.");
7266 Type *ScalarTy = TE.Scalars[0]->getType();
7269 Ptrs.
reserve(TE.Scalars.size());
7271 BBs.
reserve(TE.Scalars.size());
7272 for (
Value *V : TE.Scalars) {
7273 auto *L = dyn_cast<LoadInst>(V);
7274 if (!L || !L->isSimple())
7275 return std::nullopt;
7281 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
7283 return std::move(Order);
7284 return std::nullopt;
7295 if (VU->
getType() != V->getType())
7298 if (!VU->
hasOneUse() && !V->hasOneUse())
7304 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7310 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
7311 bool IsReusedIdx =
false;
7313 if (IE2 == VU && !IE1)
7315 if (IE1 == V && !IE2)
7316 return V->hasOneUse();
7317 if (IE1 && IE1 != V) {
7319 IsReusedIdx |= ReusedIdx.
test(Idx1);
7320 ReusedIdx.
set(Idx1);
7321 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7324 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7326 if (IE2 && IE2 != VU) {
7328 IsReusedIdx |= ReusedIdx.
test(Idx2);
7329 ReusedIdx.
set(Idx2);
7330 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7333 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7335 }
while (!IsReusedIdx && (IE1 || IE2));
7345std::optional<BoUpSLP::OrdersType>
7347 bool IgnoreReorder) {
7350 if (!TE.ReuseShuffleIndices.empty()) {
7352 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
7353 "Reshuffling scalars not yet supported for nodes with padding");
7356 return std::nullopt;
7364 unsigned Sz = TE.Scalars.size();
7365 if (TE.isGather()) {
7366 if (std::optional<OrdersType> CurrentOrder =
7371 ::addMask(Mask, TE.ReuseShuffleIndices);
7372 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7373 unsigned Sz = TE.Scalars.size();
7374 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
7377 Res[
Idx + K * Sz] =
I + K * Sz;
7379 return std::move(Res);
7382 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7384 2 * TE.getVectorFactor())) == 1)
7385 return std::nullopt;
7386 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7387 return std::nullopt;
7391 if (TE.ReorderIndices.empty())
7392 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7395 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7396 unsigned VF = ReorderMask.
size();
7400 for (
unsigned I = 0;
I < VF;
I += Sz) {
7402 unsigned UndefCnt = 0;
7403 unsigned Limit = std::min(Sz, VF -
I);
7412 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7414 return std::nullopt;
7416 for (
unsigned K = 0; K < NumParts; ++K) {
7417 unsigned Idx = Val + Sz * K;
7418 if (
Idx < VF &&
I + K < VF)
7419 ResOrder[
Idx] =
I + K;
7422 return std::move(ResOrder);
7424 unsigned VF = TE.getVectorFactor();
7427 TE.ReuseShuffleIndices.end());
7428 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7430 if (isa<PoisonValue>(V))
7432 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7433 return Idx && *Idx < Sz;
7435 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7436 "by BinaryOperator and CastInst.");
7438 if (TE.ReorderIndices.empty())
7439 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7442 for (
unsigned I = 0;
I < VF; ++
I) {
7443 int &
Idx = ReusedMask[
I];
7446 Value *V = TE.Scalars[ReorderMask[
Idx]];
7448 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7454 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7455 auto *It = ResOrder.
begin();
7456 for (
unsigned K = 0; K < VF; K += Sz) {
7460 std::iota(SubMask.begin(), SubMask.end(), 0);
7462 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7463 std::advance(It, Sz);
7466 return Data.index() ==
Data.value();
7468 return std::nullopt;
7469 return std::move(ResOrder);
7471 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7472 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7474 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7475 return std::nullopt;
7476 if (TE.State == TreeEntry::SplitVectorize ||
7477 ((TE.State == TreeEntry::Vectorize ||
7478 TE.State == TreeEntry::StridedVectorize ||
7479 TE.State == TreeEntry::CompressVectorize) &&
7480 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
7481 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7482 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7483 "Alternate instructions are only supported by "
7484 "BinaryOperator and CastInst.");
7485 return TE.ReorderIndices;
7487 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7488 TE.isAltShuffle()) {
7489 assert(TE.ReuseShuffleIndices.empty() &&
7490 "ReuseShuffleIndices should be "
7491 "empty for alternate instructions.");
7493 TE.buildAltOpShuffleMask(
7495 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7496 "Unexpected main/alternate opcode");
7500 const int VF = TE.getVectorFactor();
7502 for (
unsigned I : seq<unsigned>(VF)) {
7505 ResOrder[Mask[
I] % VF] =
I;
7507 return std::move(ResOrder);
7509 if (!TE.ReorderIndices.empty())
7510 return TE.ReorderIndices;
7511 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7512 if (!TE.ReorderIndices.empty())
7513 return TE.ReorderIndices;
7516 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7517 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7519 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
7524 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7526 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
7532 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7537 auto *NodeA = DT->
getNode(BB1);
7538 auto *NodeB = DT->
getNode(BB2);
7539 assert(NodeA &&
"Should only process reachable instructions");
7540 assert(NodeB &&
"Should only process reachable instructions");
7541 assert((NodeA == NodeB) ==
7542 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7543 "Different nodes should have different DFS numbers");
7544 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7546 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7547 Value *V1 = TE.Scalars[I1];
7548 Value *V2 = TE.Scalars[I2];
7551 if (isa<PoisonValue>(V1))
7553 if (isa<PoisonValue>(V2))
7559 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
7560 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->
user_begin());
7561 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7562 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7563 FirstUserOfPhi2->getParent());
7564 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7565 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7566 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7567 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7573 if (UserBVHead[I1] && !UserBVHead[I2])
7575 if (!UserBVHead[I1])
7577 if (UserBVHead[I1] == UserBVHead[I2])
7580 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7582 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7589 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7590 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7591 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7592 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7595 if (EE1->getOperand(0) == EE2->getOperand(0))
7597 if (!Inst1 && Inst2)
7599 if (Inst1 && Inst2) {
7607 "Expected either instructions or arguments vector operands.");
7608 return P1->getArgNo() < P2->getArgNo();
7613 std::iota(Phis.
begin(), Phis.
end(), 0);
7616 return std::nullopt;
7617 return std::move(Phis);
7619 if (TE.isGather() &&
7620 (!TE.hasState() || !TE.isAltShuffle() ||
7621 ScalarsInSplitNodes.
contains(TE.getMainOp())) &&
7625 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7626 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
7627 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7629 auto *EE = dyn_cast<ExtractElementInst>(V);
7630 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7636 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7637 if (Reuse || !CurrentOrder.
empty())
7638 return std::move(CurrentOrder);
7646 int Sz = TE.Scalars.size();
7648 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7650 if (It == TE.Scalars.begin())
7653 if (It != TE.Scalars.end()) {
7655 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7670 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7673 return std::move(Order);
7678 return std::nullopt;
7679 if (TE.Scalars.size() >= 3)
7684 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7688 CurrentOrder, PointerOps);
7691 return std::move(CurrentOrder);
7696 if (std::optional<OrdersType> CurrentOrder =
7698 return CurrentOrder;
7700 return std::nullopt;
7710 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7712 if (Cluster != FirstCluster)
7718void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
7721 const unsigned Sz =
TE.Scalars.size();
7723 if (!
TE.isGather() ||
7730 addMask(NewMask,
TE.ReuseShuffleIndices);
7732 TE.ReorderIndices.clear();
7739 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7740 *
End =
TE.ReuseShuffleIndices.end();
7741 It !=
End; std::advance(It, Sz))
7742 std::iota(It, std::next(It, Sz), 0);
7748 "Expected same size of orders");
7749 size_t Sz = Order.
size();
7751 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
7752 if (Order[
Idx] != Sz)
7753 UsedIndices.
set(Order[
Idx]);
7755 if (SecondaryOrder.
empty()) {
7756 for (
unsigned Idx : seq<unsigned>(0, Sz))
7757 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
7760 for (
unsigned Idx : seq<unsigned>(0, Sz))
7761 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
7762 !UsedIndices.
test(SecondaryOrder[
Idx]))
7763 Order[
Idx] = SecondaryOrder[
Idx];
7768 constexpr unsigned TinyVF = 2;
7769 constexpr unsigned TinyTree = 10;
7770 constexpr unsigned PhiOpsLimit = 12;
7771 constexpr unsigned GatherLoadsLimit = 2;
7772 if (VectorizableTree.size() <= TinyTree)
7774 if (VectorizableTree.front()->hasState() &&
7775 !VectorizableTree.front()->isGather() &&
7776 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7777 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7778 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7779 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7780 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7781 VectorizableTree.front()->ReorderIndices.empty()) {
7785 if (VectorizableTree.front()->hasState() &&
7786 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7787 VectorizableTree.front()->Scalars.size() == TinyVF &&
7788 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7791 if (VectorizableTree.front()->hasState() &&
7792 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7793 VectorizableTree.front()->ReorderIndices.empty()) {
7794 const unsigned ReorderedSplitsCnt =
7795 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7796 return TE->State == TreeEntry::SplitVectorize &&
7797 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7798 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7801 if (ReorderedSplitsCnt <= 1 &&
7803 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7804 return ((!TE->isGather() &&
7805 (TE->ReorderIndices.empty() ||
7806 (TE->UserTreeIndex.UserTE &&
7807 TE->UserTreeIndex.UserTE->State ==
7808 TreeEntry::Vectorize &&
7809 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7811 (TE->isGather() && TE->ReorderIndices.empty() &&
7812 (!TE->hasState() || TE->isAltShuffle() ||
7813 TE->getOpcode() == Instruction::Load ||
7814 TE->getOpcode() == Instruction::ZExt ||
7815 TE->getOpcode() == Instruction::SExt))) &&
7816 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7817 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7818 return !isConstant(V) && isVectorized(V);
7820 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7823 bool HasPhis =
false;
7824 bool HasLoad =
true;
7825 unsigned GatherLoads = 0;
7826 for (
const std::unique_ptr<TreeEntry> &TE :
7827 ArrayRef(VectorizableTree).drop_front()) {
7828 if (TE->State == TreeEntry::SplitVectorize)
7830 if (!TE->hasState()) {
7831 if (
all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7832 all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
7834 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7835 any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
7839 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7840 if (!TE->isGather()) {
7847 if (GatherLoads >= GatherLoadsLimit)
7850 if (TE->getOpcode() == Instruction::GetElementPtr ||
7853 if (TE->getOpcode() != Instruction::PHI &&
7854 (!TE->hasCopyableElements() ||
7855 static_cast<unsigned>(
count_if(TE->Scalars, IsaPred<PHINode>)) <
7856 TE->Scalars.size() / 2))
7858 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7859 TE->getNumOperands() > PhiOpsLimit)
7870 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7873 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7874 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7877 copy(MaskOrder, NewMaskOrder.begin());
7879 assert(
Idx == 1 &&
"Expected either 0 or 1 index.");
7880 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7881 for (
unsigned I : seq<unsigned>(Mask.size())) {
7889 ReorderIndices.clear();
7908 ExternalUserReorderMap;
7913 const std::unique_ptr<TreeEntry> &TE) {
7916 findExternalStoreUsersReorderIndices(TE.get());
7917 if (!ExternalUserReorderIndices.
empty()) {
7918 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7920 std::move(ExternalUserReorderIndices));
7926 if (TE->hasState() && TE->isAltShuffle() &&
7927 TE->State != TreeEntry::SplitVectorize) {
7928 Type *ScalarTy = TE->Scalars[0]->getType();
7930 unsigned Opcode0 = TE->getOpcode();
7931 unsigned Opcode1 = TE->getAltOpcode();
7935 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7936 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7942 bool IgnoreReorder =
7943 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7944 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7945 VectorizableTree.front()->getOpcode() == Instruction::Store);
7946 if (std::optional<OrdersType> CurrentOrder =
7956 const TreeEntry *UserTE = TE.get();
7958 if (!UserTE->UserTreeIndex)
7960 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7961 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
7962 UserTE->UserTreeIndex.UserTE->Idx != 0)
7964 UserTE = UserTE->UserTreeIndex.UserTE;
7967 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7968 if (!(TE->State == TreeEntry::Vectorize ||
7969 TE->State == TreeEntry::StridedVectorize ||
7970 TE->State == TreeEntry::SplitVectorize ||
7971 TE->State == TreeEntry::CompressVectorize) ||
7972 !TE->ReuseShuffleIndices.empty())
7973 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
7974 if (TE->State == TreeEntry::Vectorize &&
7975 TE->getOpcode() == Instruction::PHI)
7976 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
7981 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
7982 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
7983 auto It = VFToOrderedEntries.
find(VF);
7984 if (It == VFToOrderedEntries.
end())
7998 for (
const TreeEntry *OpTE : OrderedEntries) {
8001 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8002 OpTE->State != TreeEntry::SplitVectorize)
8005 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8007 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8008 auto It = GathersToOrders.find(OpTE);
8009 if (It != GathersToOrders.end())
8012 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8013 auto It = AltShufflesToOrders.find(OpTE);
8014 if (It != AltShufflesToOrders.end())
8017 if (OpTE->State == TreeEntry::Vectorize &&
8018 OpTE->getOpcode() == Instruction::PHI) {
8019 auto It = PhisToOrders.
find(OpTE);
8020 if (It != PhisToOrders.
end())
8023 return OpTE->ReorderIndices;
8026 auto It = ExternalUserReorderMap.
find(OpTE);
8027 if (It != ExternalUserReorderMap.
end()) {
8028 const auto &ExternalUserReorderIndices = It->second;
8032 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8033 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8034 ExternalUserReorderIndices.size();
8036 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8037 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8044 if (OpTE->State == TreeEntry::Vectorize &&
8045 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8046 assert(!OpTE->isAltShuffle() &&
8047 "Alternate instructions are only supported by BinaryOperator "
8051 unsigned E = Order.
size();
8054 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8057 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8059 ++OrdersUses.try_emplace(Order, 0).first->second;
8062 if (OrdersUses.empty())
8065 unsigned IdentityCnt = 0;
8066 unsigned FilledIdentityCnt = 0;
8068 for (
auto &Pair : OrdersUses) {
8070 if (!Pair.first.empty())
8071 FilledIdentityCnt += Pair.second;
8072 IdentityCnt += Pair.second;
8077 unsigned Cnt = IdentityCnt;
8078 for (
auto &Pair : OrdersUses) {
8082 if (Cnt < Pair.second ||
8083 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8084 Cnt == Pair.second && !BestOrder.
empty() &&
8087 BestOrder = Pair.first;
8100 unsigned E = BestOrder.
size();
8102 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8105 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8107 if (TE->Scalars.size() != VF) {
8108 if (TE->ReuseShuffleIndices.size() == VF) {
8109 assert(TE->State != TreeEntry::SplitVectorize &&
8110 "Split vectorized not expected.");
8115 (!TE->UserTreeIndex ||
8116 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8117 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8118 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8119 "All users must be of VF size.");
8126 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8127 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8132 reorderNodeWithReuses(*TE, Mask);
8134 if (TE->UserTreeIndex &&
8135 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8136 TE->UserTreeIndex.UserTE->reorderSplitNode(
8137 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8141 if ((TE->State == TreeEntry::SplitVectorize &&
8142 TE->ReuseShuffleIndices.empty()) ||
8143 ((TE->State == TreeEntry::Vectorize ||
8144 TE->State == TreeEntry::StridedVectorize ||
8145 TE->State == TreeEntry::CompressVectorize) &&
8148 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8150 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8151 TE->ReuseShuffleIndices.empty())) &&
8152 "Alternate instructions are only supported by BinaryOperator "
8157 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8158 TE->reorderOperands(Mask);
8161 TE->reorderOperands(Mask);
8162 assert(TE->ReorderIndices.empty() &&
8163 "Expected empty reorder sequence.");
8166 if (!TE->ReuseShuffleIndices.empty()) {
8173 addMask(NewReuses, TE->ReuseShuffleIndices);
8174 TE->ReuseShuffleIndices.swap(NewReuses);
8175 }
else if (TE->UserTreeIndex &&
8176 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8178 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8184void BoUpSLP::buildReorderableOperands(
8185 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8188 for (
unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8189 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8190 return OpData.first ==
I &&
8191 (OpData.second->State == TreeEntry::Vectorize ||
8192 OpData.second->State == TreeEntry::StridedVectorize ||
8193 OpData.second->State == TreeEntry::CompressVectorize ||
8194 OpData.second->State == TreeEntry::SplitVectorize);
8198 if (UserTE->hasState()) {
8199 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8200 UserTE->getOpcode() == Instruction::ExtractValue)
8202 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8204 if (UserTE->getOpcode() == Instruction::Store &&
8205 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8207 if (UserTE->getOpcode() == Instruction::Load &&
8208 (UserTE->State == TreeEntry::Vectorize ||
8209 UserTE->State == TreeEntry::StridedVectorize ||
8210 UserTE->State == TreeEntry::CompressVectorize))
8213 TreeEntry *TE = getOperandEntry(UserTE,
I);
8214 assert(TE &&
"Expected operand entry.");
8215 if (!TE->isGather()) {
8218 Edges.emplace_back(
I, TE);
8224 if (TE->State == TreeEntry::ScatterVectorize &&
8225 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8229 if (ReorderableGathers.
contains(TE))
8235 struct TreeEntryCompare {
8236 bool operator()(
const TreeEntry *
LHS,
const TreeEntry *
RHS)
const {
8237 if (
LHS->UserTreeIndex &&
RHS->UserTreeIndex)
8238 return LHS->UserTreeIndex.UserTE->Idx <
RHS->UserTreeIndex.UserTE->Idx;
8239 return LHS->Idx <
RHS->Idx;
8248 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8249 if (TE->State != TreeEntry::Vectorize &&
8250 TE->State != TreeEntry::StridedVectorize &&
8251 TE->State != TreeEntry::CompressVectorize &&
8252 TE->State != TreeEntry::SplitVectorize)
8253 NonVectorized.
insert(TE.get());
8254 if (std::optional<OrdersType> CurrentOrder =
8256 Queue.push(TE.get());
8257 if (!(TE->State == TreeEntry::Vectorize ||
8258 TE->State == TreeEntry::StridedVectorize ||
8259 TE->State == TreeEntry::CompressVectorize ||
8260 TE->State == TreeEntry::SplitVectorize) ||
8261 !TE->ReuseShuffleIndices.empty())
8262 GathersToOrders.
insert(TE.get());
8271 while (!Queue.empty()) {
8273 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8274 TreeEntry *TE = Queue.top();
8275 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8278 while (!Queue.empty()) {
8280 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8285 for (TreeEntry *TE : OrderedOps) {
8286 if (!(TE->State == TreeEntry::Vectorize ||
8287 TE->State == TreeEntry::StridedVectorize ||
8288 TE->State == TreeEntry::CompressVectorize ||
8289 TE->State == TreeEntry::SplitVectorize ||
8290 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8291 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8292 !Visited.
insert(TE).second)
8296 Users.first = TE->UserTreeIndex.UserTE;
8297 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8301 if (
Data.first->State == TreeEntry::SplitVectorize) {
8303 Data.second.size() <= 2 &&
8304 "Expected not greater than 2 operands for split vectorize node.");
8306 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8309 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8310 "Expected exactly 2 entries.");
8311 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8312 TreeEntry &OpTE = *VectorizableTree[
P.first];
8314 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8315 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8317 const auto BestOrder =
8326 const unsigned E = Order.
size();
8329 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8331 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8333 if (!OpTE.ReorderIndices.empty()) {
8334 OpTE.ReorderIndices.clear();
8335 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8338 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8342 if (
Data.first->ReuseShuffleIndices.empty() &&
8343 !
Data.first->ReorderIndices.empty()) {
8346 Queue.push(
Data.first);
8352 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8364 for (
const auto &
Op :
Data.second) {
8365 TreeEntry *OpTE =
Op.second;
8366 if (!VisitedOps.
insert(OpTE).second)
8368 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8370 const auto Order = [&]() ->
const OrdersType {
8371 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8375 return OpTE->ReorderIndices;
8379 if (Order.
size() == 1)
8385 Value *Root = OpTE->hasState()
8388 auto GetSameNodesUsers = [&](
Value *Root) {
8390 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8391 if (TE != OpTE && TE->UserTreeIndex &&
8392 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8393 TE->Scalars.size() == OpTE->Scalars.size() &&
8394 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8395 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8396 Res.
insert(TE->UserTreeIndex.UserTE);
8398 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8399 if (TE != OpTE && TE->UserTreeIndex &&
8400 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8401 TE->Scalars.size() == OpTE->Scalars.size() &&
8402 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8403 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8404 Res.
insert(TE->UserTreeIndex.UserTE);
8408 auto GetNumOperands = [](
const TreeEntry *TE) {
8409 if (TE->State == TreeEntry::SplitVectorize)
8410 return TE->getNumOperands();
8411 if (
auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8412 return CI->arg_size();
8413 return TE->getNumOperands();
8415 auto NodeShouldBeReorderedWithOperands = [&,
TTI =
TTI](
8416 const TreeEntry *TE) {
8418 if (
auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8420 for (
unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8424 const TreeEntry *
Op = getOperandEntry(TE,
Idx);
8425 if (
Op->isGather() &&
Op->hasState()) {
8426 const TreeEntry *VecOp =
8427 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8431 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8438 if (!RevisitedOps.
insert(UTE).second)
8440 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8441 !UTE->ReuseShuffleIndices.empty() ||
8442 (UTE->UserTreeIndex &&
8443 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8444 (
Data.first->UserTreeIndex &&
8445 Data.first->UserTreeIndex.UserTE == UTE) ||
8446 (IgnoreReorder && UTE->UserTreeIndex &&
8447 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8448 NodeShouldBeReorderedWithOperands(UTE);
8451 for (TreeEntry *UTE :
Users) {
8453 if (
auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8455 for (
unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8459 const TreeEntry *
Op = getOperandEntry(UTE,
Idx);
8461 Queue.push(
const_cast<TreeEntry *
>(
Op));
8466 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8467 return P.second == OpTE;
8470 if (OpTE->State == TreeEntry::Vectorize &&
8471 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8472 assert(!OpTE->isAltShuffle() &&
8473 "Alternate instructions are only supported by BinaryOperator "
8477 unsigned E = Order.
size();
8480 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8483 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8485 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8487 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8488 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8489 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8490 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8491 (IgnoreReorder && TE->Idx == 0))
8493 if (TE->isGather()) {
8503 if (OpTE->UserTreeIndex) {
8504 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8505 if (!VisitedUsers.
insert(UserTE).second)
8510 if (AllowsReordering(UserTE))
8518 if (
static_cast<unsigned>(
count_if(
8519 Ops, [UserTE, &AllowsReordering](
8520 const std::pair<unsigned, TreeEntry *> &
Op) {
8521 return AllowsReordering(
Op.second) &&
8522 Op.second->UserTreeIndex.UserTE == UserTE;
8523 })) <= Ops.
size() / 2)
8524 ++Res.first->second;
8527 if (OrdersUses.empty()) {
8532 unsigned IdentityCnt = 0;
8533 unsigned VF =
Data.second.front().second->getVectorFactor();
8535 for (
auto &Pair : OrdersUses) {
8537 IdentityCnt += Pair.second;
8542 unsigned Cnt = IdentityCnt;
8543 for (
auto &Pair : OrdersUses) {
8547 if (Cnt < Pair.second) {
8549 BestOrder = Pair.first;
8566 unsigned E = BestOrder.
size();
8568 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8570 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8571 TreeEntry *TE =
Op.second;
8572 if (!VisitedOps.
insert(TE).second)
8574 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8575 reorderNodeWithReuses(*TE, Mask);
8579 if (TE->State != TreeEntry::Vectorize &&
8580 TE->State != TreeEntry::StridedVectorize &&
8581 TE->State != TreeEntry::CompressVectorize &&
8582 TE->State != TreeEntry::SplitVectorize &&
8583 (TE->State != TreeEntry::ScatterVectorize ||
8584 TE->ReorderIndices.empty()))
8586 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8587 TE->ReorderIndices.empty()) &&
8588 "Non-matching sizes of user/operand entries.");
8590 if (IgnoreReorder && TE == VectorizableTree.front().get())
8591 IgnoreReorder =
false;
8594 for (TreeEntry *
Gather : GatherOps) {
8596 "Unexpected reordering of gathers.");
8597 if (!
Gather->ReuseShuffleIndices.empty()) {
8607 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8608 return TE.isAltShuffle() &&
8609 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8610 TE.ReorderIndices.empty());
8612 if (
Data.first->State != TreeEntry::Vectorize ||
8613 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
8614 Data.first->getMainOp()) ||
8615 IsNotProfitableAltCodeNode(*
Data.first))
8616 Data.first->reorderOperands(Mask);
8617 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
8618 IsNotProfitableAltCodeNode(*
Data.first) ||
8619 Data.first->State == TreeEntry::StridedVectorize ||
8620 Data.first->State == TreeEntry::CompressVectorize) {
8624 if (
Data.first->ReuseShuffleIndices.empty() &&
8625 !
Data.first->ReorderIndices.empty() &&
8626 !IsNotProfitableAltCodeNode(*
Data.first)) {
8629 Queue.push(
Data.first);
8637 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8638 VectorizableTree.front()->ReuseShuffleIndices.empty())
8639 VectorizableTree.front()->ReorderIndices.clear();
8642Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8643 if (Entry.hasState() &&
8644 (Entry.getOpcode() == Instruction::Store ||
8645 Entry.getOpcode() == Instruction::Load) &&
8646 Entry.State == TreeEntry::StridedVectorize &&
8647 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8648 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8649 return dyn_cast<Instruction>(Entry.Scalars.front());
8654 const size_t NumVectScalars = ScalarToTreeEntries.
size() + 1;
8658 for (
auto &TEPtr : VectorizableTree) {
8659 TreeEntry *Entry = TEPtr.get();
8662 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8666 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8667 Value *Scalar = Entry->Scalars[Lane];
8668 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8672 auto It = ScalarToExtUses.
find(Scalar);
8673 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8676 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8677 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8678 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8679 <<
" from " << *Scalar <<
"for many users.\n");
8680 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8681 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8682 ExternalUsesWithNonUsers.
insert(Scalar);
8687 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8688 if (ExtI != ExternallyUsedValues.
end()) {
8689 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8690 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8691 << FoundLane <<
" from " << *Scalar <<
".\n");
8692 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8693 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8696 for (
User *U : Scalar->users()) {
8704 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8709 !UseEntries.empty()) {
8713 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8714 isa<LoadInst, StoreInst>(UserInst)) ||
8715 isa<CallInst>(UserInst)) ||
8716 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8717 return UseEntry->State == TreeEntry::ScatterVectorize ||
8719 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8722 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8725 [](TreeEntry *UseEntry) {
8726 return UseEntry->isGather();
8732 if (It != ScalarToExtUses.
end()) {
8733 ExternalUses[It->second].User =
nullptr;
8738 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8740 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8742 <<
" from lane " << FoundLane <<
" from " << *Scalar
8744 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8745 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8746 ExternalUsesWithNonUsers.
insert(Scalar);
8755BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8759 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8760 Value *V = TE->Scalars[Lane];
8762 if (!isa<Instruction>(V))
8769 for (
User *U : V->users()) {
8770 auto *SI = dyn_cast<StoreInst>(U);
8773 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() != F ||
8782 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8783 SI->getValueOperand()->getType(),
Ptr}];
8786 if (StoresVec.size() > Lane)
8788 if (!StoresVec.empty()) {
8790 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8791 SI->getValueOperand()->getType(),
8792 StoresVec.front()->getPointerOperand(), *DL, *SE,
8798 StoresVec.push_back(SI);
8803 for (
auto &
P : PtrToStoresMap) {
8811 OrdersType &ReorderIndices)
const {
8822 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
8824 std::optional<int64_t> Diff =
8826 SI->getPointerOperand(), *DL, *SE,
8832 if (StoreOffsetVec.
size() != StoresVec.
size())
8836 int64_t PrevDist = 0;
8837 for (
const auto &
P : StoreOffsetVec) {
8838 if (
Idx > 0 &&
P.first != PrevDist + 1)
8846 ReorderIndices.assign(StoresVec.
size(), 0);
8847 bool IsIdentity =
true;
8849 ReorderIndices[
P.second] =
I;
8850 IsIdentity &=
P.second ==
I;
8856 ReorderIndices.clear();
8863 for (
unsigned Idx : Order)
8870BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8871 unsigned NumLanes =
TE->Scalars.size();
8884 if (StoresVec.
size() != NumLanes)
8889 if (!canFormVector(StoresVec, ReorderIndices))
8894 ExternalReorderIndices.
push_back(ReorderIndices);
8896 return ExternalReorderIndices;
8902 UserIgnoreList = &UserIgnoreLst;
8905 buildTreeRec(Roots, 0,
EdgeInfo());
8912 buildTreeRec(Roots, 0,
EdgeInfo());
8921 bool AddNew =
true) {
8929 for (
Value *V : VL) {
8930 auto *LI = dyn_cast<LoadInst>(V);
8933 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8935 bool IsFound =
false;
8936 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8937 assert(LI->getParent() ==
Data.front().first->getParent() &&
8938 LI->getType() ==
Data.front().first->getType() &&
8942 "Expected loads with the same type, same parent and same "
8943 "underlying pointer.");
8945 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8946 Data.front().first->getPointerOperand(),
DL, SE,
8950 auto It = Map.find(*Dist);
8951 if (It != Map.end() && It->second != LI)
8953 if (It == Map.end()) {
8954 Data.emplace_back(LI, *Dist);
8955 Map.try_emplace(*Dist, LI);
8965 auto FindMatchingLoads =
8970 int64_t &
Offset,
unsigned &Start) {
8972 return GatheredLoads.
end();
8981 std::optional<int64_t> Dist =
8983 Data.front().first->getType(),
8984 Data.front().first->getPointerOperand(),
DL, SE,
8990 for (std::pair<LoadInst *, int64_t>
P :
Data) {
8996 unsigned NumUniques = 0;
8997 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
8998 bool Used = DataLoads.
contains(Pair.first);
8999 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9003 Repeated.insert(Cnt);
9006 if (NumUniques > 0 &&
9007 (Loads.
size() == NumUniques ||
9008 (Loads.
size() - NumUniques >= 2 &&
9009 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9015 return std::next(GatheredLoads.
begin(),
Idx);
9019 return GatheredLoads.
end();
9021 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9025 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9027 while (It != GatheredLoads.
end()) {
9028 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9029 for (
unsigned Idx : LocalToAdd)
9032 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9036 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9040 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
9049 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9050 return PD.front().first->getParent() == LI->
getParent() &&
9051 PD.front().first->getType() == LI->
getType();
9053 while (It != GatheredLoads.
end()) {
9056 std::next(It), GatheredLoads.
end(),
9057 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9058 return PD.front().first->getParent() == LI->getParent() &&
9059 PD.front().first->getType() == LI->getType();
9063 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9064 AddNewLoads(GatheredLoads.emplace_back());
9069void BoUpSLP::tryToVectorizeGatheredLoads(
9071 std::tuple<BasicBlock *, Value *, Type *>,
9074 GatheredLoadsEntriesFirst = VectorizableTree.size();
9077 LoadEntriesToVectorize.
size());
9078 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9079 Set.insert_range(VectorizableTree[
Idx]->Scalars);
9082 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9083 const std::pair<LoadInst *, int64_t> &L2) {
9084 return L1.second > L2.second;
9090 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9091 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9099 bool Final,
unsigned MaxVF) {
9101 unsigned StartIdx = 0;
9106 *
TTI, Loads.
front()->getType(), MaxVF);
9108 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
9114 if (Final && CandidateVFs.
empty())
9117 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9118 for (
unsigned NumElts : CandidateVFs) {
9119 if (Final && NumElts > BestVF)
9122 for (
unsigned Cnt = StartIdx, E = Loads.
size(); Cnt < E;
9126 if (VectorizedLoads.count(Slice.
front()) ||
9127 VectorizedLoads.count(Slice.
back()) ||
9133 bool AllowToVectorize =
false;
9141 if (LI->hasOneUse())
9147 if (
static_cast<unsigned int>(std::distance(
9148 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9150 if (!IsLegalBroadcastLoad)
9154 for (
User *U : LI->users()) {
9155 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
9157 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9158 for (
int I : seq<int>(UTE->getNumOperands())) {
9160 return V == LI || isa<PoisonValue>(V);
9170 AllowToVectorize = CheckIfAllowed(Slice);
9174 any_of(ValueToGatherNodes.at(Slice.front()),
9175 [=](
const TreeEntry *TE) {
9176 return TE->Scalars.size() == 2 &&
9177 ((TE->Scalars.front() == Slice.front() &&
9178 TE->Scalars.back() == Slice.back()) ||
9179 (TE->Scalars.front() == Slice.back() &&
9180 TE->Scalars.back() == Slice.front()));
9185 if (AllowToVectorize) {
9190 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9192 PointerOps, &BestVF);
9194 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9196 if (MaskedGatherVectorized.
empty() ||
9197 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9202 Results.emplace_back(Values, LS);
9203 VectorizedLoads.insert_range(Slice);
9206 if (Cnt == StartIdx)
9207 StartIdx += NumElts;
9210 if (StartIdx >= Loads.
size())
9214 if (!MaskedGatherVectorized.
empty() &&
9215 Cnt < MaskedGatherVectorized.
back() + NumElts)
9221 if (!AllowToVectorize || BestVF == 0)
9225 for (
unsigned Cnt : MaskedGatherVectorized) {
9227 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9231 VectorizedLoads.insert_range(Slice);
9233 if (Cnt == StartIdx)
9234 StartIdx += NumElts;
9238 if (!VectorizedLoads.contains(LI))
9239 NonVectorized.push_back(LI);
9243 auto ProcessGatheredLoads =
9246 bool Final =
false) {
9248 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9250 if (LoadsDists.size() <= 1) {
9251 NonVectorized.
push_back(LoadsDists.back().first);
9259 unsigned MaxConsecutiveDistance = 0;
9260 unsigned CurrentConsecutiveDist = 1;
9261 int64_t LastDist = LocalLoadsDists.
front().second;
9262 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9263 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9266 assert(LastDist >=
L.second &&
9267 "Expected first distance always not less than second");
9268 if (
static_cast<uint64_t>(LastDist -
L.second) ==
9269 CurrentConsecutiveDist) {
9270 ++CurrentConsecutiveDist;
9271 MaxConsecutiveDistance =
9272 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9276 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9279 CurrentConsecutiveDist = 1;
9280 LastDist =
L.second;
9283 if (Loads.
size() <= 1)
9285 if (AllowMaskedGather)
9286 MaxConsecutiveDistance = Loads.
size();
9287 else if (MaxConsecutiveDistance < 2)
9292 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9293 Final, MaxConsecutiveDistance);
9295 OriginalLoads.size() == Loads.
size() &&
9296 MaxConsecutiveDistance == Loads.
size() &&
9301 VectorizedLoads.
clear();
9305 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9306 UnsortedNonVectorized, Final,
9307 OriginalLoads.size());
9308 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9309 SortedNonVectorized.
swap(UnsortedNonVectorized);
9310 Results.swap(UnsortedResults);
9315 << Slice.
size() <<
")\n");
9317 for (
Value *L : Slice)
9319 SortedNonVectorized.
push_back(cast<LoadInst>(L));
9325 unsigned MaxVF = Slice.size();
9326 unsigned UserMaxVF = 0;
9327 unsigned InterleaveFactor = 0;
9332 std::optional<unsigned> InterleavedLoadsDistance = 0;
9334 std::optional<unsigned> CommonVF = 0;
9338 for (
const TreeEntry *E : ValueToGatherNodes.at(V)) {
9339 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9342 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
9344 if (*CommonVF == 0) {
9345 CommonVF = E->Scalars.size();
9348 if (*CommonVF != E->Scalars.size())
9352 if (Pos !=
Idx && InterleavedLoadsDistance) {
9353 if (!DeinterleavedNodes.
contains(E) &&
9355 if (isa<Constant>(V))
9357 if (isVectorized(V))
9359 const auto &Nodes = ValueToGatherNodes.at(V);
9360 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9361 !is_contained(Slice, V);
9363 InterleavedLoadsDistance.reset();
9366 DeinterleavedNodes.
insert(E);
9367 if (*InterleavedLoadsDistance == 0) {
9368 InterleavedLoadsDistance =
Idx - Pos;
9371 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9372 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
9373 InterleavedLoadsDistance.reset();
9374 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9378 DeinterleavedNodes.
clear();
9380 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9381 CommonVF.value_or(0) != 0) {
9382 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9383 unsigned VF = *CommonVF;
9387 if (InterleaveFactor <= Slice.size() &&
9391 cast<LoadInst>(Slice.front())->getAlign(),
9392 cast<LoadInst>(Slice.front())
9396 UserMaxVF = InterleaveFactor * VF;
9398 InterleaveFactor = 0;
9403 unsigned ConsecutiveNodesSize = 0;
9404 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
9405 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9406 [&, Slice = Slice](
const auto &
P) {
9408 return std::get<1>(
P).contains(V);
9410 if (It == Slice.end())
9412 const TreeEntry &
TE =
9413 *VectorizableTree[std::get<0>(
P)];
9418 VL, VL.
front(), Order, PointerOps);
9422 ConsecutiveNodesSize += VL.
size();
9423 size_t Start = std::distance(Slice.begin(), It);
9424 size_t Sz = Slice.size() - Start;
9425 return Sz < VL.
size() ||
9426 Slice.slice(Start, VL.
size()) != VL;
9431 if (InterleaveFactor == 0 &&
9432 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9433 [&, Slice = Slice](
unsigned Idx) {
9435 SmallVector<Value *> PointerOps;
9436 return canVectorizeLoads(
9437 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9438 Slice[Idx * UserMaxVF], Order,
9440 LoadsState::ScatterVectorize;
9443 if (Slice.size() != ConsecutiveNodesSize)
9444 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9446 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9447 bool IsVectorized =
true;
9448 for (
unsigned I = 0, E = Slice.size();
I < E;
I += VF) {
9450 Slice.
slice(
I, std::min(VF, E -
I));
9455 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9456 [&](
const auto &
P) {
9458 VectorizableTree[std::get<0>(
P)]
9463 unsigned Sz = VectorizableTree.size();
9464 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9465 if (Sz == VectorizableTree.size()) {
9466 IsVectorized =
false;
9469 if (InterleaveFactor > 0) {
9470 VF = 2 * (MaxVF / InterleaveFactor);
9471 InterleaveFactor = 0;
9480 NonVectorized.
append(SortedNonVectorized);
9482 return NonVectorized;
9484 for (
const auto &GLs : GatheredLoads) {
9485 const auto &
Ref = GLs.second;
9487 if (!
Ref.empty() && !NonVectorized.
empty() &&
9489 Ref.begin(),
Ref.end(), 0u,
9490 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9491 ->
unsigned { return S + LoadsDists.size(); }) !=
9492 NonVectorized.
size() &&
9493 IsMaskedGatherSupported(NonVectorized)) {
9496 for (
LoadInst *LI : NonVectorized) {
9504 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9508 for (
unsigned Idx : LoadEntriesToVectorize) {
9509 const TreeEntry &E = *VectorizableTree[
Idx];
9512 if (!E.ReorderIndices.empty()) {
9519 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9523 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9524 VectorizableTree.size())
9525 GatheredLoadsEntriesFirst.reset();
9535 bool AllowAlternate) {
9539 if (
auto *LI = dyn_cast<LoadInst>(V)) {
9542 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
9547 if (isa<ExtractElementInst, UndefValue>(V))
9549 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
9551 !isa<UndefValue>(EI->getIndexOperand()))
9554 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
9557 if ((isa<BinaryOperator, CastInst>(
I)) &&
9558 isValidForAlternation(
I->getOpcode())) {
9567 : cast<CastInst>(
I)->getOperand(0)->getType()));
9569 if (isa<CastInst>(
I)) {
9570 std::pair<size_t, size_t> OpVals =
9576 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
9578 if (CI->isCommutative())
9584 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
9598 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
9599 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9600 SubKey =
hash_value(Gep->getPointerOperand());
9604 !isa<ConstantInt>(
I->getOperand(1))) {
9612 return std::make_pair(Key, SubKey);
9620bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9622 Type *ScalarTy = S.getMainOp()->getType();
9623 unsigned Opcode0 = S.getOpcode();
9624 unsigned Opcode1 = S.getAltOpcode();
9628 Opcode1, OpcodeMask))
9631 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9634 for (
Value *V : VL) {
9635 if (isa<PoisonValue>(V)) {
9640 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
9645 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9651 switch (Res.value_or(0)) {
9666 constexpr unsigned NumAltInsts = 3;
9667 unsigned NonInstCnt = 0;
9670 unsigned UndefCnt = 0;
9672 unsigned ExtraShuffleInsts = 0;
9681 return is_contained(Operands.back(), V);
9684 ++ExtraShuffleInsts;
9701 if (isa<Constant, ExtractElementInst>(V) ||
9703 if (isa<UndefValue>(V))
9709 if (!Res.second && Res.first->second == 1)
9710 ++ExtraShuffleInsts;
9711 ++Res.first->getSecond();
9712 if (
auto *
I = dyn_cast<Instruction>(V))
9713 UniqueOpcodes.
insert(
I->getOpcode());
9714 else if (Res.second)
9717 return none_of(Uniques, [&](
const auto &
P) {
9718 return P.first->hasNUsesOrMore(
P.second + 1) &&
9720 return isVectorized(U) || Uniques.contains(U);
9729 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9730 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9731 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9738 const unsigned VF,
unsigned MinBW,
9761static std::pair<InstructionCost, InstructionCost>
9780 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9781 FMF = FPCI->getFastMathFlags();
9784 LibCost.isValid() ? LibCost : ScalarLimit);
9794BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9796 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9799 "Expected instructions with same/alternate opcodes only.");
9801 unsigned ShuffleOrOp =
9802 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
9804 switch (ShuffleOrOp) {
9805 case Instruction::PHI: {
9808 return TreeEntry::NeedToGather;
9810 for (
Value *V : VL) {
9811 auto *
PHI = dyn_cast<PHINode>(V);
9816 if (Term &&
Term->isTerminator()) {
9818 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9819 return TreeEntry::NeedToGather;
9824 return TreeEntry::Vectorize;
9826 case Instruction::ExtractElement:
9828 auto *EI = dyn_cast<ExtractElementInst>(V);
9833 return TreeEntry::NeedToGather;
9835 case Instruction::ExtractValue: {
9836 bool Reuse = canReuseExtract(VL, CurrentOrder);
9840 return TreeEntry::NeedToGather;
9841 if (Reuse || !CurrentOrder.empty())
9842 return TreeEntry::Vectorize;
9844 return TreeEntry::NeedToGather;
9846 case Instruction::InsertElement: {
9850 for (
Value *V : VL) {
9851 if (isa<PoisonValue>(V)) {
9852 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9853 return TreeEntry::NeedToGather;
9855 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9857 "Non-constant or undef index?");
9861 return !SourceVectors.contains(V);
9864 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9865 "different source vectors.\n");
9866 return TreeEntry::NeedToGather;
9871 return SourceVectors.contains(V) && !
V->hasOneUse();
9874 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9875 "multiple uses.\n");
9876 return TreeEntry::NeedToGather;
9879 return TreeEntry::Vectorize;
9881 case Instruction::Load: {
9888 auto IsGatheredNode = [&]() {
9889 if (!GatheredLoadsEntriesFirst)
9892 if (isa<PoisonValue>(V))
9894 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9895 return TE->Idx >= *GatheredLoadsEntriesFirst;
9901 return TreeEntry::Vectorize;
9903 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9905 LoadEntriesToVectorize.insert(VectorizableTree.size());
9906 return TreeEntry::NeedToGather;
9908 return IsGatheredNode() ? TreeEntry::NeedToGather
9909 : TreeEntry::CompressVectorize;
9911 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9913 LoadEntriesToVectorize.insert(VectorizableTree.size());
9914 return TreeEntry::NeedToGather;
9916 return IsGatheredNode() ? TreeEntry::NeedToGather
9917 : TreeEntry::ScatterVectorize;
9919 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9921 LoadEntriesToVectorize.insert(VectorizableTree.size());
9922 return TreeEntry::NeedToGather;
9924 return IsGatheredNode() ? TreeEntry::NeedToGather
9925 : TreeEntry::StridedVectorize;
9929 if (
DL->getTypeSizeInBits(ScalarTy) !=
9930 DL->getTypeAllocSizeInBits(ScalarTy))
9931 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9933 auto *LI = dyn_cast<LoadInst>(V);
9934 return !LI || !LI->isSimple();
9941 return TreeEntry::NeedToGather;
9945 case Instruction::ZExt:
9946 case Instruction::SExt:
9947 case Instruction::FPToUI:
9948 case Instruction::FPToSI:
9949 case Instruction::FPExt:
9950 case Instruction::PtrToInt:
9951 case Instruction::IntToPtr:
9952 case Instruction::SIToFP:
9953 case Instruction::UIToFP:
9954 case Instruction::Trunc:
9955 case Instruction::FPTrunc:
9956 case Instruction::BitCast: {
9958 for (
Value *V : VL) {
9959 if (isa<PoisonValue>(V))
9961 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
9964 dbgs() <<
"SLP: Gathering casts with different src types.\n");
9965 return TreeEntry::NeedToGather;
9968 return TreeEntry::Vectorize;
9970 case Instruction::ICmp:
9971 case Instruction::FCmp: {
9976 for (
Value *V : VL) {
9977 if (isa<PoisonValue>(V))
9979 auto *
Cmp = cast<CmpInst>(V);
9980 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
9981 Cmp->getOperand(0)->getType() != ComparedTy) {
9982 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
9983 return TreeEntry::NeedToGather;
9986 return TreeEntry::Vectorize;
9988 case Instruction::Select:
9989 case Instruction::FNeg:
9990 case Instruction::Add:
9991 case Instruction::FAdd:
9992 case Instruction::Sub:
9993 case Instruction::FSub:
9994 case Instruction::Mul:
9995 case Instruction::FMul:
9996 case Instruction::UDiv:
9997 case Instruction::SDiv:
9998 case Instruction::FDiv:
9999 case Instruction::URem:
10000 case Instruction::SRem:
10001 case Instruction::FRem:
10002 case Instruction::Shl:
10003 case Instruction::LShr:
10004 case Instruction::AShr:
10005 case Instruction::And:
10006 case Instruction::Or:
10007 case Instruction::Xor:
10008 case Instruction::Freeze:
10009 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10011 auto *
I = dyn_cast<Instruction>(V);
10012 return I &&
I->isBinaryOp() && !
I->isFast();
10014 return TreeEntry::NeedToGather;
10015 return TreeEntry::Vectorize;
10016 case Instruction::GetElementPtr: {
10018 for (
Value *V : VL) {
10019 auto *
I = dyn_cast<GetElementPtrInst>(V);
10022 if (
I->getNumOperands() != 2) {
10023 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10024 return TreeEntry::NeedToGather;
10030 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10031 for (
Value *V : VL) {
10032 auto *
GEP = dyn_cast<GEPOperator>(V);
10035 Type *CurTy =
GEP->getSourceElementType();
10036 if (Ty0 != CurTy) {
10037 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10038 return TreeEntry::NeedToGather;
10044 for (
Value *V : VL) {
10045 auto *
I = dyn_cast<GetElementPtrInst>(V);
10048 auto *
Op =
I->getOperand(1);
10049 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
10050 (
Op->getType() != Ty1 &&
10051 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
10052 Op->getType()->getScalarSizeInBits() >
10053 DL->getIndexSizeInBits(
10054 V->getType()->getPointerAddressSpace())))) {
10056 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10057 return TreeEntry::NeedToGather;
10061 return TreeEntry::Vectorize;
10063 case Instruction::Store: {
10065 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10068 if (
DL->getTypeSizeInBits(ScalarTy) !=
10069 DL->getTypeAllocSizeInBits(ScalarTy)) {
10070 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10071 return TreeEntry::NeedToGather;
10075 for (
Value *V : VL) {
10076 auto *
SI = cast<StoreInst>(V);
10077 if (!
SI->isSimple()) {
10079 return TreeEntry::NeedToGather;
10088 if (CurrentOrder.empty()) {
10089 Ptr0 = PointerOps.
front();
10090 PtrN = PointerOps.
back();
10092 Ptr0 = PointerOps[CurrentOrder.front()];
10093 PtrN = PointerOps[CurrentOrder.back()];
10095 std::optional<int64_t> Dist =
10098 if (
static_cast<uint64_t>(*Dist) == VL.size() - 1)
10099 return TreeEntry::Vectorize;
10103 return TreeEntry::NeedToGather;
10105 case Instruction::Call: {
10106 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10108 auto *
I = dyn_cast<Instruction>(V);
10109 return I && !
I->isFast();
10111 return TreeEntry::NeedToGather;
10114 CallInst *CI = cast<CallInst>(VL0);
10125 return TreeEntry::NeedToGather;
10128 unsigned NumArgs = CI->
arg_size();
10130 for (
unsigned J = 0; J != NumArgs; ++J)
10133 for (
Value *V : VL) {
10134 CallInst *CI2 = dyn_cast<CallInst>(V);
10140 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10142 return TreeEntry::NeedToGather;
10146 for (
unsigned J = 0; J != NumArgs; ++J) {
10149 if (ScalarArgs[J] != A1J) {
10151 <<
"SLP: mismatched arguments in call:" << *CI
10152 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10153 return TreeEntry::NeedToGather;
10162 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10163 <<
"!=" << *V <<
'\n');
10164 return TreeEntry::NeedToGather;
10169 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10171 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10172 return TreeEntry::NeedToGather;
10174 return TreeEntry::Vectorize;
10176 case Instruction::ShuffleVector: {
10177 if (!S.isAltShuffle()) {
10180 return TreeEntry::Vectorize;
10183 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10184 return TreeEntry::NeedToGather;
10189 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10190 "the whole alt sequence is not profitable.\n");
10191 return TreeEntry::NeedToGather;
10194 return TreeEntry::Vectorize;
10198 return TreeEntry::NeedToGather;
10212 PHIHandler() =
delete;
10214 : DT(DT), Main(Main), Phis(Phis),
10215 Operands(Main->getNumIncomingValues(),
10217 void buildOperands() {
10218 constexpr unsigned FastLimit = 4;
10228 auto *
P = dyn_cast<PHINode>(V);
10230 assert(isa<PoisonValue>(V) &&
10231 "Expected isa instruction or poison value.");
10235 if (
P->getIncomingBlock(
I) == InBB)
10251 Blocks.try_emplace(InBB).first->second.push_back(
I);
10254 if (isa<PoisonValue>(V)) {
10259 auto *
P = cast<PHINode>(V);
10260 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
10268 auto *It =
Blocks.find(InBB);
10271 Operands[It->second.front()][
Idx] =
P->getIncomingValue(
I);
10274 for (
const auto &
P :
Blocks) {
10276 if (IncomingValues.
size() <= 1)
10279 for (
unsigned I : IncomingValues) {
10281 [&](
const auto &Data) {
10282 return !Data.value() ||
10283 Data.value() ==
Operands[BasicI][Data.index()];
10285 "Expected empty operands list.");
10299static std::pair<Instruction *, Instruction *>
10303 for (
Value *V : VL) {
10304 if (isa<PoisonValue>(V))
10306 auto *
I = dyn_cast<Instruction>(V);
10313 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10332 "Expected different main and alt instructions.");
10333 return std::make_pair(MainOp, AltOp);
10346 const InstructionsState &S,
10348 bool TryPad =
false) {
10352 for (
Value *V : VL) {
10368 size_t NumUniqueScalarValues = UniqueValues.
size();
10371 if (NumUniqueScalarValues == VL.
size() &&
10373 ReuseShuffleIndices.
clear();
10378 if ((UserTreeIdx.
UserTE &&
10379 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10381 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10382 "for nodes with padding.\n");
10383 ReuseShuffleIndices.
clear();
10388 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10390 return isa<UndefValue>(V) || !
isConstant(V);
10392 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10393 S.getMainOp()->isSafeToRemove() &&
10394 (S.areInstructionsWithCopyableElements() ||
10395 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10398 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10399 PWSz = std::min<unsigned>(PWSz, VL.
size());
10400 if (PWSz == VL.
size()) {
10404 ReuseShuffleIndices.
clear();
10408 UniqueValues.
end());
10409 PaddedUniqueValues.
append(
10410 PWSz - UniqueValues.
size(),
10414 if (!S.areInstructionsWithCopyableElements() &&
10417 ReuseShuffleIndices.
clear();
10420 VL = std::move(PaddedUniqueValues);
10425 ReuseShuffleIndices.
clear();
10428 VL = std::move(UniqueValues);
10433 const InstructionsState &LocalState,
10436 OrdersType &ReorderIndices)
const {
10437 constexpr unsigned SmallNodeSize = 4;
10443 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10445 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10446 if (E->isSame(VL)) {
10448 << *LocalState.getMainOp() <<
".\n");
10453 return isa<PoisonValue>(V) || Values.contains(V);
10460 ReorderIndices.assign(VL.
size(), VL.
size());
10463 auto *
I = dyn_cast<Instruction>(V);
10466 Op1Indices.set(
Idx);
10469 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10472 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10474 LocalState.getAltOp(), *TLI))) {
10476 Op1Indices.set(
Idx);
10483 unsigned Opcode0 = LocalState.getOpcode();
10484 unsigned Opcode1 = LocalState.getAltOpcode();
10490 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10496 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10497 for (
unsigned Idx : seq<unsigned>(VL.
size())) {
10498 if (Op1Indices.test(
Idx)) {
10499 ReorderIndices[Op1Cnt] =
Idx;
10502 ReorderIndices[Op2Cnt] =
Idx;
10507 ReorderIndices.clear();
10509 if (!ReorderIndices.empty())
10516 if (NumParts >= VL.
size())
10525 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10526 (
Mask.empty() || InsertCost >= NewShuffleCost))
10528 if ((LocalState.getMainOp()->isBinaryOp() &&
10529 LocalState.getAltOp()->isBinaryOp() &&
10530 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10531 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10532 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10533 (LocalState.getMainOp()->isUnaryOp() &&
10534 LocalState.getAltOp()->isUnaryOp())) {
10539 for (
unsigned Idx : seq<unsigned>(VL.
size())) {
10540 if (isa<PoisonValue>(VL[
Idx]))
10542 OriginalMask[
Idx] =
Idx + (Op1Indices.test(
Idx) ? 0 : VL.
size());
10546 VecTy, OriginalMask, Kind);
10551 NewVecOpsCost + InsertCost +
10552 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10553 VectorizableTree.front()->getOpcode() == Instruction::Store
10557 if (NewCost >= OriginalCost)
10567class InstructionsCompatibilityAnalysis {
10572 unsigned MainOpcode = 0;
10577 static bool isSupportedOpcode(
const unsigned Opcode) {
10578 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10589 return I && isSupportedOpcode(
I->getOpcode()) &&
10596 for (
Value *V : VL) {
10597 auto *
I = dyn_cast<Instruction>(V);
10602 if (Candidates.
empty()) {
10603 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10604 Parent =
I->getParent();
10605 Operands.insert(
I->op_begin(),
I->op_end());
10608 if (Parent ==
I->getParent()) {
10609 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10610 Operands.insert(
I->op_begin(),
I->op_end());
10613 auto *NodeA = DT.
getNode(Parent);
10614 auto *NodeB = DT.
getNode(
I->getParent());
10615 assert(NodeA &&
"Should only process reachable instructions");
10616 assert(NodeB &&
"Should only process reachable instructions");
10617 assert((NodeA == NodeB) ==
10618 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10619 "Different nodes should have different DFS numbers");
10620 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10621 Candidates.
clear();
10622 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10623 Parent =
I->getParent();
10625 Operands.insert(
I->op_begin(),
I->op_end());
10628 unsigned BestOpcodeNum = 0;
10630 for (
const auto &
P : Candidates) {
10631 if (
P.second.size() < BestOpcodeNum)
10634 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10636 BestOpcodeNum =
P.second.size();
10648 Value *selectBestIdempotentValue()
const {
10649 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10658 if (isa<PoisonValue>(V))
10660 if (!S.isCopyableElement(V))
10661 return convertTo(cast<Instruction>(V), S).second;
10662 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10663 return {
V, selectBestIdempotentValue()};
10671 unsigned ShuffleOrOp =
10672 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
10675 switch (ShuffleOrOp) {
10676 case Instruction::PHI: {
10677 auto *PH = cast<PHINode>(VL0);
10680 PHIHandler Handler(DT, PH, VL);
10681 Handler.buildOperands();
10682 Operands.assign(PH->getNumOperands(), {});
10683 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
10684 Operands[
I].assign(Handler.getOperands(
I).begin(),
10685 Handler.getOperands(
I).end());
10688 case Instruction::ExtractValue:
10689 case Instruction::ExtractElement:
10694 case Instruction::InsertElement:
10697 auto *
IE = cast<InsertElementInst>(V);
10702 case Instruction::Load:
10707 auto *LI = dyn_cast<LoadInst>(V);
10710 Op = LI->getPointerOperand();
10713 case Instruction::ZExt:
10714 case Instruction::SExt:
10715 case Instruction::FPToUI:
10716 case Instruction::FPToSI:
10717 case Instruction::FPExt:
10718 case Instruction::PtrToInt:
10719 case Instruction::IntToPtr:
10720 case Instruction::SIToFP:
10721 case Instruction::UIToFP:
10722 case Instruction::Trunc:
10723 case Instruction::FPTrunc:
10724 case Instruction::BitCast:
10725 case Instruction::ICmp:
10726 case Instruction::FCmp:
10727 case Instruction::Select:
10728 case Instruction::FNeg:
10729 case Instruction::Add:
10730 case Instruction::FAdd:
10731 case Instruction::Sub:
10732 case Instruction::FSub:
10733 case Instruction::Mul:
10734 case Instruction::FMul:
10735 case Instruction::UDiv:
10736 case Instruction::SDiv:
10737 case Instruction::FDiv:
10738 case Instruction::URem:
10739 case Instruction::SRem:
10740 case Instruction::FRem:
10741 case Instruction::Shl:
10742 case Instruction::LShr:
10743 case Instruction::AShr:
10744 case Instruction::And:
10745 case Instruction::Or:
10746 case Instruction::Xor:
10747 case Instruction::Freeze:
10748 case Instruction::Store:
10749 case Instruction::ShuffleVector:
10752 auto *
I = dyn_cast<Instruction>(V);
10758 auto [
Op, ConvertedOps] = convertTo(
I, S);
10763 case Instruction::GetElementPtr: {
10770 const unsigned IndexIdx = 1;
10775 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
10776 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10779 :
DL.getIndexType(cast<GetElementPtrInst>(VL0)
10780 ->getPointerOperandType()
10781 ->getScalarType());
10783 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
10786 Operands[1][
Idx] = ConstantInt::getNullValue(Ty);
10790 auto *
Op =
GEP->getOperand(IndexIdx);
10791 auto *CI = dyn_cast<ConstantInt>(
Op);
10793 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10798 case Instruction::Call: {
10799 auto *CI = cast<CallInst>(VL0);
10801 for (
unsigned Idx : seq<unsigned>(CI->
arg_size())) {
10804 auto &Ops =
Operands.emplace_back();
10805 for (
Value *V : VL) {
10806 auto *
I = dyn_cast<Instruction>(V);
10807 Ops.push_back(
I ?
I->getOperand(
Idx)
10827 bool TryCopyableElementsVectorization,
10828 bool WithProfitabilityCheck =
false,
10829 bool SkipSameCodeCheck =
false) {
10830 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10831 ? InstructionsState::invalid()
10837 findAndSetMainInstruction(VL, R);
10839 return InstructionsState::invalid();
10840 S = InstructionsState(MainOp, MainOp,
true);
10841 if (!WithProfitabilityCheck)
10845 auto BuildCandidates =
10848 if (V1 != V2 && isa<PHINode>(V1))
10850 auto *
I1 = dyn_cast<Instruction>(V1);
10851 auto *I2 = dyn_cast<Instruction>(V2);
10852 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10853 I1->getParent() != I2->getParent())
10857 if (VL.
size() == 2) {
10862 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10863 R.findBestRootPair(Candidates1) &&
10864 R.findBestRootPair(Candidates2);
10866 Candidates1.
clear();
10867 Candidates2.
clear();
10870 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10871 R.findBestRootPair(Candidates1) &&
10872 R.findBestRootPair(Candidates2);
10875 return InstructionsState::invalid();
10881 switch (MainOpcode) {
10882 case Instruction::Add:
10883 case Instruction::LShr:
10889 if (VectorCost > ScalarCost)
10890 return InstructionsState::invalid();
10893 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10894 unsigned CopyableNum =
10895 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10896 if (CopyableNum < VL.
size() / 2)
10899 const unsigned Limit = VL.
size() / 24;
10900 if ((CopyableNum >= VL.
size() - Limit ||
10901 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10904 return isa<PHINode>(V) || !S.isCopyableElement(V);
10906 return InstructionsState::invalid();
10912 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10917 if (isa<Constant>(Ops.front())) {
10925 return InstructionsState::invalid();
10931 constexpr unsigned Limit = 4;
10932 if (
Operands.front().size() >= Limit) {
10934 for (
Value *V : Ops) {
10935 if (isa<UndefValue>(V))
10941 return C.second == 1;
10947 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
10948 InstructionsState OpS =
Analysis.buildInstructionsState(
10950 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(Ops)))
10952 unsigned CopyableNum =
10953 count_if(Ops, [&](
Value *V) {
return OpS.isCopyableElement(V); });
10954 return CopyableNum <= VL.
size() / 2;
10956 if (!CheckOperand(
Operands.front()))
10957 return InstructionsState::invalid();
10964 assert(S &&
"Invalid state!");
10966 if (S.areInstructionsWithCopyableElements()) {
10967 MainOp = S.getMainOp();
10968 MainOpcode = S.getOpcode();
10973 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
10977 buildOriginalOperands(S, VL,
Operands);
10984BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
10986 bool TryCopyableElementsVectorization)
const {
10989 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *
TTI, *TLI);
10990 InstructionsState S =
Analysis.buildInstructionsState(
10991 VL, *
this, TryCopyableElementsVectorization,
10992 true, TryCopyableElementsVectorization);
10997 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11000 return ScalarsVectorizationLegality(S,
false,
11006 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11007 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11008 if (E->isSame(VL)) {
11009 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11011 return ScalarsVectorizationLegality(S,
false);
11015 return isa<PoisonValue>(V) || Values.contains(V);
11018 return ScalarsVectorizationLegality(S,
false);
11027 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11032 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
11034 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11035 return ScalarsVectorizationLegality(S,
false);
11039 if (S && S.getOpcode() == Instruction::ExtractElement &&
11040 isa<ScalableVectorType>(
11041 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11042 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11043 return ScalarsVectorizationLegality(S,
false);
11050 return ScalarsVectorizationLegality(S,
false,
11060 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11069 for (
Value *V : VL) {
11070 auto *
I = cast<Instruction>(V);
11072 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11075 bool IsCommutative =
11077 if ((IsCommutative &&
11078 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11080 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11082 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11084 auto *
I1 = cast<Instruction>(VL.front());
11085 auto *I2 = cast<Instruction>(VL.back());
11086 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
11088 I2->getOperand(
Op));
11089 if (
static_cast<unsigned>(
count_if(
11090 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11092 })) >= S.getMainOp()->getNumOperands() / 2)
11094 if (S.getMainOp()->getNumOperands() > 2)
11096 if (IsCommutative) {
11098 Candidates.
clear();
11101 I2->getOperand((
Op + 1) % E));
11103 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11112 bool IsScatterVectorizeUserTE =
11113 UserTreeIdx.UserTE &&
11114 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11115 bool AreAllSameBlock = S.valid();
11116 bool AreScatterAllGEPSameBlock =
11117 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11121 auto *
I = dyn_cast<GetElementPtrInst>(V);
11125 BB =
I->getParent();
11126 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
11131 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11134 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11137 NotProfitableForVectorization(VL)) {
11139 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11140 "C,S,B,O, small shuffle. \n";
11144 return ScalarsVectorizationLegality(S,
false,
11148 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11152 return ScalarsVectorizationLegality(S,
false);
11156 if (S && !EphValues.
empty()) {
11157 for (
Value *V : VL) {
11158 if (EphValues.
count(V)) {
11160 <<
") is ephemeral.\n");
11162 return ScalarsVectorizationLegality(S,
false,
11174 if (S && S.isAltShuffle()) {
11175 auto GetNumVectorizedExtracted = [&]() {
11179 auto *
I = dyn_cast<Instruction>(V);
11182 return isa<ExtractElementInst>(U.get());
11187 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11190 return std::make_pair(Vectorized, Extracted);
11192 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11194 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11195 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11198 Type *ScalarTy = VL.front()->getType();
11203 false,
true, Kind);
11205 *
TTI, ScalarTy, VecTy, Vectorized,
11206 true,
false, Kind,
false);
11207 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11209 if (PreferScalarize) {
11210 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11211 "node is not profitable.\n");
11212 return ScalarsVectorizationLegality(S,
false);
11217 if (UserIgnoreList && !UserIgnoreList->empty()) {
11218 for (
Value *V : VL) {
11219 if (UserIgnoreList->contains(V)) {
11220 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11221 return ScalarsVectorizationLegality(S,
false);
11228 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11229 assert(VL.front()->getType()->isPointerTy() &&
11230 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
11231 "Expected pointers only.");
11233 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11234 assert(It != VL.end() &&
"Expected at least one GEP.");
11251 return ScalarsVectorizationLegality(S,
false);
11253 return ScalarsVectorizationLegality(S,
true);
11257 const EdgeInfo &UserTreeIdx,
11258 unsigned InterleaveFactor) {
11265 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11268 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11272 copy(Op1, NewVL.begin());
11273 copy(Op2, std::next(NewVL.begin(), Op1.
size()));
11274 auto Invalid = ScheduleBundle::invalid();
11275 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11276 UserTreeIdx, {}, ReorderIndices);
11280 if (S && (isa<LoadInst>(S.getMainOp()) ||
11281 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11283 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11285 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE,
Idx});
11287 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11298 bool AreConsts =
false;
11299 for (
Value *V : VL) {
11300 if (isa<PoisonValue>(V))
11302 if (isa<Constant>(V)) {
11306 if (!isa<PHINode>(V))
11311 if (AreOnlyConstsWithPHIs(VL)) {
11312 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11313 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11317 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11318 VL,
Depth, UserTreeIdx,
false);
11319 InstructionsState S = Legality.getInstructionsState();
11320 if (!Legality.isLegal()) {
11321 if (Legality.trySplitVectorize()) {
11324 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11328 Legality = getScalarsVectorizationLegality(
11329 VL,
Depth, UserTreeIdx,
true);
11330 if (!Legality.isLegal()) {
11331 if (Legality.tryToFindDuplicates())
11335 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11338 S = Legality.getInstructionsState();
11342 if (S.isAltShuffle() && TrySplitNode(S))
11348 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11353 bool IsScatterVectorizeUserTE =
11354 UserTreeIdx.UserTE &&
11355 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11358 TreeEntry::EntryState State = getScalarsVectorizationState(
11359 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
11360 if (State == TreeEntry::NeedToGather) {
11361 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11367 auto &BSRef = BlocksSchedules[BB];
11369 BSRef = std::make_unique<BlockScheduling>(BB);
11371 BlockScheduling &BS = *BSRef;
11374 std::optional<ScheduleBundle *> BundlePtr =
11375 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11376#ifdef EXPENSIVE_CHECKS
11380 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11381 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11383 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11385 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11386 NonScheduledFirst.insert(VL.front());
11387 if (S.getOpcode() == Instruction::Load &&
11388 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11392 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *
TTI, *TLI);
11394 ScheduleBundle
Empty;
11395 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11396 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11398 unsigned ShuffleOrOp =
11399 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
11400 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11403 for (
unsigned I : seq<unsigned>(
Operands.size())) {
11408 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11413 for (
unsigned I : PHIOps)
11416 switch (ShuffleOrOp) {
11417 case Instruction::PHI: {
11419 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11427 case Instruction::ExtractValue:
11428 case Instruction::ExtractElement: {
11429 if (CurrentOrder.empty()) {
11430 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11433 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11435 for (
unsigned Idx : CurrentOrder)
11443 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11444 ReuseShuffleIndices, CurrentOrder);
11446 "(ExtractValueInst/ExtractElementInst).\n";
11453 case Instruction::InsertElement: {
11454 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11456 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11457 const std::pair<int, int> &P2) {
11458 return P1.first > P2.first;
11461 decltype(OrdCompare)>
11462 Indices(OrdCompare);
11463 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11465 Indices.emplace(
Idx,
I);
11467 OrdersType CurrentOrder(VL.size(), VL.size());
11468 bool IsIdentity =
true;
11469 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11470 CurrentOrder[Indices.top().second] =
I;
11471 IsIdentity &= Indices.top().second ==
I;
11475 CurrentOrder.clear();
11476 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11478 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11482 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11485 case Instruction::Load: {
11492 TreeEntry *
TE =
nullptr;
11495 case TreeEntry::Vectorize:
11496 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11497 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11498 if (CurrentOrder.empty())
11499 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11503 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11506 case TreeEntry::CompressVectorize:
11508 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11509 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11512 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11515 case TreeEntry::StridedVectorize:
11517 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11518 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11519 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11522 case TreeEntry::ScatterVectorize:
11524 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11525 UserTreeIdx, ReuseShuffleIndices);
11528 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11531 case TreeEntry::CombinedVectorize:
11532 case TreeEntry::SplitVectorize:
11533 case TreeEntry::NeedToGather:
11536 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11537 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11543 if (State == TreeEntry::ScatterVectorize)
11544 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11547 case Instruction::ZExt:
11548 case Instruction::SExt:
11549 case Instruction::FPToUI:
11550 case Instruction::FPToSI:
11551 case Instruction::FPExt:
11552 case Instruction::PtrToInt:
11553 case Instruction::IntToPtr:
11554 case Instruction::SIToFP:
11555 case Instruction::UIToFP:
11556 case Instruction::Trunc:
11557 case Instruction::FPTrunc:
11558 case Instruction::BitCast: {
11559 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11560 std::make_pair(std::numeric_limits<unsigned>::min(),
11561 std::numeric_limits<unsigned>::max()));
11562 if (ShuffleOrOp == Instruction::ZExt ||
11563 ShuffleOrOp == Instruction::SExt) {
11564 CastMaxMinBWSizes = std::make_pair(
11565 std::max<unsigned>(
DL->getTypeSizeInBits(VL0->
getType()),
11567 std::min<unsigned>(
11570 }
else if (ShuffleOrOp == Instruction::Trunc) {
11571 CastMaxMinBWSizes = std::make_pair(
11572 std::max<unsigned>(
11575 std::min<unsigned>(
DL->getTypeSizeInBits(VL0->
getType()),
11578 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11579 ReuseShuffleIndices);
11580 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11585 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11586 if (ShuffleOrOp == Instruction::Trunc) {
11587 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
11588 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11589 ShuffleOrOp == Instruction::UIToFP) {
11590 unsigned NumSignBits =
11592 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
11594 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11596 if (NumSignBits * 2 >=
11598 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
11602 case Instruction::ICmp:
11603 case Instruction::FCmp: {
11606 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11607 ReuseShuffleIndices);
11611 VLOperands Ops(VL,
Operands, S, *
this);
11616 "Commutative Predicate mismatch");
11623 if (isa<PoisonValue>(V))
11625 auto *
Cmp = cast<CmpInst>(V);
11626 if (
Cmp->getPredicate() != P0)
11633 if (ShuffleOrOp == Instruction::ICmp) {
11634 unsigned NumSignBits0 =
11636 if (NumSignBits0 * 2 >=
11638 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
11639 unsigned NumSignBits1 =
11641 if (NumSignBits1 * 2 >=
11643 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
11647 case Instruction::Select:
11648 case Instruction::FNeg:
11649 case Instruction::Add:
11650 case Instruction::FAdd:
11651 case Instruction::Sub:
11652 case Instruction::FSub:
11653 case Instruction::Mul:
11654 case Instruction::FMul:
11655 case Instruction::UDiv:
11656 case Instruction::SDiv:
11657 case Instruction::FDiv:
11658 case Instruction::URem:
11659 case Instruction::SRem:
11660 case Instruction::FRem:
11661 case Instruction::Shl:
11662 case Instruction::LShr:
11663 case Instruction::AShr:
11664 case Instruction::And:
11665 case Instruction::Or:
11666 case Instruction::Xor:
11667 case Instruction::Freeze: {
11668 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11669 ReuseShuffleIndices);
11671 dbgs() <<
"SLP: added a new TreeEntry "
11672 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11676 VLOperands Ops(VL,
Operands, S, *
this);
11683 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11686 case Instruction::GetElementPtr: {
11687 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11688 ReuseShuffleIndices);
11689 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11693 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
11697 case Instruction::Store: {
11698 bool Consecutive = CurrentOrder.empty();
11701 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11702 ReuseShuffleIndices, CurrentOrder);
11704 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11708 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11711 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11714 case Instruction::Call: {
11717 CallInst *CI = cast<CallInst>(VL0);
11720 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11721 ReuseShuffleIndices);
11722 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11725 VLOperands Ops(VL,
Operands, S, *
this);
11731 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
11736 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11740 case Instruction::ShuffleVector: {
11741 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11742 ReuseShuffleIndices);
11743 if (S.isAltShuffle()) {
11744 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11749 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11754 auto *CI = dyn_cast<CmpInst>(VL0);
11756 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11758 auto *MainCI = cast<CmpInst>(S.getMainOp());
11759 auto *AltCI = cast<CmpInst>(S.getAltOp());
11763 "Expected different main/alternate predicates.");
11767 if (isa<PoisonValue>(V))
11769 auto *
Cmp = cast<CmpInst>(V);
11785 if (isa<BinaryOperator>(VL0) || CI) {
11786 VLOperands Ops(VL,
Operands, S, *
this);
11793 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11806 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
11809 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
11811 for (
const auto *Ty : ST->elements())
11812 if (Ty != *ST->element_begin())
11814 N *= ST->getNumElements();
11815 EltTy = *ST->element_begin();
11816 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
11817 N *= AT->getNumElements();
11818 EltTy = AT->getElementType();
11820 auto *VT = cast<FixedVectorType>(EltTy);
11821 N *= VT->getNumElements();
11822 EltTy = VT->getElementType();
11829 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11830 VTSize !=
DL->getTypeStoreSizeInBits(
T))
11837 bool ResizeAllowed)
const {
11838 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
11839 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11840 auto *E0 = cast<Instruction>(*It);
11842 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
11846 Value *Vec = E0->getOperand(0);
11848 CurrentOrder.
clear();
11852 if (E0->getOpcode() == Instruction::ExtractValue) {
11857 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11861 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11864 unsigned E = VL.
size();
11865 if (!ResizeAllowed && NElts != E)
11868 unsigned MinIdx = NElts, MaxIdx = 0;
11870 auto *Inst = dyn_cast<Instruction>(V);
11873 if (Inst->getOperand(0) != Vec)
11875 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
11876 if (isa<UndefValue>(EE->getIndexOperand()))
11881 const unsigned ExtIdx = *
Idx;
11882 if (ExtIdx >= NElts)
11884 Indices[
I] = ExtIdx;
11885 if (MinIdx > ExtIdx)
11887 if (MaxIdx < ExtIdx)
11890 if (MaxIdx - MinIdx + 1 > E)
11892 if (MaxIdx + 1 <= E)
11896 bool ShouldKeepOrder =
true;
11902 CurrentOrder.
assign(E, E);
11903 for (
unsigned I = 0;
I < E; ++
I) {
11906 const unsigned ExtIdx = Indices[
I] - MinIdx;
11907 if (CurrentOrder[ExtIdx] != E) {
11908 CurrentOrder.
clear();
11911 ShouldKeepOrder &= ExtIdx ==
I;
11912 CurrentOrder[ExtIdx] =
I;
11914 if (ShouldKeepOrder)
11915 CurrentOrder.
clear();
11917 return ShouldKeepOrder;
11920bool BoUpSLP::areAllUsersVectorized(
11922 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11924 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11925 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11929void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11933 unsigned Sz = Scalars.size();
11936 if (!ReorderIndices.empty())
11938 for (
unsigned I = 0;
I < Sz; ++
I) {
11940 if (!ReorderIndices.empty())
11941 Idx = OrderMask[
I];
11942 if (isa<PoisonValue>(Scalars[
Idx]))
11944 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
11945 if (IsAltOp(OpInst)) {
11955 if (!ReuseShuffleIndices.
empty()) {
11958 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
11960 Mask.swap(NewMask);
11967 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
11973 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
11974 auto *AltCI = cast<CmpInst>(AltOp);
11977 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
11978 auto *CI = cast<CmpInst>(
I);
11986 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
11987 "CmpInst expected to match either main or alternate predicate or "
11989 return MainP !=
P && MainP != SwappedP;
11991 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
11996 const auto *Op0 = Ops.
front();
12000 return isConstant(V) && !isa<UndefValue>(V);
12002 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
12006 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
12008 if (
auto *CI = dyn_cast<ConstantInt>(V))
12009 return CI->getValue().isPowerOf2();
12012 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
12014 if (
auto *CI = dyn_cast<ConstantInt>(V))
12015 return CI->getValue().isNegatedPowerOf2();
12020 if (IsConstant && IsUniform)
12022 else if (IsConstant)
12024 else if (IsUniform)
12036class BaseShuffleAnalysis {
12038 Type *ScalarTy =
nullptr;
12040 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12048 unsigned getVF(
Value *V)
const {
12049 assert(V &&
"V cannot be nullptr");
12050 assert(isa<FixedVectorType>(
V->getType()) &&
12051 "V does not have FixedVectorType");
12052 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12054 unsigned VNumElements =
12055 cast<FixedVectorType>(
V->getType())->getNumElements();
12056 assert(VNumElements > ScalarTyNumElements &&
12057 "the number of elements of V is not large enough");
12058 assert(VNumElements % ScalarTyNumElements == 0 &&
12059 "the number of elements of V is not a vectorized value");
12060 return VNumElements / ScalarTyNumElements;
12068 int Limit =
Mask.size();
12080 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
12096 unsigned VF =
Mask.size();
12098 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12101 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12105 Mask.swap(NewMask);
12142 bool SinglePermute) {
12146 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
12148 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12154 if (isIdentityMask(Mask, SVTy,
false)) {
12155 if (!IdentityOp || !SinglePermute ||
12156 (isIdentityMask(Mask, SVTy,
true) &&
12158 IdentityMask.
size()))) {
12163 IdentityMask.
assign(Mask);
12183 if (SV->isZeroEltSplat()) {
12185 IdentityMask.
assign(Mask);
12187 int LocalVF =
Mask.size();
12189 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12190 LocalVF = SVOpTy->getNumElements();
12194 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12196 ExtMask[
Idx] = SV->getMaskValue(
I);
12206 if (!IsOp1Undef && !IsOp2Undef) {
12208 for (
int &
I : Mask) {
12211 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12218 combineMasks(LocalVF, ShuffleMask, Mask);
12219 Mask.swap(ShuffleMask);
12221 Op = SV->getOperand(0);
12223 Op = SV->getOperand(1);
12225 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
12226 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12231 "Expected masks of same sizes.");
12236 Mask.swap(IdentityMask);
12237 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12238 return SinglePermute &&
12239 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
12241 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12242 Shuffle->isZeroEltSplat() &&
12246 Shuffle->getShuffleMask()[
P.index()] == 0;
12259 template <
typename T,
typename ShuffleBuilderTy>
12261 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12262 assert(V1 &&
"Expected at least one vector value.");
12265 if (ScalarTyNumElements != 1) {
12271 Builder.resizeToMatch(V1, V2);
12272 int VF =
Mask.size();
12273 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
12274 VF = FTy->getNumElements();
12275 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
12282 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
12285 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12287 CombinedMask1[
I] =
Mask[
I];
12289 CombinedMask2[
I] =
Mask[
I] - VF;
12296 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12297 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12300 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12301 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12306 ExtMask1[
Idx] = SV1->getMaskValue(
I);
12309 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12310 ->getNumElements(),
12311 ExtMask1, UseMask::SecondArg);
12316 ExtMask2[
Idx] = SV2->getMaskValue(
I);
12319 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12320 ->getNumElements(),
12321 ExtMask2, UseMask::SecondArg);
12322 if (SV1->getOperand(0)->getType() ==
12323 SV2->getOperand(0)->getType() &&
12324 SV1->getOperand(0)->getType() != SV1->getType() &&
12327 Op1 = SV1->getOperand(0);
12328 Op2 = SV2->getOperand(0);
12330 int LocalVF = ShuffleMask1.size();
12331 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
12332 LocalVF = FTy->getNumElements();
12333 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12334 CombinedMask1.swap(ShuffleMask1);
12336 LocalVF = ShuffleMask2.size();
12337 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
12338 LocalVF = FTy->getNumElements();
12339 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12340 CombinedMask2.swap(ShuffleMask2);
12343 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12344 Builder.resizeToMatch(Op1, Op2);
12345 VF = std::max(cast<VectorType>(Op1->
getType())
12346 ->getElementCount()
12347 .getKnownMinValue(),
12348 cast<VectorType>(Op2->
getType())
12349 ->getElementCount()
12350 .getKnownMinValue());
12351 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12354 "Expected undefined mask element");
12355 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12361 isa<ShuffleVectorInst>(Op1) &&
12362 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12364 return Builder.createIdentity(Op1);
12365 return Builder.createShuffleVector(
12369 if (isa<PoisonValue>(V1))
12370 return Builder.createPoison(
12371 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
12372 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12373 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12376 return Builder.createShuffleVector(V1, NewMask);
12377 return Builder.createIdentity(V1);
12384 for (
unsigned I : seq<unsigned>(CommonMask.
size()))
12392static std::pair<InstructionCost, InstructionCost>
12403 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12413 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12417 for (
Value *V : Ptrs) {
12418 if (V == BasePtr) {
12422 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
12427 if (!
Ptr || !
Ptr->hasOneUse())
12431 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12437 TTI::PointersChainInfo::getKnownStride(),
12447 [](
const Value *V) {
12448 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
12449 return Ptr && !
Ptr->hasAllConstantIndices();
12451 ? TTI::PointersChainInfo::getUnknownStride()
12452 : TTI::PointersChainInfo::getKnownStride();
12456 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12458 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
12459 if (It != Ptrs.
end())
12460 BaseGEP = cast<GEPOperator>(*It);
12465 BaseGEP->getPointerOperand(), Indices, VecTy,
12470 return std::make_pair(ScalarCost, VecCost);
12473void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12474 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12475 "Expected gather node without reordering.");
12481 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12485 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
12486 return VectorizableTree[Idx]->isSame(TE.Scalars);
12490 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
12495 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
12496 if (LIt != LoadsMap.
end()) {
12497 for (
LoadInst *RLI : LIt->second) {
12503 for (
LoadInst *RLI : LIt->second) {
12510 if (LIt->second.size() > 2) {
12512 hash_value(LIt->second.back()->getPointerOperand());
12517 LoadKeyUsed.
insert(Key);
12518 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
12523 bool IsOrdered =
true;
12524 unsigned NumInstructions = 0;
12528 size_t Key = 1,
Idx = 1;
12529 if (
auto *Inst = dyn_cast<Instruction>(V);
12530 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
12536 auto &Container = SortedValues[
Key];
12537 if (IsOrdered && !KeyToIndex.
contains(V) &&
12538 !(isa<Constant, ExtractElementInst>(V) ||
12540 ((Container.contains(
Idx) &&
12541 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
12542 (!Container.empty() && !Container.contains(
Idx) &&
12543 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12545 auto &KTI = KeyToIndex[
V];
12547 Container[
Idx].push_back(V);
12552 if (!IsOrdered && NumInstructions > 1) {
12554 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12555 for (
const auto &
D : SortedValues) {
12556 for (
const auto &
P :
D.second) {
12558 for (
Value *V :
P.second) {
12561 TE.ReorderIndices[Cnt +
K] =
Idx;
12562 TE.Scalars[Cnt +
K] =
V;
12564 Sz += Indices.
size();
12565 Cnt += Indices.
size();
12567 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
12569 *
TTI,
TE.Scalars.front()->getType(), Sz);
12571 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12573 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12574 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12581 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12586 auto *ScalarTy =
TE.Scalars.front()->getType();
12588 for (
auto [
Idx, Sz] : SubVectors) {
12595 int Sz =
TE.Scalars.size();
12597 TE.ReorderIndices.end());
12598 for (
unsigned I : seq<unsigned>(Sz)) {
12600 if (isa<PoisonValue>(V)) {
12603 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12607 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12610 VecTy, ReorderMask);
12613 for (
unsigned I : seq<unsigned>(Sz)) {
12617 if (!isa<PoisonValue>(V))
12618 ReorderMask[
I] =
I;
12620 ReorderMask[
I] =
I + Sz;
12628 if (
Cost >= BVCost) {
12631 TE.ReorderIndices.clear();
12638 const InstructionsState &S,
12644 return V->getType()->getScalarType()->isFloatingPointTy();
12646 "Can only convert to FMA for floating point types");
12647 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12652 for (
Value *V : VL) {
12653 auto *
I = dyn_cast<Instruction>(V);
12656 if (S.isCopyableElement(
I))
12658 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12659 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12661 if (
auto *FPCI = dyn_cast<FPMathOperator>(
I))
12662 FMF &= FPCI->getFastMathFlags();
12666 if (!CheckForContractable(VL))
12669 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12676 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12678 if (!CheckForContractable(
Operands.front()))
12686 for (
Value *V : VL) {
12687 auto *
I = dyn_cast<Instruction>(V);
12690 if (!S.isCopyableElement(
I))
12691 if (
auto *FPCI = dyn_cast<FPMathOperator>(
I))
12692 FMF &= FPCI->getFastMathFlags();
12695 unsigned NumOps = 0;
12697 if (S.isCopyableElement(V))
12699 auto *
I = dyn_cast<Instruction>(
Op);
12700 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12701 if (
auto *OpI = dyn_cast<Instruction>(V))
12708 if (
auto *FPCI = dyn_cast<FPMathOperator>(
I))
12709 FMF &= FPCI->getFastMathFlags();
12720 BaseGraphSize = VectorizableTree.size();
12722 class GraphTransformModeRAAI {
12723 bool &SavedIsGraphTransformMode;
12726 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12727 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12728 IsGraphTransformMode =
true;
12730 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12731 } TransformContext(IsGraphTransformMode);
12740 const InstructionsState &S) {
12742 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12744 I2->getOperand(
Op));
12746 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12748 [](
const std::pair<Value *, Value *> &
P) {
12749 return isa<Constant>(
P.first) ||
12750 isa<Constant>(
P.second) ||
P.first ==
P.second;
12757 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12758 TreeEntry &E = *VectorizableTree[
Idx];
12760 reorderGatherNode(E);
12765 constexpr unsigned VFLimit = 16;
12766 bool ForceLoadGather =
12767 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12768 return TE->isGather() && TE->hasState() &&
12769 TE->getOpcode() == Instruction::Load &&
12770 TE->getVectorFactor() < VFLimit;
12776 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12777 if (isa<PoisonValue>(V))
12779 auto *
I = dyn_cast<Instruction>(V);
12785 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12786 if (E.hasState()) {
12788 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12789 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12790 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12791 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12792 return is_contained(TEs, TE);
12799 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12800 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12801 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12802 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12803 return is_contained(TEs, TE);
12810 auto *It =
find_if(E.Scalars, IsaPred<Instruction>);
12811 if (It != E.Scalars.end()) {
12813 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12814 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12815 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12816 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12817 return is_contained(TEs, TE);
12827 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12828 TreeEntry &E = *VectorizableTree[
Idx];
12829 if (E.isGather()) {
12831 const unsigned Sz = getVectorElementSize(VL.
front());
12832 unsigned MinVF = getMinVF(2 * Sz);
12835 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
12836 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12842 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12845 if (CheckForSameVectorNodes(E))
12849 unsigned StartIdx = 0;
12854 *
TTI, VL.
front()->getType(), VF - 1)) {
12855 if (StartIdx + VF >
End)
12858 bool AllStrided =
true;
12859 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
12863 if (isVectorized(Slice.
front()) &&
12864 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12871 bool IsSplat =
isSplat(Slice);
12872 bool IsTwoRegisterSplat =
true;
12873 if (IsSplat && VF == 2) {
12876 IsTwoRegisterSplat = NumRegs2VF == 2;
12878 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12880 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
12886 (S.getOpcode() == Instruction::Load &&
12887 areKnownNonVectorizableLoads(Slice)) ||
12888 (S.getOpcode() != Instruction::Load &&
12894 if ((!UserIgnoreList || E.Idx != 0) &&
12898 if (isa<PoisonValue>(V))
12900 return areAllUsersVectorized(cast<Instruction>(V),
12904 if (S.getOpcode() == Instruction::Load) {
12908 canVectorizeLoads(Slice, Slice.
front(), Order, PointerOps);
12909 AllStrided &= Res == LoadsState::StridedVectorize ||
12910 Res == LoadsState::ScatterVectorize ||
12911 Res == LoadsState::Gather;
12913 if (Res == LoadsState::ScatterVectorize ||
12914 Res == LoadsState::Gather) {
12915 if (Res == LoadsState::Gather) {
12916 registerNonVectorizableLoads(Slice);
12919 if (UserIgnoreList && E.Idx == 0)
12920 analyzedReductionVals(Slice);
12924 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12927 !CheckOperandsProfitability(
12930 IsaPred<Instruction>)),
12944 if (VF == 2 && AllStrided && Slices.
size() > 2)
12946 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
12947 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
12948 if (StartIdx == Cnt)
12949 StartIdx = Cnt + Sz;
12950 if (
End == Cnt + Sz)
12953 for (
auto [Cnt, Sz] : Slices) {
12955 const TreeEntry *SameTE =
nullptr;
12956 if (
const auto *It =
find_if(Slice, IsaPred<Instruction>);
12957 It != Slice.
end()) {
12959 SameTE = getSameValuesTreeEntry(*It, Slice);
12961 unsigned PrevSize = VectorizableTree.size();
12962 [[maybe_unused]]
unsigned PrevEntriesSize =
12963 LoadEntriesToVectorize.size();
12964 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
12965 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
12966 VectorizableTree[PrevSize]->isGather() &&
12967 VectorizableTree[PrevSize]->hasState() &&
12968 VectorizableTree[PrevSize]->getOpcode() !=
12969 Instruction::ExtractElement &&
12971 if (UserIgnoreList && E.Idx == 0 && VF == 2)
12972 analyzedReductionVals(Slice);
12973 VectorizableTree.pop_back();
12974 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
12975 "LoadEntriesToVectorize expected to remain the same");
12978 AddCombinedNode(PrevSize, Cnt, Sz);
12982 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
12985 E.ReorderIndices.clear();
12990 switch (E.getOpcode()) {
12991 case Instruction::Load: {
12994 if (E.State != TreeEntry::Vectorize)
12996 Type *ScalarTy = E.getMainOp()->getType();
12998 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13001 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
13005 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13008 BaseLI->getPointerAddressSpace(),
CostKind,
13012 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13013 false, CommonAlignment,
CostKind, BaseLI);
13014 if (StridedCost < OriginalVecCost)
13017 E.State = TreeEntry::StridedVectorize;
13021 case Instruction::Store: {
13023 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13025 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13028 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
13032 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13035 BaseSI->getPointerAddressSpace(),
CostKind,
13039 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13040 false, CommonAlignment,
CostKind, BaseSI);
13041 if (StridedCost < OriginalVecCost)
13044 E.State = TreeEntry::StridedVectorize;
13045 }
else if (!E.ReorderIndices.empty()) {
13048 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13049 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13050 if (
Mask.size() < 4)
13052 for (
unsigned Factor : seq<unsigned>(2,
Mask.size() / 2 + 1)) {
13056 VecTy, Factor, BaseSI->getAlign(),
13057 BaseSI->getPointerAddressSpace()))
13064 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13065 if (InterleaveFactor != 0)
13066 E.setInterleave(InterleaveFactor);
13070 case Instruction::Select: {
13071 if (E.State != TreeEntry::Vectorize)
13077 E.CombinedOp = TreeEntry::MinMax;
13078 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13079 if (SelectOnly && CondEntry->UserTreeIndex &&
13080 CondEntry->State == TreeEntry::Vectorize) {
13082 CondEntry->State = TreeEntry::CombinedVectorize;
13086 case Instruction::FSub:
13087 case Instruction::FAdd: {
13089 if (E.State != TreeEntry::Vectorize ||
13090 !E.getOperations().isAddSubLikeOp())
13096 E.CombinedOp = TreeEntry::FMulAdd;
13097 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13098 if (FMulEntry->UserTreeIndex &&
13099 FMulEntry->State == TreeEntry::Vectorize) {
13101 FMulEntry->State = TreeEntry::CombinedVectorize;
13110 if (LoadEntriesToVectorize.empty()) {
13112 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13113 VectorizableTree.front()->getOpcode() == Instruction::Load)
13116 constexpr unsigned SmallTree = 3;
13117 constexpr unsigned SmallVF = 2;
13118 if ((VectorizableTree.size() <= SmallTree &&
13119 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13120 (VectorizableTree.size() <= 2 && UserIgnoreList))
13123 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13124 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13125 getCanonicalGraphSize() <= SmallTree &&
13127 [](
const std::unique_ptr<TreeEntry> &TE) {
13128 return TE->isGather() &&
TE->hasState() &&
13129 TE->getOpcode() == Instruction::Load &&
13141 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13142 TreeEntry &E = *
TE;
13143 if (E.isGather() &&
13144 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13145 (!E.hasState() &&
any_of(E.Scalars,
13147 return isa<LoadInst>(V) &&
13148 !isVectorized(V) &&
13149 !isDeleted(cast<Instruction>(V));
13152 for (
Value *V : E.Scalars) {
13153 auto *LI = dyn_cast<LoadInst>(V);
13156 if (isDeleted(LI) || isVectorized(LI) || !LI->
isSimple())
13159 *
this, V, *
DL, *SE, *
TTI,
13160 GatheredLoads[std::make_tuple(
13168 if (!GatheredLoads.
empty())
13169 tryToVectorizeGatheredLoads(GatheredLoads);
13179 bool IsFinalized =
false;
13192 bool SameNodesEstimated =
true;
13201 if (
auto *VTy = dyn_cast<VectorType>(Ty))
13217 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
13218 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13221 count(VL, *It) > 1 &&
13223 if (!NeedShuffle) {
13224 if (isa<FixedVectorType>(ScalarTy)) {
13229 cast<FixedVectorType>(ScalarTy));
13232 CostKind, std::distance(VL.
begin(), It),
13238 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13245 VecTy, ShuffleMask, CostKind,
13249 return GatherCost +
13250 (
all_of(Gathers, IsaPred<UndefValue>)
13252 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13260 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13261 unsigned NumParts) {
13262 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13264 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13265 auto *EE = dyn_cast<ExtractElementInst>(V);
13268 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13271 return std::max(Sz, VecTy->getNumElements());
13278 -> std::optional<TTI::ShuffleKind> {
13279 if (NumElts <= EltsPerVector)
13280 return std::nullopt;
13282 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13284 if (I == PoisonMaskElem)
13286 return std::min(S, I);
13289 int OffsetReg1 = OffsetReg0;
13293 int FirstRegId = -1;
13294 Indices.assign(1, OffsetReg0);
13298 int Idx =
I - OffsetReg0;
13300 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
13301 if (FirstRegId < 0)
13302 FirstRegId = RegId;
13303 RegIndices.
insert(RegId);
13304 if (RegIndices.
size() > 2)
13305 return std::nullopt;
13306 if (RegIndices.
size() == 2) {
13308 if (Indices.
size() == 1) {
13311 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13312 [&](
int S,
int I) {
13313 if (I == PoisonMaskElem)
13315 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13316 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13317 if (RegId == FirstRegId)
13319 return std::min(S, I);
13322 unsigned Index = OffsetReg1 % NumElts;
13323 Indices.push_back(Index);
13324 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13326 Idx =
I - OffsetReg1;
13328 I = (
Idx % NumElts) % EltsPerVector +
13329 (RegId == FirstRegId ? 0 : EltsPerVector);
13331 return ShuffleKind;
13338 for (
unsigned Part : seq<unsigned>(NumParts)) {
13339 if (!ShuffleKinds[Part])
13342 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13347 std::optional<TTI::ShuffleKind> RegShuffleKind =
13348 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13349 if (!RegShuffleKind) {
13352 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13365 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13366 for (
const auto [
Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13368 "SK_ExtractSubvector index out of range");
13379 if (OriginalCost <
Cost)
13380 Cost = OriginalCost;
13387 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13389 unsigned SliceSize) {
13390 if (SameNodesEstimated) {
13396 if ((InVectors.
size() == 2 &&
13397 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
13398 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
13399 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
13400 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13403 "Expected all poisoned elements.");
13405 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
13410 Cost += createShuffle(InVectors.
front(),
13411 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
13413 transformMaskAfterShuffle(CommonMask, CommonMask);
13414 }
else if (InVectors.
size() == 2) {
13415 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
13416 transformMaskAfterShuffle(CommonMask, CommonMask);
13418 SameNodesEstimated =
false;
13419 if (!E2 && InVectors.
size() == 1) {
13420 unsigned VF = E1.getVectorFactor();
13421 if (
Value *V1 = dyn_cast<Value *>(InVectors.
front())) {
13422 VF = std::max(VF, getVF(V1));
13424 const auto *E = cast<const TreeEntry *>(InVectors.
front());
13425 VF = std::max(VF, E->getVectorFactor());
13427 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13429 CommonMask[
Idx] = Mask[
Idx] + VF;
13430 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
13431 transformMaskAfterShuffle(CommonMask, CommonMask);
13433 auto P = InVectors.
front();
13434 Cost += createShuffle(&E1, E2, Mask);
13435 unsigned VF = Mask.size();
13436 if (
Value *V1 = dyn_cast<Value *>(
P)) {
13440 const auto *E = cast<const TreeEntry *>(
P);
13441 VF = std::max(VF, E->getVectorFactor());
13443 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13445 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
13446 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
13447 transformMaskAfterShuffle(CommonMask, CommonMask);
13451 class ShuffleCostBuilder {
13454 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13456 return Mask.empty() ||
13457 (VF == Mask.size() &&
13465 ~ShuffleCostBuilder() =
default;
13470 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
13471 if (isEmptyOrIdentity(Mask, VF))
13474 cast<VectorType>(V1->
getType()), Mask);
13479 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
13480 if (isEmptyOrIdentity(Mask, VF))
13483 cast<VectorType>(V1->
getType()), Mask);
13489 void resizeToMatch(
Value *&,
Value *&)
const {}
13499 ShuffleCostBuilder Builder(
TTI);
13502 unsigned CommonVF = Mask.size();
13504 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13508 Type *EScalarTy = E.Scalars.front()->getType();
13509 bool IsSigned =
true;
13510 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13512 IsSigned = It->second.second;
13514 if (EScalarTy != ScalarTy) {
13515 unsigned CastOpcode = Instruction::Trunc;
13516 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13517 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13519 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13527 if (isa<Constant>(V))
13529 auto *VecTy = cast<VectorType>(V->getType());
13531 if (EScalarTy != ScalarTy) {
13533 unsigned CastOpcode = Instruction::Trunc;
13534 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13535 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13537 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13544 if (!V1 && !V2 && !P2.
isNull()) {
13546 const TreeEntry *E = cast<const TreeEntry *>(P1);
13547 unsigned VF = E->getVectorFactor();
13548 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13549 CommonVF = std::max(VF, E2->getVectorFactor());
13552 return Idx < 2 * static_cast<int>(CommonVF);
13554 "All elements in mask must be less than 2 * CommonVF.");
13555 if (E->Scalars.size() == E2->Scalars.size()) {
13559 for (
int &
Idx : CommonMask) {
13562 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13564 else if (
Idx >=
static_cast<int>(CommonVF))
13565 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
13569 CommonVF = E->Scalars.size();
13570 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13571 GetNodeMinBWAffectedCost(*E2, CommonVF);
13573 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13574 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13577 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13578 }
else if (!V1 && P2.
isNull()) {
13580 const TreeEntry *E = cast<const TreeEntry *>(P1);
13581 unsigned VF = E->getVectorFactor();
13585 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13586 "All elements in mask must be less than CommonVF.");
13587 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13589 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13590 for (
int &
Idx : CommonMask) {
13594 CommonVF = E->Scalars.size();
13595 }
else if (
unsigned Factor = E->getInterleaveFactor();
13596 Factor > 0 && E->Scalars.size() != Mask.size() &&
13600 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
13602 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13605 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13606 CommonVF == CommonMask.
size() &&
13608 [](
const auto &&
P) {
13610 static_cast<unsigned>(
P.value()) !=
P.index();
13618 }
else if (V1 && P2.
isNull()) {
13620 ExtraCost += GetValueMinBWAffectedCost(V1);
13621 CommonVF = getVF(V1);
13624 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13625 "All elements in mask must be less than CommonVF.");
13626 }
else if (V1 && !V2) {
13628 unsigned VF = getVF(V1);
13629 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13630 CommonVF = std::max(VF, E2->getVectorFactor());
13633 return Idx < 2 * static_cast<int>(CommonVF);
13635 "All elements in mask must be less than 2 * CommonVF.");
13636 if (E2->Scalars.size() == VF && VF != CommonVF) {
13638 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13639 for (
int &
Idx : CommonMask) {
13642 if (
Idx >=
static_cast<int>(CommonVF))
13643 Idx = E2Mask[
Idx - CommonVF] + VF;
13647 ExtraCost += GetValueMinBWAffectedCost(V1);
13649 ExtraCost += GetNodeMinBWAffectedCost(
13650 *E2, std::min(CommonVF, E2->getVectorFactor()));
13651 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13652 }
else if (!V1 && V2) {
13654 unsigned VF = getVF(V2);
13655 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13656 CommonVF = std::max(VF, E1->getVectorFactor());
13659 return Idx < 2 * static_cast<int>(CommonVF);
13661 "All elements in mask must be less than 2 * CommonVF.");
13662 if (E1->Scalars.size() == VF && VF != CommonVF) {
13664 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13665 for (
int &
Idx : CommonMask) {
13668 if (
Idx >=
static_cast<int>(CommonVF))
13669 Idx = E1Mask[
Idx - CommonVF] + VF;
13675 ExtraCost += GetNodeMinBWAffectedCost(
13676 *E1, std::min(CommonVF, E1->getVectorFactor()));
13678 ExtraCost += GetValueMinBWAffectedCost(V2);
13679 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13681 assert(V1 && V2 &&
"Expected both vectors.");
13682 unsigned VF = getVF(V1);
13683 CommonVF = std::max(VF, getVF(V2));
13686 return Idx < 2 * static_cast<int>(CommonVF);
13688 "All elements in mask must be less than 2 * CommonVF.");
13690 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13693 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13695 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
13697 if (cast<VectorType>(V2->
getType())->getElementType() != ScalarTy)
13698 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13701 InVectors.
front() =
13703 if (InVectors.
size() == 2)
13705 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13706 V1, V2, CommonMask, Builder, ScalarTy);
13713 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
13714 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13715 CheckedExtracts(CheckedExtracts) {}
13717 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13718 unsigned NumParts,
bool &UseVecBaseAsInput) {
13719 UseVecBaseAsInput =
false;
13722 Value *VecBase =
nullptr;
13724 if (!E->ReorderIndices.empty()) {
13726 E->ReorderIndices.end());
13731 bool PrevNodeFound =
any_of(
13733 [&](
const std::unique_ptr<TreeEntry> &TE) {
13734 return ((TE->hasState() && !TE->isAltShuffle() &&
13735 TE->getOpcode() == Instruction::ExtractElement) ||
13737 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13738 return VL.size() > Data.index() &&
13739 (Mask[Data.index()] == PoisonMaskElem ||
13740 isa<UndefValue>(VL[Data.index()]) ||
13741 Data.value() == VL[Data.index()]);
13747 for (
unsigned Part : seq<unsigned>(NumParts)) {
13749 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13753 if (isa<UndefValue>(V) ||
13762 auto *EE = cast<ExtractElementInst>(V);
13763 VecBase = EE->getVectorOperand();
13764 UniqueBases.
insert(VecBase);
13766 if (!CheckedExtracts.
insert(V).second ||
13767 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13770 return isa<GetElementPtrInst>(U) &&
13771 !R.areAllUsersVectorized(cast<Instruction>(U),
13779 unsigned Idx = *EEIdx;
13781 if (EE->hasOneUse() || !PrevNodeFound) {
13783 if (isa<SExtInst, ZExtInst>(Ext) &&
13784 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13788 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13792 Ext->getOpcode(), Ext->getType(), EE->getType(),
13797 APInt &DemandedElts =
13798 VectorOpsToExtracts
13801 .first->getSecond();
13805 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13807 DemandedElts,
false,
13815 if (!PrevNodeFound)
13816 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13819 transformMaskAfterShuffle(CommonMask, CommonMask);
13820 SameNodesEstimated =
false;
13821 if (NumParts != 1 && UniqueBases.
size() != 1) {
13822 UseVecBaseAsInput =
true;
13830 std::optional<InstructionCost>
13834 return std::nullopt;
13838 IsFinalized =
false;
13839 CommonMask.
clear();
13842 VectorizedVals.
clear();
13843 SameNodesEstimated =
true;
13849 return Idx < static_cast<int>(E1.getVectorFactor());
13851 "Expected single vector shuffle mask.");
13855 if (InVectors.
empty()) {
13856 CommonMask.
assign(Mask.begin(), Mask.end());
13857 InVectors.
assign({&E1, &E2});
13860 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
13866 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13867 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13870 if (InVectors.
empty()) {
13871 CommonMask.
assign(Mask.begin(), Mask.end());
13872 InVectors.
assign(1, &E1);
13875 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
13881 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13882 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13883 if (!SameNodesEstimated && InVectors.
size() == 1)
13895 auto *EI = cast<ExtractElementInst>(
13896 cast<const TreeEntry *>(InVectors.
front())
13897 ->getOrdered(
P.index()));
13898 return EI->getVectorOperand() == V1 ||
13899 EI->getVectorOperand() == V2;
13901 "Expected extractelement vectors.");
13905 if (InVectors.
empty()) {
13907 "Expected empty input mask/vectors.");
13908 CommonMask.
assign(Mask.begin(), Mask.end());
13909 InVectors.
assign(1, V1);
13914 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13915 !CommonMask.
empty() &&
13918 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13919 ->getOrdered(
P.index());
13921 return P.value() == Mask[
P.index()] ||
13922 isa<UndefValue>(Scalar);
13923 if (isa<Constant>(V1))
13925 auto *EI = cast<ExtractElementInst>(Scalar);
13926 return EI->getVectorOperand() == V1;
13928 "Expected only tree entry for extractelement vectors.");
13932 "Expected only tree entries from extracts/reused buildvectors.");
13933 unsigned VF = getVF(V1);
13934 if (InVectors.
size() == 2) {
13935 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
13936 transformMaskAfterShuffle(CommonMask, CommonMask);
13937 VF = std::max<unsigned>(VF, CommonMask.
size());
13938 }
else if (
const auto *InTE =
13939 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
13940 VF = std::max(VF, InTE->getVectorFactor());
13943 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
13944 ->getNumElements());
13947 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13949 CommonMask[
Idx] = Mask[
Idx] + VF;
13952 Value *Root =
nullptr) {
13953 Cost += getBuildVectorCost(VL, Root);
13957 unsigned VF = VL.
size();
13959 VF = std::min(VF, MaskVF);
13960 Type *VLScalarTy = VL.
front()->getType();
13963 if (isa<PoisonValue>(V)) {
13967 if (isa<UndefValue>(V)) {
13973 if (
auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
13983 cast<FixedVectorType>(Root->
getType())->getNumElements()),
13990 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
13995 IsFinalized =
true;
13998 if (InVectors.
size() == 2)
13999 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14001 Cost += createShuffle(Vec,
nullptr, CommonMask);
14002 transformMaskAfterShuffle(CommonMask, CommonMask);
14004 "Expected vector length for the final value before action.");
14005 Value *V = cast<Value *>(Vec);
14007 Cost += createShuffle(V1, V2, Mask);
14010 InVectors.
front() = V;
14012 if (!SubVectors.empty()) {
14014 if (InVectors.
size() == 2)
14015 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14017 Cost += createShuffle(Vec,
nullptr, CommonMask);
14018 transformMaskAfterShuffle(CommonMask, CommonMask);
14020 if (!SubVectorsMask.
empty()) {
14022 "Expected same size of masks for subvectors and common mask.");
14024 copy(SubVectorsMask, SVMask.begin());
14025 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14028 I1 = I2 + CommonMask.
size();
14035 for (
auto [E,
Idx] : SubVectors) {
14036 Type *EScalarTy = E->Scalars.front()->getType();
14037 bool IsSigned =
true;
14038 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
14041 IsSigned = It->second.second;
14043 if (ScalarTy != EScalarTy) {
14044 unsigned CastOpcode = Instruction::Trunc;
14045 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14046 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14048 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14058 if (!CommonMask.
empty()) {
14059 std::iota(std::next(CommonMask.
begin(),
Idx),
14060 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
14066 if (!ExtMask.
empty()) {
14067 if (CommonMask.
empty()) {
14071 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14074 NewMask[
I] = CommonMask[ExtMask[
I]];
14076 CommonMask.
swap(NewMask);
14079 if (CommonMask.
empty()) {
14080 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14084 createShuffle(InVectors.
front(),
14085 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14091 "Shuffle construction must be finalized.");
14095const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14096 unsigned Idx)
const {
14097 TreeEntry *
Op = OperandsToTreeEntry.
at({E,
Idx});
14098 assert(
Op->isSame(E->getOperand(
Idx)) &&
"Operands mismatch!");
14103 if (TE.State == TreeEntry::ScatterVectorize ||
14104 TE.State == TreeEntry::StridedVectorize)
14106 if (TE.State == TreeEntry::CompressVectorize)
14108 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14109 !TE.isAltShuffle()) {
14110 if (TE.ReorderIndices.empty())
14132 auto It = MinBWs.
find(E);
14133 Type *OrigScalarTy = ScalarTy;
14134 if (It != MinBWs.
end()) {
14135 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14141 unsigned EntryVF = E->getVectorFactor();
14144 if (E->isGather()) {
14147 if (isa<InsertElementInst>(VL[0]))
14149 if (isa<CmpInst>(VL.
front()))
14150 ScalarTy = VL.
front()->getType();
14151 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14152 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
14154 if (E->State == TreeEntry::SplitVectorize) {
14155 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14156 "Expected exactly 2 combined entries.");
14157 assert(E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14159 if (E->ReorderIndices.empty()) {
14162 E->CombinedEntriesWithIndices.back().second,
14165 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14166 ->getVectorFactor()));
14168 unsigned CommonVF =
14169 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14170 ->getVectorFactor(),
14171 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14172 ->getVectorFactor());
14177 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14182 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14183 (E->State != TreeEntry::StridedVectorize ||
14186 if (E->getOpcode() == Instruction::Store) {
14188 NewMask.
resize(E->ReorderIndices.size());
14189 copy(E->ReorderIndices, NewMask.
begin());
14195 if (!E->ReuseShuffleIndices.empty())
14196 ::addMask(Mask, E->ReuseShuffleIndices);
14200 assert((E->State == TreeEntry::Vectorize ||
14201 E->State == TreeEntry::ScatterVectorize ||
14202 E->State == TreeEntry::StridedVectorize ||
14203 E->State == TreeEntry::CompressVectorize) &&
14204 "Unhandled state");
14205 assert(E->getOpcode() &&
14207 (E->getOpcode() == Instruction::GetElementPtr &&
14208 E->getMainOp()->getType()->isPointerTy()) ||
14209 E->hasCopyableElements()) &&
14212 unsigned ShuffleOrOp =
14213 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
14214 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14215 ShuffleOrOp = E->CombinedOp;
14217 const unsigned Sz = UniqueValues.size();
14219 for (
unsigned I = 0;
I < Sz; ++
I) {
14220 if (isa<Instruction>(UniqueValues[
I]) &&
14221 !E->isCopyableElement(UniqueValues[
I]) &&
14222 getTreeEntries(UniqueValues[
I]).front() == E)
14224 UsedScalars.set(
I);
14226 auto GetCastContextHint = [&](
Value *
V) {
14228 return getCastContextHint(*OpTEs.front());
14229 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
14230 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14231 !SrcState.isAltShuffle())
14240 if (isa<CastInst, CallInst>(VL0)) {
14244 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14246 for (
unsigned I = 0;
I < Sz; ++
I) {
14247 if (UsedScalars.test(
I))
14249 ScalarCost += ScalarEltCost(
I);
14258 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14259 const EdgeInfo &EI = E->UserTreeIndex;
14260 if (!EI.UserTE->hasState() ||
14261 EI.UserTE->getOpcode() != Instruction::Select ||
14263 auto UserBWIt = MinBWs.
find(EI.UserTE);
14264 Type *UserScalarTy =
14265 (EI.UserTE->isGather() ||
14266 EI.UserTE->State == TreeEntry::SplitVectorize)
14267 ? EI.UserTE->Scalars.front()->getType()
14268 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14269 if (UserBWIt != MinBWs.
end())
14271 UserBWIt->second.first);
14272 if (ScalarTy != UserScalarTy) {
14273 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
14274 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
14275 unsigned VecOpcode;
14276 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
14277 if (BWSz > SrcBWSz)
14278 VecOpcode = Instruction::Trunc;
14281 It->second.second ? Instruction::SExt : Instruction::ZExt;
14288 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14289 ScalarCost,
"Calculated costs for Tree"));
14290 return VecCost - ScalarCost;
14295 assert((E->State == TreeEntry::Vectorize ||
14296 E->State == TreeEntry::StridedVectorize ||
14297 E->State == TreeEntry::CompressVectorize) &&
14298 "Entry state expected to be Vectorize, StridedVectorize or "
14299 "MaskedLoadCompressVectorize here.");
14303 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14304 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14305 "Calculated GEPs cost for Tree"));
14307 return VecCost - ScalarCost;
14314 Type *CanonicalType = Ty;
14321 {CanonicalType, CanonicalType});
14326 if (VI && SelectOnly) {
14328 "Expected only for scalar type.");
14329 auto *CI = cast<CmpInst>(
VI->getOperand(0));
14331 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14332 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14333 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14337 auto GetFMulAddCost = [&, &
TTI = *
TTI](
const InstructionsState &S,
14342 switch (ShuffleOrOp) {
14343 case Instruction::PHI: {
14347 for (
Value *V : UniqueValues) {
14348 auto *
PHI = dyn_cast<PHINode>(V);
14353 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14357 if (
const TreeEntry *OpTE =
14359 if (CountedOps.
insert(OpTE).second &&
14360 !OpTE->ReuseShuffleIndices.empty())
14361 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14362 OpTE->Scalars.size());
14365 return CommonCost - ScalarCost;
14367 case Instruction::ExtractValue:
14368 case Instruction::ExtractElement: {
14369 APInt DemandedElts;
14371 auto GetScalarCost = [&](
unsigned Idx) {
14372 if (isa<PoisonValue>(UniqueValues[
Idx]))
14375 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
14377 if (ShuffleOrOp == Instruction::ExtractElement) {
14378 auto *EE = cast<ExtractElementInst>(
I);
14379 SrcVecTy = EE->getVectorOperandType();
14381 auto *EV = cast<ExtractValueInst>(
I);
14382 Type *AggregateTy = EV->getAggregateOperand()->getType();
14384 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14385 NumElts = ATy->getNumElements();
14391 if (
I->hasOneUse()) {
14393 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14394 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
14402 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14407 if (DemandedElts.
isZero())
14413 return CommonCost - (DemandedElts.
isZero()
14416 SrcVecTy, DemandedElts,
false,
14419 return GetCostDiff(GetScalarCost, GetVectorCost);
14421 case Instruction::InsertElement: {
14422 assert(E->ReuseShuffleIndices.empty() &&
14423 "Unique insertelements only are expected.");
14424 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
14425 unsigned const NumElts = SrcVecTy->getNumElements();
14426 unsigned const NumScalars = VL.
size();
14432 unsigned OffsetEnd = OffsetBeg;
14433 InsertMask[OffsetBeg] = 0;
14436 if (OffsetBeg >
Idx)
14438 else if (OffsetEnd <
Idx)
14440 InsertMask[
Idx] =
I + 1;
14443 if (NumOfParts > 0 && NumOfParts < NumElts)
14444 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14445 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14447 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14448 unsigned InsertVecSz = std::min<unsigned>(
14450 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14451 bool IsWholeSubvector =
14452 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14456 if (OffsetBeg + InsertVecSz > VecSz) {
14459 InsertVecSz = VecSz;
14465 if (!E->ReorderIndices.empty()) {
14470 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14472 bool IsIdentity =
true;
14474 Mask.swap(PrevMask);
14475 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14477 DemandedElts.
setBit(InsertIdx);
14478 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14479 Mask[InsertIdx - OffsetBeg] =
I;
14481 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14495 InsertVecTy, Mask);
14496 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
14497 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14505 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14506 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14507 if (InsertVecSz != VecSz) {
14518 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14527 case Instruction::ZExt:
14528 case Instruction::SExt:
14529 case Instruction::FPToUI:
14530 case Instruction::FPToSI:
14531 case Instruction::FPExt:
14532 case Instruction::PtrToInt:
14533 case Instruction::IntToPtr:
14534 case Instruction::SIToFP:
14535 case Instruction::UIToFP:
14536 case Instruction::Trunc:
14537 case Instruction::FPTrunc:
14538 case Instruction::BitCast: {
14539 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
14542 unsigned Opcode = ShuffleOrOp;
14543 unsigned VecOpcode = Opcode;
14545 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
14547 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14548 if (SrcIt != MinBWs.
end()) {
14549 SrcBWSz = SrcIt->second.first;
14556 if (BWSz == SrcBWSz) {
14557 VecOpcode = Instruction::BitCast;
14558 }
else if (BWSz < SrcBWSz) {
14559 VecOpcode = Instruction::Trunc;
14560 }
else if (It != MinBWs.
end()) {
14561 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14562 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14563 }
else if (SrcIt != MinBWs.
end()) {
14564 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14566 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14568 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
14569 !SrcIt->second.second) {
14570 VecOpcode = Instruction::UIToFP;
14573 assert(
Idx == 0 &&
"Expected 0 index only");
14581 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14583 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14586 bool IsArithmeticExtendedReduction =
14587 E->Idx == 0 && UserIgnoreList &&
14589 auto *
I = cast<Instruction>(V);
14590 return is_contained({Instruction::Add, Instruction::FAdd,
14591 Instruction::Mul, Instruction::FMul,
14592 Instruction::And, Instruction::Or,
14596 if (IsArithmeticExtendedReduction &&
14597 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14599 return CommonCost +
14601 VecOpcode == Opcode ? VI :
nullptr);
14603 return GetCostDiff(GetScalarCost, GetVectorCost);
14605 case Instruction::FCmp:
14606 case Instruction::ICmp:
14607 case Instruction::Select: {
14611 match(VL0, MatchCmp))
14617 auto GetScalarCost = [&](
unsigned Idx) {
14618 if (isa<PoisonValue>(UniqueValues[
Idx]))
14621 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
14627 !
match(VI, MatchCmp)) ||
14635 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14636 CostKind, getOperandInfo(
VI->getOperand(0)),
14637 getOperandInfo(
VI->getOperand(1)), VI);
14649 CostKind, getOperandInfo(E->getOperand(0)),
14650 getOperandInfo(E->getOperand(1)), VL0);
14651 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
14654 unsigned CondNumElements = CondType->getNumElements();
14656 assert(VecTyNumElements >= CondNumElements &&
14657 VecTyNumElements % CondNumElements == 0 &&
14658 "Cannot vectorize Instruction::Select");
14659 if (CondNumElements != VecTyNumElements) {
14668 return VecCost + CommonCost;
14670 return GetCostDiff(GetScalarCost, GetVectorCost);
14672 case TreeEntry::MinMax: {
14673 auto GetScalarCost = [&](
unsigned Idx) {
14674 return GetMinMaxCost(OrigScalarTy);
14678 return VecCost + CommonCost;
14680 return GetCostDiff(GetScalarCost, GetVectorCost);
14682 case TreeEntry::FMulAdd: {
14683 auto GetScalarCost = [&](
unsigned Idx) {
14684 if (isa<PoisonValue>(UniqueValues[
Idx]))
14686 return GetFMulAddCost(E->getOperations(),
14687 cast<Instruction>(UniqueValues[
Idx]));
14692 for (
Value *V : E->Scalars) {
14693 if (
auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14694 FMF &= FPCI->getFastMathFlags();
14695 if (
auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14696 FMF &= FPCIOp->getFastMathFlags();
14700 {VecTy, VecTy, VecTy}, FMF);
14702 return VecCost + CommonCost;
14704 return GetCostDiff(GetScalarCost, GetVectorCost);
14706 case Instruction::FNeg:
14707 case Instruction::Add:
14708 case Instruction::FAdd:
14709 case Instruction::Sub:
14710 case Instruction::FSub:
14711 case Instruction::Mul:
14712 case Instruction::FMul:
14713 case Instruction::UDiv:
14714 case Instruction::SDiv:
14715 case Instruction::FDiv:
14716 case Instruction::URem:
14717 case Instruction::SRem:
14718 case Instruction::FRem:
14719 case Instruction::Shl:
14720 case Instruction::LShr:
14721 case Instruction::AShr:
14722 case Instruction::And:
14723 case Instruction::Or:
14724 case Instruction::Xor: {
14725 auto GetScalarCost = [&](
unsigned Idx) {
14726 if (isa<PoisonValue>(UniqueValues[
Idx]))
14732 Value *Op1 = E->getOperand(0)[
Idx];
14735 if (isa<UnaryOperator>(UniqueValues[
Idx])) {
14738 Op2 = E->getOperand(1)[
Idx];
14745 if (
auto *
I = dyn_cast<Instruction>(UniqueValues[
Idx]);
14746 I && (ShuffleOrOp == Instruction::FAdd ||
14747 ShuffleOrOp == Instruction::FSub)) {
14755 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
14756 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14759 auto *CI = dyn_cast<ConstantInt>(
Op);
14760 return CI && CI->getValue().countr_one() >= It->second.first;
14765 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14769 Op2Info, {},
nullptr, TLI) +
14772 return GetCostDiff(GetScalarCost, GetVectorCost);
14774 case Instruction::GetElementPtr: {
14775 return CommonCost + GetGEPCostDiff(VL, VL0);
14777 case Instruction::Load: {
14778 auto GetScalarCost = [&](
unsigned Idx) {
14779 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
14781 VI->getAlign(),
VI->getPointerAddressSpace(),
14784 auto *LI0 = cast<LoadInst>(VL0);
14787 switch (E->State) {
14788 case TreeEntry::Vectorize:
14789 if (
unsigned Factor = E->getInterleaveFactor()) {
14791 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14792 LI0->getPointerAddressSpace(),
CostKind);
14796 Instruction::Load, VecTy, LI0->getAlign(),
14800 case TreeEntry::StridedVectorize: {
14801 Align CommonAlignment =
14802 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14804 Instruction::Load, VecTy, LI0->getPointerOperand(),
14805 false, CommonAlignment,
CostKind);
14808 case TreeEntry::CompressVectorize: {
14810 unsigned InterleaveFactor;
14814 if (!E->ReorderIndices.empty()) {
14816 E->ReorderIndices.end());
14821 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
14823 Scalars, PointerOps, E->ReorderIndices, *
TTI, *DL, *SE, *AC, *DT,
14824 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14825 CompressMask, LoadVecTy);
14826 assert(IsVectorized &&
"Failed to vectorize load");
14827 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14828 InterleaveFactor, IsMasked);
14829 Align CommonAlignment = LI0->getAlign();
14830 if (InterleaveFactor) {
14832 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14833 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14834 }
else if (IsMasked) {
14836 Instruction::Load, LoadVecTy, CommonAlignment,
14837 LI0->getPointerAddressSpace(),
CostKind);
14840 LoadVecTy, CompressMask,
CostKind);
14843 Instruction::Load, LoadVecTy, CommonAlignment,
14847 LoadVecTy, CompressMask,
CostKind);
14851 case TreeEntry::ScatterVectorize: {
14852 Align CommonAlignment =
14853 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14855 Instruction::Load, VecTy, LI0->getPointerOperand(),
14856 false, CommonAlignment,
CostKind);
14859 case TreeEntry::CombinedVectorize:
14860 case TreeEntry::SplitVectorize:
14861 case TreeEntry::NeedToGather:
14864 return VecLdCost + CommonCost;
14870 if (E->State == TreeEntry::ScatterVectorize)
14876 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
14877 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14879 case Instruction::Store: {
14880 bool IsReorder = !E->ReorderIndices.empty();
14881 auto GetScalarCost = [=](
unsigned Idx) {
14882 auto *
VI = cast<StoreInst>(VL[
Idx]);
14885 VI->getAlign(),
VI->getPointerAddressSpace(),
14889 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14893 if (E->State == TreeEntry::StridedVectorize) {
14894 Align CommonAlignment =
14895 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14897 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14898 false, CommonAlignment,
CostKind);
14900 assert(E->State == TreeEntry::Vectorize &&
14901 "Expected either strided or consecutive stores.");
14902 if (
unsigned Factor = E->getInterleaveFactor()) {
14903 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14904 "No reused shuffles expected");
14907 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14908 BaseSI->getPointerAddressSpace(),
CostKind);
14912 Instruction::Store, VecTy, BaseSI->getAlign(),
14913 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14916 return VecStCost + CommonCost;
14920 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
14921 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
14924 return GetCostDiff(GetScalarCost, GetVectorCost) +
14925 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14927 case Instruction::Call: {
14928 auto GetScalarCost = [&](
unsigned Idx) {
14929 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
14940 auto *CI = cast<CallInst>(VL0);
14944 It != MinBWs.
end() ? It->second.first : 0,
TTI);
14946 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
14948 return GetCostDiff(GetScalarCost, GetVectorCost);
14950 case Instruction::ShuffleVector: {
14951 if (!
SLPReVec || E->isAltShuffle())
14952 assert(E->isAltShuffle() &&
14957 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
14958 "Invalid Shuffle Vector Operand");
14961 auto TryFindNodeWithEqualOperands = [=]() {
14962 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14965 if (
TE->hasState() &&
TE->isAltShuffle() &&
14966 ((
TE->getOpcode() == E->getOpcode() &&
14967 TE->getAltOpcode() == E->getAltOpcode()) ||
14968 (
TE->getOpcode() == E->getAltOpcode() &&
14969 TE->getAltOpcode() == E->getOpcode())) &&
14970 TE->hasEqualOperands(*E))
14975 auto GetScalarCost = [&](
unsigned Idx) {
14976 if (isa<PoisonValue>(UniqueValues[
Idx]))
14979 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
14980 assert(E->getMatchingMainOpOrAltOp(VI) &&
14981 "Unexpected main/alternate opcode");
14991 if (TryFindNodeWithEqualOperands()) {
14993 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15000 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
15002 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
15003 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15005 VecCost = TTIRef.getCmpSelInstrCost(
15006 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15007 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15009 VecCost += TTIRef.getCmpSelInstrCost(
15010 E->getOpcode(), VecTy, MaskTy,
15011 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
15012 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15015 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15018 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15019 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
15021 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15022 if (SrcIt != MinBWs.
end()) {
15023 SrcBWSz = SrcIt->second.first;
15027 if (BWSz <= SrcBWSz) {
15028 if (BWSz < SrcBWSz)
15030 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15034 <<
"SLP: alternate extension, which should be truncated.\n";
15040 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15043 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15047 E->buildAltOpShuffleMask(
15049 assert(E->getMatchingMainOpOrAltOp(
I) &&
15050 "Unexpected main/alternate opcode");
15061 unsigned Opcode0 = E->getOpcode();
15062 unsigned Opcode1 = E->getAltOpcode();
15067 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15069 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15070 return AltVecCost < VecCost ? AltVecCost : VecCost;
15075 if (
SLPReVec && !E->isAltShuffle())
15076 return GetCostDiff(
15081 "Not supported shufflevector usage.");
15082 auto *SV = cast<ShuffleVectorInst>(VL.
front());
15083 unsigned SVNumElements =
15084 cast<FixedVectorType>(SV->getOperand(0)->getType())
15085 ->getNumElements();
15086 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15087 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
15091 assert(isa<ShuffleVectorInst>(V) &&
15092 "Not supported shufflevector usage.");
15093 auto *SV = cast<ShuffleVectorInst>(V);
15095 [[maybe_unused]]
bool IsExtractSubvectorMask =
15096 SV->isExtractSubvectorMask(Index);
15097 assert(IsExtractSubvectorMask &&
15098 "Not supported shufflevector usage.");
15099 if (NextIndex != Index)
15101 NextIndex += SV->getShuffleMask().size();
15104 return ::getShuffleCost(
15110 return GetCostDiff(GetScalarCost, GetVectorCost);
15112 case Instruction::Freeze:
15119bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15121 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15123 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15125 return TE->isGather() &&
15127 [
this](
Value *V) { return EphValues.contains(V); }) &&
15129 TE->Scalars.size() < Limit ||
15130 (((
TE->hasState() &&
15131 TE->getOpcode() == Instruction::ExtractElement) ||
15132 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
15134 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15135 !
TE->isAltShuffle()) ||
15136 any_of(
TE->Scalars, IsaPred<LoadInst>));
15140 if (VectorizableTree.size() == 1 &&
15141 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15142 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15143 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15145 AreVectorizableGathers(VectorizableTree[0].
get(),
15146 VectorizableTree[0]->Scalars.size()) &&
15147 VectorizableTree[0]->getVectorFactor() > 2)))
15150 if (VectorizableTree.size() != 2)
15157 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15158 AreVectorizableGathers(VectorizableTree[1].
get(),
15159 VectorizableTree[0]->Scalars.size()))
15163 if (VectorizableTree[0]->
isGather() ||
15164 (VectorizableTree[1]->isGather() &&
15165 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15166 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15167 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15175 bool MustMatchOrInst) {
15179 Value *ZextLoad = Root;
15180 const APInt *ShAmtC;
15181 bool FoundOr =
false;
15182 while (!isa<ConstantExpr>(ZextLoad) &&
15185 ShAmtC->
urem(8) == 0))) {
15186 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15187 ZextLoad = BinOp->getOperand(0);
15188 if (BinOp->getOpcode() == Instruction::Or)
15193 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15200 Type *SrcTy = Load->getType();
15207 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15208 << *(cast<Instruction>(Root)) <<
"\n");
15217 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15218 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15226 unsigned NumElts = Stores.
size();
15227 for (
Value *Scalar : Stores) {
15241 if (VectorizableTree.empty()) {
15242 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15248 if (VectorizableTree.size() == 2 &&
15249 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15250 VectorizableTree[1]->isGather() &&
15251 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15252 !(
isSplat(VectorizableTree[1]->Scalars) ||
15260 constexpr int Limit = 4;
15262 !VectorizableTree.empty() &&
15263 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15264 return (TE->isGather() &&
15265 (!TE->hasState() ||
15266 TE->getOpcode() != Instruction::ExtractElement) &&
15267 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15268 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15275 VectorizableTree.size() <= Limit &&
15276 all_of(VectorizableTree,
15277 [&](
const std::unique_ptr<TreeEntry> &TE) {
15278 return (TE->isGather() &&
15279 (!TE->hasState() ||
15280 TE->getOpcode() != Instruction::ExtractElement) &&
15281 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15284 (TE->getOpcode() == Instruction::InsertElement ||
15285 (TE->getOpcode() == Instruction::PHI &&
15287 return isa<PoisonValue>(V) || MustGather.contains(V);
15290 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15291 return TE->State == TreeEntry::Vectorize &&
15292 TE->getOpcode() == Instruction::PHI;
15299 unsigned NumGathers = 0;
15300 constexpr int LimitTreeSize = 36;
15302 all_of(VectorizableTree,
15303 [&](
const std::unique_ptr<TreeEntry> &TE) {
15304 if (!TE->isGather() && TE->hasState() &&
15305 (TE->getOpcode() == Instruction::Load ||
15306 TE->getOpcode() == Instruction::Store)) {
15307 StoreLoadNodes.push_back(TE.get());
15310 if (TE->isGather())
15312 return TE->State == TreeEntry::SplitVectorize ||
15313 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15314 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15315 VectorizableTree.size() > LimitTreeSize) ||
15317 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15319 (TE->getOpcode() == Instruction::PHI ||
15320 (TE->hasCopyableElements() &&
15322 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15323 TE->Scalars.size() / 2) ||
15324 ((!TE->ReuseShuffleIndices.empty() ||
15325 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15326 TE->Scalars.size() == 2)));
15328 (StoreLoadNodes.
empty() ||
15329 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15330 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15331 return TE->getOpcode() == Instruction::Store ||
15333 return !isa<LoadInst>(V) ||
15334 areAllUsersVectorized(cast<Instruction>(V));
15342 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15343 VectorizableTree.size() >= Limit &&
15345 [&](
const std::unique_ptr<TreeEntry> &TE) {
15346 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15347 TE->UserTreeIndex.UserTE->Idx == 0;
15354 VectorizableTree.size() > 2 &&
15355 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15356 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15357 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15358 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15360 ArrayRef(VectorizableTree).drop_front(2),
15361 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15371 if (isFullyVectorizableTinyTree(ForReduction))
15376 bool IsAllowedSingleBVNode =
15377 VectorizableTree.
size() > 1 ||
15378 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15379 !VectorizableTree.front()->isAltShuffle() &&
15380 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15381 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15383 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15384 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15385 return isa<ExtractElementInst, Constant>(V) ||
15386 (IsAllowedSingleBVNode &&
15387 !V->hasNUsesOrMore(UsesLimit) &&
15388 any_of(V->users(), IsaPred<InsertElementInst>));
15393 if (VectorizableTree.back()->isGather() &&
15394 VectorizableTree.back()->hasState() &&
15395 VectorizableTree.back()->isAltShuffle() &&
15396 VectorizableTree.back()->getVectorFactor() > 2 &&
15398 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15400 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15401 VectorizableTree.back()->getVectorFactor()),
15414 constexpr unsigned SmallTree = 3;
15415 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15418 [](
const std::unique_ptr<TreeEntry> &TE) {
15419 return TE->isGather() && TE->hasState() &&
15420 TE->getOpcode() == Instruction::Load &&
15428 TreeEntry &E = *VectorizableTree[
Idx];
15429 if (E.State == TreeEntry::SplitVectorize)
15433 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15435 all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
15436 (isa<ExtractElementInst>(E.Scalars.front()) &&
15452 const TreeEntry *Root = VectorizableTree.front().get();
15453 if (Root->isGather())
15461 for (
const auto &TEPtr : VectorizableTree) {
15462 if (!TEPtr->isGather()) {
15463 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15464 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15465 LastInstructions.
insert(LastInst);
15467 if (TEPtr->UserTreeIndex)
15468 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15472 const auto *
II = dyn_cast<IntrinsicInst>(
I);
15475 if (
II->isAssumeLikeIntrinsic())
15482 return IntrCost < CallCost;
15489 CheckedInstructions;
15490 unsigned Budget = 0;
15491 const unsigned BudgetLimit =
15496 "Expected instructions in same block.");
15497 if (
auto It = CheckedInstructions.
find(
Last);
15498 It != CheckedInstructions.
end()) {
15499 const Instruction *Checked = It->second.getPointer();
15501 return It->second.getInt() != 0;
15507 ++
First->getIterator().getReverse(),
15509 Last->getIterator().getReverse();
15511 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15515 if (
const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15517 for (
const Instruction *LastInst : LastInstsInRange)
15518 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15521 if (LastInstructions.
contains(&*PrevInstIt))
15522 LastInstsInRange.
push_back(&*PrevInstIt);
15527 for (
const Instruction *LastInst : LastInstsInRange)
15529 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15530 Budget <= BudgetLimit ? 1 : 0);
15531 return Budget <= BudgetLimit;
15533 auto AddCosts = [&](
const TreeEntry *
Op) {
15534 Type *ScalarTy =
Op->Scalars.front()->getType();
15535 auto It = MinBWs.
find(
Op);
15536 if (It != MinBWs.
end())
15549 ParentOpParentToPreds;
15552 auto Key = std::make_pair(Root, OpParent);
15553 if (
auto It = ParentOpParentToPreds.
find(Key);
15554 It != ParentOpParentToPreds.
end())
15566 for (
const auto &KeyPair : ParentsPairsToAdd) {
15568 "Should not have been added before.");
15572 while (!Worklist.
empty()) {
15574 if (BB == OpParent || !Visited.
insert(BB).second)
15576 auto Pair = std::make_pair(BB, OpParent);
15577 if (
auto It = ParentOpParentToPreds.
find(Pair);
15578 It != ParentOpParentToPreds.
end()) {
15582 ParentsPairsToAdd.
insert(Pair);
15587 if (Budget > BudgetLimit)
15599 while (!LiveEntries.
empty()) {
15604 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15607 if (!
Op->isGather())
15609 if (Entry->State == TreeEntry::SplitVectorize ||
15610 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15615 if (
auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15616 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15619 if (
Op->isGather()) {
15620 assert(Entry->getOpcode() == Instruction::PHI &&
15621 "Expected phi node only.");
15622 OpParent = cast<PHINode>(Entry->getMainOp())
15623 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15625 for (
Value *V :
Op->Scalars) {
15626 auto *Inst = dyn_cast<Instruction>(V);
15636 OpLastInst = EntriesToLastInstruction.
at(
Op);
15640 if (OpParent == Parent) {
15641 if (Entry->getOpcode() == Instruction::PHI) {
15642 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15646 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15652 if (Entry->getOpcode() != Instruction::PHI &&
15653 !CheckForNonVecCallsInSameBlock(
15654 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15660 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15666 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15682 const auto *I1 = IE1;
15683 const auto *I2 = IE2;
15695 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15697 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15698 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15700 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15701 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15708struct ValueSelect {
15709 template <
typename U>
15710 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15713 template <
typename U>
15714 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15732template <
typename T>
15738 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15740 auto VMIt = std::next(ShuffleMask.begin());
15743 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15745 if (!IsBaseUndef.
all()) {
15747 std::pair<T *, bool> Res =
15748 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15750 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
15754 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
15756 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15757 assert((!V || GetVF(V) == Mask.size()) &&
15758 "Expected base vector of VF number of elements.");
15759 Prev = Action(Mask, {
nullptr, Res.first});
15760 }
else if (ShuffleMask.size() == 1) {
15763 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15769 Prev = Action(Mask, {ShuffleMask.begin()->first});
15773 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15774 unsigned Vec2VF = GetVF(VMIt->first);
15775 if (Vec1VF == Vec2VF) {
15779 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15782 Mask[
I] = SecMask[
I] + Vec1VF;
15785 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15788 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15790 std::pair<T *, bool> Res2 =
15791 ResizeAction(VMIt->first, VMIt->second,
false);
15793 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15800 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15803 Prev = Action(Mask, {Res1.first, Res2.first});
15805 VMIt = std::next(VMIt);
15807 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15809 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15811 std::pair<T *, bool> Res =
15812 ResizeAction(VMIt->first, VMIt->second,
false);
15814 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15817 "Multiple uses of scalars.");
15818 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15823 Prev = Action(Mask, {Prev, Res.first});
15831template <
typename T>
struct ShuffledInsertData {
15843 << VectorizableTree.size() <<
".\n");
15846 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15847 TreeEntry &TE = *VectorizableTree[
I];
15850 if (TE.State == TreeEntry::CombinedVectorize) {
15852 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15853 << *TE.Scalars[0] <<
".\n";
15854 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15857 if (TE.hasState() &&
15858 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15859 if (
const TreeEntry *E =
15860 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15861 E && E->getVectorFactor() == TE.getVectorFactor()) {
15866 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15873 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15874 "Expected gather nodes with users only.");
15880 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15884 none_of(ExternalUses, [](
const ExternalUser &EU) {
15885 return isa_and_nonnull<InsertElementInst>(EU.User);
15895 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15902 for (ExternalUser &EU : ExternalUses) {
15903 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15906 for (ExternalUser &EU : ExternalUses) {
15907 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15908 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15910 else dbgs() <<
" User: nullptr\n");
15911 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15916 if (EphValues.
count(EU.User))
15920 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15922 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
15928 EU.User ? cast<Instruction>(EU.User)->getParent() :
nullptr;
15931 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
15935 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
15936 !ExtractCostCalculated.
insert(EU.Scalar).second)
15940 if (!
SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
15946 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
15948 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
15949 if (!UsedInserts.
insert(VU).second)
15953 const TreeEntry *ScalarTE = &EU.E;
15956 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
15961 Value *Op0 =
II->getOperand(0);
15968 if (It == ShuffledInserts.
end()) {
15970 Data.InsertElements.emplace_back(VU);
15972 VecId = ShuffledInserts.
size() - 1;
15973 auto It = MinBWs.
find(ScalarTE);
15974 if (It != MinBWs.
end() &&
15976 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
15978 unsigned BWSz = It->second.first;
15979 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
15980 unsigned VecOpcode;
15981 if (DstBWSz < BWSz)
15982 VecOpcode = Instruction::Trunc;
15985 It->second.second ? Instruction::SExt : Instruction::ZExt;
15990 FTy->getNumElements()),
15993 <<
" for extending externally used vector with "
15994 "non-equal minimum bitwidth.\n");
15999 It->InsertElements.front() = VU;
16000 VecId = std::distance(ShuffledInserts.
begin(), It);
16002 int InIdx = *InsertIdx;
16004 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16007 Mask[InIdx] = EU.Lane;
16008 DemandedElts[VecId].setBit(InIdx);
16019 auto *ScalarTy = EU.Scalar->getType();
16020 const unsigned BundleWidth = EU.E.getVectorFactor();
16021 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16023 const TreeEntry *Entry = &EU.E;
16024 auto It = MinBWs.
find(Entry);
16025 if (It != MinBWs.
end()) {
16027 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16030 ? Instruction::ZExt
16031 : Instruction::SExt;
16036 << ExtraCost <<
"\n");
16040 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16041 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16042 << *VecTy <<
": " << ExtraCost <<
"\n");
16045 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16046 Entry->getOpcode() == Instruction::Load) {
16048 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16049 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16050 auto *
I = cast<Instruction>(U.Scalar);
16051 const Loop *L = LI->getLoopFor(Phi->getParent());
16052 return L && (Phi->getParent() ==
I->getParent() ||
16053 L == LI->getLoopFor(
I->getParent()));
16057 if (!ValueToExtUses) {
16058 ValueToExtUses.emplace();
16059 for (
const auto &
P :
enumerate(ExternalUses)) {
16061 if (IsPhiInLoop(
P.value()))
16064 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16069 auto *Inst = cast<Instruction>(EU.Scalar);
16071 auto OperandIsScalar = [&](
Value *V) {
16076 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
16077 return !EE->hasOneUse() || !MustGather.contains(EE);
16080 return ValueToExtUses->contains(V);
16082 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16083 bool CanBeUsedAsScalarCast =
false;
16084 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16085 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
16086 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16091 if (ScalarCost + OpCost <= ExtraCost) {
16092 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16093 ScalarCost += OpCost;
16097 if (CanBeUsedAsScalar) {
16098 bool KeepScalar = ScalarCost <= ExtraCost;
16102 bool IsProfitablePHIUser =
16104 VectorizableTree.front()->Scalars.size() > 2)) &&
16105 VectorizableTree.front()->hasState() &&
16106 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16110 auto *PHIUser = dyn_cast<PHINode>(U);
16111 return (!PHIUser ||
16112 PHIUser->getParent() !=
16114 VectorizableTree.front()->getMainOp())
16119 return ValueToExtUses->contains(V);
16121 if (IsProfitablePHIUser) {
16125 (!GatheredLoadsEntriesFirst.has_value() ||
16126 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16127 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16128 return ValueToExtUses->contains(V);
16130 auto It = ExtractsCount.
find(Entry);
16131 if (It != ExtractsCount.
end()) {
16132 assert(ScalarUsesCount >= It->getSecond().size() &&
16133 "Expected total number of external uses not less than "
16134 "number of scalar uses.");
16135 ScalarUsesCount -= It->getSecond().size();
16140 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16143 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
16144 for (
Value *V : Inst->operands()) {
16145 auto It = ValueToExtUses->find(V);
16146 if (It != ValueToExtUses->end()) {
16148 ExternalUses[It->second].User =
nullptr;
16151 ExtraCost = ScalarCost;
16152 if (!IsPhiInLoop(EU))
16153 ExtractsCount[Entry].
insert(Inst);
16154 if (CanBeUsedAsScalarCast) {
16155 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16158 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16159 for (
Value *V : IOp->operands()) {
16160 auto It = ValueToExtUses->find(V);
16161 if (It != ValueToExtUses->end()) {
16163 ExternalUses[It->second].User =
nullptr;
16172 ExtractCost += ExtraCost;
16176 for (
Value *V : ScalarOpsFromCasts) {
16177 ExternalUsesAsOriginalScalar.
insert(V);
16179 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16180 TEs.front()->findLaneForValue(V));
16184 if (!VectorizedVals.
empty()) {
16185 const TreeEntry &Root = *VectorizableTree.front();
16186 auto BWIt = MinBWs.
find(&Root);
16187 if (BWIt != MinBWs.
end()) {
16188 Type *DstTy = Root.Scalars.front()->getType();
16191 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16192 if (OriginalSz != SrcSz) {
16193 unsigned Opcode = Instruction::Trunc;
16194 if (OriginalSz > SrcSz)
16195 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16197 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16208 Cost += ExtractCost;
16210 bool ForSingleMask) {
16212 unsigned VF = Mask.size();
16213 unsigned VecVF = TE->getVectorFactor();
16214 bool HasLargeIndex =
16215 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16216 if ((VF != VecVF && HasLargeIndex) ||
16219 if (HasLargeIndex) {
16221 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16227 dbgs() <<
"SLP: Adding cost " <<
C
16228 <<
" for final shuffle of insertelement external users.\n";
16229 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16231 return std::make_pair(TE,
true);
16234 if (!ForSingleMask) {
16236 for (
unsigned I = 0;
I < VF; ++
I) {
16238 ResizeMask[Mask[
I]] = Mask[
I];
16245 dbgs() <<
"SLP: Adding cost " <<
C
16246 <<
" for final shuffle of insertelement external users.\n";
16247 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16252 return std::make_pair(TE,
false);
16255 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16256 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16257 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16261 assert((TEs.size() == 1 || TEs.size() == 2) &&
16262 "Expected exactly 1 or 2 tree entries.");
16263 if (TEs.size() == 1) {
16265 VF = TEs.front()->getVectorFactor();
16266 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16270 (
Data.index() < VF &&
16271 static_cast<int>(
Data.index()) ==
Data.value());
16276 <<
" for final shuffle of insertelement "
16277 "external users.\n";
16278 TEs.front()->
dump();
16279 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16285 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16286 VF = TEs.front()->getVectorFactor();
16290 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16294 <<
" for final shuffle of vector node and external "
16295 "insertelement users.\n";
16296 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16297 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16303 (void)performExtractsShuffleAction<const TreeEntry>(
16305 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
16306 EstimateShufflesCost);
16308 cast<FixedVectorType>(
16309 ShuffledInserts[
I].InsertElements.
front()->getType()),
16312 Cost -= InsertCost;
16316 if (ReductionBitWidth != 0) {
16317 assert(UserIgnoreList &&
"Expected reduction tree.");
16318 const TreeEntry &E = *VectorizableTree.front();
16319 auto It = MinBWs.
find(&E);
16320 if (It != MinBWs.
end() && It->second.first != ReductionBitWidth) {
16321 unsigned SrcSize = It->second.first;
16322 unsigned DstSize = ReductionBitWidth;
16323 unsigned Opcode = Instruction::Trunc;
16324 if (SrcSize < DstSize) {
16325 bool IsArithmeticExtendedReduction =
16327 auto *
I = cast<Instruction>(V);
16328 return is_contained({Instruction::Add, Instruction::FAdd,
16329 Instruction::Mul, Instruction::FMul,
16330 Instruction::And, Instruction::Or,
16334 if (IsArithmeticExtendedReduction)
16336 Instruction::BitCast;
16338 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16340 if (Opcode != Instruction::BitCast) {
16342 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16344 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16347 switch (E.getOpcode()) {
16348 case Instruction::SExt:
16349 case Instruction::ZExt:
16350 case Instruction::Trunc: {
16351 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16352 CCH = getCastContextHint(*OpTE);
16362 <<
" for final resize for reduction from " << SrcVecTy
16363 <<
" to " << DstVecTy <<
"\n";
16364 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16369 std::optional<InstructionCost> SpillCost;
16372 Cost += *SpillCost;
16378 OS <<
"SLP: Spill Cost = ";
16383 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16384 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16388 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
16399std::optional<TTI::ShuffleKind>
16400BoUpSLP::tryToGatherSingleRegisterExtractElements(
16406 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16407 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
16409 if (isa<UndefValue>(VL[
I]))
16413 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16414 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16427 ExtractMask.reset(*
Idx);
16432 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16437 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16438 return P1.second.size() > P2.second.size();
16441 const int UndefSz = UndefVectorExtracts.
size();
16442 unsigned SingleMax = 0;
16443 unsigned PairMax = 0;
16444 if (!Vectors.
empty()) {
16445 SingleMax = Vectors.
front().second.size() + UndefSz;
16446 if (Vectors.
size() > 1) {
16447 auto *ItNext = std::next(Vectors.
begin());
16448 PairMax = SingleMax + ItNext->second.size();
16451 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16452 return std::nullopt;
16458 if (SingleMax >= PairMax && SingleMax) {
16459 for (
int Idx : Vectors.
front().second)
16461 }
else if (!Vectors.
empty()) {
16462 for (
unsigned Idx : {0, 1})
16463 for (
int Idx : Vectors[
Idx].second)
16467 for (
int Idx : UndefVectorExtracts)
16471 std::optional<TTI::ShuffleKind> Res =
16477 return std::nullopt;
16481 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
16482 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
16483 isa<UndefValue>(GatheredExtracts[
I])) {
16487 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
16488 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16489 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16504 unsigned NumParts)
const {
16505 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16509 for (
unsigned Part : seq<unsigned>(NumParts)) {
16515 std::optional<TTI::ShuffleKind> Res =
16516 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16517 ShufflesRes[Part] = Res;
16518 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16520 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16521 return Res.has_value();
16523 ShufflesRes.clear();
16524 return ShufflesRes;
16527std::optional<TargetTransformInfo::ShuffleKind>
16528BoUpSLP::isGatherShuffledSingleRegisterEntry(
16534 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16535 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16536 TE =
TE->UserTreeIndex.UserTE;
16537 if (TE == VectorizableTree.front().get())
16538 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16539 return TE->UserTreeIndex;
16541 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16542 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16543 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16545 TE =
TE->UserTreeIndex.UserTE;
16549 const EdgeInfo TEUseEI = GetUserEntry(TE);
16551 return std::nullopt;
16552 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16556 if (
auto *
PHI = dyn_cast_or_null<PHINode>(
16557 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16558 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16559 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16562 TEInsertBlock = TEInsertPt->
getParent();
16565 return std::nullopt;
16566 auto *NodeUI = DT->
getNode(TEInsertBlock);
16567 assert(NodeUI &&
"Should only process reachable instructions");
16569 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16583 auto *NodeEUI = DT->
getNode(InsertBlock);
16586 assert((NodeUI == NodeEUI) ==
16587 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16588 "Different nodes should have different DFS numbers");
16590 if (TEInsertPt->
getParent() != InsertBlock &&
16593 if (TEInsertPt->
getParent() == InsertBlock &&
16608 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16610 if ((TEPtr->getVectorFactor() != VL.
size() &&
16611 TEPtr->Scalars.size() != VL.
size()) ||
16612 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16616 for (
Value *V : VL) {
16623 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16624 unsigned EdgeIdx) {
16625 const TreeEntry *Ptr1 = User1;
16626 const TreeEntry *Ptr2 = User2;
16630 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16631 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16634 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16635 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16636 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16637 return Idx < It->second;
16641 for (
Value *V : VL) {
16646 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16647 if (TEPtr == TE || TEPtr->Idx == 0)
16650 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16651 "Must contain at least single gathered value.");
16652 assert(TEPtr->UserTreeIndex &&
16653 "Expected only single user of a gather node.");
16654 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16656 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16657 UseEI.UserTE->hasState())
16658 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16662 : &getLastInstructionInBundle(UseEI.UserTE);
16663 if (TEInsertPt == InsertPt) {
16665 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16666 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16667 TEUseEI.UserTE->isAltShuffle()) &&
16669 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16670 (UseEI.UserTE->hasState() &&
16671 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16672 !UseEI.UserTE->isAltShuffle()) ||
16681 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16684 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16685 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16686 UseEI.UserTE->State == TreeEntry::Vectorize &&
16687 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16688 TEUseEI.UserTE != UseEI.UserTE)
16693 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16697 if (TEUseEI.UserTE != UseEI.UserTE &&
16698 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16699 HasGatherUser(TEUseEI.UserTE)))
16702 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16706 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16707 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16708 UseEI.UserTE->doesNotNeedToSchedule() &&
16713 if ((TEInsertBlock != InsertPt->
getParent() ||
16714 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16715 !CheckOrdering(InsertPt))
16718 if (CheckAndUseSameNode(TEPtr))
16724 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16725 if (It != VTEs.end()) {
16726 const TreeEntry *VTE = *It;
16727 if (
none_of(
TE->CombinedEntriesWithIndices,
16728 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16729 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16730 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16734 if (CheckAndUseSameNode(VTE))
16740 const TreeEntry *VTE = VTEs.front();
16741 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16742 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16743 VTEs = VTEs.drop_front();
16745 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16746 return MTE->State == TreeEntry::Vectorize;
16748 if (MIt == VTEs.end())
16752 if (
none_of(
TE->CombinedEntriesWithIndices,
16753 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16754 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16755 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16759 if (CheckAndUseSameNode(VTE))
16763 if (VToTEs.
empty())
16765 if (UsedTEs.
empty()) {
16779 if (!VToTEs.
empty()) {
16785 VToTEs = SavedVToTEs;
16794 if (UsedTEs.
size() == 2)
16796 UsedTEs.push_back(SavedVToTEs);
16803 if (UsedTEs.
empty()) {
16805 return std::nullopt;
16809 if (UsedTEs.
size() == 1) {
16812 UsedTEs.front().
end());
16813 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16814 return TE1->Idx < TE2->Idx;
16817 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16818 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16820 if (It != FirstEntries.end() &&
16821 ((*It)->getVectorFactor() == VL.size() ||
16822 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16823 TE->ReuseShuffleIndices.size() == VL.size() &&
16824 (*It)->isSame(
TE->Scalars)))) {
16825 Entries.push_back(*It);
16826 if ((*It)->getVectorFactor() == VL.size()) {
16827 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16828 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16834 for (
unsigned I : seq<unsigned>(VL.size()))
16835 if (isa<PoisonValue>(VL[
I]))
16841 Entries.push_back(FirstEntries.front());
16843 for (
auto &
P : UsedValuesEntry)
16845 VF = FirstEntries.front()->getVectorFactor();
16848 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16851 for (
const TreeEntry *TE : UsedTEs.front()) {
16852 unsigned VF =
TE->getVectorFactor();
16853 auto It = VFToTE.
find(VF);
16854 if (It != VFToTE.
end()) {
16855 if (It->second->Idx >
TE->Idx)
16856 It->getSecond() =
TE;
16863 UsedTEs.back().
end());
16864 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16865 return TE1->Idx < TE2->Idx;
16867 for (
const TreeEntry *TE : SecondEntries) {
16868 auto It = VFToTE.
find(
TE->getVectorFactor());
16869 if (It != VFToTE.
end()) {
16871 Entries.push_back(It->second);
16872 Entries.push_back(TE);
16878 if (Entries.empty()) {
16880 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16881 return TE1->Idx < TE2->Idx;
16883 Entries.push_back(SecondEntries.front());
16884 VF = std::max(Entries.front()->getVectorFactor(),
16885 Entries.back()->getVectorFactor());
16887 VF = Entries.front()->getVectorFactor();
16890 for (
const TreeEntry *E : Entries)
16891 ValuesToEntries.
emplace_back().insert(E->Scalars.begin(),
16894 for (
auto &
P : UsedValuesEntry) {
16895 for (
unsigned Idx : seq<unsigned>(ValuesToEntries.
size()))
16903 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
16906 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16907 auto *
PHI = cast<PHINode>(V);
16908 auto *PHI1 = cast<PHINode>(V1);
16913 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
16915 Value *In1 = PHI1->getIncomingValue(
I);
16920 if (cast<Instruction>(In)->
getParent() !=
16930 auto MightBeIgnored = [=](
Value *
V) {
16931 auto *
I = dyn_cast<Instruction>(V);
16934 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
16939 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
16941 bool UsedInSameVTE =
false;
16942 auto It = UsedValuesEntry.find(V1);
16943 if (It != UsedValuesEntry.end())
16944 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
16945 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
16947 cast<Instruction>(V)->getParent() ==
16948 cast<Instruction>(V1)->getParent() &&
16949 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
16954 for (
int I = 0, E = VL.size();
I < E; ++
I) {
16956 auto It = UsedValuesEntry.find(V);
16957 if (It == UsedValuesEntry.end())
16963 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
16964 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
16966 unsigned Idx = It->second;
16973 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
16974 if (!UsedIdxs.test(
I))
16980 for (std::pair<unsigned, int> &Pair : EntryLanes)
16981 if (Pair.first ==
I)
16982 Pair.first = TempEntries.
size();
16985 Entries.swap(TempEntries);
16986 if (EntryLanes.size() == Entries.size() &&
16988 .
slice(Part * VL.size(),
16989 std::min<int>(VL.size(),
TE->Scalars.size())))) {
16995 return std::nullopt;
16998 bool IsIdentity = Entries.size() == 1;
17001 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17002 unsigned Idx = Part * VL.size() + Pair.second;
17005 (ForOrder ? std::distance(
17006 Entries[Pair.first]->Scalars.begin(),
17007 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17008 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17009 IsIdentity &=
Mask[
Idx] == Pair.second;
17011 if (ForOrder || IsIdentity || Entries.empty()) {
17012 switch (Entries.size()) {
17014 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17018 if (EntryLanes.size() > 2 || VL.size() <= 2)
17024 }
else if (!isa<VectorType>(VL.front()->getType()) &&
17025 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17028 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17029 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17030 for (
int Idx : SubMask) {
17038 assert(MaxElement >= 0 && MinElement >= 0 &&
17039 MaxElement % VF >= MinElement % VF &&
17040 "Expected at least single element.");
17041 unsigned NewVF = std::max<unsigned>(
17043 (MaxElement % VF) -
17044 (MinElement % VF) + 1));
17046 for (
int &
Idx : SubMask) {
17049 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17050 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
17058 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17059 auto GetShuffleCost = [&,
17063 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17065 Mask, Entries.front()->getInterleaveFactor()))
17067 return ::getShuffleCost(
TTI,
17072 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17075 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17076 FirstShuffleCost = ShuffleCost;
17080 bool IsIdentity =
true;
17082 if (
Idx >=
static_cast<int>(NewVF)) {
17087 IsIdentity &=
static_cast<int>(
I) ==
Idx;
17091 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17093 *
TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17098 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17099 SecondShuffleCost = ShuffleCost;
17103 bool IsIdentity =
true;
17105 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
17111 IsIdentity &=
static_cast<int>(
I) ==
Idx;
17116 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17118 *
TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17126 *
TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17128 const TreeEntry *BestEntry =
nullptr;
17129 if (FirstShuffleCost < ShuffleCost) {
17130 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17131 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17133 if (Idx >= static_cast<int>(VF))
17134 Idx = PoisonMaskElem;
17136 BestEntry = Entries.front();
17137 ShuffleCost = FirstShuffleCost;
17139 if (SecondShuffleCost < ShuffleCost) {
17140 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17141 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17143 if (Idx < static_cast<int>(VF))
17144 Idx = PoisonMaskElem;
17148 BestEntry = Entries[1];
17149 ShuffleCost = SecondShuffleCost;
17151 if (BuildVectorCost >= ShuffleCost) {
17154 Entries.push_back(BestEntry);
17162 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17164 return std::nullopt;
17168BoUpSLP::isGatherShuffledEntry(
17172 assert(NumParts > 0 && NumParts < VL.
size() &&
17173 "Expected positive number of registers.");
17176 if (TE == VectorizableTree.front().get() &&
17177 (!GatheredLoadsEntriesFirst.has_value() ||
17179 [](
const std::unique_ptr<TreeEntry> &TE) {
17180 return !
TE->isGather();
17185 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
17188 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17189 "Expected only single user of the gather node.");
17191 "Number of scalars must be divisible by NumParts.");
17192 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17193 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17195 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17198 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17202 for (
unsigned Part : seq<unsigned>(NumParts)) {
17206 std::optional<TTI::ShuffleKind> SubRes =
17207 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17210 SubEntries.
clear();
17213 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17214 (SubEntries.
front()->isSame(
TE->Scalars) ||
17215 SubEntries.
front()->isSame(VL))) {
17217 LocalSubEntries.
swap(SubEntries);
17220 std::iota(
Mask.begin(),
Mask.end(), 0);
17222 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17223 if (isa<PoisonValue>(VL[
I]))
17225 Entries.emplace_back(1, LocalSubEntries.
front());
17231 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17239 Type *ScalarTy)
const {
17240 const unsigned VF = VL.
size();
17248 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17250 if (
V->getType() != ScalarTy)
17255 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17258 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V))
17262 ConstantShuffleMask[
I] =
I + VF;
17265 EstimateInsertCost(
I, V);
17268 bool IsAnyNonUndefConst =
17271 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17273 ConstantShuffleMask);
17277 if (!DemandedElements.
isZero())
17281 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17285Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
17286 auto It = EntryToLastInstruction.
find(E);
17287 if (It != EntryToLastInstruction.
end())
17288 return *cast<Instruction>(It->second);
17295 if (E->hasState()) {
17296 Front = E->getMainOp();
17297 Opcode = E->getOpcode();
17299 Front = cast<Instruction>(*
find_if(E->Scalars, IsaPred<Instruction>));
17304 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17305 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17306 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17308 [=](
Value *V) ->
bool {
17309 if (Opcode == Instruction::GetElementPtr &&
17310 !isa<GetElementPtrInst>(V))
17312 auto *I = dyn_cast<Instruction>(V);
17313 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17314 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17316 "Expected gathered loads or GEPs or instructions from same basic "
17319 auto FindLastInst = [&]() {
17321 for (
Value *V : E->Scalars) {
17322 auto *
I = dyn_cast<Instruction>(V);
17325 if (E->isCopyableElement(
I))
17327 if (LastInst->
getParent() ==
I->getParent()) {
17332 assert(((Opcode == Instruction::GetElementPtr &&
17333 !isa<GetElementPtrInst>(
I)) ||
17334 E->State == TreeEntry::SplitVectorize ||
17337 (GatheredLoadsEntriesFirst.has_value() &&
17338 Opcode == Instruction::Load && E->isGather() &&
17339 E->Idx < *GatheredLoadsEntriesFirst)) &&
17340 "Expected vector-like or non-GEP in GEP node insts only.");
17348 auto *NodeB = DT->
getNode(
I->getParent());
17349 assert(NodeA &&
"Should only process reachable instructions");
17350 assert(NodeB &&
"Should only process reachable instructions");
17351 assert((NodeA == NodeB) ==
17352 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17353 "Different nodes should have different DFS numbers");
17354 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17361 auto FindFirstInst = [&]() {
17363 for (
Value *V : E->Scalars) {
17364 auto *
I = dyn_cast<Instruction>(V);
17367 if (E->isCopyableElement(
I))
17369 if (FirstInst->
getParent() ==
I->getParent()) {
17370 if (
I->comesBefore(FirstInst))
17374 assert(((Opcode == Instruction::GetElementPtr &&
17375 !isa<GetElementPtrInst>(
I)) ||
17378 "Expected vector-like or non-GEP in GEP node insts only.");
17386 auto *NodeB = DT->
getNode(
I->getParent());
17387 assert(NodeA &&
"Should only process reachable instructions");
17388 assert(NodeB &&
"Should only process reachable instructions");
17389 assert((NodeA == NodeB) ==
17390 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17391 "Different nodes should have different DFS numbers");
17392 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17398 if (E->State == TreeEntry::SplitVectorize) {
17399 Res = FindLastInst();
17401 for (
auto *E : Entries) {
17402 auto *
I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17404 I = &getLastInstructionInBundle(E);
17414 if (GatheredLoadsEntriesFirst.has_value() &&
17415 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17416 Opcode == Instruction::Load) {
17417 Res = FindFirstInst();
17424 auto FindScheduleBundle = [&](
const TreeEntry *E) ->
const ScheduleBundle * {
17428 const auto *It = BlocksSchedules.find(BB);
17429 if (It == BlocksSchedules.end())
17431 for (
Value *V : E->Scalars) {
17432 auto *
I = dyn_cast<Instruction>(V);
17433 if (!
I || isa<PHINode>(
I) ||
17437 if (Bundles.
empty())
17440 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() == E; });
17441 if (It != Bundles.
end())
17446 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17447 if (!E->isGather() && !Bundle) {
17448 if ((Opcode == Instruction::GetElementPtr &&
17451 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17454 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17455 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17457 Res = FindLastInst();
17459 Res = FindFirstInst();
17469 assert(!E->isGather() &&
"Gathered instructions should not be scheduled");
17470 Res = Bundle->getBundle().back()->getInst();
17494 Res = FindLastInst();
17495 assert(Res &&
"Failed to find last instruction in bundle");
17500void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
17501 auto *Front = E->getMainOp();
17502 Instruction *LastInst = &getLastInstructionInBundle(E);
17503 assert(LastInst &&
"Failed to find last instruction in bundle");
17506 bool IsPHI = isa<PHINode>(LastInst);
17508 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17509 if (LastInstIt != LastInst->
getParent()->end() &&
17510 LastInstIt->getParent()->isLandingPad())
17511 LastInstIt = std::next(LastInstIt);
17514 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17515 E->doesNotNeedToSchedule()) ||
17516 (GatheredLoadsEntriesFirst.has_value() &&
17517 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17518 E->getOpcode() == Instruction::Load)) {
17519 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17523 Builder.SetInsertPoint(
17527 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17530Value *BoUpSLP::gather(
17539 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17542 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17543 InsertBB = InsertBB->getSinglePredecessor();
17544 return InsertBB && InsertBB == InstBB;
17546 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17547 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
17548 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17550 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17551 PostponedIndices.
insert(
I).second)
17555 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17558 if (Scalar->getType() != Ty) {
17559 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17562 if (
auto *CI = dyn_cast<CastInst>(Scalar);
17563 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
17565 if (
auto *IOp = dyn_cast<Instruction>(
Op);
17569 Scalar = Builder.CreateIntCast(
17574 if (
auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17578 auto *
II = dyn_cast<Instruction>(Vec);
17583 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17584 InsElt = dyn_cast<InsertElementInst>(Vec);
17588 GatherShuffleExtractSeq.
insert(InsElt);
17591 if (isa<Instruction>(V)) {
17594 User *UserOp =
nullptr;
17596 if (
auto *SI = dyn_cast<Instruction>(Scalar))
17599 if (
V->getType()->isVectorTy()) {
17600 if (
auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17601 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17604 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17605 if (SV->getOperand(0) ==
V)
17607 if (SV->getOperand(1) ==
V)
17618 "Failed to find shufflevector, caused by resize.");
17624 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17625 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17635 std::iota(
Mask.begin(),
Mask.end(), 0);
17636 Value *OriginalRoot = Root;
17637 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17638 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17639 SV->getOperand(0)->getType() == VecTy) {
17640 Root = SV->getOperand(0);
17641 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17644 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17651 if (isa<PoisonValue>(VL[
I]))
17653 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17657 if (isa<PoisonValue>(Vec)) {
17658 Vec = OriginalRoot;
17660 Vec = CreateShuffle(Root, Vec, Mask);
17661 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
17662 OI && OI->use_empty() &&
17663 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17664 return TE->VectorizedValue == OI;
17670 for (
int I : NonConsts)
17671 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17674 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17675 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17713 bool IsFinalized =
false;
17726 class ShuffleIRBuilder {
17739 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17740 CSEBlocks(CSEBlocks),
DL(
DL) {}
17741 ~ShuffleIRBuilder() =
default;
17747 "Expected integer vector types only.");
17749 if (cast<VectorType>(V2->
getType())
17751 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
17753 ->getIntegerBitWidth())
17762 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
17763 GatherShuffleExtractSeq.
insert(
I);
17764 CSEBlocks.
insert(
I->getParent());
17773 unsigned VF = Mask.size();
17774 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
17778 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
17779 GatherShuffleExtractSeq.
insert(
I);
17780 CSEBlocks.
insert(
I->getParent());
17784 Value *createIdentity(
Value *V) {
return V; }
17785 Value *createPoison(
Type *Ty,
unsigned VF) {
17790 void resizeToMatch(
Value *&V1,
Value *&V2) {
17793 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
17794 int V2VF = cast<FixedVectorType>(V2->
getType())->getNumElements();
17795 int VF = std::max(V1VF, V2VF);
17796 int MinVF = std::min(V1VF, V2VF);
17798 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17800 Value *&
Op = MinVF == V1VF ? V1 : V2;
17802 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
17803 GatherShuffleExtractSeq.
insert(
I);
17804 CSEBlocks.
insert(
I->getParent());
17817 assert(V1 &&
"Expected at least one vector value.");
17818 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17819 R.CSEBlocks, *R.DL);
17820 return BaseShuffleAnalysis::createShuffle<Value *>(
17821 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17827 std::optional<bool> IsSigned = std::nullopt) {
17828 auto *VecTy = cast<VectorType>(V->getType());
17839 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17843 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17844 unsigned NumParts,
bool &UseVecBaseAsInput) {
17845 UseVecBaseAsInput =
false;
17847 Value *VecBase =
nullptr;
17849 if (!E->ReorderIndices.empty()) {
17851 E->ReorderIndices.end());
17854 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17858 auto *EI = cast<ExtractElementInst>(VL[
I]);
17859 VecBase = EI->getVectorOperand();
17861 VecBase = TEs.front()->VectorizedValue;
17862 assert(VecBase &&
"Expected vectorized value.");
17863 UniqueBases.
insert(VecBase);
17866 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17867 (NumParts != 1 &&
count(VL, EI) > 1) ||
17869 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17870 return UTEs.empty() || UTEs.size() > 1 ||
17871 (isa<GetElementPtrInst>(U) &&
17872 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17874 count_if(R.VectorizableTree,
17875 [&](const std::unique_ptr<TreeEntry> &TE) {
17876 return TE->UserTreeIndex.UserTE ==
17878 is_contained(VL, EI);
17882 R.eraseInstruction(EI);
17884 if (NumParts == 1 || UniqueBases.
size() == 1) {
17885 assert(VecBase &&
"Expected vectorized value.");
17886 return castToScalarTyElem(VecBase);
17888 UseVecBaseAsInput =
true;
17898 Value *Vec =
nullptr;
17901 for (
unsigned Part : seq<unsigned>(NumParts)) {
17905 constexpr int MaxBases = 2;
17907 auto VLMask =
zip(SubVL, SubMask);
17908 const unsigned VF = std::accumulate(
17909 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17910 if (std::get<1>(D) == PoisonMaskElem)
17913 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17914 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17916 VecOp = TEs.front()->VectorizedValue;
17917 assert(VecOp &&
"Expected vectorized value.");
17918 const unsigned Size =
17919 cast<FixedVectorType>(VecOp->getType())->getNumElements();
17920 return std::max(S, Size);
17922 for (
const auto [V,
I] : VLMask) {
17925 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
17927 VecOp = TEs.front()->VectorizedValue;
17928 assert(VecOp &&
"Expected vectorized value.");
17929 VecOp = castToScalarTyElem(VecOp);
17930 Bases[
I / VF] = VecOp;
17932 if (!Bases.front())
17935 if (Bases.back()) {
17936 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
17937 TransformToIdentity(SubMask);
17939 SubVec = Bases.front();
17946 Mask.slice(
P * SliceSize,
17953 "Expected first part or all previous parts masked.");
17954 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17957 cast<FixedVectorType>(Vec->
getType())->getNumElements();
17959 unsigned SubVecVF =
17960 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
17961 NewVF = std::max(NewVF, SubVecVF);
17964 for (
int &
Idx : SubMask)
17967 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17968 Vec = createShuffle(Vec, SubVec, VecMask);
17969 TransformToIdentity(VecMask);
17977 std::optional<Value *>
17983 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
17985 return std::nullopt;
17988 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
17996 IsFinalized =
false;
17997 CommonMask.
clear();
18003 Value *V1 = E1.VectorizedValue;
18005 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
18006 if (isa<PoisonValue>(V))
18008 return !isKnownNonNegative(
18009 V, SimplifyQuery(*R.DL));
18011 Value *V2 = E2.VectorizedValue;
18013 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
18014 if (isa<PoisonValue>(V))
18016 return !isKnownNonNegative(
18017 V, SimplifyQuery(*R.DL));
18024 Value *V1 = E1.VectorizedValue;
18026 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
18027 if (isa<PoisonValue>(V))
18029 return !isKnownNonNegative(
18030 V, SimplifyQuery(*R.DL));
18036 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18038 isa<FixedVectorType>(V2->
getType()) &&
18039 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18040 V1 = castToScalarTyElem(V1);
18041 V2 = castToScalarTyElem(V2);
18042 if (InVectors.
empty()) {
18045 CommonMask.
assign(Mask.begin(), Mask.end());
18049 if (InVectors.
size() == 2) {
18050 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18051 transformMaskAfterShuffle(CommonMask, CommonMask);
18052 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
18054 Vec = createShuffle(Vec,
nullptr, CommonMask);
18055 transformMaskAfterShuffle(CommonMask, CommonMask);
18057 V1 = createShuffle(V1, V2, Mask);
18058 unsigned VF = std::max(getVF(V1), getVF(Vec));
18059 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18061 CommonMask[
Idx] =
Idx + VF;
18062 InVectors.
front() = Vec;
18063 if (InVectors.
size() == 2)
18064 InVectors.
back() = V1;
18071 "castToScalarTyElem expects V1 to be FixedVectorType");
18072 V1 = castToScalarTyElem(V1);
18073 if (InVectors.
empty()) {
18075 CommonMask.
assign(Mask.begin(), Mask.end());
18078 const auto *It =
find(InVectors, V1);
18079 if (It == InVectors.
end()) {
18080 if (InVectors.
size() == 2 ||
18083 if (InVectors.
size() == 2) {
18084 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18085 transformMaskAfterShuffle(CommonMask, CommonMask);
18086 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18087 CommonMask.
size()) {
18088 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
18089 transformMaskAfterShuffle(CommonMask, CommonMask);
18091 unsigned VF = std::max(CommonMask.
size(), Mask.size());
18092 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18094 CommonMask[
Idx] = V->getType() != V1->
getType()
18096 : Mask[
Idx] + getVF(V1);
18097 if (V->getType() != V1->
getType())
18098 V1 = createShuffle(V1,
nullptr, Mask);
18099 InVectors.
front() = V;
18100 if (InVectors.
size() == 2)
18101 InVectors.
back() = V1;
18108 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18115 for (
Value *V : InVectors)
18116 VF = std::max(VF, getVF(V));
18117 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18119 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
18128 Value *Root =
nullptr) {
18129 return R.gather(VL, Root, ScalarTy,
18131 return createShuffle(V1, V2, Mask);
18140 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18145 IsFinalized =
true;
18148 if (InVectors.
size() == 2) {
18149 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18152 Vec = createShuffle(Vec,
nullptr, CommonMask);
18154 transformMaskAfterShuffle(CommonMask, CommonMask);
18156 "Expected vector length for the final value before action.");
18157 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
18160 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18161 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18164 return createShuffle(V1, V2, Mask);
18166 InVectors.
front() = Vec;
18168 if (!SubVectors.empty()) {
18170 if (InVectors.
size() == 2) {
18171 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18174 Vec = createShuffle(Vec,
nullptr, CommonMask);
18176 transformMaskAfterShuffle(CommonMask, CommonMask);
18177 auto CreateSubVectors = [&](
Value *Vec,
18179 for (
auto [E,
Idx] : SubVectors) {
18180 Value *
V = E->VectorizedValue;
18181 if (
V->getType()->isIntOrIntVectorTy())
18182 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
18183 if (isa<PoisonValue>(V))
18185 return !isKnownNonNegative(
18186 V, SimplifyQuery(*R.DL));
18194 Type *OrigScalarTy = ScalarTy;
18197 Builder, Vec, V, InsertionIndex,
18198 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18200 ScalarTy = OrigScalarTy;
18201 if (!CommonMask.
empty()) {
18202 std::iota(std::next(CommonMask.
begin(),
Idx),
18203 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
18209 if (SubVectorsMask.
empty()) {
18210 Vec = CreateSubVectors(Vec, CommonMask);
18213 copy(SubVectorsMask, SVMask.begin());
18214 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18217 I1 = I2 + CommonMask.
size();
18222 Vec = createShuffle(InsertVec, Vec, SVMask);
18223 transformMaskAfterShuffle(CommonMask, SVMask);
18225 InVectors.
front() = Vec;
18228 if (!ExtMask.
empty()) {
18229 if (CommonMask.
empty()) {
18233 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18236 NewMask[
I] = CommonMask[ExtMask[
I]];
18238 CommonMask.
swap(NewMask);
18241 if (CommonMask.
empty()) {
18242 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18243 return InVectors.
front();
18245 if (InVectors.
size() == 2)
18246 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18247 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18252 "Shuffle construction must be finalized.");
18256Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18260template <
typename BVTy,
typename ResTy,
typename... Args>
18261ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18263 assert(E->isGather() &&
"Expected gather node.");
18264 unsigned VF = E->getVectorFactor();
18266 bool NeedFreeze =
false;
18269 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
18271 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
18274 E->CombinedEntriesWithIndices.size());
18275 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18276 [&](
const auto &
P) {
18277 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18282 E->ReorderIndices.end());
18283 if (!ReorderMask.empty())
18289 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18290 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
18291 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18294 SubVectorsMask.
clear();
18298 unsigned I,
unsigned SliceSize,
18299 bool IsNotPoisonous) {
18301 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18304 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18305 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18306 if (UserTE->getNumOperands() != 2)
18308 if (!IsNotPoisonous) {
18309 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18310 [=](
const std::unique_ptr<TreeEntry> &TE) {
18311 return TE->UserTreeIndex.UserTE == UserTE &&
18312 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18314 if (It == VectorizableTree.end())
18317 if (!(*It)->ReorderIndices.empty()) {
18321 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18322 Value *V0 = std::get<0>(
P);
18323 Value *V1 = std::get<1>(
P);
18324 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18325 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18331 if ((
Mask.size() < InputVF &&
18334 (
Mask.size() == InputVF &&
18337 std::next(
Mask.begin(),
I * SliceSize),
18338 std::next(
Mask.begin(),
18345 std::next(
Mask.begin(),
I * SliceSize),
18346 std::next(
Mask.begin(),
18352 BVTy ShuffleBuilder(ScalarTy, Params...);
18353 ResTy Res = ResTy();
18357 Value *ExtractVecBase =
nullptr;
18358 bool UseVecBaseAsInput =
false;
18361 Type *OrigScalarTy = GatheredScalars.front()->getType();
18364 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
18366 bool Resized =
false;
18368 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18369 if (!ExtractShuffles.
empty()) {
18375 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand());
18377 ExtractEntries.
append(TEs.begin(), TEs.end());
18379 if (std::optional<ResTy> Delayed =
18380 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18382 PostponedGathers.
insert(E);
18387 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18388 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18389 ExtractVecBase = VecBase;
18390 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18391 if (VF == VecBaseTy->getNumElements() &&
18392 GatheredScalars.size() != VF) {
18394 GatheredScalars.append(VF - GatheredScalars.size(),
18402 if (!ExtractShuffles.
empty() || !E->hasState() ||
18403 E->getOpcode() != Instruction::Load ||
18404 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18405 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18408 return isa<LoadInst>(V) && isVectorized(V);
18410 (E->hasState() && E->isAltShuffle()) ||
18411 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18413 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18415 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18417 if (!GatherShuffles.
empty()) {
18418 if (std::optional<ResTy> Delayed =
18419 ShuffleBuilder.needToDelay(E, Entries)) {
18421 PostponedGathers.
insert(E);
18426 if (GatherShuffles.
size() == 1 &&
18428 Entries.front().front()->isSame(E->Scalars)) {
18431 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18434 Mask.resize(E->Scalars.size());
18435 const TreeEntry *FrontTE = Entries.front().front();
18436 if (FrontTE->ReorderIndices.empty() &&
18437 ((FrontTE->ReuseShuffleIndices.empty() &&
18438 E->Scalars.size() == FrontTE->Scalars.size()) ||
18439 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18440 std::iota(
Mask.begin(),
Mask.end(), 0);
18443 if (isa<PoisonValue>(V)) {
18447 Mask[
I] = FrontTE->findLaneForValue(V);
18452 ShuffleBuilder.resetForSameNode();
18453 ShuffleBuilder.add(*FrontTE, Mask);
18455 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18459 if (GatheredScalars.size() != VF &&
18461 return any_of(TEs, [&](
const TreeEntry *TE) {
18462 return TE->getVectorFactor() == VF;
18465 GatheredScalars.append(VF - GatheredScalars.size(),
18469 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18477 bool IsRootPoison) {
18480 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18487 int NumNonConsts = 0;
18490 if (isa<UndefValue>(V)) {
18491 if (!isa<PoisonValue>(V)) {
18506 Scalars.
front() = OrigV;
18509 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18510 Scalars[Res.first->second] = OrigV;
18511 ReuseMask[
I] = Res.first->second;
18514 if (NumNonConsts == 1) {
18519 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18522 ReuseMask[SinglePos] = SinglePos;
18523 }
else if (!UndefPos.
empty() && IsSplat) {
18528 return !isa<UndefValue>(V) &&
18530 (E->UserTreeIndex &&
any_of(
V->uses(), [E](
const Use &U) {
18533 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18534 is_contained(E->UserTreeIndex.UserTE->Scalars,
18538 if (It != Scalars.
end()) {
18540 int Pos = std::distance(Scalars.
begin(), It);
18541 for (
int I : UndefPos) {
18543 ReuseMask[
I] = Pos;
18552 for (
int I : UndefPos) {
18554 if (isa<UndefValue>(Scalars[
I]))
18561 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18562 bool IsNonPoisoned =
true;
18563 bool IsUsedInExpr =
true;
18564 Value *Vec1 =
nullptr;
18565 if (!ExtractShuffles.
empty()) {
18569 Value *Vec2 =
nullptr;
18570 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18574 if (UseVecBaseAsInput) {
18575 Vec1 = ExtractVecBase;
18577 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18580 if (isa<UndefValue>(StoredGS[
I]))
18582 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
18583 Value *VecOp = EI->getVectorOperand();
18585 !TEs.
empty() && TEs.
front()->VectorizedValue)
18586 VecOp = TEs.
front()->VectorizedValue;
18589 }
else if (Vec1 != VecOp) {
18590 assert((!Vec2 || Vec2 == VecOp) &&
18591 "Expected only 1 or 2 vectors shuffle.");
18597 IsUsedInExpr =
false;
18600 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18603 IsUsedInExpr &= FindReusedSplat(
18605 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
18606 ExtractMask.size(), IsNotPoisonedVec);
18607 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18608 IsNonPoisoned &= IsNotPoisonedVec;
18610 IsUsedInExpr =
false;
18615 if (!GatherShuffles.
empty()) {
18616 unsigned SliceSize =
18620 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18623 "No shuffles with empty entries list expected.");
18627 "Expected shuffle of 1 or 2 entries.");
18631 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18632 if (TEs.
size() == 1) {
18633 bool IsNotPoisonedVec =
18634 TEs.
front()->VectorizedValue
18638 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18639 SliceSize, IsNotPoisonedVec);
18640 ShuffleBuilder.add(*TEs.
front(), VecMask);
18641 IsNonPoisoned &= IsNotPoisonedVec;
18643 IsUsedInExpr =
false;
18644 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18645 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18656 int EMSz = ExtractMask.size();
18657 int MSz =
Mask.size();
18660 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18661 bool IsIdentityShuffle =
18662 ((UseVecBaseAsInput ||
18664 [](
const std::optional<TTI::ShuffleKind> &SK) {
18668 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18670 (!GatherShuffles.
empty() &&
18672 [](
const std::optional<TTI::ShuffleKind> &SK) {
18676 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18678 bool EnoughConstsForShuffle =
18682 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18686 return isa<Constant>(V) && !isa<UndefValue>(V);
18688 (!IsIdentityShuffle ||
18689 (GatheredScalars.size() == 2 &&
18691 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
18693 return isa<Constant>(V) && !isa<PoisonValue>(V);
18697 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18698 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
18704 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18706 TryPackScalars(GatheredScalars, BVMask,
true);
18707 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18708 ShuffleBuilder.add(BV, BVMask);
18711 return isa<PoisonValue>(V) ||
18712 (IsSingleShuffle && ((IsIdentityShuffle &&
18713 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18715 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18718 Res = ShuffleBuilder.finalize(
18719 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18721 bool IsSplat =
isSplat(NonConstants);
18723 TryPackScalars(NonConstants, BVMask,
false);
18724 auto CheckIfSplatIsProfitable = [&]() {
18732 Instruction::InsertElement, VecTy,
CostKind, 0,
18741 Instruction::InsertElement, VecTy,
CostKind,
18746 static_cast<int>(BVMask.size() - 1)) {
18754 return SplatCost <= BVCost;
18756 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18760 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18766 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18768 transform(BVMask, SplatMask.begin(), [](
int I) {
18769 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18772 BV = CreateShuffle(BV,
nullptr, SplatMask);
18776 Vec = CreateShuffle(Vec, BV, Mask);
18785 TryPackScalars(GatheredScalars, ReuseMask,
true);
18786 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18787 ShuffleBuilder.add(BV, ReuseMask);
18788 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18793 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18794 if (!isa<PoisonValue>(V))
18797 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18798 ShuffleBuilder.add(BV, Mask);
18799 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18804 Res = ShuffleBuilder.createFreeze(Res);
18808Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
18809 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
18811 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18819 for (
Value *V : VL)
18820 if (isa<Instruction>(V))
18834 Value *
V = E->Scalars.front();
18835 Type *ScalarTy =
V->getType();
18836 if (!isa<CmpInst>(V))
18838 auto It = MinBWs.
find(E);
18839 if (It != MinBWs.
end()) {
18840 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18845 if (E->VectorizedValue)
18846 return E->VectorizedValue;
18848 if (E->isGather()) {
18850 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18851 setInsertPointAfterBundle(E);
18852 Value *Vec = createBuildVector(E, ScalarTy);
18853 E->VectorizedValue = Vec;
18856 if (E->State == TreeEntry::SplitVectorize) {
18857 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18858 "Expected exactly 2 combined entries.");
18859 setInsertPointAfterBundle(E);
18861 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18864 "Expected same first part of scalars.");
18867 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18870 "Expected same second part of scalars.");
18872 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18873 bool IsSigned =
false;
18874 auto It = MinBWs.
find(OpE);
18875 if (It != MinBWs.
end())
18876 IsSigned = It->second.second;
18879 if (isa<PoisonValue>(V))
18885 if (cast<VectorType>(Op1->
getType())->getElementType() !=
18892 cast<FixedVectorType>(Op1->
getType())->getNumElements()),
18893 GetOperandSignedness(&OpTE1));
18895 if (cast<VectorType>(Op2->
getType())->getElementType() !=
18902 cast<FixedVectorType>(Op2->
getType())->getNumElements()),
18903 GetOperandSignedness(&OpTE2));
18905 if (E->ReorderIndices.empty()) {
18909 std::next(
Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18912 if (ScalarTyNumElements != 1) {
18918 E->CombinedEntriesWithIndices.back().second *
18919 ScalarTyNumElements);
18920 E->VectorizedValue = Vec;
18923 unsigned CommonVF =
18924 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18927 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18933 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18938 E->VectorizedValue = Vec;
18942 bool IsReverseOrder =
18943 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
18944 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
18945 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
18946 if (E->getOpcode() == Instruction::Store &&
18947 E->State == TreeEntry::Vectorize) {
18949 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
18950 E->ReorderIndices.size());
18951 ShuffleBuilder.add(V, Mask);
18952 }
else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
18953 E->State == TreeEntry::CompressVectorize) {
18954 ShuffleBuilder.addOrdered(V, {});
18956 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
18959 E->CombinedEntriesWithIndices.size());
18961 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
18962 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18965 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
18966 "Expected either combined subnodes or reordering");
18967 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
18970 assert(!E->isGather() &&
"Unhandled state");
18971 unsigned ShuffleOrOp =
18972 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
18974 auto GetOperandSignedness = [&](
unsigned Idx) {
18975 const TreeEntry *OpE = getOperandEntry(E,
Idx);
18976 bool IsSigned =
false;
18977 auto It = MinBWs.
find(OpE);
18978 if (It != MinBWs.
end())
18979 IsSigned = It->second.second;
18982 if (isa<PoisonValue>(V))
18984 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18988 switch (ShuffleOrOp) {
18989 case Instruction::PHI: {
18990 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
18991 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
18992 "PHI reordering is free.");
18993 auto *PH = cast<PHINode>(VL0);
18995 PH->getParent()->getFirstNonPHIIt());
19002 PH->getParent()->getFirstInsertionPt());
19005 V = FinalShuffle(V, E);
19007 E->VectorizedValue =
V;
19016 for (
unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19021 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19025 if (!VisitedBBs.
insert(IBB).second) {
19028 TreeEntry *OpTE = getOperandEntry(E,
I);
19029 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19030 OpTE->VectorizedValue = VecOp;
19036 Value *Vec = vectorizeOperand(E,
I);
19037 if (VecTy != Vec->
getType()) {
19039 MinBWs.
contains(getOperandEntry(E,
I))) &&
19040 "Expected item in MinBWs.");
19041 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19047 "Invalid number of incoming values");
19048 assert(E->VectorizedValue &&
"Expected vectorized value.");
19049 return E->VectorizedValue;
19052 case Instruction::ExtractElement: {
19053 Value *
V = E->getSingleOperand(0);
19054 setInsertPointAfterBundle(E);
19055 V = FinalShuffle(V, E);
19056 E->VectorizedValue =
V;
19059 case Instruction::ExtractValue: {
19060 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19065 NewV = FinalShuffle(NewV, E);
19066 E->VectorizedValue = NewV;
19069 case Instruction::InsertElement: {
19070 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19072 Value *
V = vectorizeOperand(E, 1);
19074 Type *ScalarTy =
Op.front()->getType();
19075 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
19077 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
19078 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19083 cast<FixedVectorType>(
V->getType())->getNumElements()),
19088 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
19089 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19091 const unsigned NumElts =
19092 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19093 const unsigned NumScalars = E->Scalars.size();
19096 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19100 if (!E->ReorderIndices.empty()) {
19105 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19108 bool IsIdentity =
true;
19110 Mask.swap(PrevMask);
19111 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19112 Value *Scalar = E->Scalars[PrevMask[
I]];
19114 IsIdentity &= InsertIdx -
Offset ==
I;
19117 if (!IsIdentity || NumElts != NumScalars) {
19118 Value *V2 =
nullptr;
19119 bool IsVNonPoisonous =
19122 if (NumElts != NumScalars &&
Offset == 0) {
19131 InsertMask[*InsertIdx] = *InsertIdx;
19132 if (!
Ins->hasOneUse())
19134 Ins = dyn_cast_or_null<InsertElementInst>(
19135 Ins->getUniqueUndroppableUser());
19138 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19140 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19143 if (!IsFirstPoison.
all()) {
19145 for (
unsigned I = 0;
I < NumElts;
I++) {
19147 IsFirstUndef.
test(
I)) {
19148 if (IsVNonPoisonous) {
19149 InsertMask[
I] =
I < NumScalars ?
I : 0;
19154 if (
Idx >= NumScalars)
19155 Idx = NumScalars - 1;
19156 InsertMask[
I] = NumScalars +
Idx;
19170 if (
auto *
I = dyn_cast<Instruction>(V)) {
19171 GatherShuffleExtractSeq.
insert(
I);
19172 CSEBlocks.
insert(
I->getParent());
19177 for (
unsigned I = 0;
I < NumElts;
I++) {
19182 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19185 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19186 NumElts != NumScalars) {
19187 if (IsFirstUndef.
all()) {
19190 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19191 if (!IsFirstPoison.
all()) {
19192 for (
unsigned I = 0;
I < NumElts;
I++) {
19194 InsertMask[
I] =
I + NumElts;
19201 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
19202 if (
auto *
I = dyn_cast<Instruction>(V)) {
19203 GatherShuffleExtractSeq.
insert(
I);
19204 CSEBlocks.
insert(
I->getParent());
19209 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19210 for (
unsigned I = 0;
I < NumElts;
I++) {
19214 InsertMask[
I] += NumElts;
19217 FirstInsert->getOperand(0), V, InsertMask,
19218 cast<Instruction>(E->Scalars.back())->getName());
19219 if (
auto *
I = dyn_cast<Instruction>(V)) {
19220 GatherShuffleExtractSeq.
insert(
I);
19221 CSEBlocks.
insert(
I->getParent());
19226 ++NumVectorInstructions;
19227 E->VectorizedValue =
V;
19230 case Instruction::ZExt:
19231 case Instruction::SExt:
19232 case Instruction::FPToUI:
19233 case Instruction::FPToSI:
19234 case Instruction::FPExt:
19235 case Instruction::PtrToInt:
19236 case Instruction::IntToPtr:
19237 case Instruction::SIToFP:
19238 case Instruction::UIToFP:
19239 case Instruction::Trunc:
19240 case Instruction::FPTrunc:
19241 case Instruction::BitCast: {
19242 setInsertPointAfterBundle(E);
19244 Value *InVec = vectorizeOperand(E, 0);
19246 auto *CI = cast<CastInst>(VL0);
19248 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
19249 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
19251 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
19254 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
19255 if (SrcIt != MinBWs.
end())
19256 SrcBWSz = SrcIt->second.first;
19258 if (BWSz == SrcBWSz) {
19259 VecOpcode = Instruction::BitCast;
19260 }
else if (BWSz < SrcBWSz) {
19261 VecOpcode = Instruction::Trunc;
19262 }
else if (It != MinBWs.
end()) {
19263 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19264 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19265 }
else if (SrcIt != MinBWs.
end()) {
19266 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19268 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19270 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
19271 !SrcIt->second.second) {
19272 VecOpcode = Instruction::UIToFP;
19274 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19276 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
19277 V = FinalShuffle(V, E);
19279 E->VectorizedValue =
V;
19280 ++NumVectorInstructions;
19283 case Instruction::FCmp:
19284 case Instruction::ICmp: {
19285 setInsertPointAfterBundle(E);
19287 Value *
L = vectorizeOperand(E, 0);
19288 Value *
R = vectorizeOperand(E, 1);
19289 if (
L->getType() !=
R->getType()) {
19291 getOperandEntry(E, 1)->
isGather() ||
19292 MinBWs.
contains(getOperandEntry(E, 0)) ||
19293 MinBWs.
contains(getOperandEntry(E, 1))) &&
19294 "Expected item in MinBWs.");
19295 if (cast<VectorType>(
L->getType())
19297 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
19299 ->getIntegerBitWidth()) {
19300 Type *CastTy =
R->getType();
19303 Type *CastTy =
L->getType();
19311 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
19312 ICmp->setSameSign(
false);
19314 VecTy = cast<FixedVectorType>(
V->getType());
19315 V = FinalShuffle(V, E);
19317 E->VectorizedValue =
V;
19318 ++NumVectorInstructions;
19321 case Instruction::Select: {
19322 setInsertPointAfterBundle(E);
19325 Value *True = vectorizeOperand(E, 1);
19326 Value *False = vectorizeOperand(E, 2);
19329 getOperandEntry(E, 2)->
isGather() ||
19330 MinBWs.
contains(getOperandEntry(E, 1)) ||
19331 MinBWs.
contains(getOperandEntry(E, 2))) &&
19332 "Expected item in MinBWs.");
19333 if (True->
getType() != VecTy)
19334 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
19335 if (False->
getType() != VecTy)
19336 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
19341 assert(TrueNumElements >= CondNumElements &&
19342 TrueNumElements % CondNumElements == 0 &&
19343 "Cannot vectorize Instruction::Select");
19345 "Cannot vectorize Instruction::Select");
19346 if (CondNumElements != TrueNumElements) {
19354 "Cannot vectorize Instruction::Select");
19356 V = FinalShuffle(V, E);
19358 E->VectorizedValue =
V;
19359 ++NumVectorInstructions;
19362 case Instruction::FNeg: {
19363 setInsertPointAfterBundle(E);
19365 Value *
Op = vectorizeOperand(E, 0);
19370 if (
auto *
I = dyn_cast<Instruction>(V))
19373 V = FinalShuffle(V, E);
19375 E->VectorizedValue =
V;
19376 ++NumVectorInstructions;
19380 case Instruction::Freeze: {
19381 setInsertPointAfterBundle(E);
19383 Value *
Op = vectorizeOperand(E, 0);
19385 if (
Op->getType() != VecTy) {
19387 MinBWs.
contains(getOperandEntry(E, 0))) &&
19388 "Expected item in MinBWs.");
19392 V = FinalShuffle(V, E);
19394 E->VectorizedValue =
V;
19395 ++NumVectorInstructions;
19399 case Instruction::Add:
19400 case Instruction::FAdd:
19401 case Instruction::Sub:
19402 case Instruction::FSub:
19403 case Instruction::Mul:
19404 case Instruction::FMul:
19405 case Instruction::UDiv:
19406 case Instruction::SDiv:
19407 case Instruction::FDiv:
19408 case Instruction::URem:
19409 case Instruction::SRem:
19410 case Instruction::FRem:
19411 case Instruction::Shl:
19412 case Instruction::LShr:
19413 case Instruction::AShr:
19414 case Instruction::And:
19415 case Instruction::Or:
19416 case Instruction::Xor: {
19417 setInsertPointAfterBundle(E);
19419 Value *
LHS = vectorizeOperand(E, 0);
19420 Value *
RHS = vectorizeOperand(E, 1);
19421 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
19422 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19425 auto *CI = dyn_cast<ConstantInt>(
Op);
19426 return CI && CI->getValue().countr_one() >= It->second.first;
19428 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
19429 E->VectorizedValue =
V;
19430 ++NumVectorInstructions;
19437 getOperandEntry(E, 1)->
isGather() ||
19438 MinBWs.
contains(getOperandEntry(E, 0)) ||
19439 MinBWs.
contains(getOperandEntry(E, 1))) &&
19440 "Expected item in MinBWs.");
19451 if (
auto *
I = dyn_cast<Instruction>(V)) {
19454 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
19456 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19458 I->setHasNoUnsignedWrap(
false);
19461 V = FinalShuffle(V, E);
19463 E->VectorizedValue =
V;
19464 ++NumVectorInstructions;
19468 case Instruction::Load: {
19471 setInsertPointAfterBundle(E);
19473 LoadInst *LI = cast<LoadInst>(VL0);
19476 if (E->State == TreeEntry::Vectorize) {
19478 }
else if (E->State == TreeEntry::CompressVectorize) {
19479 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19480 CompressEntryToData.at(E);
19487 for (
int I : CompressMask)
19489 if (
auto *VecTy = dyn_cast<FixedVectorType>(LI->
getType())) {
19501 if (
auto *VecTy = dyn_cast<FixedVectorType>(LI->
getType())) {
19508 }
else if (E->State == TreeEntry::StridedVectorize) {
19509 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19510 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19511 PO = IsReverseOrder ? PtrN : Ptr0;
19518 *Diff / (
static_cast<int64_t
>(E->Scalars.size()) - 1);
19520 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
19521 DL->getTypeAllocSize(ScalarTy));
19525 return cast<LoadInst>(V)->getPointerOperand();
19528 std::optional<Value *> Stride =
19537 (IsReverseOrder ? -1 : 1) *
19538 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
19540 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19542 Intrinsic::experimental_vp_strided_load,
19543 {VecTy, PO->
getType(), StrideTy},
19545 Builder.
getInt32(E->Scalars.size())});
19551 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19552 Value *VecPtr = vectorizeOperand(E, 0);
19553 if (isa<FixedVectorType>(ScalarTy)) {
19557 unsigned ScalarTyNumElements =
19558 cast<FixedVectorType>(ScalarTy)->getNumElements();
19559 unsigned VecTyNumElements =
19560 cast<FixedVectorType>(VecTy)->getNumElements();
19561 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19562 "Cannot expand getelementptr.");
19563 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19566 return Builder.getInt64(I % ScalarTyNumElements);
19575 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19578 Value *
V = E->State == TreeEntry::CompressVectorize
19582 V = FinalShuffle(V, E);
19583 E->VectorizedValue =
V;
19584 ++NumVectorInstructions;
19587 case Instruction::Store: {
19588 auto *
SI = cast<StoreInst>(VL0);
19590 setInsertPointAfterBundle(E);
19592 Value *VecValue = vectorizeOperand(E, 0);
19593 if (VecValue->
getType() != VecTy)
19595 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19596 VecValue = FinalShuffle(VecValue, E);
19600 if (E->State == TreeEntry::Vectorize) {
19603 assert(E->State == TreeEntry::StridedVectorize &&
19604 "Expected either strided or consecutive stores.");
19605 if (!E->ReorderIndices.empty()) {
19606 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19607 Ptr =
SI->getPointerOperand();
19609 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19610 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
19612 Intrinsic::experimental_vp_strided_store,
19613 {VecTy,
Ptr->getType(), StrideTy},
19616 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
19618 Builder.
getInt32(E->Scalars.size())});
19627 E->VectorizedValue =
V;
19628 ++NumVectorInstructions;
19631 case Instruction::GetElementPtr: {
19632 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19633 setInsertPointAfterBundle(E);
19635 Value *Op0 = vectorizeOperand(E, 0);
19638 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19639 Value *OpVec = vectorizeOperand(E, J);
19643 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19644 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
19646 for (
Value *V : E->Scalars) {
19647 if (isa<GetElementPtrInst>(V))
19653 V = FinalShuffle(V, E);
19655 E->VectorizedValue =
V;
19656 ++NumVectorInstructions;
19660 case Instruction::Call: {
19661 CallInst *CI = cast<CallInst>(VL0);
19662 setInsertPointAfterBundle(E);
19668 It != MinBWs.
end() ? It->second.first : 0,
TTI);
19671 VecCallCosts.first <= VecCallCosts.second;
19673 Value *ScalarArg =
nullptr;
19679 auto *CEI = cast<CallInst>(VL0);
19680 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
19684 ScalarArg = CEI->getArgOperand(
I);
19687 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
19688 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
19696 Value *OpVec = vectorizeOperand(E,
I);
19697 ScalarArg = CEI->getArgOperand(
I);
19698 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
19700 It == MinBWs.
end()) {
19703 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19704 }
else if (It != MinBWs.
end()) {
19705 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19714 if (!UseIntrinsic) {
19729 V = FinalShuffle(V, E);
19731 E->VectorizedValue =
V;
19732 ++NumVectorInstructions;
19735 case Instruction::ShuffleVector: {
19737 if (
SLPReVec && !E->isAltShuffle()) {
19738 setInsertPointAfterBundle(E);
19739 Value *Src = vectorizeOperand(E, 0);
19741 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19744 return SVSrc->getShuffleMask()[Mask];
19747 SVSrc->getOperand(1), NewMask);
19752 if (
auto *
I = dyn_cast<Instruction>(V))
19754 V = FinalShuffle(V, E);
19756 assert(E->isAltShuffle() &&
19761 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19762 "Invalid Shuffle Vector Operand");
19766 setInsertPointAfterBundle(E);
19767 LHS = vectorizeOperand(E, 0);
19768 RHS = vectorizeOperand(E, 1);
19770 setInsertPointAfterBundle(E);
19771 LHS = vectorizeOperand(E, 0);
19778 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19779 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19780 MinBWs.
contains(getOperandEntry(E, 0)) ||
19781 MinBWs.
contains(getOperandEntry(E, 1))) &&
19782 "Expected item in MinBWs.");
19783 Type *CastTy = VecTy;
19787 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
19789 ->getIntegerBitWidth())
19806 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19807 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
19808 auto *AltCI = cast<CmpInst>(E->getAltOp());
19810 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
19813 unsigned SrcBWSz =
DL->getTypeSizeInBits(
19814 cast<VectorType>(
LHS->
getType())->getElementType());
19815 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
19816 if (BWSz <= SrcBWSz) {
19817 if (BWSz < SrcBWSz)
19820 "Expected same type as operand.");
19821 if (
auto *
I = dyn_cast<Instruction>(LHS))
19823 LHS = FinalShuffle(LHS, E);
19824 E->VectorizedValue =
LHS;
19825 ++NumVectorInstructions;
19836 for (
Value *V : {V0, V1}) {
19837 if (
auto *
I = dyn_cast<Instruction>(V)) {
19838 GatherShuffleExtractSeq.
insert(
I);
19839 CSEBlocks.
insert(
I->getParent());
19848 E->buildAltOpShuffleMask(
19850 assert(E->getMatchingMainOpOrAltOp(
I) &&
19851 "Unexpected main/alternate opcode");
19855 Mask, &OpScalars, &AltScalars);
19859 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19861 if (
auto *
I = dyn_cast<Instruction>(Vec);
19862 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
19864 if (isa<PoisonValue>(V))
19866 auto *IV = cast<Instruction>(V);
19867 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19869 I->setHasNoUnsignedWrap(
false);
19871 DropNuwFlag(V0, E->getOpcode());
19872 DropNuwFlag(V1, E->getAltOpcode());
19874 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19879 if (
auto *
I = dyn_cast<Instruction>(V)) {
19881 GatherShuffleExtractSeq.
insert(
I);
19882 CSEBlocks.
insert(
I->getParent());
19886 E->VectorizedValue =
V;
19887 ++NumVectorInstructions;
19905 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19908 EntryToLastInstruction.
clear();
19910 for (
auto &BSIter : BlocksSchedules)
19911 scheduleBlock(*
this, BSIter.second.get());
19914 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19915 if (TE->isGather())
19917 (void)getLastInstructionInBundle(TE.get());
19928 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19929 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19930 TE->UserTreeIndex.UserTE->hasState() &&
19931 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19932 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19933 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19934 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19935 all_of(TE->UserTreeIndex.UserTE->Scalars,
19936 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19938 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19942 for (
auto &Entry : GatherEntries) {
19950 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19951 if (GatheredLoadsEntriesFirst.has_value() &&
19952 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
19953 (!TE->isGather() || TE->UserTreeIndex)) {
19954 assert((TE->UserTreeIndex ||
19955 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
19956 "Expected gathered load node.");
19965 for (
const TreeEntry *E : PostponedNodes) {
19966 auto *TE =
const_cast<TreeEntry *
>(E);
19967 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
19968 TE->VectorizedValue =
nullptr;
19969 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
19978 if (isa<PHINode>(UserI)) {
19981 for (
User *U : PrevVec->users()) {
19984 auto *UI = dyn_cast<Instruction>(U);
19985 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
19987 if (UI->comesBefore(InsertPt))
19996 if (
auto *VecI = dyn_cast<Instruction>(Vec);
20001 if (Vec->
getType() != PrevVec->getType()) {
20003 PrevVec->getType()->isIntOrIntVectorTy() &&
20004 "Expected integer vector types only.");
20005 std::optional<bool> IsSigned;
20006 for (
Value *V : TE->Scalars) {
20008 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20009 auto It = MinBWs.
find(MNTE);
20010 if (It != MinBWs.
end()) {
20011 IsSigned = IsSigned.value_or(
false) || It->second.second;
20016 if (IsSigned.value_or(
false))
20019 for (
const TreeEntry *BVE : ValueToGatherNodes.
lookup(V)) {
20020 auto It = MinBWs.
find(BVE);
20021 if (It != MinBWs.
end()) {
20022 IsSigned = IsSigned.value_or(
false) || It->second.second;
20027 if (IsSigned.value_or(
false))
20029 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
20031 IsSigned.value_or(
false) ||
20035 if (IsSigned.value_or(
false))
20039 if (IsSigned.value_or(
false)) {
20041 auto It = MinBWs.
find(TE->UserTreeIndex.UserTE);
20042 if (It != MinBWs.
end())
20043 IsSigned = It->second.second;
20046 "Expected user node or perfect diamond match in MinBWs.");
20050 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20053 auto It = PostponedValues.
find(PrevVec);
20054 if (It != PostponedValues.
end()) {
20055 for (TreeEntry *VTE : It->getSecond())
20056 VTE->VectorizedValue = Vec;
20076 for (
const auto &ExternalUse : ExternalUses) {
20077 Value *Scalar = ExternalUse.Scalar;
20084 const TreeEntry *E = &ExternalUse.E;
20085 assert(E &&
"Invalid scalar");
20086 assert(!E->isGather() &&
"Extracting from a gather list");
20088 if (E->getOpcode() == Instruction::GetElementPtr &&
20089 !isa<GetElementPtrInst>(Scalar))
20092 Value *Vec = E->VectorizedValue;
20093 assert(Vec &&
"Can't find vectorizable value");
20096 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20097 if (Scalar->getType() != Vec->
getType()) {
20098 Value *Ex =
nullptr;
20099 Value *ExV =
nullptr;
20100 auto *Inst = dyn_cast<Instruction>(Scalar);
20101 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
20102 auto It = ScalarToEEs.
find(Scalar);
20103 if (It != ScalarToEEs.
end()) {
20106 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20108 if (EEIt != It->second.end()) {
20109 Value *PrevV = EEIt->second.first;
20110 if (
auto *
I = dyn_cast<Instruction>(PrevV);
20111 I && !ReplaceInst &&
20116 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20120 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20128 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20129 IgnoredExtracts.
insert(EE);
20132 auto *CloneInst = Inst->clone();
20133 CloneInst->insertBefore(Inst->getIterator());
20134 if (Inst->hasName())
20138 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20139 ES && isa<Instruction>(Vec)) {
20140 Value *V = ES->getVectorOperand();
20141 auto *IVec = cast<Instruction>(Vec);
20143 V = ETEs.front()->VectorizedValue;
20144 if (
auto *
IV = dyn_cast<Instruction>(V);
20145 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20146 IV->comesBefore(IVec))
20150 }
else if (
auto *VecTy =
20151 dyn_cast<FixedVectorType>(Scalar->getType())) {
20158 ExternalUse.Lane * VecTyNumElements);
20165 if (Scalar->getType() != Ex->
getType())
20167 Ex, Scalar->getType(),
20169 auto *
I = dyn_cast<Instruction>(Ex);
20171 : &
F->getEntryBlock(),
20172 std::make_pair(Ex, ExV));
20176 if (
auto *ExI = dyn_cast<Instruction>(Ex);
20178 GatherShuffleExtractSeq.
insert(ExI);
20179 CSEBlocks.
insert(ExI->getParent());
20183 assert(isa<FixedVectorType>(Scalar->getType()) &&
20184 isa<InsertElementInst>(Scalar) &&
20185 "In-tree scalar of vector type is not insertelement?");
20186 auto *IE = cast<InsertElementInst>(Scalar);
20194 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20197 (ExternallyUsedValues.
count(Scalar) ||
20198 ExternalUsesWithNonUsers.
count(Scalar) ||
20199 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
20203 if (ExternalUsesAsOriginalScalar.contains(U))
20205 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20206 return !UseEntries.empty() &&
20207 (E->State == TreeEntry::Vectorize ||
20208 E->State == TreeEntry::StridedVectorize ||
20209 E->State == TreeEntry::CompressVectorize) &&
20210 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20211 return (UseEntry->State == TreeEntry::Vectorize ||
20213 TreeEntry::StridedVectorize ||
20215 TreeEntry::CompressVectorize) &&
20216 doesInTreeUserNeedToExtract(
20217 Scalar, getRootEntryInstruction(*UseEntry),
20221 "Scalar with nullptr User must be registered in "
20222 "ExternallyUsedValues map or remain as scalar in vectorized "
20224 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
20225 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
20226 if (
PHI->getParent()->isLandingPad())
20230 PHI->getParent()->getLandingPadInst()->getIterator()));
20233 PHI->getParent()->getFirstNonPHIIt());
20236 std::next(VecI->getIterator()));
20241 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20243 if (Scalar != NewInst) {
20244 assert((!isa<ExtractElementInst>(Scalar) ||
20245 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
20246 "Extractelements should not be replaced.");
20247 Scalar->replaceAllUsesWith(NewInst);
20252 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
20255 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20256 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
20257 if (!UsedInserts.
insert(VU).second)
20260 auto BWIt = MinBWs.
find(E);
20262 auto *ScalarTy = FTy->getElementType();
20263 auto Key = std::make_pair(Vec, ScalarTy);
20264 auto VecIt = VectorCasts.
find(Key);
20265 if (VecIt == VectorCasts.
end()) {
20267 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
20268 if (IVec->getParent()->isLandingPad())
20270 std::next(IVec->getParent()
20271 ->getLandingPadInst()
20275 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20276 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
20283 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
20284 BWIt->second.second);
20287 Vec = VecIt->second;
20294 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20301 unsigned Idx = *InsertIdx;
20302 if (It == ShuffledInserts.
end()) {
20304 It = std::next(ShuffledInserts.
begin(),
20305 ShuffledInserts.
size() - 1);
20310 Mask[
Idx] = ExternalUse.Lane;
20311 It->InsertElements.push_back(cast<InsertElementInst>(
User));
20320 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
20322 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20323 if (PH->getIncomingValue(
I) == Scalar) {
20325 PH->getIncomingBlock(
I)->getTerminator();
20326 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20328 std::next(VecI->getIterator()));
20332 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20333 PH->setOperand(
I, NewInst);
20338 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20343 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20353 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
20354 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20356 CombinedMask1[
I] = Mask[
I];
20358 CombinedMask2[
I] = Mask[
I] - VF;
20360 ShuffleInstructionBuilder ShuffleBuilder(
20361 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
20362 ShuffleBuilder.add(V1, CombinedMask1);
20364 ShuffleBuilder.add(V2, CombinedMask2);
20365 return ShuffleBuilder.finalize({}, {}, {});
20369 bool ForSingleMask) {
20370 unsigned VF =
Mask.size();
20371 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
20373 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20374 Vec = CreateShuffle(Vec,
nullptr, Mask);
20375 return std::make_pair(Vec,
true);
20377 if (!ForSingleMask) {
20379 for (
unsigned I = 0;
I < VF; ++
I) {
20383 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20387 return std::make_pair(Vec,
false);
20391 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
20397 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20398 Value *NewInst = performExtractsShuffleAction<Value>(
20402 return cast<VectorType>(Vec->getType())
20403 ->getElementCount()
20404 .getKnownMinValue();
20409 assert((Vals.size() == 1 || Vals.size() == 2) &&
20410 "Expected exactly 1 or 2 input values.");
20411 if (Vals.size() == 1) {
20414 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20415 ->getNumElements() ||
20416 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20417 return CreateShuffle(Vals.front(), nullptr, Mask);
20418 return Vals.front();
20420 return CreateShuffle(Vals.
front() ? Vals.
front()
20422 Vals.
back(), Mask);
20424 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20427 if (It != ShuffledInserts[
I].InsertElements.rend())
20430 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20431 assert(
II &&
"Must be an insertelement instruction.");
20436 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
20439 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20440 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
20441 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20442 II->moveAfter(NewI);
20447 IE->replaceUsesOfWith(
IE->getOperand(0),
20449 IE->replaceUsesOfWith(
IE->getOperand(1),
20458 for (
auto &TEPtr : VectorizableTree) {
20459 TreeEntry *
Entry = TEPtr.get();
20462 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20465 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20468 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20471 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20472 !isa<GetElementPtrInst>(Scalar))
20474 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20475 EE && IgnoredExtracts.contains(EE))
20477 if (!isa<Instruction>(Scalar) ||
Entry->isCopyableElement(Scalar))
20480 Type *Ty = Scalar->getType();
20482 for (
User *U : Scalar->users()) {
20486 assert((isVectorized(U) ||
20487 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20488 (isa_and_nonnull<Instruction>(U) &&
20489 isDeleted(cast<Instruction>(U)))) &&
20490 "Deleting out-of-tree value");
20494 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20495 auto *
I = cast<Instruction>(Scalar);
20502 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20503 V->mergeDIAssignID(RemovedInsts);
20506 if (UserIgnoreList) {
20508 const TreeEntry *
IE = getTreeEntries(
I).front();
20509 if (
IE->Idx != 0 &&
20510 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20511 (ValueToGatherNodes.lookup(
I).contains(
20512 VectorizableTree.front().get()) ||
20513 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20514 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20515 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20516 IE->UserTreeIndex &&
20518 !(GatheredLoadsEntriesFirst.has_value() &&
20519 IE->Idx >= *GatheredLoadsEntriesFirst &&
20520 VectorizableTree.front()->isGather() &&
20526 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20527 (match(U.getUser(), m_LogicalAnd()) ||
20528 match(U.getUser(), m_LogicalOr())) &&
20529 U.getOperandNo() == 0;
20530 if (IsPoisoningLogicalOp) {
20531 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20534 return UserIgnoreList->contains(
U.getUser());
20546 removeInstructionsAndOperands(
ArrayRef(RemovedInsts), VectorValuesAndScales);
20549 InstrElementSize.
clear();
20551 const TreeEntry &RootTE = *VectorizableTree.front();
20552 Value *Vec = RootTE.VectorizedValue;
20553 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20554 It != MinBWs.end() &&
20555 ReductionBitWidth != It->second.first) {
20558 ReductionRoot->getIterator());
20562 cast<VectorType>(Vec->
getType())->getElementCount()),
20563 It->second.second);
20570 <<
" gather sequences instructions.\n");
20577 Loop *L = LI->getLoopFor(
I->getParent());
20582 BasicBlock *PreHeader = L->getLoopPreheader();
20590 auto *OpI = dyn_cast<Instruction>(V);
20591 return OpI && L->contains(OpI);
20597 CSEBlocks.
insert(PreHeader);
20612 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20613 "Different nodes should have different DFS numbers");
20614 return A->getDFSNumIn() <
B->getDFSNumIn();
20625 if (I1->getType() != I2->getType())
20627 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20628 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20630 return I1->isIdenticalTo(I2);
20631 if (SI1->isIdenticalTo(SI2))
20633 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20634 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20637 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20641 unsigned LastUndefsCnt = 0;
20642 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20648 NewMask[
I] != SM1[
I])
20651 NewMask[
I] = SM1[
I];
20655 return SM1.
size() - LastUndefsCnt > 1 &&
20659 SM1.
size() - LastUndefsCnt));
20665 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20668 "Worklist not sorted properly!");
20674 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
20675 !GatherShuffleExtractSeq.contains(&In))
20680 bool Replaced =
false;
20683 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20684 DT->
dominates(V->getParent(), In.getParent())) {
20685 In.replaceAllUsesWith(V);
20687 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
20688 if (!NewMask.
empty())
20689 SI->setShuffleMask(NewMask);
20693 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
20694 GatherShuffleExtractSeq.contains(V) &&
20695 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20696 DT->
dominates(In.getParent(), V->getParent())) {
20698 V->replaceAllUsesWith(&In);
20700 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20701 if (!NewMask.
empty())
20702 SI->setShuffleMask(NewMask);
20710 Visited.push_back(&In);
20715 GatherShuffleExtractSeq.clear();
20718BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20721 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20722 for (
Value *V : VL) {
20723 if (S.isNonSchedulable(V))
20725 auto *
I = cast<Instruction>(V);
20726 if (S.isCopyableElement(V)) {
20728 ScheduleCopyableData &SD =
20729 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20731 BundlePtr->add(&SD);
20734 ScheduleData *BundleMember = getScheduleData(V);
20735 assert(BundleMember &&
"no ScheduleData for bundle member "
20736 "(maybe not in same basic block)");
20738 BundlePtr->add(BundleMember);
20739 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20742 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20748std::optional<BoUpSLP::ScheduleBundle *>
20750 const InstructionsState &S,
20751 const EdgeInfo &EI) {
20754 bool HasCopyables = S.areInstructionsWithCopyableElements();
20755 if (isa<PHINode>(S.getMainOp()) ||
20758 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
20763 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20765 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20769 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20771 for (ScheduleEntity *SE : Bundle.getBundle()) {
20772 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20773 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20774 BundleMember && BundleMember->hasValidDependencies()) {
20775 BundleMember->clearDirectDependencies();
20776 if (RegionHasStackSave ||
20778 BundleMember->getInst()))
20779 ControlDependentMembers.
push_back(BundleMember);
20783 auto *SD = cast<ScheduleData>(SE);
20784 for (
const Use &U : SD->getInst()->operands()) {
20787 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20788 .first->getSecond();
20790 if (
auto *
Op = dyn_cast<Instruction>(
U.get());
20791 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20793 if (ScheduleData *OpSD = getScheduleData(
Op)) {
20794 OpSD->clearDirectDependencies();
20795 if (RegionHasStackSave ||
20797 ControlDependentMembers.
push_back(OpSD);
20808 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20809 for_each(ScheduleDataMap, [&](
auto &
P) {
20810 if (BB !=
P.first->getParent())
20812 ScheduleData *SD =
P.second;
20813 if (isInSchedulingRegion(*SD))
20814 SD->clearDependencies();
20816 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20817 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20818 if (isInSchedulingRegion(*SD))
20819 SD->clearDependencies();
20826 if (Bundle && !Bundle.getBundle().empty()) {
20827 if (S.areInstructionsWithCopyableElements() ||
20828 !ScheduleCopyableDataMap.empty())
20829 CheckIfNeedToClearDeps(Bundle);
20830 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20832 calculateDependencies(Bundle, !ReSchedule, SLP,
20833 ControlDependentMembers);
20834 }
else if (!ControlDependentMembers.
empty()) {
20835 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20836 calculateDependencies(
Invalid, !ReSchedule, SLP,
20837 ControlDependentMembers);
20842 initialFillReadyList(ReadyInsts);
20849 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20850 !ReadyInsts.empty()) {
20851 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20852 assert(Picked->isReady() &&
"must be ready to schedule");
20853 schedule(*SLP, S, EI, Picked, ReadyInsts);
20854 if (Picked == &Bundle)
20861 for (
Value *V : VL) {
20862 if (S.isNonSchedulable(V))
20864 if (!extendSchedulingRegion(V, S)) {
20871 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20872 TryScheduleBundleImpl(
false,
Invalid);
20873 return std::nullopt;
20877 bool ReSchedule =
false;
20878 for (
Value *V : VL) {
20879 if (S.isNonSchedulable(V))
20882 getScheduleCopyableData(cast<Instruction>(V));
20883 if (!CopyableData.
empty()) {
20884 for (ScheduleCopyableData *SD : CopyableData)
20885 ReadyInsts.remove(SD);
20887 ScheduleData *BundleMember = getScheduleData(V);
20888 assert((BundleMember || S.isCopyableElement(V)) &&
20889 "no ScheduleData for bundle member (maybe not in same basic block)");
20895 ReadyInsts.remove(BundleMember);
20897 !Bundles.
empty()) {
20898 for (ScheduleBundle *
B : Bundles)
20899 ReadyInsts.remove(
B);
20902 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20909 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
20910 <<
" was already scheduled\n");
20914 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
20915 TryScheduleBundleImpl(ReSchedule, Bundle);
20916 if (!Bundle.isReady()) {
20917 for (ScheduleEntity *BD : Bundle.getBundle()) {
20919 if (isa<ScheduleCopyableData>(BD))
20921 if (BD->isReady()) {
20923 if (Bundles.
empty()) {
20924 ReadyInsts.insert(BD);
20927 for (ScheduleBundle *
B : Bundles)
20929 ReadyInsts.insert(
B);
20932 ScheduledBundlesList.pop_back();
20935 for (
Value *V : VL) {
20936 if (S.isNonSchedulable(V))
20938 auto *
I = cast<Instruction>(V);
20939 if (S.isCopyableElement(
I)) {
20942 auto KV = std::make_pair(EI,
I);
20943 assert(ScheduleCopyableDataMap.contains(KV) &&
20944 "no ScheduleCopyableData for copyable element");
20945 ScheduleCopyableData *SD =
20946 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
20947 ScheduleCopyableDataMapByUsers[
I].remove(SD);
20950 const auto *It =
find(
Op,
I);
20951 assert(It !=
Op.end() &&
"Lane not set");
20954 int Lane = std::distance(
Op.begin(), It);
20955 assert(Lane >= 0 &&
"Lane not set");
20956 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
20957 !EI.UserTE->ReorderIndices.empty())
20958 Lane = EI.UserTE->ReorderIndices[Lane];
20959 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
20960 "Couldn't find extract lane");
20961 auto *
In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
20962 if (!Visited.
insert(In).second) {
20966 ScheduleCopyableDataMapByInstUser
20967 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
20970 }
while (It !=
Op.end());
20971 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
20972 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
20973 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
20975 if (ScheduleCopyableDataMapByUsers[
I].empty())
20976 ScheduleCopyableDataMapByUsers.erase(
I);
20977 ScheduleCopyableDataMap.erase(KV);
20979 if (ScheduleData *OpSD = getScheduleData(
I)) {
20980 OpSD->clearDirectDependencies();
20981 if (RegionHasStackSave ||
20983 ControlDependentMembers.
push_back(OpSD);
20987 ScheduledBundles.find(
I)->getSecond().pop_back();
20989 if (!ControlDependentMembers.
empty()) {
20990 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20991 calculateDependencies(
Invalid,
false, SLP,
20992 ControlDependentMembers);
20994 return std::nullopt;
20999BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21001 if (ChunkPos >= ChunkSize) {
21002 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21005 return &(ScheduleDataChunks.back()[ChunkPos++]);
21008bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21009 Value *V,
const InstructionsState &S) {
21011 assert(
I &&
"bundle member must be an instruction");
21012 if (getScheduleData(
I))
21014 if (!ScheduleStart) {
21016 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21018 ScheduleEnd =
I->getNextNode();
21019 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21020 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21028 ++ScheduleStart->getIterator().getReverse();
21033 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
21034 return II->isAssumeLikeIntrinsic();
21037 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21038 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21039 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21041 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21042 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21049 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21050 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21052 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21053 assert(
I->getParent() == ScheduleStart->getParent() &&
21054 "Instruction is in wrong basic block.");
21055 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21061 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21062 "Expected to reach top of the basic block or instruction down the "
21064 assert(
I->getParent() == ScheduleEnd->getParent() &&
21065 "Instruction is in wrong basic block.");
21066 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21068 ScheduleEnd =
I->getNextNode();
21069 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21070 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21074void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
21076 ScheduleData *PrevLoadStore,
21077 ScheduleData *NextLoadStore) {
21078 ScheduleData *CurrentLoadStore = PrevLoadStore;
21081 if (isa<PHINode>(
I))
21083 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21085 SD = allocateScheduleDataChunks();
21086 ScheduleDataMap[
I] = SD;
21088 assert(!isInSchedulingRegion(*SD) &&
21089 "new ScheduleData already in scheduling region");
21090 SD->init(SchedulingRegionID,
I);
21092 if (
I->mayReadOrWriteMemory() &&
21093 (!isa<IntrinsicInst>(
I) ||
21094 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
21095 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
21096 Intrinsic::pseudoprobe))) {
21098 if (CurrentLoadStore) {
21099 CurrentLoadStore->setNextLoadStore(SD);
21101 FirstLoadStoreInRegion = SD;
21103 CurrentLoadStore = SD;
21106 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
21107 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
21108 RegionHasStackSave =
true;
21110 if (NextLoadStore) {
21111 if (CurrentLoadStore)
21112 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21114 LastLoadStoreInRegion = CurrentLoadStore;
21118void BoUpSLP::BlockScheduling::calculateDependencies(
21119 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21122 auto ProcessNode = [&](ScheduleEntity *SE) {
21123 if (
auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21124 if (CD->hasValidDependencies())
21127 CD->initDependencies();
21128 CD->resetUnscheduledDeps();
21129 const EdgeInfo &EI = CD->getEdgeInfo();
21132 const auto *It =
find(
Op, CD->getInst());
21133 assert(It !=
Op.end() &&
"Lane not set");
21136 int Lane = std::distance(
Op.begin(), It);
21137 assert(Lane >= 0 &&
"Lane not set");
21138 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21139 !EI.UserTE->ReorderIndices.empty())
21140 Lane = EI.UserTE->ReorderIndices[Lane];
21141 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21142 "Couldn't find extract lane");
21143 auto *
In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21144 if (EI.UserTE->isCopyableElement(In)) {
21147 if (ScheduleCopyableData *UseSD =
21148 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21149 CD->incDependencies();
21150 if (!UseSD->isScheduled())
21151 CD->incrementUnscheduledDeps(1);
21152 if (!UseSD->hasValidDependencies() ||
21153 (InsertInReadyList && UseSD->isReady()))
21156 }
else if (Visited.
insert(In).second) {
21157 if (ScheduleData *UseSD = getScheduleData(In)) {
21158 CD->incDependencies();
21159 if (!UseSD->isScheduled())
21160 CD->incrementUnscheduledDeps(1);
21161 if (!UseSD->hasValidDependencies() ||
21162 (InsertInReadyList && UseSD->isReady()))
21167 }
while (It !=
Op.end());
21168 if (CD->isReady() && CD->getDependencies() == 0 &&
21169 (EI.UserTE->hasState() &&
21170 (EI.UserTE->getMainOp()->getParent() !=
21171 CD->getInst()->getParent() ||
21172 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21173 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21174 any_of(EI.UserTE->getMainOp()->users(), [&](
User *U) {
21175 auto *IU = dyn_cast<Instruction>(U);
21178 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21184 CD->incDependencies();
21185 CD->incrementUnscheduledDeps(1);
21190 auto *BundleMember = cast<ScheduleData>(SE);
21191 if (BundleMember->hasValidDependencies())
21193 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21194 BundleMember->initDependencies();
21195 BundleMember->resetUnscheduledDeps();
21198 for (
User *U : BundleMember->getInst()->
users()) {
21199 if (isa<PHINode>(U))
21201 if (ScheduleData *UseSD = getScheduleData(U)) {
21203 unsigned &NumOps = UserToNumOps.
try_emplace(U, 0).first->getSecond();
21205 if (areAllOperandsReplacedByCopyableData(
21206 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21208 BundleMember->incDependencies();
21209 if (!UseSD->isScheduled())
21210 BundleMember->incrementUnscheduledDeps(1);
21211 if (!UseSD->hasValidDependencies() ||
21212 (InsertInReadyList && UseSD->isReady()))
21216 for (ScheduleCopyableData *UseSD :
21217 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21218 BundleMember->incDependencies();
21219 if (!UseSD->isScheduled())
21220 BundleMember->incrementUnscheduledDeps(1);
21221 if (!UseSD->hasValidDependencies() ||
21222 (InsertInReadyList && UseSD->isReady()))
21229 if (!Visited.
insert(
I).second)
21231 auto *DepDest = getScheduleData(
I);
21232 assert(DepDest &&
"must be in schedule window");
21233 DepDest->addControlDependency(BundleMember);
21234 BundleMember->incDependencies();
21235 if (!DepDest->isScheduled())
21236 BundleMember->incrementUnscheduledDeps(1);
21237 if (!DepDest->hasValidDependencies() ||
21238 (InsertInReadyList && DepDest->isReady()))
21246 for (
Instruction *
I = BundleMember->getInst()->getNextNode();
21247 I != ScheduleEnd;
I =
I->getNextNode()) {
21252 MakeControlDependent(
I);
21260 if (RegionHasStackSave) {
21264 if (
match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21265 match(BundleMember->getInst(),
21266 m_Intrinsic<Intrinsic::stackrestore>())) {
21267 for (
Instruction *
I = BundleMember->getInst()->getNextNode();
21268 I != ScheduleEnd;
I =
I->getNextNode()) {
21269 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
21270 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
21275 if (!isa<AllocaInst>(
I))
21279 MakeControlDependent(
I);
21288 if (isa<AllocaInst>(BundleMember->getInst()) ||
21289 BundleMember->getInst()->mayReadOrWriteMemory()) {
21290 for (
Instruction *
I = BundleMember->getInst()->getNextNode();
21291 I != ScheduleEnd;
I =
I->getNextNode()) {
21292 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
21293 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
21297 MakeControlDependent(
I);
21304 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21305 if (!NextLoadStore)
21309 "NextLoadStore list for non memory effecting bundle?");
21312 unsigned NumAliased = 0;
21313 unsigned DistToSrc = 1;
21314 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21316 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21317 DepDest = DepDest->getNextLoadStore()) {
21318 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21328 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21330 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21337 DepDest->addMemoryDependency(BundleMember);
21338 BundleMember->incDependencies();
21339 if (!DepDest->isScheduled())
21340 BundleMember->incrementUnscheduledDeps(1);
21341 if (!DepDest->hasValidDependencies() ||
21342 (InsertInReadyList && DepDest->isReady()))
21366 "expected at least one instruction to schedule");
21368 WorkList.
push_back(Bundle.getBundle().front());
21371 while (!WorkList.
empty()) {
21375 if (
auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21376 CopyableBundle.
push_back(&CD->getBundle());
21377 Bundles = CopyableBundle;
21379 Bundles = getScheduleBundles(SD->getInst());
21381 if (Bundles.
empty()) {
21382 if (!SD->hasValidDependencies())
21384 if (InsertInReadyList && SD->isReady()) {
21385 ReadyInsts.insert(SD);
21386 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21390 for (ScheduleBundle *Bundle : Bundles) {
21391 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21393 assert(isInSchedulingRegion(*Bundle) &&
21394 "ScheduleData not in scheduling region");
21395 for_each(Bundle->getBundle(), ProcessNode);
21397 if (InsertInReadyList && SD->isReady()) {
21398 for (ScheduleBundle *Bundle : Bundles) {
21399 assert(isInSchedulingRegion(*Bundle) &&
21400 "ScheduleData not in scheduling region");
21401 if (!Bundle->isReady())
21403 ReadyInsts.insert(Bundle);
21411void BoUpSLP::BlockScheduling::resetSchedule() {
21413 "tried to reset schedule on block which has not been scheduled");
21414 for_each(ScheduleDataMap, [&](
auto &
P) {
21415 if (BB !=
P.first->getParent())
21417 ScheduleData *SD =
P.second;
21418 if (isInSchedulingRegion(*SD)) {
21419 SD->setScheduled(
false);
21420 SD->resetUnscheduledDeps();
21423 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21424 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21425 if (isInSchedulingRegion(*SD)) {
21426 SD->setScheduled(false);
21427 SD->resetUnscheduledDeps();
21431 for_each(ScheduledBundles, [&](
auto &
P) {
21432 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21433 if (isInSchedulingRegion(*Bundle))
21434 Bundle->setScheduled(false);
21438 for (
auto &
P : ScheduleCopyableDataMap) {
21439 if (isInSchedulingRegion(*
P.second)) {
21440 P.second->setScheduled(
false);
21441 P.second->resetUnscheduledDeps();
21444 ReadyInsts.clear();
21447void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21448 if (!BS->ScheduleStart)
21451 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21458 BS->resetSchedule();
21465 struct ScheduleDataCompare {
21466 bool operator()(
const ScheduleEntity *SD1,
21467 const ScheduleEntity *SD2)
const {
21468 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21471 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21476 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21477 I =
I->getNextNode()) {
21479 if (!Bundles.
empty()) {
21480 for (ScheduleBundle *Bundle : Bundles) {
21481 Bundle->setSchedulingPriority(
Idx++);
21482 if (!Bundle->hasValidDependencies())
21483 BS->calculateDependencies(*Bundle,
false,
this);
21486 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21487 ScheduleBundle &Bundle = SD->getBundle();
21488 Bundle.setSchedulingPriority(
Idx++);
21489 if (!Bundle.hasValidDependencies())
21490 BS->calculateDependencies(Bundle,
false,
this);
21495 BS->getScheduleCopyableDataUsers(
I);
21496 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21499 SDTEs.
front()->doesNotNeedToSchedule() ||
21501 "scheduler and vectorizer bundle mismatch");
21502 SD->setSchedulingPriority(
Idx++);
21503 if (!SD->hasValidDependencies() &&
21504 (!CopyableData.
empty() ||
21505 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21506 assert(TE->isGather() &&
"expected gather node");
21507 return TE->hasState() && TE->hasCopyableElements() &&
21508 TE->isCopyableElement(I);
21514 ScheduleBundle Bundle;
21516 BS->calculateDependencies(Bundle,
false,
this);
21519 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21520 ScheduleBundle &Bundle = SD->getBundle();
21521 Bundle.setSchedulingPriority(
Idx++);
21522 if (!Bundle.hasValidDependencies())
21523 BS->calculateDependencies(Bundle,
false,
this);
21526 BS->initialFillReadyList(ReadyInsts);
21528 Instruction *LastScheduledInst = BS->ScheduleEnd;
21532 while (!ReadyInsts.empty()) {
21533 auto *Picked = *ReadyInsts.
begin();
21534 ReadyInsts.erase(ReadyInsts.begin());
21538 if (
auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21539 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21540 Instruction *PickedInst = BundleMember->getInst();
21542 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21543 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21544 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21546 if (PickedInst->
getNextNode() != LastScheduledInst)
21548 LastScheduledInst = PickedInst;
21550 EntryToLastInstruction.
try_emplace(Bundle->getTreeEntry(),
21551 LastScheduledInst);
21553 auto *SD = cast<ScheduleData>(Picked);
21555 if (PickedInst->
getNextNode() != LastScheduledInst)
21557 LastScheduledInst = PickedInst;
21559 auto Invalid = InstructionsState::invalid();
21560 BS->schedule(R,
Invalid, EdgeInfo(), Picked, ReadyInsts);
21564#ifdef EXPENSIVE_CHECKS
21568#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21570 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21571 I =
I->getNextNode()) {
21574 [](
const ScheduleBundle *Bundle) {
21575 return Bundle->isScheduled();
21577 "must be scheduled at this point");
21582 BS->ScheduleStart =
nullptr;
21589 if (
auto *Store = dyn_cast<StoreInst>(V))
21590 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21592 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
21595 auto E = InstrElementSize.
find(V);
21596 if (E != InstrElementSize.
end())
21605 if (
auto *
I = dyn_cast<Instruction>(V)) {
21613 Value *FirstNonBool =
nullptr;
21614 while (!Worklist.
empty()) {
21619 auto *Ty =
I->getType();
21620 if (isa<VectorType>(Ty))
21622 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
21629 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
21630 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
21638 for (
Use &U :
I->operands()) {
21639 if (
auto *J = dyn_cast<Instruction>(U.get()))
21640 if (Visited.
insert(J).second &&
21641 (isa<PHINode>(
I) || J->getParent() == Parent)) {
21645 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
21646 FirstNonBool = U.get();
21657 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
21659 Width =
DL->getTypeSizeInBits(V->getType());
21663 InstrElementSize[
I] = Width;
21668bool BoUpSLP::collectValuesToDemote(
21669 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21672 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21674 if (
all_of(E.Scalars, IsaPred<Constant>))
21677 unsigned OrigBitWidth =
21678 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21685 if (NodesToKeepBWs.
contains(E.Idx))
21691 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21692 if (isa<PoisonValue>(R))
21694 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21696 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21697 if (isa<PoisonValue>(V))
21699 if (getTreeEntries(V).size() > 1)
21705 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21711 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21714 if (
auto *
I = dyn_cast<Instruction>(V)) {
21716 unsigned BitWidth2 =
21717 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21718 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21724 BitWidth1 = std::min(BitWidth1, BitWidth2);
21729 auto FinalAnalysis = [&,
TTI =
TTI]() {
21730 if (!IsProfitableToDemote)
21733 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21735 if (Res && E.isGather()) {
21736 if (E.hasState()) {
21737 if (
const TreeEntry *SameTE =
21738 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21740 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21741 ToDemote, Visited, NodesToKeepBWs,
21742 MaxDepthLevel, IsProfitableToDemote,
21751 for (
Value *V : E.Scalars) {
21752 auto *EE = dyn_cast<ExtractElementInst>(V);
21755 UniqueBases.
insert(EE->getVectorOperand());
21757 const unsigned VF = E.Scalars.size();
21758 Type *OrigScalarTy = E.Scalars.front()->getType();
21759 if (UniqueBases.
size() <= 2 ||
21772 if (E.isGather() || !Visited.
insert(&E).second ||
21774 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21775 return isa<InsertElementInst>(U) && !isVectorized(U);
21778 return FinalAnalysis();
21781 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21782 return isVectorized(U) ||
21783 (E.Idx == 0 && UserIgnoreList &&
21784 UserIgnoreList->contains(U)) ||
21785 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21786 !U->getType()->isScalableTy() &&
21787 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21788 }) && !IsPotentiallyTruncated(V,
BitWidth);
21793 bool &NeedToExit) {
21794 NeedToExit =
false;
21795 unsigned InitLevel = MaxDepthLevel;
21797 unsigned Level = InitLevel;
21798 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21799 ToDemote, Visited, NodesToKeepBWs, Level,
21800 IsProfitableToDemote, IsTruncRoot)) {
21801 if (!IsProfitableToDemote)
21804 if (!FinalAnalysis())
21808 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21812 auto AttemptCheckBitwidth =
21815 NeedToExit =
false;
21816 unsigned BestFailBitwidth = 0;
21818 if (Checker(
BitWidth, OrigBitWidth))
21820 if (BestFailBitwidth == 0 && FinalAnalysis())
21824 if (BestFailBitwidth == 0) {
21835 auto TryProcessInstruction =
21841 for (
Value *V : E.Scalars)
21842 (void)IsPotentiallyTruncated(V,
BitWidth);
21847 return !
V->hasOneUse() && !IsPotentiallyTruncated(V,
BitWidth);
21850 bool NeedToExit =
false;
21851 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21855 if (!ProcessOperands(
Operands, NeedToExit))
21864 return IsProfitableToDemote;
21867 if (E.State == TreeEntry::SplitVectorize)
21868 return TryProcessInstruction(
21870 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].
get(),
21871 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21873 switch (E.getOpcode()) {
21877 case Instruction::Trunc:
21878 if (IsProfitableToDemoteRoot)
21879 IsProfitableToDemote =
true;
21880 return TryProcessInstruction(
BitWidth);
21881 case Instruction::ZExt:
21882 case Instruction::SExt:
21883 IsProfitableToDemote =
true;
21884 return TryProcessInstruction(
BitWidth);
21888 case Instruction::Add:
21889 case Instruction::Sub:
21890 case Instruction::Mul:
21891 case Instruction::And:
21892 case Instruction::Or:
21893 case Instruction::Xor: {
21894 return TryProcessInstruction(
21895 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21897 case Instruction::Freeze:
21898 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
21899 case Instruction::Shl: {
21904 if (isa<PoisonValue>(V))
21906 auto *I = cast<Instruction>(V);
21907 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21908 return AmtKnownBits.getMaxValue().ult(BitWidth);
21911 return TryProcessInstruction(
21912 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
21914 case Instruction::LShr: {
21918 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21920 if (isa<PoisonValue>(V))
21922 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21923 if (E.isCopyableElement(V))
21924 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
21925 auto *I = cast<Instruction>(V);
21926 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21927 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21928 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
21929 SimplifyQuery(*DL));
21932 return TryProcessInstruction(
21933 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21936 case Instruction::AShr: {
21940 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21942 if (isa<PoisonValue>(V))
21944 auto *I = cast<Instruction>(V);
21945 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21946 unsigned ShiftedBits = OrigBitWidth - BitWidth;
21947 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21949 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
21952 return TryProcessInstruction(
21953 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21956 case Instruction::UDiv:
21957 case Instruction::URem: {
21959 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21962 auto *I = cast<Instruction>(V);
21963 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21964 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
21965 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
21968 return TryProcessInstruction(
21969 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
21973 case Instruction::Select: {
21974 return TryProcessInstruction(
21975 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
21979 case Instruction::PHI: {
21980 const unsigned NumOps = E.getNumOperands();
21983 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
21985 return TryProcessInstruction(
BitWidth, Ops);
21988 case Instruction::Call: {
21989 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
21993 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
21994 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
21998 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22001 auto *I = cast<Instruction>(V);
22002 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22003 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22004 return MaskedValueIsZero(I->getOperand(0), Mask,
22005 SimplifyQuery(*DL)) &&
22006 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22008 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22009 "Expected min/max intrinsics only.");
22010 unsigned SignBits = OrigBitWidth -
BitWidth;
22012 unsigned Op0SignBits =
22014 unsigned Op1SignBits =
22016 return SignBits <= Op0SignBits &&
22017 ((SignBits != Op0SignBits &&
22021 SignBits <= Op1SignBits &&
22022 ((SignBits != Op1SignBits &&
22027 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22030 auto *I = cast<Instruction>(V);
22031 unsigned SignBits = OrigBitWidth - BitWidth;
22032 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22033 unsigned Op0SignBits =
22034 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22035 return SignBits <= Op0SignBits &&
22036 ((SignBits != Op0SignBits &&
22037 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22038 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22041 if (
ID != Intrinsic::abs) {
22042 Operands.push_back(getOperandEntry(&E, 1));
22043 CallChecker = CompChecker;
22045 CallChecker = AbsChecker;
22048 std::numeric_limits<InstructionCost::CostType>::max();
22050 unsigned VF = E.Scalars.size();
22060 if (
Cost < BestCost) {
22066 [[maybe_unused]]
bool NeedToExit;
22067 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22077 return FinalAnalysis();
22084 bool IsStoreOrInsertElt =
22085 VectorizableTree.front()->hasState() &&
22086 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22087 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22088 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22089 ExtraBitWidthNodes.
size() <= 1 &&
22090 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22091 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22094 unsigned NodeIdx = 0;
22095 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22099 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22100 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22101 "Unexpected tree is graph.");
22105 bool IsTruncRoot =
false;
22106 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22109 if (NodeIdx != 0 &&
22110 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22111 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22112 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22113 IsTruncRoot =
true;
22115 IsProfitableToDemoteRoot =
true;
22120 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
22124 auto ComputeMaxBitWidth =
22125 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22126 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22130 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22131 !NodesToKeepBWs.
contains(E.Idx) &&
22132 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22134 return V->hasOneUse() || isa<Constant>(V) ||
22137 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22138 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22139 if (TEs.empty() || is_contained(TEs, UserTE))
22141 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22143 isa<SIToFPInst, UIToFPInst>(U) ||
22144 (UserTE->hasState() &&
22145 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22146 SelectInst>(UserTE->getMainOp()) ||
22147 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22149 unsigned UserTESz = DL->getTypeSizeInBits(
22150 UserTE->Scalars.front()->getType());
22151 if (all_of(TEs, [&](const TreeEntry *TE) {
22152 auto It = MinBWs.find(TE);
22153 return It != MinBWs.end() &&
22154 It->second.first > UserTESz;
22157 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22161 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22162 auto It = MinBWs.
find(UserTE);
22163 if (It != MinBWs.
end())
22164 return It->second.first;
22165 unsigned MaxBitWidth =
22166 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22167 MaxBitWidth =
bit_ceil(MaxBitWidth);
22168 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22170 return MaxBitWidth;
22176 unsigned VF = E.getVectorFactor();
22177 Type *ScalarTy = E.Scalars.front()->getType();
22179 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
22184 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22193 unsigned MaxBitWidth = 1u;
22201 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22202 if (isa<PoisonValue>(R))
22204 KnownBits Known = computeKnownBits(R, *DL);
22205 return Known.isNonNegative();
22208 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22209 E.UserTreeIndex.UserTE->hasState() &&
22210 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22212 std::min(
DL->getTypeSizeInBits(
22213 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22214 DL->getTypeSizeInBits(ScalarTy));
22218 for (
Value *Root : E.Scalars) {
22219 if (isa<PoisonValue>(Root))
22224 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22240 if (!IsKnownPositive)
22243 auto *
I = dyn_cast<Instruction>(Root);
22245 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22249 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22251 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22254 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22259 if (NumParts > 1 &&
22267 unsigned Opcode = E.getOpcode();
22268 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22269 Opcode == Instruction::SExt ||
22270 Opcode == Instruction::ZExt || NumParts > 1;
22275 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22276 bool NeedToDemote = IsProfitableToDemote;
22278 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22279 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22280 NeedToDemote, IsTruncRoot) ||
22281 (MaxDepthLevel <= Limit &&
22282 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22283 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22284 DL->getTypeSizeInBits(TreeRootIT) /
22285 DL->getTypeSizeInBits(
22286 E.getMainOp()->getOperand(0)->getType()) >
22290 MaxBitWidth =
bit_ceil(MaxBitWidth);
22292 return MaxBitWidth;
22299 if (UserIgnoreList &&
22300 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22303 if (
all_of(*UserIgnoreList,
22305 return isa<PoisonValue>(V) ||
22306 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22308 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22309 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22310 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22312 ReductionBitWidth = 1;
22314 for (
Value *V : *UserIgnoreList) {
22315 if (isa<PoisonValue>(V))
22318 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22319 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22322 unsigned BitWidth2 = BitWidth1;
22324 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22325 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22327 ReductionBitWidth =
22328 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22330 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22331 ReductionBitWidth = 8;
22333 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22336 bool IsTopRoot = NodeIdx == 0;
22337 while (NodeIdx < VectorizableTree.size() &&
22338 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22339 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22340 RootDemotes.push_back(NodeIdx);
22342 IsTruncRoot =
true;
22344 bool IsSignedCmp =
false;
22345 if (UserIgnoreList &&
all_of(*UserIgnoreList, [](
Value *V) {
22349 IsSignedCmp =
true;
22350 while (NodeIdx < VectorizableTree.size()) {
22352 unsigned Limit = 2;
22354 ReductionBitWidth ==
22355 DL->getTypeSizeInBits(
22356 VectorizableTree.front()->Scalars.front()->getType()))
22358 unsigned MaxBitWidth = ComputeMaxBitWidth(
22359 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22360 IsTruncRoot, IsSignedCmp);
22361 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22362 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22363 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22364 else if (MaxBitWidth == 0)
22365 ReductionBitWidth = 0;
22368 for (
unsigned Idx : RootDemotes) {
22371 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22372 if (OrigBitWidth > MaxBitWidth) {
22380 RootDemotes.clear();
22382 IsProfitableToDemoteRoot =
true;
22384 if (ExtraBitWidthNodes.empty()) {
22385 NodeIdx = VectorizableTree.size();
22387 unsigned NewIdx = 0;
22389 NewIdx = *ExtraBitWidthNodes.begin();
22390 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22391 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22394 NodeIdx < VectorizableTree.size() &&
22395 VectorizableTree[NodeIdx]->UserTreeIndex &&
22396 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22397 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22398 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22399 Instruction::Trunc &&
22400 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22402 NodeIdx < VectorizableTree.size() &&
22403 VectorizableTree[NodeIdx]->UserTreeIndex &&
22404 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22405 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22406 Instruction::ICmp &&
22408 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22410 auto *IC = dyn_cast<ICmpInst>(V);
22411 return IC && (IC->isSigned() ||
22412 !isKnownNonNegative(IC->getOperand(0),
22413 SimplifyQuery(*DL)) ||
22414 !isKnownNonNegative(IC->getOperand(1),
22415 SimplifyQuery(*DL)));
22421 if (MaxBitWidth == 0 ||
22423 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
22425 if (UserIgnoreList)
22433 for (
unsigned Idx : ToDemote) {
22434 TreeEntry *
TE = VectorizableTree[
Idx].get();
22435 if (MinBWs.contains(TE))
22438 if (isa<PoisonValue>(R))
22440 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22442 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22458 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
22483 DL = &
F.getDataLayout();
22487 bool Changed =
false;
22493 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22498 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22501 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22505 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
22514 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22519 R.clearReductionData();
22520 collectSeedInstructions(BB);
22523 if (!Stores.
empty()) {
22525 <<
" underlying objects.\n");
22526 Changed |= vectorizeStoreChains(R);
22530 Changed |= vectorizeChainsInBlock(BB, R);
22535 if (!GEPs.
empty()) {
22537 <<
" underlying objects.\n");
22538 Changed |= vectorizeGEPIndices(BB, R);
22543 R.optimizeGatherSequence();
22551 unsigned Idx,
unsigned MinVF,
22556 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22557 unsigned VF = Chain.
size();
22561 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
22563 VF < 2 || VF < MinVF) {
22575 for (
Value *V : Chain)
22576 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
22578 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22579 InstructionsState S =
Analysis.buildInstructionsState(
22581 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
22583 bool IsAllowedSize =
22587 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22588 (!S.getMainOp()->isSafeToRemove() ||
22591 return !isa<ExtractElementInst>(V) &&
22592 (V->getNumUses() > Chain.size() ||
22593 any_of(V->users(), [&](User *U) {
22594 return !Stores.contains(U);
22597 (ValOps.
size() > Chain.size() / 2 && !S)) {
22598 Size = (!IsAllowedSize && S) ? 1 : 2;
22602 if (
R.isLoadCombineCandidate(Chain))
22604 R.buildTree(Chain);
22606 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22607 if (
R.isGathered(Chain.front()) ||
22608 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22609 return std::nullopt;
22610 Size =
R.getCanonicalGraphSize();
22613 if (
R.isProfitableToReorder()) {
22614 R.reorderTopToBottom();
22615 R.reorderBottomToTop();
22617 R.transformNodes();
22618 R.buildExternalUses();
22620 R.computeMinimumValueSizes();
22622 Size =
R.getCanonicalGraphSize();
22623 if (S && S.getOpcode() == Instruction::Load)
22631 using namespace ore;
22634 cast<StoreInst>(Chain[0]))
22635 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22636 <<
" and with tree size "
22637 <<
NV(
"TreeSize",
R.getTreeSize()));
22651 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22652 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22653 unsigned Size = First ? Val.first : Val.second;
22665 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22666 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22667 unsigned P = First ? Val.first : Val.second;
22670 return V + (P - Mean) * (P - Mean);
22673 return Dev * 96 / (Mean * Mean) == 0;
22681class RelatedStoreInsts {
22684 : AllStores(AllStores) {
22685 reset(BaseInstrIdx);
22688 void reset(
unsigned NewBaseInstr) {
22689 assert(NewBaseInstr < AllStores.size() &&
22690 "Instruction index out of bounds");
22691 BaseInstrIdx = NewBaseInstr;
22693 insertOrLookup(NewBaseInstr, 0);
22700 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22701 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22702 return Inserted ? std::nullopt : std::make_optional(It->second);
22705 using DistToInstMap = std::map<int64_t, unsigned>;
22706 const DistToInstMap &getStores()
const {
return Instrs; }
22712 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22715 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22721 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22722 int64_t DistFromCurBase) {
22723 DistToInstMap PrevSet = std::move(Instrs);
22724 reset(NewBaseInstIdx);
22729 for (
auto [Dist, InstIdx] : PrevSet) {
22730 if (InstIdx >= MinSafeIdx)
22731 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22737 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22738 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22739 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22744 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22745 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22750 unsigned BaseInstrIdx;
22753 DistToInstMap Instrs;
22761bool SLPVectorizerPass::vectorizeStores(
22763 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22768 bool Changed =
false;
22770 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22771 int64_t PrevDist = -1;
22775 auto &[Dist, InstIdx] =
Data;
22776 if (
Operands.empty() || Dist - PrevDist == 1) {
22777 Operands.push_back(Stores[InstIdx]);
22779 if (
Idx != StoreSeq.size() - 1)
22784 Operands.push_back(Stores[InstIdx]);
22790 .
insert({Operands.front(),
22791 cast<StoreInst>(Operands.front())->getValueOperand(),
22793 cast<StoreInst>(Operands.back())->getValueOperand(),
22798 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22799 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22803 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22805 Type *StoreTy =
Store->getValueOperand()->getType();
22806 Type *ValueTy = StoreTy;
22807 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
22808 ValueTy = Trunc->getSrcTy();
22817 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22820 MinVF = std::max<unsigned>(2, MinVF);
22822 if (MaxVF < MinVF) {
22823 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22825 <<
"MinVF (" << MinVF <<
")\n");
22829 unsigned NonPowerOf2VF = 0;
22834 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22836 NonPowerOf2VF = CandVF;
22837 assert(NonPowerOf2VF != MaxVF &&
22838 "Non-power-of-2 VF should not be equal to MaxVF");
22845 unsigned MaxRegVF = MaxVF;
22848 if (MaxVF < MinVF) {
22849 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22851 <<
"MinVF (" << MinVF <<
")\n");
22856 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22861 unsigned Repeat = 0;
22862 constexpr unsigned MaxAttempts = 4;
22864 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22865 P.first =
P.second = 1;
22867 auto IsNotVectorized = [](
bool First,
22868 const std::pair<unsigned, unsigned> &
P) {
22869 return First ?
P.first > 0 :
P.second > 0;
22871 auto IsVectorized = [](
bool First,
22872 const std::pair<unsigned, unsigned> &
P) {
22873 return First ?
P.first == 0 :
P.second == 0;
22875 auto VFIsProfitable = [](
bool First,
unsigned Size,
22876 const std::pair<unsigned, unsigned> &
P) {
22879 auto FirstSizeSame = [](
unsigned Size,
22880 const std::pair<unsigned, unsigned> &
P) {
22881 return Size ==
P.first;
22885 bool RepeatChanged =
false;
22886 bool AnyProfitableGraph =
false;
22887 for (
unsigned VF : CandidateVFs) {
22888 AnyProfitableGraph =
false;
22889 unsigned FirstUnvecStore =
22890 std::distance(RangeSizes.begin(),
22891 find_if(RangeSizes, std::bind(IsNotVectorized,
22892 VF >= MaxRegVF, _1)));
22896 while (FirstUnvecStore <
End) {
22897 unsigned FirstVecStore = std::distance(
22898 RangeSizes.begin(),
22899 find_if(RangeSizes.drop_front(FirstUnvecStore),
22900 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22901 unsigned MaxSliceEnd = FirstVecStore >=
End ?
End : FirstVecStore;
22902 for (
unsigned SliceStartIdx = FirstUnvecStore;
22903 SliceStartIdx + VF <= MaxSliceEnd;) {
22913 return cast<StoreInst>(V)
22914 ->getValueOperand()
22916 cast<StoreInst>(Slice.
front())
22917 ->getValueOperand()
22920 "Expected all operands of same type.");
22921 if (!NonSchedulable.
empty()) {
22922 auto [NonSchedSizeMax, NonSchedSizeMin] =
22924 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
22927 SliceStartIdx += NonSchedSizeMax;
22932 std::optional<bool> Res =
22933 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
22939 .first->getSecond()
22947 AnyProfitableGraph = RepeatChanged = Changed =
true;
22950 for (std::pair<unsigned, unsigned> &
P :
22951 RangeSizes.slice(SliceStartIdx, VF))
22952 P.first =
P.second = 0;
22953 if (SliceStartIdx < FirstUnvecStore + MinVF) {
22954 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
22955 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
22956 P.first =
P.second = 0;
22957 FirstUnvecStore = SliceStartIdx + VF;
22959 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
22960 for (std::pair<unsigned, unsigned> &
P :
22961 RangeSizes.slice(SliceStartIdx + VF,
22962 MaxSliceEnd - (SliceStartIdx + VF)))
22963 P.first =
P.second = 0;
22964 if (MaxSliceEnd ==
End)
22965 End = SliceStartIdx;
22966 MaxSliceEnd = SliceStartIdx;
22968 SliceStartIdx += VF;
22971 if (VF > 2 && Res &&
22972 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
22973 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
22975 SliceStartIdx += VF;
22980 if (VF > MaxRegVF && TreeSize > 1 &&
22981 all_of(RangeSizes.slice(SliceStartIdx, VF),
22982 std::bind(FirstSizeSame, TreeSize, _1))) {
22983 SliceStartIdx += VF;
22984 while (SliceStartIdx != MaxSliceEnd &&
22985 RangeSizes[SliceStartIdx].first == TreeSize)
22989 if (TreeSize > 1) {
22990 for (std::pair<unsigned, unsigned> &
P :
22991 RangeSizes.slice(SliceStartIdx, VF)) {
22992 if (VF >= MaxRegVF)
22993 P.second = std::max(
P.second, TreeSize);
22995 P.first = std::max(
P.first, TreeSize);
22999 AnyProfitableGraph =
true;
23001 if (FirstUnvecStore >=
End)
23003 if (MaxSliceEnd - FirstUnvecStore < VF &&
23004 MaxSliceEnd - FirstUnvecStore >= MinVF)
23005 AnyProfitableGraph =
true;
23006 FirstUnvecStore = std::distance(
23007 RangeSizes.begin(),
23008 find_if(RangeSizes.drop_front(MaxSliceEnd),
23009 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23011 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23015 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23016 return P.first == 0 &&
P.second == 0;
23020 if (Repeat >= MaxAttempts ||
23021 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23023 constexpr unsigned StoresLimit = 64;
23024 const unsigned MaxTotalNum = std::min<unsigned>(
23026 static_cast<unsigned>(
23029 RangeSizes.begin(),
23030 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23032 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23035 CandidateVFs.clear();
23037 CandidateVFs.push_back(Limit);
23038 if (VF > MaxTotalNum || VF >= StoresLimit)
23040 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23042 P.first = std::max(
P.second,
P.first);
23046 CandidateVFs.push_back(VF);
23087 std::optional<int64_t> PtrDist;
23088 auto *RelatedStores =
find_if(
23089 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23090 PtrDist = StoreSeq.getPointerDiff(*SI, *
DL, *SE);
23091 return PtrDist.has_value();
23095 if (RelatedStores == SortedStores.
end()) {
23103 if (std::optional<unsigned> PrevInst =
23104 RelatedStores->insertOrLookup(
Idx, *PtrDist)) {
23105 TryToVectorize(RelatedStores->getStores());
23106 RelatedStores->clearVectorizedStores(VectorizedStores);
23107 RelatedStores->rebase(*PrevInst + 1,
23112 Type *PrevValTy =
nullptr;
23114 if (
R.isDeleted(SI))
23117 PrevValTy =
SI->getValueOperand()->getType();
23119 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23120 for (RelatedStoreInsts &StoreSeq : SortedStores)
23121 TryToVectorize(StoreSeq.getStores());
23122 SortedStores.clear();
23123 PrevValTy =
SI->getValueOperand()->getType();
23125 FillStoresSet(
I, SI);
23129 for (RelatedStoreInsts &StoreSeq : SortedStores)
23130 TryToVectorize(StoreSeq.getStores());
23135void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
23146 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
23147 if (!
SI->isSimple())
23157 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
23158 if (
GEP->getNumIndices() != 1)
23161 if (isa<Constant>(
Idx))
23165 if (
GEP->getType()->isVectorTy())
23177 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23178 << VL.
size() <<
".\n");
23189 for (
Value *V : VL) {
23190 Type *Ty =
V->getType();
23194 R.getORE()->emit([&]() {
23195 std::string TypeStr;
23199 <<
"Cannot SLP vectorize list: type "
23200 << TypeStr +
" is unsupported by vectorizer";
23207 unsigned Sz =
R.getVectorElementSize(I0);
23208 unsigned MinVF =
R.getMinVF(Sz);
23209 unsigned MaxVF = std::max<unsigned>(
23211 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23213 R.getORE()->emit([&]() {
23215 <<
"Cannot SLP vectorize list: vectorization factor "
23216 <<
"less than 2 is not supported";
23221 bool Changed =
false;
23222 bool CandidateFound =
false;
23225 unsigned NextInst = 0, MaxInst = VL.size();
23226 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23234 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23235 unsigned ActualVF = std::min(MaxInst -
I, VF);
23240 if (MaxVFOnly && ActualVF < MaxVF)
23242 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23247 for (
Value *V : VL.drop_front(
I)) {
23250 if (
auto *Inst = dyn_cast<Instruction>(V);
23251 !Inst || !
R.isDeleted(Inst)) {
23254 if (
Idx == ActualVF)
23259 if (
Idx != ActualVF)
23262 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23266 if (
R.isTreeTinyAndNotFullyVectorizable())
23268 if (
R.isProfitableToReorder()) {
23269 R.reorderTopToBottom();
23270 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.
front()));
23272 R.transformNodes();
23273 R.buildExternalUses();
23275 R.computeMinimumValueSizes();
23277 CandidateFound =
true;
23278 MinCost = std::min(MinCost,
Cost);
23281 <<
" for VF=" << ActualVF <<
"\n");
23285 cast<Instruction>(Ops[0]))
23286 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23287 <<
" and with tree size "
23288 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23299 if (!Changed && CandidateFound) {
23300 R.getORE()->emit([&]() {
23302 <<
"List vectorization was possible but not beneficial with cost "
23303 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23306 }
else if (!Changed) {
23307 R.getORE()->emit([&]() {
23309 <<
"Cannot SLP vectorize list: vectorization was impossible"
23310 <<
" with available vectorization factors";
23347 ReductionOpsListType ReductionOps;
23357 bool IsSupportedHorRdxIdentityOp =
false;
23373 return isa<SelectInst>(
I) &&
23379 bool TwoElementReduction =
false) {
23380 if (Kind == RecurKind::None)
23389 if (TwoElementReduction)
23392 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23396 return I->getFastMathFlags().noNaNs();
23399 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23402 return I->isAssociative();
23411 return I->getOperand(2);
23412 return I->getOperand(
Index);
23421 case RecurKind::Or: {
23430 case RecurKind::And: {
23439 case RecurKind::Add:
23440 case RecurKind::Mul:
23441 case RecurKind::Xor:
23442 case RecurKind::FAdd:
23443 case RecurKind::FMul: {
23448 case RecurKind::SMax:
23449 case RecurKind::SMin:
23450 case RecurKind::UMax:
23451 case RecurKind::UMin:
23458 case RecurKind::FMax:
23459 case RecurKind::FMin:
23460 case RecurKind::FMaximum:
23461 case RecurKind::FMinimum:
23462 case RecurKind::FMaximumNum:
23463 case RecurKind::FMinimumNum: {
23476 const ReductionOpsListType &ReductionOps) {
23477 bool UseSelect = ReductionOps.size() == 2 ||
23479 (ReductionOps.size() == 1 &&
23480 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23481 assert((!UseSelect || ReductionOps.size() != 2 ||
23482 isa<SelectInst>(ReductionOps[1][0])) &&
23483 "Expected cmp + select pairs for reduction");
23486 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
23500 auto *
I = dyn_cast<Instruction>(V);
23502 return RecurKind::None;
23504 return RecurKind::Add;
23506 return RecurKind::Mul;
23509 return RecurKind::And;
23512 return RecurKind::Or;
23514 return RecurKind::Xor;
23516 return RecurKind::FAdd;
23518 return RecurKind::FMul;
23521 return RecurKind::FMax;
23523 return RecurKind::FMin;
23526 return RecurKind::FMaximum;
23528 return RecurKind::FMinimum;
23534 return RecurKind::SMax;
23536 return RecurKind::SMin;
23538 return RecurKind::UMax;
23540 return RecurKind::UMin;
23542 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
23564 if (!isa<ExtractElementInst>(
RHS) ||
23566 return RecurKind::None;
23568 if (!isa<ExtractElementInst>(
LHS) ||
23570 return RecurKind::None;
23572 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
23573 return RecurKind::None;
23577 return RecurKind::None;
23582 return RecurKind::None;
23585 return RecurKind::SMax;
23588 return RecurKind::SMin;
23591 return RecurKind::UMax;
23594 return RecurKind::UMin;
23597 return RecurKind::None;
23601 static unsigned getFirstOperandIndex(
Instruction *
I) {
23602 return isCmpSelMinMax(
I) ? 1 : 0;
23608 return isCmpSelMinMax(
I) ? 3 : 2;
23614 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23615 auto *Sel = cast<SelectInst>(
I);
23616 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
23617 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23619 return I->getParent() == BB;
23623 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
23624 if (IsCmpSelMinMax) {
23627 if (
auto *Sel = dyn_cast<SelectInst>(
I))
23628 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
23629 return I->hasNUses(2);
23633 return I->hasOneUse();
23638 if (isCmpSelMinMax(
I))
23639 ReductionOps.assign(2, ReductionOpsType());
23641 ReductionOps.assign(1, ReductionOpsType());
23646 if (isCmpSelMinMax(
I)) {
23647 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
23648 ReductionOps[1].emplace_back(
I);
23650 ReductionOps[0].emplace_back(
I);
23655 int Sz = Data.size();
23656 auto *
I = dyn_cast<Instruction>(Data.front());
23657 return Sz > 1 ||
isConstant(Data.front()) ||
23658 (
I && !isa<LoadInst>(
I) && isValidForAlternation(
I->getOpcode()));
23664 : ReductionRoot(
I), ReductionLimit(2) {
23665 RdxKind = HorizontalReduction::getRdxKind(
I);
23666 ReductionOps.emplace_back().push_back(
I);
23668 for (
Value *V : Ops)
23669 ReducedValsToOps[
V].push_back(
I);
23672 bool matchReductionForOperands()
const {
23675 assert(ReductionRoot &&
"Reduction root is not set!");
23676 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23678 return Ops.
size() == 2;
23689 RdxKind = HorizontalReduction::getRdxKind(Root);
23690 if (!isVectorizable(RdxKind, Root))
23701 if (
auto *Sel = dyn_cast<SelectInst>(Root))
23702 if (!Sel->getCondition()->hasOneUse())
23705 ReductionRoot = Root;
23710 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23712 1, std::make_pair(Root, 0));
23720 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
23721 getNumberOfOperands(TreeN)))) {
23722 Value *EdgeVal = getRdxOperand(TreeN,
I);
23723 ReducedValsToOps[EdgeVal].push_back(TreeN);
23724 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23731 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23732 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23733 !isVectorizable(RdxKind, EdgeInst) ||
23734 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23735 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23736 PossibleReducedVals.push_back(EdgeVal);
23739 ReductionOps.push_back(EdgeInst);
23750 PossibleReducedVals;
23751 initReductionOps(Root);
23755 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
23759 if (!LoadKeyUsed.
insert(Key).second) {
23760 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
23761 if (LIt != LoadsMap.
end()) {
23762 for (
LoadInst *RLI : LIt->second) {
23768 for (
LoadInst *RLI : LIt->second) {
23775 if (LIt->second.size() > 2) {
23777 hash_value(LIt->second.back()->getPointerOperand());
23783 .first->second.push_back(LI);
23787 while (!Worklist.empty()) {
23788 auto [TreeN, Level] = Worklist.pop_back_val();
23791 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23792 addReductionOps(TreeN);
23795 for (
Value *V : PossibleRedVals) {
23802 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23804 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23807 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23808 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23810 for (
auto &Slice : PossibleRedVals) {
23812 auto RedValsVect = Slice.second.takeVector();
23814 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
23815 PossibleRedValsVect.
back().append(Data.second, Data.first);
23817 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23818 return P1.size() > P2.size();
23823 (!isGoodForReduction(Data) &&
23824 (!isa<LoadInst>(Data.front()) ||
23825 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
23827 cast<LoadInst>(Data.front())->getPointerOperand()) !=
23829 cast<LoadInst>(ReducedVals[NewIdx].front())
23831 NewIdx = ReducedVals.
size();
23834 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
23840 return P1.size() > P2.
size();
23848 constexpr unsigned RegMaxNumber = 4;
23849 constexpr unsigned RedValsMaxNumber = 128;
23853 if (
unsigned NumReducedVals = std::accumulate(
23854 ReducedVals.
begin(), ReducedVals.
end(), 0,
23856 if (!isGoodForReduction(Vals))
23858 return Num + Vals.size();
23860 NumReducedVals < ReductionLimit &&
23864 for (ReductionOpsType &RdxOps : ReductionOps)
23865 for (
Value *RdxOp : RdxOps)
23866 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23877 ReducedVals.
front().size());
23881 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23882 assert(isa<SelectInst>(RdxRootInst) &&
23883 "Expected min/max reduction to have select root instruction");
23884 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23885 assert(isa<Instruction>(ScalarCond) &&
23886 "Expected min/max reduction to have compare condition");
23887 return cast<Instruction>(ScalarCond);
23890 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23891 return isBoolLogicOp(cast<Instruction>(V));
23894 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23895 if (VectorizedTree) {
23898 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
23899 if (AnyBoolLogicOp) {
23900 auto It = ReducedValsToOps.
find(VectorizedTree);
23901 auto It1 = ReducedValsToOps.
find(Res);
23902 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
23904 (It != ReducedValsToOps.
end() &&
23906 return isBoolLogicOp(I) &&
23907 getRdxOperand(I, 0) == VectorizedTree;
23911 (It1 != ReducedValsToOps.
end() &&
23913 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
23917 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
23921 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
23928 ReductionOps.front().size());
23929 for (ReductionOpsType &RdxOps : ReductionOps)
23930 for (
Value *RdxOp : RdxOps) {
23933 IgnoreList.insert(RdxOp);
23938 for (
Value *U : IgnoreList)
23939 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
23940 RdxFMF &= FPMO->getFastMathFlags();
23941 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
23946 for (
Value *V : Candidates)
23947 TrackedVals.try_emplace(V, V);
23950 Value *
V) ->
unsigned & {
23951 auto *It = MV.
find(V);
23952 assert(It != MV.
end() &&
"Unable to find given key.");
23961 bool CheckForReusedReductionOps =
false;
23966 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
23968 InstructionsState S = States[
I];
23972 for (
Value *ReducedVal : OrigReducedVals) {
23973 Value *RdxVal = TrackedVals.at(ReducedVal);
23978 auto *Inst = dyn_cast<Instruction>(RdxVal);
23980 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
23984 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
23986 bool ShuffledExtracts =
false;
23988 if (S && S.getOpcode() == Instruction::ExtractElement &&
23989 !S.isAltShuffle() &&
I + 1 <
E) {
23991 for (
Value *RV : ReducedVals[
I + 1]) {
23992 Value *RdxVal = TrackedVals.at(RV);
23996 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
23999 CommonCandidates.push_back(RdxVal);
24000 TrackedToOrig.try_emplace(RdxVal, RV);
24005 Candidates.
swap(CommonCandidates);
24006 ShuffledExtracts =
true;
24013 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24014 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24016 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24017 Value *OrigV = TrackedToOrig.at(VC);
24018 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24019 if (
auto *ResI = dyn_cast<Instruction>(Res))
24020 V.analyzedReductionRoot(ResI);
24022 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24026 unsigned NumReducedVals = Candidates.
size();
24027 if (NumReducedVals < ReductionLimit &&
24028 (NumReducedVals < 2 || !
isSplat(Candidates)))
24033 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24034 RdxKind != RecurKind::FMul &&
24035 RdxKind != RecurKind::FMulAdd;
24038 if (IsSupportedHorRdxIdentityOp)
24039 for (
Value *V : Candidates) {
24040 Value *OrigV = TrackedToOrig.at(V);
24041 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24053 bool SameScaleFactor =
false;
24054 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24055 SameValuesCounter.
size() != Candidates.size();
24057 if (OptReusedScalars) {
24059 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24060 RdxKind == RecurKind::Xor) &&
24062 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24063 return P.second == SameValuesCounter.
front().second;
24065 Candidates.resize(SameValuesCounter.
size());
24066 transform(SameValuesCounter, Candidates.begin(),
24067 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24068 NumReducedVals = Candidates.size();
24070 if (NumReducedVals == 1) {
24071 Value *OrigV = TrackedToOrig.at(Candidates.front());
24072 unsigned Cnt = At(SameValuesCounter, OrigV);
24074 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24075 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24076 VectorizedVals.try_emplace(OrigV, Cnt);
24077 ExternallyUsedValues.
insert(OrigV);
24082 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24083 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24084 const unsigned MaxElts = std::clamp<unsigned>(
24086 RegMaxNumber * RedValsMaxNumber);
24088 unsigned ReduxWidth = NumReducedVals;
24089 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24090 unsigned NumParts, NumRegs;
24091 Type *ScalarTy = Candidates.front()->getType();
24098 while (NumParts > NumRegs) {
24099 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24100 ReduxWidth =
bit_floor(ReduxWidth - 1);
24106 if (NumParts > NumRegs / 2)
24111 ReduxWidth = GetVectorFactor(ReduxWidth);
24112 ReduxWidth = std::min(ReduxWidth, MaxElts);
24114 unsigned Start = 0;
24115 unsigned Pos = Start;
24117 unsigned PrevReduxWidth = ReduxWidth;
24118 bool CheckForReusedReductionOpsLocal =
false;
24119 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24120 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24121 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24124 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24127 if (Pos < NumReducedVals - ReduxWidth + 1)
24128 return IsAnyRedOpGathered;
24131 if (ReduxWidth > 1)
24132 ReduxWidth = GetVectorFactor(ReduxWidth);
24133 return IsAnyRedOpGathered;
24135 bool AnyVectorized =
false;
24137 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24138 ReduxWidth >= ReductionLimit) {
24141 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24143 CheckForReusedReductionOps =
true;
24146 PrevReduxWidth = ReduxWidth;
24149 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24152 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24154 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24156 V.areAnalyzedReductionVals(VL)) {
24157 (void)AdjustReducedVals(
true);
24163 auto *RedValI = dyn_cast<Instruction>(RedVal);
24166 return V.isDeleted(RedValI);
24169 V.buildTree(VL, IgnoreList);
24170 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24171 if (!AdjustReducedVals())
24172 V.analyzedReductionVals(VL);
24175 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24176 if (!AdjustReducedVals())
24177 V.analyzedReductionVals(VL);
24180 V.reorderTopToBottom();
24183 VL.front()->getType()->isIntOrIntVectorTy() ||
24184 ReductionLimit > 2);
24188 ExternallyUsedValues);
24192 LocalExternallyUsedValues.insert(ReductionRoot);
24193 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24194 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24196 for (
Value *V : ReducedVals[Cnt])
24197 if (isa<Instruction>(V))
24198 LocalExternallyUsedValues.insert(TrackedVals[V]);
24200 if (!IsSupportedHorRdxIdentityOp) {
24203 "Reused values counter map is not empty");
24204 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24205 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24207 Value *
V = Candidates[Cnt];
24208 Value *OrigV = TrackedToOrig.at(V);
24209 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24212 V.transformNodes();
24216 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24217 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24219 Value *RdxVal = Candidates[Cnt];
24220 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24221 RdxVal = It->second;
24222 if (!Visited.
insert(RdxVal).second)
24226 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24227 LocalExternallyUsedValues.insert(RdxVal);
24230 Value *OrigV = TrackedToOrig.at(RdxVal);
24232 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24233 if (NumOps != ReducedValsToOps.
at(OrigV).size())
24234 LocalExternallyUsedValues.insert(RdxVal);
24237 if (!IsSupportedHorRdxIdentityOp)
24238 SameValuesCounter.
clear();
24239 for (
Value *RdxVal : VL)
24240 if (RequiredExtract.
contains(RdxVal))
24241 LocalExternallyUsedValues.insert(RdxVal);
24242 V.buildExternalUses(LocalExternallyUsedValues);
24244 V.computeMinimumValueSizes();
24248 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
24251 <<
" for reduction\n");
24255 V.getORE()->emit([&]() {
24257 ReducedValsToOps.
at(VL[0]).front())
24258 <<
"Vectorizing horizontal reduction is possible "
24259 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24260 <<
" and threshold "
24263 if (!AdjustReducedVals()) {
24264 V.analyzedReductionVals(VL);
24265 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24266 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24269 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24270 VF >= ReductionLimit;
24272 *
TTI, VL.front()->getType(), VF - 1)) {
24274 V.getCanonicalGraphSize() !=
V.getTreeSize())
24276 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24284 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24285 <<
Cost <<
". (HorRdx)\n");
24286 V.getORE()->emit([&]() {
24288 ReducedValsToOps.
at(VL[0]).front())
24289 <<
"Vectorized horizontal reduction with cost "
24290 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24291 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24298 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24300 if (IsCmpSelMinMax)
24301 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24304 Value *VectorizedRoot =
V.vectorizeTree(
24305 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24308 for (
Value *RdxVal : Candidates) {
24309 Value *OrigVal = TrackedToOrig.at(RdxVal);
24310 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24311 if (TransformedRdxVal != RdxVal)
24312 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24321 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24324 if (OptReusedScalars && !SameScaleFactor) {
24325 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24326 SameValuesCounter, TrackedToOrig);
24329 Type *ScalarTy = VL.front()->getType();
24334 OptReusedScalars && SameScaleFactor
24335 ? SameValuesCounter.
front().second
24338 ?
V.isSignedMinBitwidthRootNode()
24342 for (
Value *RdxVal : VL) {
24343 Value *OrigV = TrackedToOrig.at(RdxVal);
24344 if (IsSupportedHorRdxIdentityOp) {
24345 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24348 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24349 if (!
V.isVectorized(RdxVal))
24350 RequiredExtract.
insert(RdxVal);
24354 ReduxWidth = NumReducedVals - Pos;
24355 if (ReduxWidth > 1)
24356 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24357 AnyVectorized =
true;
24359 if (OptReusedScalars && !AnyVectorized) {
24360 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24361 Value *RdxVal = TrackedVals.at(
P.first);
24362 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24363 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24364 VectorizedVals.try_emplace(
P.first,
P.second);
24369 if (!VectorValuesAndScales.
empty())
24370 VectorizedTree = GetNewVectorizedTree(
24372 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24373 if (VectorizedTree) {
24394 if (!AnyBoolLogicOp)
24396 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24397 getRdxOperand(RedOp1, 0) ==
LHS ||
24400 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24401 getRdxOperand(RedOp2, 0) ==
RHS ||
24406 if (
LHS != VectorizedTree)
24417 unsigned Sz = InstVals.
size();
24420 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24423 Value *RdxVal1 = InstVals[
I].second;
24424 Value *StableRdxVal1 = RdxVal1;
24425 auto It1 = TrackedVals.find(RdxVal1);
24426 if (It1 != TrackedVals.end())
24427 StableRdxVal1 = It1->second;
24428 Value *RdxVal2 = InstVals[
I + 1].second;
24429 Value *StableRdxVal2 = RdxVal2;
24430 auto It2 = TrackedVals.find(RdxVal2);
24431 if (It2 != TrackedVals.end())
24432 StableRdxVal2 = It2->second;
24436 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24438 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24439 StableRdxVal2,
"op.rdx", ReductionOps);
24440 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24443 ExtraReds[Sz / 2] = InstVals.
back();
24447 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
24451 for (
Value *RdxVal : Candidates) {
24452 if (!Visited.
insert(RdxVal).second)
24454 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24461 bool InitStep =
true;
24462 while (ExtraReductions.
size() > 1) {
24464 FinalGen(ExtraReductions, InitStep);
24465 ExtraReductions.
swap(NewReds);
24468 VectorizedTree = ExtraReductions.
front().second;
24470 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24486 for (
auto *U :
Ignore->users()) {
24488 "All users must be either in the reduction ops list.");
24491 if (!
Ignore->use_empty()) {
24493 Ignore->replaceAllUsesWith(
P);
24496 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24498 }
else if (!CheckForReusedReductionOps) {
24499 for (ReductionOpsType &RdxOps : ReductionOps)
24500 for (
Value *RdxOp : RdxOps)
24501 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24503 return VectorizedTree;
24510 Value *Vec,
unsigned Scale,
bool IsSigned,
24513 if (
auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24518 for (
unsigned I : seq<unsigned>(DestTyNumElements)) {
24534 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24537 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24539 if (Rdx->
getType() != DestTy)
24545 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24555 Type *ScalarTy = ReducedVals.
front()->getType();
24556 unsigned ReduxWidth = ReducedVals.
size();
24565 int Cnt = ReducedVals.
size();
24566 for (
Value *RdxVal : ReducedVals) {
24571 Cost += GenCostFn();
24576 auto *RdxOp = cast<Instruction>(U);
24577 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24585 Cost += ScalarCost;
24587 Cost += GenCostFn();
24596 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24598 case RecurKind::Add:
24599 case RecurKind::Mul:
24600 case RecurKind::Or:
24601 case RecurKind::And:
24602 case RecurKind::Xor:
24603 case RecurKind::FAdd:
24604 case RecurKind::FMul: {
24607 if (DoesRequireReductionOp) {
24608 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24610 unsigned ScalarTyNumElements = VecTy->getNumElements();
24611 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
24615 ReducedVals.size()),
24626 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24627 std::make_pair(RedTy,
true));
24628 if (RType == RedTy) {
24633 RdxOpcode, !IsSigned, RedTy,
24639 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24640 std::make_pair(RedTy,
true));
24644 if (RType != RedTy) {
24645 unsigned Opcode = Instruction::Trunc;
24647 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24653 ScalarCost = EvaluateScalarCost([&]() {
24658 case RecurKind::FMax:
24659 case RecurKind::FMin:
24660 case RecurKind::FMaximum:
24661 case RecurKind::FMinimum:
24662 case RecurKind::SMax:
24663 case RecurKind::SMin:
24664 case RecurKind::UMax:
24665 case RecurKind::UMin: {
24668 if (DoesRequireReductionOp) {
24674 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24675 std::make_pair(RedTy,
true));
24679 if (RType != RedTy) {
24680 unsigned Opcode = Instruction::Trunc;
24682 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24688 ScalarCost = EvaluateScalarCost([&]() {
24698 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24700 <<
" (It is a splitting reduction)\n");
24701 return VectorCost - ScalarCost;
24709 Value *ReducedSubTree =
nullptr;
24711 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24712 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24713 if (ReducedSubTree)
24714 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24715 "op.rdx", ReductionOps);
24717 ReducedSubTree = Rdx;
24719 if (VectorValuesAndScales.
size() == 1) {
24720 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24721 CreateSingleOp(Vec, Scale, IsSigned);
24722 return ReducedSubTree;
24726 Value *VecRes =
nullptr;
24727 bool VecResSignedness =
false;
24728 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24734 case RecurKind::Add: {
24735 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24738 <<
". (HorRdx)\n");
24740 for (
unsigned I : seq<unsigned>(Cnt))
24741 std::iota(std::next(
Mask.begin(), VF *
I),
24742 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24743 ++NumVectorInstructions;
24754 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24755 <<
". (HorRdx)\n");
24756 ++NumVectorInstructions;
24760 case RecurKind::Xor: {
24763 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24768 case RecurKind::FAdd: {
24772 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24773 <<
". (HorRdx)\n");
24774 ++NumVectorInstructions;
24778 case RecurKind::And:
24779 case RecurKind::Or:
24780 case RecurKind::SMax:
24781 case RecurKind::SMin:
24782 case RecurKind::UMax:
24783 case RecurKind::UMin:
24784 case RecurKind::FMax:
24785 case RecurKind::FMin:
24786 case RecurKind::FMaximum:
24787 case RecurKind::FMinimum:
24790 case RecurKind::Sub:
24791 case RecurKind::AddChainWithSubs:
24792 case RecurKind::Mul:
24793 case RecurKind::FMul:
24794 case RecurKind::FMulAdd:
24795 case RecurKind::AnyOf:
24796 case RecurKind::FindFirstIVSMin:
24797 case RecurKind::FindFirstIVUMin:
24798 case RecurKind::FindLastIVSMax:
24799 case RecurKind::FindLastIVUMax:
24800 case RecurKind::FMaxNum:
24801 case RecurKind::FMinNum:
24802 case RecurKind::FMaximumNum:
24803 case RecurKind::FMinimumNum:
24804 case RecurKind::None:
24811 VecResSignedness = IsSigned;
24813 ++NumVectorInstructions;
24814 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24820 std::iota(
Mask.begin(),
Mask.end(), 0);
24822 if (VecResVF < VecVF) {
24826 if (VecResVF != VecVF) {
24828 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24845 if (VecResVF < VecVF) {
24851 if (VecResVF != VecVF)
24853 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
24854 if (VecResVF != VecVF)
24859 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
24860 CreateVecOp(Vec, Scale, IsSigned);
24861 CreateSingleOp(VecRes, 1,
false);
24863 return ReducedSubTree;
24869 assert(VectorizedValue &&
"Need to have a vectorized tree node");
24870 assert(RdxKind != RecurKind::FMulAdd &&
24871 "A call to the llvm.fmuladd intrinsic is not handled yet");
24873 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
24874 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
24875 RdxKind == RecurKind::Add &&
24880 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
24881 ++NumVectorInstructions;
24884 ++NumVectorInstructions;
24891 assert(IsSupportedHorRdxIdentityOp &&
24892 "The optimization of matched scalar identity horizontal reductions "
24893 "must be supported.");
24895 return VectorizedValue;
24897 case RecurKind::Add: {
24899 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
24901 << VectorizedValue <<
". (HorRdx)\n");
24902 return Builder.
CreateMul(VectorizedValue, Scale);
24904 case RecurKind::Xor: {
24906 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
24907 <<
". (HorRdx)\n");
24910 return VectorizedValue;
24912 case RecurKind::FAdd: {
24914 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
24916 << VectorizedValue <<
". (HorRdx)\n");
24917 return Builder.
CreateFMul(VectorizedValue, Scale);
24919 case RecurKind::And:
24920 case RecurKind::Or:
24921 case RecurKind::SMax:
24922 case RecurKind::SMin:
24923 case RecurKind::UMax:
24924 case RecurKind::UMin:
24925 case RecurKind::FMax:
24926 case RecurKind::FMin:
24927 case RecurKind::FMaximum:
24928 case RecurKind::FMinimum:
24930 return VectorizedValue;
24931 case RecurKind::Sub:
24932 case RecurKind::AddChainWithSubs:
24933 case RecurKind::Mul:
24934 case RecurKind::FMul:
24935 case RecurKind::FMulAdd:
24936 case RecurKind::AnyOf:
24937 case RecurKind::FindFirstIVSMin:
24938 case RecurKind::FindFirstIVUMin:
24939 case RecurKind::FindLastIVSMax:
24940 case RecurKind::FindLastIVUMax:
24941 case RecurKind::FMaxNum:
24942 case RecurKind::FMinNum:
24943 case RecurKind::FMaximumNum:
24944 case RecurKind::FMinimumNum:
24945 case RecurKind::None:
24957 assert(IsSupportedHorRdxIdentityOp &&
24958 "The optimization of matched scalar identity horizontal reductions "
24959 "must be supported.");
24961 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
24962 if (VTy->getElementType() != VL.
front()->getType()) {
24966 R.isSignedMinBitwidthRootNode());
24969 case RecurKind::Add: {
24972 for (
Value *V : VL) {
24973 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
24974 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
24978 << VectorizedValue <<
". (HorRdx)\n");
24979 return Builder.
CreateMul(VectorizedValue, Scale);
24981 case RecurKind::And:
24982 case RecurKind::Or:
24985 <<
". (HorRdx)\n");
24986 return VectorizedValue;
24987 case RecurKind::SMax:
24988 case RecurKind::SMin:
24989 case RecurKind::UMax:
24990 case RecurKind::UMin:
24991 case RecurKind::FMax:
24992 case RecurKind::FMin:
24993 case RecurKind::FMaximum:
24994 case RecurKind::FMinimum:
24997 <<
". (HorRdx)\n");
24998 return VectorizedValue;
24999 case RecurKind::Xor: {
25005 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
25007 std::iota(
Mask.begin(),
Mask.end(), 0);
25008 bool NeedShuffle =
false;
25009 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25011 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25012 if (Cnt % 2 == 0) {
25014 NeedShuffle =
true;
25020 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25024 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
25025 return VectorizedValue;
25027 case RecurKind::FAdd: {
25030 for (
Value *V : VL) {
25031 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25032 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25035 return Builder.
CreateFMul(VectorizedValue, Scale);
25037 case RecurKind::Sub:
25038 case RecurKind::AddChainWithSubs:
25039 case RecurKind::Mul:
25040 case RecurKind::FMul:
25041 case RecurKind::FMulAdd:
25042 case RecurKind::AnyOf:
25043 case RecurKind::FindFirstIVSMin:
25044 case RecurKind::FindFirstIVUMin:
25045 case RecurKind::FindLastIVSMax:
25046 case RecurKind::FindLastIVUMax:
25047 case RecurKind::FMaxNum:
25048 case RecurKind::FMinNum:
25049 case RecurKind::FMaximumNum:
25050 case RecurKind::FMinimumNum:
25051 case RecurKind::None:
25061 return HorizontalReduction::getRdxKind(V);
25064 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25065 return cast<FixedVectorType>(IE->getType())->getNumElements();
25067 unsigned AggregateSize = 1;
25068 auto *
IV = cast<InsertValueInst>(InsertInst);
25069 Type *CurrentType =
IV->getType();
25071 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
25072 for (
auto *Elt : ST->elements())
25073 if (Elt != ST->getElementType(0))
25074 return std::nullopt;
25075 AggregateSize *= ST->getNumElements();
25076 CurrentType = ST->getElementType(0);
25077 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25078 AggregateSize *= AT->getNumElements();
25079 CurrentType = AT->getElementType();
25080 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25081 AggregateSize *= VT->getNumElements();
25082 return AggregateSize;
25084 return AggregateSize;
25086 return std::nullopt;
25095 unsigned OperandOffset,
const BoUpSLP &R) {
25098 std::optional<unsigned> OperandIndex =
25100 if (!OperandIndex || R.isDeleted(LastInsertInst))
25102 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25104 BuildVectorOpds, InsertElts, *OperandIndex, R);
25107 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25108 InsertElts[*OperandIndex] = LastInsertInst;
25110 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
25111 }
while (LastInsertInst !=
nullptr &&
25112 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
25136 assert((isa<InsertElementInst>(LastInsertInst) ||
25137 isa<InsertValueInst>(LastInsertInst)) &&
25138 "Expected insertelement or insertvalue instruction!");
25141 "Expected empty result vectors!");
25144 if (!AggregateSize)
25146 BuildVectorOpds.
resize(*AggregateSize);
25147 InsertElts.
resize(*AggregateSize);
25152 if (BuildVectorOpds.
size() >= 2)
25170 auto DominatedReduxValue = [&](
Value *R) {
25171 return isa<Instruction>(R) &&
25172 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
25178 if (
P->getIncomingBlock(0) == ParentBB) {
25179 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
25180 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25181 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
25184 if (Rdx && DominatedReduxValue(Rdx))
25197 if (
P->getIncomingBlock(0) == BBLatch) {
25198 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
25199 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25200 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
25203 if (Rdx && DominatedReduxValue(Rdx))
25237 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25238 isa<IntrinsicInst>(Root)) &&
25239 "Expected binop, select, or intrinsic for reduction matching");
25241 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25243 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25245 return dyn_cast<Instruction>(
RHS);
25247 return dyn_cast<Instruction>(
LHS);
25254 Value *Op0 =
nullptr;
25255 Value *Op1 =
nullptr;
25258 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25264 Value *B0 =
nullptr, *B1 =
nullptr;
25269bool SLPVectorizerPass::vectorizeHorReduction(
25274 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
25276 if (Root->
getParent() != BB || isa<PHINode>(Root))
25280 auto SelectRoot = [&]() {
25299 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25300 Stack.emplace(SelectRoot(), 0);
25304 if (
R.isAnalyzedReductionRoot(Inst))
25309 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
25311 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
25313 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25314 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25321 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
25326 while (!
Stack.empty()) {
25329 std::tie(Inst, Level) =
Stack.front();
25334 if (
R.isDeleted(Inst))
25336 if (
Value *VectorizedV = TryToReduce(Inst)) {
25338 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
25340 Stack.emplace(
I, Level);
25343 if (
R.isDeleted(Inst))
25347 if (!TryAppendToPostponedInsts(Inst)) {
25358 if (VisitedInstrs.
insert(
Op).second)
25359 if (
auto *
I = dyn_cast<Instruction>(
Op))
25362 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
25363 !
R.isDeleted(
I) &&
I->getParent() == BB)
25364 Stack.emplace(
I, Level);
25373 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
25376 if ((
I->getOpcode() == Instruction::FAdd ||
25377 I->getOpcode() == Instruction::FSub) &&
25385 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
25386 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
25387 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25388 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25395 auto *
A = dyn_cast<BinaryOperator>(Op0);
25396 auto *
B = dyn_cast<BinaryOperator>(Op1);
25398 if (
A &&
B &&
B->hasOneUse()) {
25399 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
25400 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
25401 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25403 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25407 if (
B &&
A &&
A->hasOneUse()) {
25408 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
25409 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
25410 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25412 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25420 Type *Ty = Inst->getType();
25424 if (!HorRdx.matchReductionForOperands())
25444 if (
auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25445 FMF = FPCI->getFastMathFlags();
25453 if (RedCost >= ScalarCost)
25456 return HorRdx.tryToReduce(R, *
DL, &
TTI, *TLI, AC) !=
nullptr;
25458 if (Candidates.
size() == 1)
25459 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25462 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25463 if (!BestCandidate)
25465 return (*BestCandidate == 0 &&
25466 TryToReduce(
I, {Candidates[*BestCandidate].first,
25467 Candidates[*BestCandidate].second})) ||
25468 tryToVectorizeList({Candidates[*BestCandidate].first,
25469 Candidates[*BestCandidate].second},
25476 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25477 Res |= tryToVectorize(PostponedInsts, R);
25484 for (
Value *V : Insts)
25485 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
25486 Res |= tryToVectorize(Inst, R);
25490bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
25493 if (!
R.canMapToVector(IVI->
getType()))
25501 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25502 R.getORE()->emit([&]() {
25504 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25505 "trying reduction first.";
25509 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25511 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25521 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
25525 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25526 R.getORE()->emit([&]() {
25528 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25529 "trying reduction first.";
25533 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25534 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25537template <
typename T>
25542 bool MaxVFOnly,
BoUpSLP &R) {
25543 bool Changed =
false;
25554 auto *
I = dyn_cast<Instruction>(*IncIt);
25555 if (!
I || R.isDeleted(
I)) {
25559 auto *SameTypeIt = IncIt;
25560 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25561 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25562 AreCompatible(*SameTypeIt, *IncIt))) {
25563 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
25565 if (
I && !R.isDeleted(
I))
25570 unsigned NumElts = VL.
size();
25571 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25572 << NumElts <<
")\n");
25582 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25585 VL.
swap(Candidates);
25586 Candidates.
clear();
25588 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
25594 auto GetMinNumElements = [&R](
Value *V) {
25595 unsigned EltSize = R.getVectorElementSize(V);
25596 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25598 if (NumElts < GetMinNumElements(*IncIt) &&
25599 (Candidates.
empty() ||
25600 Candidates.
front()->getType() == (*IncIt)->getType())) {
25602 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
25608 if (Candidates.
size() > 1 &&
25609 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25610 if (TryToVectorizeHelper(Candidates,
false)) {
25613 }
else if (MaxVFOnly) {
25616 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
25618 auto *
I = dyn_cast<Instruction>(*It);
25619 if (!
I || R.isDeleted(
I)) {
25623 auto *SameTypeIt = It;
25624 while (SameTypeIt !=
End &&
25625 (!isa<Instruction>(*SameTypeIt) ||
25626 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25627 AreCompatible(*SameTypeIt, *It))) {
25628 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
25630 if (
I && !R.isDeleted(
I))
25633 unsigned NumElts = VL.
size();
25634 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25640 Candidates.
clear();
25644 IncIt = SameTypeIt;
25656template <
bool IsCompatibility>
25661 "Expected valid element types only.");
25663 return IsCompatibility;
25664 auto *CI1 = cast<CmpInst>(V);
25665 auto *CI2 = cast<CmpInst>(V2);
25666 if (CI1->getOperand(0)->getType()->getTypeID() <
25668 return !IsCompatibility;
25669 if (CI1->getOperand(0)->getType()->getTypeID() >
25672 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25674 return !IsCompatibility;
25675 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25684 if (BasePred1 < BasePred2)
25685 return !IsCompatibility;
25686 if (BasePred1 > BasePred2)
25689 bool CI1Preds = Pred1 == BasePred1;
25690 bool CI2Preds = Pred2 == BasePred1;
25691 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
25692 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
25693 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
25697 return !IsCompatibility;
25700 if (
auto *I1 = dyn_cast<Instruction>(Op1))
25701 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
25702 if (IsCompatibility) {
25703 if (I1->getParent() != I2->getParent())
25710 return NodeI2 !=
nullptr;
25713 assert((NodeI1 == NodeI2) ==
25715 "Different nodes should have different DFS numbers");
25716 if (NodeI1 != NodeI2)
25720 if (S && (IsCompatibility || !S.isAltShuffle()))
25722 if (IsCompatibility)
25724 if (I1->getOpcode() != I2->getOpcode())
25725 return I1->getOpcode() < I2->getOpcode();
25728 return IsCompatibility;
25731template <
typename ItT>
25734 bool Changed =
false;
25737 if (
R.isDeleted(
I))
25740 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
25741 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25742 if (
R.isDeleted(
I))
25748 if (
R.isDeleted(
I))
25750 Changed |= tryToVectorize(
I, R);
25757 return compareCmp<false>(V, V2, *TLI, *DT);
25760 auto AreCompatibleCompares = [&](
Value *V1,
Value *V2) {
25763 return compareCmp<true>(V1, V2, *TLI, *DT);
25770 if (Vals.
size() <= 1)
25772 Changed |= tryToVectorizeSequence<Value>(
25773 Vals, CompareSorter, AreCompatibleCompares,
25776 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25778 auto *Select = dyn_cast<SelectInst>(U);
25780 Select->getParent() != cast<Instruction>(V)->getParent();
25783 if (ArePossiblyReducedInOtherBlock)
25785 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25791bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25793 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
25794 "This function only accepts Insert instructions");
25795 bool OpsChanged =
false;
25797 for (
auto *
I :
reverse(Instructions)) {
25799 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
25801 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
25803 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25804 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
25806 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25809 if (
R.isDeleted(
I))
25811 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25812 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
25815 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
25817 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25818 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
25819 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25824 OpsChanged |= tryToVectorize(PostponedInsts, R);
25831 bool Changed =
false;
25838 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25841 "Expected vectorizable types only.");
25851 V2->getType()->getScalarSizeInBits())
25854 V2->getType()->getScalarSizeInBits())
25858 if (Opcodes1.
size() < Opcodes2.
size())
25860 if (Opcodes1.
size() > Opcodes2.
size())
25862 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
25865 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
25866 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
25871 return NodeI2 !=
nullptr;
25874 assert((NodeI1 == NodeI2) ==
25876 "Different nodes should have different DFS numbers");
25877 if (NodeI1 != NodeI2)
25880 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
25881 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
25882 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
25888 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
25889 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
25896 DT->
getNode(V1->getParent());
25898 DT->
getNode(V2->getParent());
25900 return NodeI2 !=
nullptr;
25903 assert((NodeI1 == NodeI2) ==
25905 "Different nodes should have different DFS numbers");
25906 if (NodeI1 != NodeI2)
25908 return V1->comesBefore(V2);
25921 return *Id1 < *Id2;
25925 if (
I1->getOpcode() == I2->getOpcode())
25927 return I1->getOpcode() < I2->getOpcode();
25936 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
25937 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
25945 bool U1 = isa<UndefValue>(Opcodes1[
I]);
25946 bool U2 = isa<UndefValue>(Opcodes2[
I]);
25950 auto ValID1 = Opcodes1[
I]->getValueID();
25951 auto ValID2 = Opcodes2[
I]->getValueID();
25952 if (ValID1 == ValID2)
25954 if (ValID1 < ValID2)
25956 if (ValID1 > ValID2)
25965 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
25969 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *V2) {
25972 if (V1->getType() != V2->getType())
25976 if (Opcodes1.
size() != Opcodes2.
size())
25978 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
25980 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
25982 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
25983 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
25984 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
25986 if (
I1->getParent() != I2->getParent())
25992 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
25994 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26000 bool HaveVectorizedPhiNodes =
false;
26005 auto *
P = dyn_cast<PHINode>(&
I);
26011 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26024 if (!Opcodes.
empty())
26028 while (!Nodes.empty()) {
26029 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
26032 for (
Value *V :
PHI->incoming_values()) {
26033 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
26034 Nodes.push_back(PHI1);
26042 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26043 Incoming, PHICompare, AreCompatiblePHIs,
26045 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26048 Changed |= HaveVectorizedPhiNodes;
26049 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26050 auto *
PHI = dyn_cast<PHINode>(
P.first);
26051 return !
PHI ||
R.isDeleted(
PHI);
26053 PHIToOpcodes.
clear();
26055 }
while (HaveVectorizedPhiNodes);
26057 VisitedInstrs.
clear();
26059 InstSetVector PostProcessInserts;
26063 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26064 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26065 if (VectorizeCmps) {
26066 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
26067 PostProcessCmps.
clear();
26069 PostProcessInserts.clear();
26074 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
26075 return PostProcessCmps.
contains(Cmp);
26076 return isa<InsertElementInst, InsertValueInst>(
I) &&
26077 PostProcessInserts.contains(
I);
26083 return I->use_empty() &&
26084 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
26089 if (isa<ScalableVectorType>(It->getType()))
26093 if (
R.isDeleted(&*It))
26096 if (!VisitedInstrs.
insert(&*It).second) {
26097 if (HasNoUsers(&*It) &&
26098 VectorizeInsertsAndCmps(It->isTerminator())) {
26109 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
26111 if (
P->getNumIncomingValues() == 2) {
26114 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26123 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
26128 if (BB ==
P->getIncomingBlock(
I) ||
26134 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
26135 PI && !IsInPostProcessInstrs(PI)) {
26137 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26139 if (Res &&
R.isDeleted(
P)) {
26149 if (HasNoUsers(&*It)) {
26150 bool OpsChanged =
false;
26151 auto *
SI = dyn_cast<StoreInst>(It);
26161 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26162 SI->getValueOperand()->hasOneUse();
26164 if (TryToVectorizeRoot) {
26165 for (
auto *V : It->operand_values()) {
26168 if (
auto *VI = dyn_cast<Instruction>(V);
26169 VI && !IsInPostProcessInstrs(VI))
26171 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26178 VectorizeInsertsAndCmps(It->isTerminator());
26189 if (isa<InsertElementInst, InsertValueInst>(It))
26190 PostProcessInserts.insert(&*It);
26191 else if (isa<CmpInst>(It))
26192 PostProcessCmps.
insert(cast<CmpInst>(&*It));
26199 auto Changed =
false;
26200 for (
auto &Entry : GEPs) {
26203 if (
Entry.second.size() < 2)
26206 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26207 <<
Entry.second.size() <<
".\n");
26215 return !R.isDeleted(GEP);
26217 if (It ==
Entry.second.end())
26219 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26220 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26221 if (MaxVecRegSize < EltSize)
26224 unsigned MaxElts = MaxVecRegSize / EltSize;
26225 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26226 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26239 Candidates.remove_if([&R](
Value *
I) {
26240 return R.isDeleted(cast<Instruction>(
I)) ||
26241 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
26249 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
26250 auto *GEPI = GEPList[
I];
26251 if (!Candidates.count(GEPI))
26254 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
26255 auto *GEPJ = GEPList[J];
26257 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
26258 Candidates.remove(GEPI);
26259 Candidates.remove(GEPJ);
26260 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26261 Candidates.remove(GEPJ);
26268 if (Candidates.
size() < 2)
26275 auto BundleIndex = 0
u;
26276 for (
auto *V : Candidates) {
26277 auto *
GEP = cast<GetElementPtrInst>(V);
26278 auto *GEPIdx =
GEP->idx_begin()->get();
26279 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26280 Bundle[BundleIndex++] = GEPIdx;
26292 Changed |= tryToVectorizeList(Bundle, R);
26298bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26299 bool Changed =
false;
26304 if (
V->getValueOperand()->getType()->getTypeID() <
26305 V2->getValueOperand()->getType()->getTypeID())
26307 if (
V->getValueOperand()->getType()->getTypeID() >
26308 V2->getValueOperand()->getType()->getTypeID())
26310 if (
V->getPointerOperandType()->getTypeID() <
26311 V2->getPointerOperandType()->getTypeID())
26313 if (
V->getPointerOperandType()->getTypeID() >
26314 V2->getPointerOperandType()->getTypeID())
26316 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26317 V2->getValueOperand()->getType()->getScalarSizeInBits())
26319 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26320 V2->getValueOperand()->getType()->getScalarSizeInBits())
26323 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
26324 if (
auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26328 DT->
getNode(I2->getParent());
26329 assert(NodeI1 &&
"Should only process reachable instructions");
26330 assert(NodeI2 &&
"Should only process reachable instructions");
26331 assert((NodeI1 == NodeI2) ==
26333 "Different nodes should have different DFS numbers");
26334 if (NodeI1 != NodeI2)
26336 return I1->getOpcode() < I2->getOpcode();
26338 return V->getValueOperand()->getValueID() <
26339 V2->getValueOperand()->getValueID();
26351 isa<UndefValue>(V2->getValueOperand()))
26354 if (
auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26355 if (
I1->getParent() != I2->getParent())
26360 isa<Constant>(V2->getValueOperand()))
26363 V2->getValueOperand()->getValueID();
26368 for (
auto &Pair : Stores) {
26369 if (Pair.second.size() < 2)
26373 << Pair.second.size() <<
".\n");
26382 Pair.second.rend());
26383 Changed |= tryToVectorizeSequence<StoreInst>(
26384 ReversedStores, StoreSorter, AreCompatibleStores,
26386 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff)
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
ArrayRef< T > take_back(size_t N=1) const
Return a copy of *this with only the last N elements.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
LLVM_ABI CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.