74#ifdef EXPENSIVE_CHECKS
107using namespace slpvectorizer;
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Display the SLP trees with Graphviz"));
207 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
212 cl::desc(
"Try to replace values with the idempotent instructions for "
213 "better vectorization."));
244 if (
SLPReVec && isa<FixedVectorType>(Ty))
246 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
255 if (
auto *SI = dyn_cast<StoreInst>(V))
256 return SI->getValueOperand()->getType();
257 if (
auto *CI = dyn_cast<CmpInst>(V))
258 return CI->getOperand(0)->getType();
259 if (
auto *IE = dyn_cast<InsertElementInst>(V))
260 return IE->getOperand(1)->getType();
266 assert(!isa<ScalableVectorType>(Ty) &&
267 "ScalableVectorType is not supported.");
268 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
269 return VecTy->getNumElements();
283 Type *Ty,
unsigned Sz) {
288 if (NumParts == 0 || NumParts >= Sz)
303 if (NumParts == 0 || NumParts >= Sz)
308 return (Sz / RegVF) * RegVF;
318 for (
unsigned I : seq<unsigned>(Mask.size()))
320 I * VecTyNumElements, VecTyNumElements)))
322 : Mask[
I] * VecTyNumElements + J;
353 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
355 auto *SV = cast<ShuffleVectorInst>(VL.
front());
356 unsigned SVNumElements =
357 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
358 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
359 if (SVNumElements % ShuffleMaskSize != 0)
361 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
362 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
364 unsigned NumGroup = 0;
365 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
366 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
367 Value *Src = SV->getOperand(0);
371 auto *SV = cast<ShuffleVectorInst>(V);
373 if (SV->getOperand(0) != Src)
376 if (!SV->isExtractSubvectorMask(Index))
378 ExpectedIndex.
set(Index / ShuffleMaskSize);
382 if (!ExpectedIndex.
all())
386 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
404 auto *SV = cast<ShuffleVectorInst>(VL.
front());
405 unsigned SVNumElements =
406 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
408 unsigned AccumulateLength = 0;
409 for (
Value *V : VL) {
410 auto *SV = cast<ShuffleVectorInst>(V);
411 for (
int M : SV->getShuffleMask())
413 : AccumulateLength + M);
414 AccumulateLength += SVNumElements;
422 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
429 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
430 !isa<ExtractValueInst, UndefValue>(V))
432 auto *
I = dyn_cast<Instruction>(V);
433 if (!
I || isa<ExtractValueInst>(
I))
435 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
437 if (isa<ExtractElementInst>(
I))
439 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
455 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
464 OS <<
"Idx: " <<
Idx <<
", ";
465 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
473 auto *It =
find_if(VL, IsaPred<Instruction>);
482 if (isa<PoisonValue>(V))
484 auto *
II = dyn_cast<Instruction>(V);
488 if (BB !=
II->getParent())
505 Value *FirstNonUndef =
nullptr;
506 for (
Value *V : VL) {
507 if (isa<UndefValue>(V))
509 if (!FirstNonUndef) {
513 if (V != FirstNonUndef)
516 return FirstNonUndef !=
nullptr;
531 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
532 return Cmp->isCommutative();
533 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
534 return BO->isCommutative() ||
535 (BO->getOpcode() == Instruction::Sub &&
542 if (match(U.getUser(),
543 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
544 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
548 return match(U.getUser(),
549 m_Intrinsic<Intrinsic::abs>(
550 m_Specific(U.get()), m_ConstantInt(Flag))) &&
551 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
554 (BO->getOpcode() == Instruction::FSub &&
557 return match(U.getUser(),
558 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
560 return I->isCommutative();
580 constexpr unsigned IntrinsicNumOperands = 2;
581 return IntrinsicNumOperands;
583 return I->getNumOperands();
589 static_assert(std::is_same_v<T, InsertElementInst> ||
590 std::is_same_v<T, ExtractElementInst>,
593 if (
const auto *IE = dyn_cast<T>(Inst)) {
594 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
597 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
600 if (CI->getValue().uge(VT->getNumElements()))
602 Index *= VT->getNumElements();
603 Index += CI->getZExtValue();
614 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
616 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
621 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
625 Type *CurrentType =
IV->getType();
626 for (
unsigned I :
IV->indices()) {
627 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
628 Index *= ST->getNumElements();
629 CurrentType = ST->getElementType(
I);
630 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
631 Index *= AT->getNumElements();
632 CurrentType = AT->getElementType();
646 auto *It =
find_if(VL, IsaPred<Instruction>);
651 bool IsCmpOp = isa<CmpInst>(MainOp);
654 return std::all_of(It, VL.
end(), [&](
Value *V) {
655 if (auto *CI = dyn_cast<CmpInst>(V))
656 return BasePred == CI->getPredicate();
657 if (auto *I = dyn_cast<Instruction>(V))
658 return I->getOpcode() == Opcode;
659 return isa<PoisonValue>(V);
687 if (MaskArg == UseMask::UndefsAsMask)
691 if (MaskArg == UseMask::FirstArg &&
Value < VF)
692 UseMask.reset(
Value);
693 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
694 UseMask.reset(
Value - VF);
702template <
bool IsPoisonOnly = false>
706 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
709 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
712 auto *
C = dyn_cast<Constant>(V);
714 if (!UseMask.empty()) {
716 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
718 if (isa<T>(
II->getOperand(1)))
725 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
733 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
740 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
741 if (
Constant *Elem =
C->getAggregateElement(
I))
743 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
771static std::optional<TargetTransformInfo::ShuffleKind>
774 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
778 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
779 auto *EI = dyn_cast<ExtractElementInst>(V);
782 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
785 return std::max(S, VTy->getNumElements());
788 Value *Vec1 =
nullptr;
789 Value *Vec2 =
nullptr;
791 auto *EE = dyn_cast<ExtractElementInst>(V);
794 Value *Vec = EE->getVectorOperand();
795 if (isa<UndefValue>(Vec))
800 ShuffleMode CommonShuffleMode =
Unknown;
802 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
804 if (isa<UndefValue>(VL[
I]))
806 auto *EI = cast<ExtractElementInst>(VL[
I]);
807 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
809 auto *Vec = EI->getVectorOperand();
811 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
814 if (isa<UndefValue>(Vec)) {
817 if (isa<UndefValue>(EI->getIndexOperand()))
819 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
825 unsigned IntIdx =
Idx->getValue().getZExtValue();
832 if (!Vec1 || Vec1 == Vec) {
834 }
else if (!Vec2 || Vec2 == Vec) {
840 if (CommonShuffleMode == Permute)
844 if (Mask[
I] %
Size !=
I) {
845 CommonShuffleMode = Permute;
848 CommonShuffleMode =
Select;
851 if (CommonShuffleMode ==
Select && Vec2)
862 assert((Opcode == Instruction::ExtractElement ||
863 Opcode == Instruction::ExtractValue) &&
864 "Expected extractelement or extractvalue instruction.");
865 if (Opcode == Instruction::ExtractElement) {
866 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
869 return CI->getZExtValue();
871 auto *EI = cast<ExtractValueInst>(E);
872 if (EI->getNumIndices() != 1)
874 return *EI->idx_begin();
900bool isValidForAlternation(
unsigned Opcode) {
909class BinOpSameOpcodeHelper {
910 using MaskType = std::uint_fast16_t;
912 constexpr static std::initializer_list<unsigned> SupportedOp = {
913 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
914 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
924 MainOpBIT = 0b100000000,
932 static std::pair<ConstantInt *, unsigned>
934 unsigned Opcode =
I->getOpcode();
937 auto *BinOp = cast<BinaryOperator>(
I);
938 if (
auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
940 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
941 Opcode == Instruction::AShr)
943 if (
auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
947 struct InterchangeableInfo {
950 MaskType
Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
951 MulBIT | AShrBIT | ShlBIT;
956 MaskType SeenBefore = 0;
961 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
962 if (Mask & InterchangeableMask) {
963 SeenBefore |= OpcodeInMaskForm;
964 Mask &= InterchangeableMask;
969 bool equal(
unsigned Opcode) {
970 return Opcode ==
I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
973 MaskType Candidate =
Mask & SeenBefore;
974 if (Candidate & MainOpBIT)
975 return I->getOpcode();
976 if (Candidate & ShlBIT)
977 return Instruction::Shl;
978 if (Candidate & AShrBIT)
979 return Instruction::AShr;
980 if (Candidate & MulBIT)
981 return Instruction::Mul;
982 if (Candidate & AddBIT)
983 return Instruction::Add;
984 if (Candidate & SubBIT)
985 return Instruction::Sub;
986 if (Candidate & AndBIT)
987 return Instruction::And;
988 if (Candidate & OrBIT)
989 return Instruction::Or;
990 if (Candidate & XorBIT)
991 return Instruction::Xor;
996 bool hasCandidateOpcode(
unsigned Opcode)
const {
997 MaskType Candidate =
Mask & SeenBefore;
999 case Instruction::Shl:
1000 return Candidate & ShlBIT;
1001 case Instruction::AShr:
1002 return Candidate & AShrBIT;
1003 case Instruction::Mul:
1004 return Candidate & MulBIT;
1005 case Instruction::Add:
1006 return Candidate & AddBIT;
1007 case Instruction::Sub:
1008 return Candidate & SubBIT;
1009 case Instruction::And:
1010 return Candidate & AndBIT;
1011 case Instruction::Or:
1012 return Candidate & OrBIT;
1013 case Instruction::Xor:
1014 return Candidate & XorBIT;
1015 case Instruction::LShr:
1016 case Instruction::FAdd:
1017 case Instruction::FSub:
1018 case Instruction::FMul:
1019 case Instruction::SDiv:
1020 case Instruction::UDiv:
1021 case Instruction::FDiv:
1022 case Instruction::SRem:
1023 case Instruction::URem:
1024 case Instruction::FRem:
1034 unsigned FromOpcode =
I->getOpcode();
1035 if (FromOpcode == ToOpcode)
1038 auto [CI, Pos] = isBinOpWithConstantInt(I);
1039 const APInt &FromCIValue = CI->getValue();
1040 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1042 switch (FromOpcode) {
1043 case Instruction::Shl:
1044 if (ToOpcode == Instruction::Mul) {
1048 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1049 ToCIValue = ToOpcode == Instruction::And
1051 :
APInt::getZero(FromCIValueBitWidth);
1054 case Instruction::Mul:
1056 if (ToOpcode == Instruction::Shl) {
1057 ToCIValue =
APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1059 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1060 ToCIValue = ToOpcode == Instruction::And
1062 :
APInt::getZero(FromCIValueBitWidth);
1065 case Instruction::Add:
1066 case Instruction::Sub:
1067 if (FromCIValue.
isZero()) {
1071 "Cannot convert the instruction.");
1072 ToCIValue = FromCIValue;
1076 case Instruction::And:
1078 ToCIValue = ToOpcode == Instruction::Mul
1080 :
APInt::getZero(FromCIValueBitWidth);
1083 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1089 ConstantInt::get(
I->getOperand(Pos)->getType(), ToCIValue);
1093 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1098 InterchangeableInfo MainOp;
1099 InterchangeableInfo AltOp;
1100 bool isValidForAlternation(
const Instruction *
I)
const {
1101 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1102 ::isValidForAlternation(
I->getOpcode());
1107 if (!isValidForAlternation(
I))
1116 : MainOp(MainOp), AltOp(AltOp) {
1120 assert(isa<BinaryOperator>(
I) &&
1121 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1122 unsigned Opcode =
I->getOpcode();
1123 MaskType OpcodeInMaskForm;
1126 case Instruction::Shl:
1127 OpcodeInMaskForm = ShlBIT;
1129 case Instruction::AShr:
1130 OpcodeInMaskForm = AShrBIT;
1132 case Instruction::Mul:
1133 OpcodeInMaskForm = MulBIT;
1135 case Instruction::Add:
1136 OpcodeInMaskForm = AddBIT;
1138 case Instruction::Sub:
1139 OpcodeInMaskForm = SubBIT;
1141 case Instruction::And:
1142 OpcodeInMaskForm = AndBIT;
1144 case Instruction::Or:
1145 OpcodeInMaskForm = OrBIT;
1147 case Instruction::Xor:
1148 OpcodeInMaskForm = XorBIT;
1151 return MainOp.equal(Opcode) ||
1152 (initializeAltOp(
I) && AltOp.equal(Opcode));
1154 MaskType InterchangeableMask = OpcodeInMaskForm;
1157 constexpr MaskType CanBeAll =
1158 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1161 case Instruction::Shl:
1163 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1165 case Instruction::Mul:
1166 if (CIValue.
isOne()) {
1167 InterchangeableMask = CanBeAll;
1171 InterchangeableMask = MulBIT | ShlBIT;
1173 case Instruction::Add:
1174 case Instruction::Sub:
1175 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1177 case Instruction::And:
1179 InterchangeableMask = CanBeAll;
1183 InterchangeableMask = CanBeAll;
1187 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1188 (initializeAltOp(
I) &&
1189 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1191 unsigned getMainOpcode()
const {
return MainOp.
getOpcode(); }
1193 bool hasCandidateOpcode(
unsigned Opcode)
const {
1194 return MainOp.hasCandidateOpcode(Opcode);
1196 bool hasAltOp()
const {
return AltOp.I; }
1197 unsigned getAltOpcode()
const {
1198 return hasAltOp() ? AltOp.
getOpcode() : getMainOpcode();
1206class InstructionsState {
1232 bool HasCopyables =
false;
1236 assert(valid() &&
"InstructionsState is invalid.");
1241 assert(valid() &&
"InstructionsState is invalid.");
1246 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1248 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1251 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1261 assert(MainOp &&
"MainOp cannot be nullptr.");
1265 assert(AltOp &&
"AltOp cannot be nullptr.");
1268 if (!
I->isBinaryOp())
1270 BinOpSameOpcodeHelper
Converter(MainOp);
1274 BinOpSameOpcodeHelper AltConverter(AltOp);
1275 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1276 AltConverter.hasCandidateOpcode(AltOp->
getOpcode()))
1279 if (
Converter.hasAltOp() && !isAltShuffle())
1281 return Converter.hasAltOp() ? AltOp : MainOp;
1285 bool isShiftOp()
const {
1286 return getMainOp()->
isShift() && getAltOp()->isShift();
1291 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1295 bool isMulDivLikeOp()
const {
1296 constexpr std::array<unsigned, 8> MulDiv = {
1297 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1298 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1299 Instruction::URem, Instruction::FRem};
1305 bool isAddSubLikeOp()
const {
1306 constexpr std::array<unsigned, 4>
AddSub = {
1307 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1314 bool isCmpOp()
const {
1315 return (
getOpcode() == Instruction::ICmp ||
1321 bool valid()
const {
return MainOp && AltOp; }
1323 explicit operator bool()
const {
return valid(); }
1325 InstructionsState() =
delete;
1327 bool HasCopyables =
false)
1328 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1329 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1332 bool isCopyableElement(
Value *V)
const {
1333 assert(valid() &&
"InstructionsState is invalid.");
1336 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1338 auto *
I = dyn_cast<Instruction>(V);
1340 return !isa<PoisonValue>(V);
1347 if (!
I->isBinaryOp())
1349 BinOpSameOpcodeHelper
Converter(MainOp);
1355 bool isNonSchedulable(
Value *V)
const {
1356 assert(valid() &&
"InstructionsState is invalid.");
1357 auto *
I = dyn_cast<Instruction>(V);
1363 if (getMainOp() == V)
1365 if (isCopyableElement(V)) {
1366 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1367 auto *
I = dyn_cast<Instruction>(V);
1368 return !
I || isa<PHINode>(
I) ||
I->getParent() != MainOp->
getParent() ||
1376 return IsNonSchedulableCopyableElement(V);
1383 bool areInstructionsWithCopyableElements()
const {
1384 assert(valid() &&
"InstructionsState is invalid.");
1385 return HasCopyables;
1389std::pair<Instruction *, SmallVector<Value *>>
1391 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1392 assert(SelectedOp &&
"Cannot convert the instruction.");
1393 if (
I->isBinaryOp()) {
1395 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1414 for (
Value *V : VL) {
1415 if (isa<PoisonValue>(V))
1417 assert(isa<Instruction>(V) &&
"Only accepts PoisonValue and Instruction.");
1418 auto *Inst = cast<Instruction>(V);
1419 if (Inst->getOpcode() == Opcode)
1431 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1432 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1433 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1444 "Assessing comparisons of different types?");
1454 return (BasePred == Pred &&
1456 (BasePred == SwappedPred &&
1466 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
1467 return InstructionsState::invalid();
1469 auto *It =
find_if(VL, IsaPred<Instruction>);
1471 return InstructionsState::invalid();
1474 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
1475 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
1476 (VL.
size() == 2 && InstCnt < 2))
1477 return InstructionsState::invalid();
1479 bool IsCastOp = isa<CastInst>(MainOp);
1480 bool IsBinOp = isa<BinaryOperator>(MainOp);
1481 bool IsCmpOp = isa<CmpInst>(MainOp);
1486 unsigned AltOpcode = Opcode;
1488 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1489 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1491 UniquePreds.
insert(BasePred);
1492 UniqueNonSwappedPreds.
insert(BasePred);
1493 for (
Value *V : VL) {
1494 auto *
I = dyn_cast<CmpInst>(V);
1500 UniqueNonSwappedPreds.
insert(CurrentPred);
1501 if (!UniquePreds.
contains(CurrentPred) &&
1502 !UniquePreds.
contains(SwappedCurrentPred))
1503 UniquePreds.
insert(CurrentPred);
1508 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1514 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
1518 return InstructionsState::invalid();
1520 bool AnyPoison = InstCnt != VL.
size();
1524 auto *
I = dyn_cast<Instruction>(V);
1531 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
1532 return InstructionsState::invalid();
1533 unsigned InstOpcode =
I->getOpcode();
1534 if (IsBinOp && isa<BinaryOperator>(
I)) {
1535 if (BinOpHelper.add(
I))
1537 }
else if (IsCastOp && isa<CastInst>(
I)) {
1540 Value *Op1 =
I->getOperand(0);
1543 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1545 if (Opcode == AltOpcode) {
1546 assert(isValidForAlternation(Opcode) &&
1547 isValidForAlternation(InstOpcode) &&
1548 "Cast isn't safe for alternation, logic needs to be updated!");
1549 AltOpcode = InstOpcode;
1554 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1555 auto *BaseInst = cast<CmpInst>(MainOp);
1556 Type *Ty0 = BaseInst->getOperand(0)->getType();
1557 Type *Ty1 = Inst->getOperand(0)->getType();
1559 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1560 assert(InstOpcode == AltOpcode &&
1561 "Alternate instructions are only supported by BinaryOperator "
1569 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1570 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1575 auto *AltInst = cast<CmpInst>(AltOp);
1576 if (MainOp != AltOp) {
1579 }
else if (BasePred != CurrentPred) {
1581 isValidForAlternation(InstOpcode) &&
1582 "CmpInst isn't safe for alternation, logic needs to be updated!");
1587 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1588 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1591 }
else if (InstOpcode == Opcode) {
1592 assert(InstOpcode == AltOpcode &&
1593 "Alternate instructions are only supported by BinaryOperator and "
1595 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1596 if (Gep->getNumOperands() != 2 ||
1598 return InstructionsState::invalid();
1599 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1601 return InstructionsState::invalid();
1602 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1603 auto *BaseLI = cast<LoadInst>(MainOp);
1604 if (!LI->isSimple() || !BaseLI->isSimple())
1605 return InstructionsState::invalid();
1606 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1607 auto *
CallBase = cast<CallInst>(MainOp);
1609 return InstructionsState::invalid();
1610 if (Call->hasOperandBundles() &&
1612 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1613 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1616 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1622 if (Mappings.
size() != BaseMappings.
size() ||
1623 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1624 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1625 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1626 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1627 Mappings.
front().Shape.Parameters !=
1628 BaseMappings.
front().Shape.Parameters)
1629 return InstructionsState::invalid();
1634 return InstructionsState::invalid();
1639 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1641 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1644 "Incorrect implementation of allSameOpcode.");
1645 InstructionsState S(MainOp, AltOp);
1648 return isa<PoisonValue>(V) ||
1649 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1651 "Invalid InstructionsState.");
1659 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1669 unsigned Opcode = UserInst->
getOpcode();
1671 case Instruction::Load: {
1672 LoadInst *LI = cast<LoadInst>(UserInst);
1675 case Instruction::Store: {
1676 StoreInst *SI = cast<StoreInst>(UserInst);
1677 return (SI->getPointerOperand() == Scalar);
1679 case Instruction::Call: {
1680 CallInst *CI = cast<CallInst>(UserInst);
1683 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1684 Arg.value().get() == Scalar;
1696 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1703 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1704 return LI->isSimple();
1706 return SI->isSimple();
1708 return !
MI->isVolatile();
1716 bool ExtendingManyInputs =
false) {
1717 if (SubMask.
empty())
1720 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1723 "SubMask with many inputs support must be larger than the mask.");
1725 Mask.append(SubMask.
begin(), SubMask.
end());
1729 int TermValue = std::min(Mask.size(), SubMask.
size());
1730 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1732 (!ExtendingManyInputs &&
1733 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1735 NewMask[
I] = Mask[SubMask[
I]];
1751 const size_t Sz = Order.
size();
1754 for (
unsigned I = 0;
I < Sz; ++
I) {
1756 UnusedIndices.
reset(Order[
I]);
1758 MaskedIndices.
set(
I);
1760 if (MaskedIndices.
none())
1763 "Non-synced masked/available indices.");
1767 assert(
Idx >= 0 &&
"Indices must be synced.");
1777 unsigned Opcode0,
unsigned Opcode1) {
1780 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1781 if (isa<PoisonValue>(VL[Lane]))
1783 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1784 OpcodeMask.
set(Lane * ScalarTyNumElements,
1785 Lane * ScalarTyNumElements + ScalarTyNumElements);
1794 "Expected scalar constants.");
1797 std::fill_n(NewVal.
begin() +
I * VF, VF, V);
1806 const unsigned E = Indices.
size();
1808 for (
unsigned I = 0;
I < E; ++
I)
1809 Mask[Indices[
I]] =
I;
1815 assert(!Mask.empty() &&
"Expected non-empty mask.");
1819 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1821 Scalars[Mask[
I]] = Prev[
I];
1829 auto *
I = dyn_cast<Instruction>(V);
1834 auto *IO = dyn_cast<Instruction>(V);
1837 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1846 auto *
I = dyn_cast<Instruction>(V);
1850 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1852 auto *IU = dyn_cast<Instruction>(U);
1855 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1871 return !VL.
empty() &&
1887 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1896 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1898 if (NumParts == 0 || NumParts >= Limit)
1901 if (NumParts >= Sz || Sz % NumParts != 0 ||
1907namespace slpvectorizer {
1912 class ScheduleEntity;
1914 class ScheduleCopyableData;
1915 class ScheduleBundle;
1940 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1941 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1973 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1994 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1995 return VectorizableTree.
front()->Scalars;
2001 const TreeEntry &Root = *VectorizableTree.
front();
2002 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2003 !Root.Scalars.front()->getType()->isIntegerTy())
2004 return std::nullopt;
2005 auto It = MinBWs.
find(&Root);
2006 if (It != MinBWs.
end())
2010 if (Root.getOpcode() == Instruction::ZExt ||
2011 Root.getOpcode() == Instruction::SExt)
2012 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2013 Root.getOpcode() == Instruction::SExt);
2014 return std::nullopt;
2020 return MinBWs.
at(VectorizableTree.
front().get()).second;
2025 if (ReductionBitWidth == 0 ||
2026 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
2027 ReductionBitWidth >=
2028 DL->getTypeSizeInBits(
2029 VectorizableTree.
front()->Scalars.front()->getType()))
2031 VectorizableTree.
front()->Scalars.front()->getType(),
2032 VectorizableTree.
front()->getVectorFactor());
2035 VectorizableTree.
front()->Scalars.front()->getContext(),
2037 VectorizableTree.
front()->getVectorFactor());
2052 VectorizableTree.
clear();
2053 ScalarToTreeEntries.clear();
2054 OperandsToTreeEntry.
clear();
2055 ScalarsInSplitNodes.clear();
2057 NonScheduledFirst.
clear();
2058 EntryToLastInstruction.clear();
2059 LoadEntriesToVectorize.
clear();
2060 IsGraphTransformMode =
false;
2061 GatheredLoadsEntriesFirst.reset();
2062 CompressEntryToData.clear();
2063 ExternalUses.
clear();
2064 ExternalUsesAsOriginalScalar.clear();
2065 ExternalUsesWithNonUsers.clear();
2066 for (
auto &Iter : BlocksSchedules) {
2067 BlockScheduling *BS = Iter.second.get();
2071 ReductionBitWidth = 0;
2073 CastMaxMinBWSizes.reset();
2074 ExtraBitWidthNodes.
clear();
2075 InstrElementSize.clear();
2076 UserIgnoreList =
nullptr;
2077 PostponedGathers.
clear();
2078 ValueToGatherNodes.
clear();
2094 assert(!Order.
empty() &&
"expected non-empty order");
2095 const unsigned Sz = Order.
size();
2097 return P.value() ==
P.index() ||
P.value() == Sz;
2110 bool IgnoreReorder);
2123 std::optional<OrdersType>
2161 return MaxVecRegSize;
2166 return MinVecRegSize;
2174 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2176 return MaxVF ? MaxVF : UINT_MAX;
2228 unsigned *BestVF =
nullptr,
2229 bool TryRecursiveCheck =
true)
const;
2237 template <
typename T>
2264 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2265 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2290 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
2291 MaxLevel(MaxLevel) {}
2345 if (isa<LoadInst>(V1)) {
2347 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2352 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2354 return U == U1 || U == U2 || R.isVectorized(U);
2357 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2360 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2362 ((
int)V1->getNumUses() == NumLanes ||
2363 AllUsersAreInternal(V1, V2)))
2369 auto CheckSameEntryOrFail = [&]() {
2374 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2380 auto *LI1 = dyn_cast<LoadInst>(V1);
2381 auto *LI2 = dyn_cast<LoadInst>(V2);
2383 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2385 return CheckSameEntryOrFail();
2388 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2389 LI2->getPointerOperand(),
DL, SE,
true);
2390 if (!Dist || *Dist == 0) {
2393 R.TTI->isLegalMaskedGather(
2396 return CheckSameEntryOrFail();
2400 if (std::abs(*Dist) > NumLanes / 2)
2409 auto *C1 = dyn_cast<Constant>(V1);
2410 auto *C2 = dyn_cast<Constant>(V2);
2415 if ((C1 && isa<InsertElementInst>(V2)) ||
2416 (C2 && isa<InsertElementInst>(V1)))
2429 if (isa<UndefValue>(V2))
2433 Value *EV2 =
nullptr;
2446 int Dist = Idx2 - Idx1;
2449 if (std::abs(Dist) == 0)
2451 if (std::abs(Dist) > NumLanes / 2)
2458 return CheckSameEntryOrFail();
2461 auto *I1 = dyn_cast<Instruction>(V1);
2462 auto *I2 = dyn_cast<Instruction>(V2);
2464 if (I1->getParent() != I2->getParent())
2465 return CheckSameEntryOrFail();
2473 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2474 !S.isAltShuffle()) &&
2476 return isa<PoisonValue>(V) ||
2477 cast<Instruction>(V)->getNumOperands() ==
2478 S.getMainOp()->getNumOperands();
2484 if (I1 && isa<PoisonValue>(V2))
2487 if (isa<UndefValue>(V2))
2490 return CheckSameEntryOrFail();
2524 int ShallowScoreAtThisLevel =
2533 auto *I1 = dyn_cast<Instruction>(
LHS);
2534 auto *I2 = dyn_cast<Instruction>(
RHS);
2535 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2537 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2538 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2539 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
2540 ShallowScoreAtThisLevel))
2541 return ShallowScoreAtThisLevel;
2542 assert(I1 && I2 &&
"Should have early exited.");
2549 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2550 OpIdx1 != NumOperands1; ++OpIdx1) {
2552 int MaxTmpScore = 0;
2553 unsigned MaxOpIdx2 = 0;
2554 bool FoundBest =
false;
2558 ? I2->getNumOperands()
2559 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2560 assert(FromIdx <= ToIdx &&
"Bad index");
2561 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2563 if (Op2Used.
count(OpIdx2))
2568 I1, I2, CurrLevel + 1, {});
2571 TmpScore > MaxTmpScore) {
2572 MaxTmpScore = TmpScore;
2579 Op2Used.
insert(MaxOpIdx2);
2580 ShallowScoreAtThisLevel += MaxTmpScore;
2583 return ShallowScoreAtThisLevel;
2614 struct OperandData {
2615 OperandData() =
default;
2616 OperandData(
Value *V,
bool APO,
bool IsUsed)
2617 : V(V), APO(APO), IsUsed(IsUsed) {}
2627 bool IsUsed =
false;
2636 enum class ReorderingMode {
2650 unsigned ArgSize = 0;
2656 const Loop *L =
nullptr;
2659 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2660 return OpsVec[
OpIdx][Lane];
2664 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2665 return OpsVec[
OpIdx][Lane];
2670 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2672 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2674 OpsVec[
OpIdx][Lane].IsUsed =
false;
2678 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2679 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2691 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2693 Value *IdxLaneV = getData(
Idx, Lane).V;
2694 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(
OpIdx, Lane).V ||
2695 isa<ExtractElementInst>(IdxLaneV))
2698 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2702 if (!isa<Instruction>(OpIdxLnV))
2706 unsigned UniquesCount = Uniques.
size();
2707 auto IdxIt = Uniques.
find(IdxLaneV);
2708 unsigned UniquesCntWithIdxLaneV =
2709 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2711 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2712 unsigned UniquesCntWithOpIdxLaneV =
2713 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2714 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2716 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2717 UniquesCntWithOpIdxLaneV,
2718 UniquesCntWithOpIdxLaneV -
2720 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2721 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2722 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2731 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2732 Value *IdxLaneV = getData(
Idx, Lane).V;
2742 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2743 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2745 return R.areAllUsersVectorized(IdxLaneI)
2753 static const int ScoreScaleFactor = 10;
2761 int Lane,
unsigned OpIdx,
unsigned Idx,
2771 int SplatScore = getSplatScore(Lane,
OpIdx,
Idx, UsedLanes);
2772 if (Score <= -SplatScore) {
2776 Score += SplatScore;
2782 Score *= ScoreScaleFactor;
2783 Score += getExternalUseScore(Lane,
OpIdx,
Idx);
2801 std::optional<unsigned>
2802 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2806 unsigned NumOperands = getNumOperands();
2809 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2812 ReorderingMode RMode = ReorderingModes[
OpIdx];
2813 if (RMode == ReorderingMode::Failed)
2814 return std::nullopt;
2817 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2823 std::optional<unsigned>
Idx;
2833 bool IsUsed = RMode == ReorderingMode::Splat ||
2834 RMode == ReorderingMode::Constant ||
2835 RMode == ReorderingMode::Load;
2837 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2839 OperandData &OpData = getData(
Idx, Lane);
2841 bool OpAPO = OpData.APO;
2850 if (OpAPO != OpIdxAPO)
2855 case ReorderingMode::Load:
2856 case ReorderingMode::Opcode: {
2857 bool LeftToRight = Lane > LastLane;
2858 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2859 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2860 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2862 if (Score >
static_cast<int>(BestOp.Score) ||
2863 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2866 BestOp.Score = Score;
2867 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2871 case ReorderingMode::Constant:
2872 if (isa<Constant>(
Op) ||
2873 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2875 if (isa<Constant>(
Op)) {
2877 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2880 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2884 case ReorderingMode::Splat:
2885 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2886 IsUsed =
Op == OpLastLane;
2887 if (
Op == OpLastLane) {
2889 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2895 case ReorderingMode::Failed:
2901 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2905 return std::nullopt;
2912 unsigned getBestLaneToStartReordering()
const {
2913 unsigned Min = UINT_MAX;
2914 unsigned SameOpNumber = 0;
2925 for (
int I = getNumLanes();
I > 0; --
I) {
2926 unsigned Lane =
I - 1;
2927 OperandsOrderData NumFreeOpsHash =
2928 getMaxNumOperandsThatCanBeReordered(Lane);
2931 if (NumFreeOpsHash.NumOfAPOs < Min) {
2932 Min = NumFreeOpsHash.NumOfAPOs;
2933 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2935 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2936 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2937 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2940 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2942 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2943 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2944 auto [It, Inserted] =
2945 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2951 unsigned BestLane = 0;
2952 unsigned CntMin = UINT_MAX;
2954 if (
Data.second.first < CntMin) {
2955 CntMin =
Data.second.first;
2956 BestLane =
Data.second.second;
2963 struct OperandsOrderData {
2966 unsigned NumOfAPOs = UINT_MAX;
2969 unsigned NumOpsWithSameOpcodeParent = 0;
2983 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2984 unsigned CntTrue = 0;
2985 unsigned NumOperands = getNumOperands();
2995 bool AllUndefs =
true;
2996 unsigned NumOpsWithSameOpcodeParent = 0;
3001 const OperandData &OpData = getData(
OpIdx, Lane);
3006 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
3008 I->getParent() != Parent) {
3009 if (NumOpsWithSameOpcodeParent == 0) {
3010 NumOpsWithSameOpcodeParent = 1;
3012 Parent =
I->getParent();
3014 --NumOpsWithSameOpcodeParent;
3017 ++NumOpsWithSameOpcodeParent;
3022 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3026 OperandsOrderData
Data;
3027 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3028 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3035 const InstructionsState &S) {
3039 return VL.
size() == getNumLanes();
3041 "Expected same number of lanes");
3042 assert(S.valid() &&
"InstructionsState is invalid.");
3049 unsigned NumLanes = VL.
size();
3051 Ops.resize(NumLanes);
3052 for (
unsigned Lane : seq<unsigned>(NumLanes)) {
3063 auto *
I = dyn_cast<Instruction>(VL[Lane]);
3064 if (!
I && isa<PoisonValue>(VL[Lane])) {
3065 for (
unsigned OpIdx : seq<unsigned>(NumOperands))
3069 bool IsInverseOperation =
false;
3070 if (S.isCopyableElement(VL[Lane])) {
3074 assert(
I &&
"Expected instruction");
3075 auto [SelectedOp, Ops] = convertTo(
I, S);
3081 for (
unsigned OpIdx : seq<unsigned>(ArgSize)) {
3082 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3089 unsigned getNumOperands()
const {
return ArgSize; }
3092 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
3095 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3096 return getData(
OpIdx, Lane).V;
3100 bool empty()
const {
return OpsVec.
empty(); }
3103 void clear() { OpsVec.
clear(); }
3108 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3110 "Op is expected to be getValue(OpIdx, Lane).");
3112 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
3114 bool OpAPO = getData(
OpIdx, Lane).APO;
3115 bool IsInvariant = L && L->isLoopInvariant(
Op);
3117 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3121 bool FoundCandidate =
false;
3122 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3123 OperandData &
Data = getData(OpI, Ln);
3124 if (
Data.APO != OpAPO ||
Data.IsUsed)
3126 Value *OpILane = getValue(OpI, Lane);
3127 bool IsConstantOp = isa<Constant>(OpILane);
3136 ((Lns > 2 && isa<Constant>(
Data.V)) ||
3141 isa<Constant>(
Data.V)))) ||
3148 (IsInvariant && !isa<Constant>(
Data.V) &&
3150 L->isLoopInvariant(
Data.V))) {
3151 FoundCandidate =
true;
3158 if (!FoundCandidate)
3161 return getNumLanes() == 2 || Cnt > 1;
3168 "Op is expected to be getValue(OpIdx, Lane).");
3169 bool OpAPO = getData(
OpIdx, Lane).APO;
3170 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3173 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
3174 const OperandData &
Data = getData(OpI, Ln);
3175 if (
Data.APO != OpAPO ||
Data.IsUsed)
3177 Value *OpILn = getValue(OpI, Ln);
3178 return (L && L->isLoopInvariant(OpILn)) ||
3190 const InstructionsState &S,
const BoUpSLP &R)
3191 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
3192 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3194 appendOperands(RootVL,
Operands, S);
3202 "Expected same num of lanes across all operands");
3203 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3204 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3212 unsigned NumOperands = getNumOperands();
3213 unsigned NumLanes = getNumLanes();
3233 unsigned FirstLane = getBestLaneToStartReordering();
3240 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3242 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3243 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3244 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3245 else if (isa<LoadInst>(OpILane0))
3246 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3248 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3249 }
else if (isa<Constant>(OpLane0)) {
3250 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3251 }
else if (isa<Argument>(OpLane0)) {
3253 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3263 auto &&SkipReordering = [
this]() {
3266 for (
const OperandData &
Data : Op0)
3270 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3277 return UniqueValues.
size() != 2 &&
3279 UniqueValues.
size());
3291 if (SkipReordering())
3294 bool StrategyFailed =
false;
3302 for (
unsigned I = 0;
I < NumOperands; ++
I)
3303 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3306 UsedLanes.
set(FirstLane);
3307 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3310 int Lane = FirstLane +
Direction * Distance;
3311 if (Lane < 0 || Lane >= (
int)NumLanes)
3313 UsedLanes.
set(Lane);
3315 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3320 std::optional<unsigned> BestIdx =
3321 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3322 MainAltOps[
OpIdx], UsedLanes);
3329 swap(
OpIdx, *BestIdx, Lane);
3332 StrategyFailed =
true;
3336 OperandData &AltOp = getData(
OpIdx, Lane);
3337 InstructionsState OpS =
3339 if (OpS && OpS.isAltShuffle())
3346 if (!StrategyFailed)
3351#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3354 case ReorderingMode::Load:
3356 case ReorderingMode::Opcode:
3358 case ReorderingMode::Constant:
3360 case ReorderingMode::Splat:
3362 case ReorderingMode::Failed:
3383 const unsigned Indent = 2;
3386 OS <<
"Operand " << Cnt++ <<
"\n";
3387 for (
const OperandData &OpData : OpDataVec) {
3389 if (
Value *V = OpData.V)
3393 OS <<
", APO:" << OpData.APO <<
"}\n";
3415 int BestScore = Limit;
3416 std::optional<int> Index;
3417 for (
int I : seq<int>(0, Candidates.size())) {
3419 Candidates[
I].second,
3422 if (Score > BestScore) {
3437 DeletedInstructions.insert(
I);
3442 template <
typename T>
3445 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3447 for (
T *V : DeadVals) {
3448 auto *
I = cast<Instruction>(V);
3452 for (
T *V : DeadVals) {
3453 if (!V || !Processed.
insert(V).second)
3455 auto *
I = cast<Instruction>(V);
3458 for (
Use &U :
I->operands()) {
3459 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3460 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3462 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3463 return Entry->VectorizedValue == OpI;
3467 I->dropAllReferences();
3469 for (
T *V : DeadVals) {
3470 auto *
I = cast<Instruction>(V);
3471 if (!
I->getParent())
3476 cast<Instruction>(U.getUser()));
3478 "trying to erase instruction with users.");
3479 I->removeFromParent();
3483 while (!DeadInsts.
empty()) {
3486 if (!VI || !VI->getParent())
3489 "Live instruction found in dead worklist!");
3490 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3497 for (
Use &OpU : VI->operands()) {
3498 Value *OpV = OpU.get();
3509 if (
auto *OpI = dyn_cast<Instruction>(OpV))
3510 if (!DeletedInstructions.contains(OpI) &&
3511 (!OpI->getType()->isVectorTy() ||
3512 none_of(VectorValuesAndScales,
3513 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3514 return std::get<0>(V) == OpI;
3520 VI->removeFromParent();
3529 return AnalyzedReductionsRoots.count(
I);
3534 AnalyzedReductionsRoots.insert(
I);
3548 AnalyzedReductionsRoots.clear();
3549 AnalyzedReductionVals.
clear();
3550 AnalyzedMinBWVals.
clear();
3562 return NonScheduledFirst.
contains(V);
3567 assert(V &&
"V cannot be nullptr.");
3568 return ScalarToTreeEntries.contains(V);
3578 bool collectValuesToDemote(
3579 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3582 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3591 void buildReorderableOperands(
3599 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3602 bool areAllUsersVectorized(
3611 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3612 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3613 return const_cast<TreeEntry *
>(
3614 getOperandEntry(
const_cast<const TreeEntry *
>(E),
Idx));
3620 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3624 getCastContextHint(
const TreeEntry &TE)
const;
3638 const InstructionsState &LocalState,
3645 unsigned InterleaveFactor = 0);
3656 bool ResizeAllowed =
false)
const;
3663 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3668 template <
typename BVTy,
typename ResTy,
typename... Args>
3669 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3674 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3680 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3687 std::optional<TargetTransformInfo::ShuffleKind>
3699 unsigned NumParts)
const;
3711 std::optional<TargetTransformInfo::ShuffleKind>
3712 isGatherShuffledSingleRegisterEntry(
3729 isGatherShuffledEntry(
3732 unsigned NumParts,
bool ForOrder =
false);
3738 Type *ScalarTy)
const;
3742 void setInsertPointAfterBundle(
const TreeEntry *E);
3752 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3757 void tryToVectorizeGatheredLoads(
3759 std::tuple<BasicBlock *, Value *, Type *>,
3767 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3783 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3787 void reorderGatherNode(TreeEntry &TE);
3792 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3796 if (State == TreeEntry::SplitVectorize)
3806 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3807 "Expected only split vectorize node.");
3809 unsigned CommonVF = std::max<unsigned>(
3810 CombinedEntriesWithIndices.back().second,
3811 Scalars.size() - CombinedEntriesWithIndices.back().second);
3812 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3814 Idx + (
Idx >= CombinedEntriesWithIndices.back().second
3815 ? CommonVF - CombinedEntriesWithIndices.back().second
3832 [Scalars](
Value *V,
int Idx) {
3833 return (isa<UndefValue>(V) &&
3834 Idx == PoisonMaskElem) ||
3835 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3838 if (!ReorderIndices.empty()) {
3845 return IsSame(Scalars, Mask);
3846 if (VL.
size() == ReuseShuffleIndices.size()) {
3848 return IsSame(Scalars, Mask);
3852 return IsSame(Scalars, ReuseShuffleIndices);
3856 bool hasEqualOperands(
const TreeEntry &TE)
const {
3857 if (
TE.getNumOperands() != getNumOperands())
3860 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3861 unsigned PrevCount =
Used.count();
3862 for (
unsigned K = 0;
K < E; ++
K) {
3865 if (getOperand(K) ==
TE.getOperand(
I)) {
3871 if (PrevCount ==
Used.count())
3880 unsigned getVectorFactor()
const {
3881 if (!ReuseShuffleIndices.empty())
3882 return ReuseShuffleIndices.size();
3883 return Scalars.
size();
3887 bool isGather()
const {
return State == NeedToGather; }
3914 enum CombinedOpcode {
3916 MinMax = Instruction::OtherOpsEnd + 1,
3919 CombinedOpcode CombinedOp = NotCombinedOp;
3933 VecTreeTy &Container;
3936 EdgeInfo UserTreeIndex;
3956 InstructionsState S = InstructionsState::invalid();
3959 unsigned InterleaveFactor = 0;
3962 bool DoesNotNeedToSchedule =
false;
3968 assert(Operands[
OpIdx].empty() &&
"Already resized?");
3970 "Number of operands is greater than the number of scalars.");
3977 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3979 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3982 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
3989 for (
unsigned I : seq<unsigned>(
Operands.size()))
3990 setOperand(
I, Operands[
I]);
4012 unsigned getNumOperands()
const {
return Operands.size(); }
4015 Value *getSingleOperand(
unsigned OpIdx)
const {
4017 assert(!Operands[
OpIdx].empty() &&
"No operand available");
4022 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4025 return S.getMatchingMainOpOrAltOp(
I);
4032 auto *
I = dyn_cast<Instruction>(
Op);
4033 if (
I && getMatchingMainOpOrAltOp(
I))
4035 return S.getMainOp();
4038 void setOperations(
const InstructionsState &S) {
4039 assert(S &&
"InstructionsState is invalid.");
4043 Instruction *getMainOp()
const {
return S.getMainOp(); }
4045 Instruction *getAltOp()
const {
return S.getAltOp(); }
4048 unsigned getOpcode()
const {
return S.
getOpcode(); }
4050 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4052 bool hasState()
const {
return S.valid(); }
4055 void addCopyableElement(
Value *V) {
4056 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4057 CopyableElements.
insert(V);
4061 bool isCopyableElement(
Value *V)
const {
4062 return CopyableElements.
contains(V);
4066 bool hasCopyableElements()
const {
return !CopyableElements.
empty(); }
4069 const InstructionsState &getOperations()
const {
return S; }
4073 unsigned findLaneForValue(
Value *V)
const {
4074 unsigned FoundLane = getVectorFactor();
4075 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
4076 std::advance(It, 1)) {
4079 FoundLane = std::distance(Scalars.begin(), It);
4080 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4081 if (!ReorderIndices.
empty())
4082 FoundLane = ReorderIndices[FoundLane];
4083 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4084 if (ReuseShuffleIndices.
empty())
4086 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4087 RIt != ReuseShuffleIndices.
end()) {
4088 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
4092 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4105 bool isNonPowOf2Vec()
const {
4107 return IsNonPowerOf2;
4116 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
4117 "Reshuffling not supported with non-power-of-2 vectors yet.");
4118 return IsNonPowerOf2;
4121 Value *getOrdered(
unsigned Idx)
const {
4122 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
4123 if (ReorderIndices.
empty())
4124 return Scalars[
Idx];
4134 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
4135 dbgs() <<
"Operand " << OpI <<
":\n";
4136 for (
const Value *V : Operands[OpI])
4139 dbgs() <<
"Scalars: \n";
4140 for (
Value *V : Scalars)
4142 dbgs() <<
"State: ";
4143 if (S && hasCopyableElements())
4144 dbgs() <<
"[[Copyable]] ";
4147 if (InterleaveFactor > 0) {
4148 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4151 dbgs() <<
"Vectorize\n";
4154 case ScatterVectorize:
4155 dbgs() <<
"ScatterVectorize\n";
4157 case StridedVectorize:
4158 dbgs() <<
"StridedVectorize\n";
4160 case CompressVectorize:
4161 dbgs() <<
"CompressVectorize\n";
4164 dbgs() <<
"NeedToGather\n";
4166 case CombinedVectorize:
4167 dbgs() <<
"CombinedVectorize\n";
4169 case SplitVectorize:
4170 dbgs() <<
"SplitVectorize\n";
4174 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4175 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4177 dbgs() <<
"MainOp: NULL\n";
4178 dbgs() <<
"AltOp: NULL\n";
4180 dbgs() <<
"VectorizedValue: ";
4181 if (VectorizedValue)
4182 dbgs() << *VectorizedValue <<
"\n";
4185 dbgs() <<
"ReuseShuffleIndices: ";
4186 if (ReuseShuffleIndices.
empty())
4189 for (
int ReuseIdx : ReuseShuffleIndices)
4190 dbgs() << ReuseIdx <<
", ";
4192 dbgs() <<
"ReorderIndices: ";
4193 for (
unsigned ReorderIdx : ReorderIndices)
4194 dbgs() << ReorderIdx <<
", ";
4196 dbgs() <<
"UserTreeIndex: ";
4198 dbgs() << UserTreeIndex;
4200 dbgs() <<
"<invalid>";
4202 if (!CombinedEntriesWithIndices.
empty()) {
4203 dbgs() <<
"Combined entries: ";
4205 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4214 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
4217 dbgs() <<
"SLP: " << Banner <<
":\n";
4219 dbgs() <<
"SLP: Costs:\n";
4220 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4221 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4222 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4223 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4224 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4230 const InstructionsState &S,
4231 const EdgeInfo &UserTreeIdx,
4233 auto Invalid = ScheduleBundle::invalid();
4234 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4239 const InstructionsState &S,
4240 const EdgeInfo &UserTreeIdx,
4243 unsigned InterleaveFactor = 0) {
4244 TreeEntry::EntryState EntryState =
4245 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4246 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4247 ReuseShuffleIndices, ReorderIndices);
4248 if (E && InterleaveFactor > 0)
4249 E->setInterleave(InterleaveFactor);
4254 TreeEntry::EntryState EntryState,
4255 ScheduleBundle &Bundle,
const InstructionsState &S,
4256 const EdgeInfo &UserTreeIdx,
4259 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4260 EntryState == TreeEntry::SplitVectorize)) ||
4261 (Bundle && EntryState != TreeEntry::NeedToGather &&
4262 EntryState != TreeEntry::SplitVectorize)) &&
4263 "Need to vectorize gather entry?");
4265 if (GatheredLoadsEntriesFirst.has_value() &&
4266 EntryState == TreeEntry::NeedToGather && S &&
4267 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4268 !UserTreeIdx.UserTE)
4270 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
4271 TreeEntry *
Last = VectorizableTree.
back().get();
4272 Last->Idx = VectorizableTree.
size() - 1;
4273 Last->State = EntryState;
4274 if (UserTreeIdx.UserTE)
4276 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4281 ReuseShuffleIndices.empty()) &&
4282 "Reshuffling scalars not yet supported for nodes with padding");
4283 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4284 ReuseShuffleIndices.end());
4285 if (ReorderIndices.
empty()) {
4288 Last->setOperations(S);
4291 Last->Scalars.assign(VL.
size(),
nullptr);
4294 if (Idx >= VL.size())
4295 return UndefValue::get(VL.front()->getType());
4300 Last->setOperations(S);
4301 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4303 if (EntryState == TreeEntry::SplitVectorize) {
4304 assert(S &&
"Split nodes must have operations.");
4305 Last->setOperations(S);
4307 for (
Value *V : VL) {
4308 auto *
I = dyn_cast<Instruction>(V);
4311 auto It = ScalarsInSplitNodes.find(V);
4312 if (It == ScalarsInSplitNodes.end()) {
4313 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4314 (void)Processed.
insert(V);
4315 }
else if (Processed.
insert(V).second) {
4317 "Value already associated with the node.");
4318 It->getSecond().push_back(
Last);
4321 }
else if (!
Last->isGather()) {
4322 if (isa<PHINode>(S.getMainOp()) ||
4324 (!S.areInstructionsWithCopyableElements() &&
4326 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4327 Last->setDoesNotNeedToSchedule();
4329 for (
Value *V : VL) {
4330 if (isa<PoisonValue>(V))
4332 if (S.isCopyableElement(V)) {
4333 Last->addCopyableElement(V);
4336 auto It = ScalarToTreeEntries.find(V);
4337 if (It == ScalarToTreeEntries.end()) {
4338 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4339 (void)Processed.
insert(V);
4340 }
else if (Processed.
insert(V).second) {
4342 "Value already associated with the node.");
4343 It->getSecond().push_back(
Last);
4347 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4348 "Bundle and VL out of sync");
4349 if (!Bundle.getBundle().empty()) {
4350#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4351 auto *BundleMember = Bundle.getBundle().begin();
4353 for (
Value *V : VL) {
4354 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4358 assert(BundleMember == Bundle.getBundle().end() &&
4359 "Bundle and VL out of sync");
4361 Bundle.setTreeEntry(
Last);
4365 bool AllConstsOrCasts =
true;
4366 for (
Value *V : VL) {
4367 if (S && S.areInstructionsWithCopyableElements() &&
4368 S.isCopyableElement(V))
4369 Last->addCopyableElement(V);
4371 auto *
I = dyn_cast<CastInst>(V);
4372 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4373 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4374 !UserTreeIdx.UserTE->isGather())
4378 if (AllConstsOrCasts)
4380 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4384 if (UserTreeIdx.UserTE)
4385 Last->UserTreeIndex = UserTreeIdx;
4391 TreeEntry::VecTreeTy VectorizableTree;
4396 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4397 VectorizableTree[
Id]->dump();
4405 assert(V &&
"V cannot be nullptr.");
4406 auto It = ScalarToTreeEntries.find(V);
4407 if (It == ScalarToTreeEntries.end())
4409 return It->getSecond();
4414 assert(V &&
"V cannot be nullptr.");
4415 auto It = ScalarsInSplitNodes.find(V);
4416 if (It == ScalarsInSplitNodes.end())
4418 return It->getSecond();
4423 bool SameVF =
false)
const {
4424 assert(V &&
"V cannot be nullptr.");
4425 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4426 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4437 bool areAltOperandsProfitable(
const InstructionsState &S,
4442 class ScalarsVectorizationLegality {
4443 InstructionsState S;
4445 bool TryToFindDuplicates;
4446 bool TrySplitVectorize;
4449 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4450 bool TryToFindDuplicates =
true,
4451 bool TrySplitVectorize =
false)
4452 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4453 TrySplitVectorize(TrySplitVectorize) {
4454 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4455 "Inconsistent state");
4457 const InstructionsState &getInstructionsState()
const {
return S; };
4458 bool isLegal()
const {
return IsLegal; }
4460 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4465 ScalarsVectorizationLegality
4467 const EdgeInfo &UserTreeIdx,
4468 bool TryCopyableElementsVectorization)
const;
4472 TreeEntry::EntryState
4474 bool IsScatterVectorizeUserTE,
4483 OperandsToTreeEntry;
4510 using ValueToGatherNodesMap =
4512 ValueToGatherNodesMap ValueToGatherNodes;
4520 bool IsGraphTransformMode =
false;
4523 std::optional<unsigned> GatheredLoadsEntriesFirst;
4528 CompressEntryToData;
4531 struct ExternalUser {
4533 : Scalar(S),
User(
U), E(E), Lane(
L) {}
4536 Value *Scalar =
nullptr;
4557 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4560 return Res.first->second;
4563 Res.first->getSecond() = Aliased;
4567 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4599 UserList ExternalUses;
4627 class ScheduleEntity {
4628 friend class ScheduleBundle;
4629 friend class ScheduleData;
4630 friend class ScheduleCopyableData;
4633 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4634 Kind getKind()
const {
return K; }
4635 ScheduleEntity(Kind K) :
K(
K) {}
4639 int SchedulingPriority = 0;
4642 bool IsScheduled =
false;
4644 const Kind K = Kind::ScheduleData;
4647 ScheduleEntity() =
delete;
4649 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4650 int getSchedulingPriority()
const {
return SchedulingPriority; }
4651 bool isReady()
const {
4652 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4653 return SD->isReady();
4654 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4655 return CD->isReady();
4656 return cast<ScheduleBundle>(
this)->isReady();
4661 bool hasValidDependencies()
const {
4662 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4663 return SD->hasValidDependencies();
4664 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4665 return CD->hasValidDependencies();
4666 return cast<ScheduleBundle>(
this)->hasValidDependencies();
4669 int getUnscheduledDeps()
const {
4670 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4671 return SD->getUnscheduledDeps();
4672 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4673 return CD->getUnscheduledDeps();
4674 return cast<ScheduleBundle>(
this)->unscheduledDepsInBundle();
4677 int incrementUnscheduledDeps(
int Incr) {
4678 if (
auto *SD = dyn_cast<ScheduleData>(
this))
4679 return SD->incrementUnscheduledDeps(Incr);
4680 return cast<ScheduleCopyableData>(
this)->incrementUnscheduledDeps(Incr);
4683 int getDependencies()
const {
4684 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4685 return SD->getDependencies();
4686 return cast<ScheduleCopyableData>(
this)->getDependencies();
4690 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4691 return SD->getInst();
4692 return cast<ScheduleCopyableData>(
this)->getInst();
4696 bool isScheduled()
const {
return IsScheduled; }
4697 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4699 static bool classof(
const ScheduleEntity *) {
return true; }
4701#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4703 if (
const auto *SD = dyn_cast<ScheduleData>(
this))
4704 return SD->dump(
OS);
4705 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(
this))
4706 return CD->dump(
OS);
4707 return cast<ScheduleBundle>(
this)->dump(
OS);
4717#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4719 const BoUpSLP::ScheduleEntity &SE) {
4729 class ScheduleData final :
public ScheduleEntity {
4733 enum { InvalidDeps = -1 };
4735 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4736 static bool classof(
const ScheduleEntity *Entity) {
4737 return Entity->getKind() == Kind::ScheduleData;
4741 NextLoadStore =
nullptr;
4742 IsScheduled =
false;
4743 SchedulingRegionID = BlockSchedulingRegionID;
4744 clearDependencies();
4750 if (hasValidDependencies()) {
4751 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4753 assert(UnscheduledDeps == Dependencies &&
"invariant");
4757 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4758 "unexpected scheduled state");
4765 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4769 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4774 int incrementUnscheduledDeps(
int Incr) {
4775 assert(hasValidDependencies() &&
4776 "increment of unscheduled deps would be meaningless");
4777 UnscheduledDeps += Incr;
4778 return UnscheduledDeps;
4783 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4786 void clearDependencies() {
4787 clearDirectDependencies();
4788 MemoryDependencies.clear();
4789 ControlDependencies.clear();
4796 void clearDirectDependencies() {
4797 Dependencies = InvalidDeps;
4798 resetUnscheduledDeps();
4799 IsScheduled =
false;
4803 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4805 int getDependencies()
const {
return Dependencies; }
4807 void initDependencies() { Dependencies = 0; }
4809 void incDependencies() { Dependencies++; }
4812 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4819 return MemoryDependencies;
4822 void addMemoryDependency(ScheduleData *Dep) {
4823 MemoryDependencies.push_back(Dep);
4827 return ControlDependencies;
4830 void addControlDependency(ScheduleData *Dep) {
4831 ControlDependencies.push_back(Dep);
4834 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4835 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4849 ScheduleData *NextLoadStore =
nullptr;
4863 int SchedulingRegionID = 0;
4869 int Dependencies = InvalidDeps;
4875 int UnscheduledDeps = InvalidDeps;
4880 const BoUpSLP::ScheduleData &SD) {
4886 class ScheduleBundle final :
public ScheduleEntity {
4890 bool IsValid =
true;
4892 TreeEntry *TE =
nullptr;
4893 ScheduleBundle(
bool IsValid)
4894 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4897 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4898 static bool classof(
const ScheduleEntity *Entity) {
4899 return Entity->getKind() == Kind::ScheduleBundle;
4904 for (
const ScheduleEntity *SD : Bundle) {
4905 if (SD->hasValidDependencies()) {
4906 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4909 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4913 if (isScheduled()) {
4914 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4915 "unexpected scheduled state");
4921 int unscheduledDepsInBundle()
const {
4922 assert(*
this &&
"bundle must not be empty");
4924 for (
const ScheduleEntity *BundleMember : Bundle) {
4925 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4926 return ScheduleData::InvalidDeps;
4927 Sum += BundleMember->getUnscheduledDeps();
4935 bool hasValidDependencies()
const {
4936 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4937 return SD->hasValidDependencies();
4943 bool isReady()
const {
4944 assert(*
this &&
"bundle must not be empty");
4945 return unscheduledDepsInBundle() == 0 && !isScheduled();
4953 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4956 void setTreeEntry(TreeEntry *TE) { this->TE =
TE; }
4957 TreeEntry *getTreeEntry()
const {
return TE; }
4959 static ScheduleBundle invalid() {
return {
false}; }
4961 operator bool()
const {
return IsValid; }
4971 if (isa<ScheduleCopyableData>(SD))
4973 OS << *SD->getInst();
4987 const BoUpSLP::ScheduleBundle &Bundle) {
4998 class ScheduleCopyableData final :
public ScheduleEntity {
5005 int SchedulingRegionID = 0;
5007 ScheduleBundle &Bundle;
5010 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5011 const EdgeInfo &EI, ScheduleBundle &Bundle)
5012 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5013 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5014 static bool classof(
const ScheduleEntity *Entity) {
5015 return Entity->getKind() == Kind::ScheduleCopyableData;
5020 if (hasValidDependencies()) {
5021 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5023 assert(UnscheduledDeps == Dependencies &&
"invariant");
5027 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5028 "unexpected scheduled state");
5035 bool hasValidDependencies()
const {
5036 return Dependencies != ScheduleData::InvalidDeps;
5041 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5046 int incrementUnscheduledDeps(
int Incr) {
5047 assert(hasValidDependencies() &&
5048 "increment of unscheduled deps would be meaningless");
5049 UnscheduledDeps += Incr;
5050 assert(UnscheduledDeps >= 0 &&
"invariant");
5051 return UnscheduledDeps;
5056 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5059 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5061 int getDependencies()
const {
return Dependencies; }
5063 void initDependencies() { Dependencies = 0; }
5065 void incDependencies() { Dependencies++; }
5068 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5074 void clearDependencies() {
5075 Dependencies = ScheduleData::InvalidDeps;
5076 UnscheduledDeps = ScheduleData::InvalidDeps;
5077 IsScheduled =
false;
5081 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5084 ScheduleBundle &getBundle() {
return Bundle; }
5085 const ScheduleBundle &getBundle()
const {
return Bundle; }
5087#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5099 int Dependencies = ScheduleData::InvalidDeps;
5105 int UnscheduledDeps = ScheduleData::InvalidDeps;
5135 struct BlockScheduling {
5137 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5140 ScheduledBundles.clear();
5141 ScheduledBundlesList.clear();
5142 ScheduleCopyableDataMap.clear();
5143 ScheduleCopyableDataMapByInst.clear();
5144 ScheduleCopyableDataMapByInstUser.clear();
5145 ScheduleCopyableDataMapByUsers.clear();
5147 ScheduleStart =
nullptr;
5148 ScheduleEnd =
nullptr;
5149 FirstLoadStoreInRegion =
nullptr;
5150 LastLoadStoreInRegion =
nullptr;
5151 RegionHasStackSave =
false;
5155 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5158 ScheduleRegionSize = 0;
5162 ++SchedulingRegionID;
5168 if (BB !=
I->getParent())
5171 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5172 if (SD && isInSchedulingRegion(*SD))
5177 ScheduleData *getScheduleData(
Value *V) {
5178 return getScheduleData(dyn_cast<Instruction>(V));
5183 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5184 const Value *V)
const {
5185 if (ScheduleCopyableDataMap.empty())
5187 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5188 if (It == ScheduleCopyableDataMap.end())
5190 ScheduleCopyableData *SD = It->getSecond().get();
5191 if (!isInSchedulingRegion(*SD))
5199 getScheduleCopyableData(
const Value *
User,
unsigned OperandIdx,
5201 if (ScheduleCopyableDataMapByInstUser.empty())
5203 const auto It = ScheduleCopyableDataMapByInstUser.find(
5204 std::make_pair(std::make_pair(
User, OperandIdx), V));
5205 if (It == ScheduleCopyableDataMapByInstUser.end())
5208 for (ScheduleCopyableData *SD : It->getSecond()) {
5209 if (isInSchedulingRegion(*SD))
5225 unsigned NumOps)
const {
5226 assert(NumOps > 0 &&
"No operands");
5227 if (ScheduleCopyableDataMap.empty())
5235 if (Entries.empty())
5239 for (TreeEntry *TE : Entries) {
5245 bool IsCommutativeUser =
5247 EdgeInfo EI(TE,
U.getOperandNo());
5248 if (!IsCommutativeUser && !isa<CmpInst>(
User)) {
5250 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5251 if (!getScheduleCopyableData(EI,
Op) && OpCnt < NumOps)
5257 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5258 .first->getSecond();
5262 if (!PotentiallyReorderedEntriesCount.
empty()) {
5263 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5264 auto *It =
find(
P.first->Scalars,
User);
5265 assert(It !=
P.first->Scalars.end() &&
5266 "User is not in the tree entry");
5267 int Lane = std::distance(
P.first->Scalars.begin(), It);
5268 assert(Lane >= 0 &&
"Lane is not found");
5269 if (isa<StoreInst>(
User) && !
P.first->ReorderIndices.empty())
5270 Lane =
P.first->ReorderIndices[Lane];
5271 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5272 "Couldn't find extract lane");
5274 for (
unsigned OpIdx :
5276 P.first->getMainOp()))) {
5277 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5278 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5282 return all_of(PotentiallyReorderedEntriesCount,
5283 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5284 return P.second == NumOps - 1;
5292 if (ScheduleCopyableDataMapByInst.empty())
5294 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5295 if (It == ScheduleCopyableDataMapByInst.end())
5298 for (ScheduleCopyableData *SD : It->getSecond()) {
5299 if (isInSchedulingRegion(*SD))
5307 if (ScheduleCopyableDataMapByUsers.empty())
5309 const auto It = ScheduleCopyableDataMapByUsers.find(
User);
5310 if (It == ScheduleCopyableDataMapByUsers.end())
5313 for (ScheduleCopyableData *SD : It->getSecond()) {
5314 if (isInSchedulingRegion(*SD))
5320 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5322 int SchedulingRegionID,
5323 ScheduleBundle &Bundle) {
5324 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5325 ScheduleCopyableData *CD =
5326 ScheduleCopyableDataMap
5327 .try_emplace(std::make_pair(EI,
I),
5328 std::make_unique<ScheduleCopyableData>(
5329 SchedulingRegionID,
I, EI, Bundle))
5332 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5336 assert(It !=
Op.end() &&
"Lane not set");
5339 int Lane = std::distance(
Op.begin(), It);
5340 assert(Lane >= 0 &&
"Lane not set");
5341 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5342 !EI.UserTE->ReorderIndices.empty())
5343 Lane = EI.UserTE->ReorderIndices[Lane];
5344 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5345 "Couldn't find extract lane");
5346 auto *
In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5347 if (!Visited.
insert(In).second) {
5351 ScheduleCopyableDataMapByInstUser
5352 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5355 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5362 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5363 if (ScheduleCopyableData *UserCD =
5364 getScheduleCopyableData(UserEI, In))
5365 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5368 }
while (It !=
Op.end());
5370 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5377 auto *
I = dyn_cast<Instruction>(V);
5380 auto It = ScheduledBundles.find(
I);
5381 if (It == ScheduledBundles.end())
5383 return It->getSecond();
5387 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5388 if (
const auto *
Data = dyn_cast<ScheduleData>(&SD))
5389 return Data->getSchedulingRegionID() == SchedulingRegionID;
5390 if (
const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5391 return CD->getSchedulingRegionID() == SchedulingRegionID;
5392 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5393 [&](
const ScheduleEntity *BundleMember) {
5394 return isInSchedulingRegion(*BundleMember);
5400 template <
typename ReadyListType>
5401 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5402 const EdgeInfo &EI, ScheduleEntity *
Data,
5403 ReadyListType &ReadyList) {
5404 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5409 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5410 if ((IsControl ||
Data->hasValidDependencies()) &&
5411 Data->incrementUnscheduledDeps(-1) == 0) {
5417 if (
auto *CD = dyn_cast<ScheduleCopyableData>(
Data)) {
5418 CopyableBundle.
push_back(&CD->getBundle());
5419 Bundles = CopyableBundle;
5421 Bundles = getScheduleBundles(
Data->getInst());
5423 if (!Bundles.
empty()) {
5424 for (ScheduleBundle *Bundle : Bundles) {
5425 if (Bundle->unscheduledDepsInBundle() == 0) {
5426 assert(!Bundle->isScheduled() &&
5427 "already scheduled bundle gets ready");
5428 ReadyList.insert(Bundle);
5430 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5436 "already scheduled bundle gets ready");
5438 "Expected non-copyable data");
5439 ReadyList.insert(
Data);
5446 if (!ScheduleCopyableDataMap.empty()) {
5449 for (ScheduleCopyableData *CD : CopyableData)
5450 DecrUnsched(CD,
false);
5451 if (!CopyableData.empty())
5454 if (ScheduleData *OpSD = getScheduleData(
I))
5455 DecrUnsched(OpSD,
false);
5461 if (!Bundles.empty()) {
5462 auto *
In = BundleMember->getInst();
5465 unsigned TotalOpCount = 0;
5466 if (isa<ScheduleCopyableData>(BundleMember)) {
5468 TotalOpCount = OperandsUses[
In] = 1;
5470 for (
const Use &U :
In->operands()) {
5471 if (
auto *
I = dyn_cast<Instruction>(
U.get())) {
5473 ++Res.first->getSecond();
5480 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5482 if (!ScheduleCopyableDataMap.empty()) {
5483 const EdgeInfo EI = {UserTE,
OpIdx};
5484 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5485 DecrUnsched(CD,
false);
5489 auto It = OperandsUses.
find(
I);
5490 assert(It != OperandsUses.
end() &&
"Operand not found");
5491 if (It->second > 0) {
5493 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5495 if (ScheduleData *OpSD = getScheduleData(
I))
5496 DecrUnsched(OpSD,
false);
5500 for (ScheduleBundle *Bundle : Bundles) {
5501 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5505 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5506 find(Bundle->getTreeEntry()->Scalars, In));
5507 assert(Lane >= 0 &&
"Lane not set");
5508 if (isa<StoreInst>(In) &&
5509 !Bundle->getTreeEntry()->ReorderIndices.empty())
5510 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5511 assert(Lane <
static_cast<int>(
5512 Bundle->getTreeEntry()->Scalars.size()) &&
5513 "Couldn't find extract lane");
5522 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5523 In->getNumOperands() ==
5524 Bundle->getTreeEntry()->getNumOperands() ||
5525 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5526 "Missed TreeEntry operands?");
5528 for (
unsigned OpIdx :
5529 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5530 if (
auto *
I = dyn_cast<Instruction>(
5531 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5534 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5540 for (
Use &U : BundleMember->getInst()->operands()) {
5541 if (
auto *
I = dyn_cast<Instruction>(
U.get())) {
5543 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5544 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5549 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5553 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5554 if (!VisitedMemory.
insert(MemoryDep).second)
5559 << *MemoryDep <<
"\n");
5560 DecrUnsched(MemoryDep);
5564 for (ScheduleData *Dep : SD->getControlDependencies()) {
5565 if (!VisitedControl.
insert(Dep).second)
5570 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5571 DecrUnsched(Dep,
true);
5574 if (
auto *SD = dyn_cast<ScheduleData>(
Data)) {
5575 SD->setScheduled(
true);
5577 ProcessBundleMember(SD, {});
5579 ScheduleBundle &Bundle = *cast<ScheduleBundle>(
Data);
5580 Bundle.setScheduled(
true);
5582 auto AreAllBundlesScheduled =
5583 [&](
const ScheduleEntity *SD,
5585 if (isa<ScheduleCopyableData>(SD))
5587 return !SDBundles.empty() &&
5588 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5589 return SDBundle->isScheduled();
5592 for (ScheduleEntity *SD : Bundle.getBundle()) {
5594 if (!isa<ScheduleCopyableData>(SD))
5595 SDBundles = getScheduleBundles(SD->getInst());
5596 if (AreAllBundlesScheduled(SD, SDBundles)) {
5597 SD->setScheduled(
true);
5598 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5610 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5611 ScheduleStart->comesBefore(ScheduleEnd) &&
5612 "Not a valid scheduling region?");
5614 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5616 if (!Bundles.
empty()) {
5617 for (ScheduleBundle *Bundle : Bundles) {
5618 assert(isInSchedulingRegion(*Bundle) &&
5619 "primary schedule data not in window?");
5624 auto *SD = getScheduleData(
I);
5627 assert(isInSchedulingRegion(*SD) &&
5628 "primary schedule data not in window?");
5633 [](
const ScheduleEntity *Bundle) {
5634 return Bundle->isReady();
5636 "item in ready list not ready?");
5640 template <
typename ReadyListType>
5641 void initialFillReadyList(ReadyListType &ReadyList) {
5643 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5644 ScheduleData *SD = getScheduleData(
I);
5645 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5648 for (ScheduleBundle *Bundle : Bundles) {
5649 if (!Visited.
insert(Bundle).second)
5651 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5652 ReadyList.insert(Bundle);
5654 << *Bundle <<
"\n");
5659 ReadyList.insert(SD);
5661 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5672 const InstructionsState &S,
const EdgeInfo &EI);
5679 std::optional<ScheduleBundle *>
5681 const InstructionsState &S,
const EdgeInfo &EI);
5684 ScheduleData *allocateScheduleDataChunks();
5688 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5693 ScheduleData *PrevLoadStore,
5694 ScheduleData *NextLoadStore);
5698 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5703 void resetSchedule();
5725 std::unique_ptr<ScheduleCopyableData>>
5726 ScheduleCopyableDataMap;
5733 ScheduleCopyableDataMapByInst;
5741 ScheduleCopyableDataMapByInstUser;
5762 ScheduleCopyableDataMapByUsers;
5781 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5785 ScheduleData *LastLoadStoreInRegion =
nullptr;
5790 bool RegionHasStackSave =
false;
5793 int ScheduleRegionSize = 0;
5802 int SchedulingRegionID = 1;
5810 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5817 struct OrdersTypeDenseMapInfo {
5830 static unsigned getHashValue(
const OrdersType &V) {
5851 unsigned MaxVecRegSize;
5852 unsigned MinVecRegSize;
5867 unsigned ReductionBitWidth = 0;
5870 unsigned BaseGraphSize = 1;
5874 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5888 SecondInfo::getEmptyKey());
5893 SecondInfo::getTombstoneKey());
5898 SecondInfo::getHashValue(Val.
EdgeIdx));
5917 struct ChildIteratorType
5919 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5930 return R.VectorizableTree[0].get();
5934 return {&
N->UserTreeIndex,
N->Container};
5938 return {&
N->UserTreeIndex + 1,
N->Container};
5943 class nodes_iterator {
5954 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
5958 return nodes_iterator(R->VectorizableTree.begin());
5962 return nodes_iterator(R->VectorizableTree.end());
5965 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
5976 OS << Entry->Idx <<
".\n";
5979 for (
auto *V : Entry->Scalars) {
5981 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
5982 return EU.Scalar == V;
5992 if (Entry->isGather())
5994 if (Entry->State == TreeEntry::ScatterVectorize ||
5995 Entry->State == TreeEntry::StridedVectorize ||
5996 Entry->State == TreeEntry::CompressVectorize)
5997 return "color=blue";
6006 for (
auto *
I : DeletedInstructions) {
6007 if (!
I->getParent()) {
6010 if (isa<PHINode>(
I))
6012 I->insertBefore(
F->getEntryBlock(),
6013 F->getEntryBlock().getFirstNonPHIIt());
6015 I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
6018 for (
Use &U :
I->operands()) {
6019 auto *
Op = dyn_cast<Instruction>(U.get());
6020 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6024 I->dropAllReferences();
6026 for (
auto *
I : DeletedInstructions) {
6028 "trying to erase instruction with users.");
6029 I->eraseFromParent();
6035#ifdef EXPENSIVE_CHECKS
6046 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6047 "Expected non-empty mask.");
6050 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6052 Reuses[Mask[
I]] = Prev[
I];
6060 bool BottomOrder =
false) {
6061 assert(!Mask.empty() &&
"Expected non-empty mask.");
6062 unsigned Sz = Mask.size();
6065 if (Order.
empty()) {
6067 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6069 PrevOrder.
swap(Order);
6072 for (
unsigned I = 0;
I < Sz; ++
I)
6074 Order[
I] = PrevOrder[Mask[
I]];
6076 return Data.value() == Sz ||
Data.index() ==
Data.value();
6085 if (Order.
empty()) {
6087 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6097 for (
unsigned I = 0;
I < Sz; ++
I)
6099 Order[MaskOrder[
I]] =
I;
6103std::optional<BoUpSLP::OrdersType>
6105 bool TopToBottom,
bool IgnoreReorder) {
6106 assert(TE.isGather() &&
"Expected gather node only.");
6110 Type *ScalarTy = GatheredScalars.
front()->getType();
6111 size_t NumScalars = GatheredScalars.
size();
6113 return std::nullopt;
6120 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6122 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6125 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6126 return std::nullopt;
6127 OrdersType CurrentOrder(NumScalars, NumScalars);
6128 if (GatherShuffles.
size() == 1 &&
6130 Entries.front().front()->isSame(TE.Scalars)) {
6134 return std::nullopt;
6136 if (Entries.front().front()->UserTreeIndex.UserTE ==
6137 TE.UserTreeIndex.UserTE)
6138 return std::nullopt;
6141 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6142 return std::nullopt;
6145 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6146 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6149 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6151 return std::nullopt;
6155 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6156 return CurrentOrder;
6160 return all_of(Mask, [&](
int I) {
6167 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6168 (Entries.size() != 1 ||
6169 Entries.front().front()->ReorderIndices.empty())) ||
6170 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6171 return std::nullopt;
6176 for (
int I : seq<int>(0, NumParts)) {
6177 if (ShuffledSubMasks.
test(
I))
6179 const int VF = GetVF(
I);
6185 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6187 ShuffledSubMasks.
set(
I);
6191 int FirstMin = INT_MAX;
6192 int SecondVecFound =
false;
6193 for (
int K : seq<int>(Limit)) {
6194 int Idx = Mask[
I * PartSz + K];
6196 Value *V = GatheredScalars[
I * PartSz + K];
6198 SecondVecFound =
true;
6207 SecondVecFound =
true;
6211 FirstMin = (FirstMin / PartSz) * PartSz;
6213 if (SecondVecFound) {
6215 ShuffledSubMasks.
set(
I);
6218 for (
int K : seq<int>(Limit)) {
6219 int Idx = Mask[
I * PartSz + K];
6223 if (
Idx >= PartSz) {
6224 SecondVecFound =
true;
6227 if (CurrentOrder[
I * PartSz +
Idx] >
6228 static_cast<unsigned>(
I * PartSz + K) &&
6229 CurrentOrder[
I * PartSz +
Idx] !=
6230 static_cast<unsigned>(
I * PartSz +
Idx))
6231 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
6234 if (SecondVecFound) {
6236 ShuffledSubMasks.
set(
I);
6242 if (!ExtractShuffles.
empty())
6243 TransformMaskToOrder(
6244 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6245 if (!ExtractShuffles[
I])
6248 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6249 for (
unsigned Idx : seq<unsigned>(Sz)) {
6250 int K =
I * PartSz +
Idx;
6253 if (!TE.ReuseShuffleIndices.empty())
6254 K = TE.ReuseShuffleIndices[K];
6257 if (!TE.ReorderIndices.empty())
6258 K = std::distance(TE.ReorderIndices.begin(),
6259 find(TE.ReorderIndices, K));
6260 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6263 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6265 .getKnownMinValue());
6270 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6271 if (ShuffledSubMasks.
any())
6272 return std::nullopt;
6273 PartSz = NumScalars;
6276 if (!Entries.empty())
6277 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6278 if (!GatherShuffles[
I])
6280 return std::max(Entries[
I].front()->getVectorFactor(),
6281 Entries[
I].back()->getVectorFactor());
6283 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6284 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6285 return std::nullopt;
6286 return std::move(CurrentOrder);
6291 bool CompareOpcodes =
true) {
6295 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6296 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6297 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6298 (!GEP2 || GEP2->getNumOperands() == 2) &&
6299 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6300 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6303 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6307template <
typename T>
6311 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
6312 return CommonAlignment;
6318 "Order is empty. Please check it before using isReverseOrder.");
6319 unsigned Sz = Order.
size();
6321 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6332static std::optional<Value *>
6338 const SCEV *PtrSCEVLowest =
nullptr;
6339 const SCEV *PtrSCEVHighest =
nullptr;
6345 return std::nullopt;
6347 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6348 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6352 if (isa<SCEVCouldNotCompute>(Diff))
6353 return std::nullopt;
6355 PtrSCEVLowest = PtrSCEV;
6359 if (isa<SCEVCouldNotCompute>(Diff1))
6360 return std::nullopt;
6362 PtrSCEVHighest = PtrSCEV;
6368 if (isa<SCEVCouldNotCompute>(Dist))
6369 return std::nullopt;
6370 int Size =
DL.getTypeStoreSize(ElemTy);
6371 auto TryGetStride = [&](
const SCEV *Dist,
6372 const SCEV *Multiplier) ->
const SCEV * {
6373 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6374 if (M->getOperand(0) == Multiplier)
6375 return M->getOperand(1);
6376 if (M->getOperand(1) == Multiplier)
6377 return M->getOperand(0);
6380 if (Multiplier == Dist)
6385 const SCEV *Stride =
nullptr;
6386 if (
Size != 1 || SCEVs.
size() > 2) {
6388 Stride = TryGetStride(Dist, Sz);
6390 return std::nullopt;
6392 if (!Stride || isa<SCEVConstant>(Stride))
6393 return std::nullopt;
6396 using DistOrdPair = std::pair<int64_t, int>;
6398 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6400 bool IsConsecutive =
true;
6401 for (
const SCEV *PtrSCEV : SCEVs) {
6403 if (PtrSCEV != PtrSCEVLowest) {
6405 const SCEV *Coeff = TryGetStride(Diff, Stride);
6407 return std::nullopt;
6408 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6409 if (!SC || isa<SCEVCouldNotCompute>(SC))
6410 return std::nullopt;
6414 return std::nullopt;
6415 Dist = SC->getAPInt().getZExtValue();
6419 return std::nullopt;
6420 auto Res = Offsets.emplace(Dist, Cnt);
6422 return std::nullopt;
6424 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6427 if (Offsets.size() != SCEVs.
size())
6428 return std::nullopt;
6429 SortedIndices.
clear();
6430 if (!IsConsecutive) {
6434 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6435 SortedIndices[Cnt] = Pair.second;
6445static std::pair<InstructionCost, InstructionCost>
6466 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6469 Mask, NumSrcElts, NumSubElts,
Index)) {
6470 if (
Index + NumSubElts > NumSrcElts &&
6471 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6487 assert(!isa<ScalableVectorType>(Ty) &&
6488 "ScalableVectorType is not supported.");
6491 "Incorrect usage.");
6492 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6496 unsigned ScalarTyNumElements = VecTy->getNumElements();
6499 if (!DemandedElts[
I])
6503 I * ScalarTyNumElements, VecTy);
6506 I * ScalarTyNumElements, VecTy);
6519 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6520 if (Opcode == Instruction::ExtractElement) {
6521 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6523 assert(isa<VectorType>(Val) &&
"Val must be a vector type.");
6525 cast<VectorType>(Val), {},
CostKind,
6526 Index * VecTy->getNumElements(), VecTy);
6539 if (
auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6542 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6544 Index * ScalarTy->getNumElements(), SubTp) +
6556 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6563 if (isa<PoisonValue>(Vec)) {
6564 auto *Begin = std::next(
Mask.begin(),
Index);
6565 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6569 std::iota(
Mask.begin(),
Mask.end(), 0);
6570 std::iota(std::next(
Mask.begin(),
Index),
6571 std::next(
Mask.begin(),
Index + SubVecVF), VecVF);
6573 return Generator(Vec, V, Mask);
6576 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6584 unsigned SubVecVF,
unsigned Index) {
6586 std::iota(Mask.begin(), Mask.end(),
Index);
6597 const unsigned Sz = PointerOps.
size();
6600 CompressMask[0] = 0;
6602 std::optional<unsigned> Stride = 0;
6604 for (
unsigned I : seq<unsigned>(1, Sz)) {
6606 std::optional<int64_t> OptPos =
6608 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6610 unsigned Pos =
static_cast<unsigned>(*OptPos);
6611 CompressMask[
I] = Pos;
6618 if (Pos != *Stride *
I)
6621 return Stride.has_value();
6634 InterleaveFactor = 0;
6636 const size_t Sz = VL.
size();
6644 if (AreAllUsersVectorized(V))
6648 Mask.empty() ?
I : Mask[
I]);
6651 if (ExtractCost <= ScalarCost)
6656 if (Order.
empty()) {
6657 Ptr0 = PointerOps.
front();
6658 PtrN = PointerOps.
back();
6660 Ptr0 = PointerOps[Order.
front()];
6661 PtrN = PointerOps[Order.
back()];
6663 std::optional<int64_t> Diff =
6667 const size_t MaxRegSize =
6671 if (*Diff / Sz >= MaxRegSize / 8)
6674 auto *LI = cast<LoadInst>(Order.
empty() ? VL.
front() : VL[Order.
front()]);
6675 Align CommonAlignment = LI->getAlign();
6677 Ptr0, LoadVecTy, CommonAlignment,
DL,
6678 cast<LoadInst>(Order.
empty() ? VL.
back() : VL[Order.
back()]), &AC, &DT,
6681 LI->getPointerAddressSpace()))
6687 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6691 auto [ScalarGEPCost, VectorGEPCost] =
6693 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6712 LI->getPointerAddressSpace(),
CostKind);
6716 LI->getPointerAddressSpace(),
CostKind);
6718 if (IsStrided && !IsMasked && Order.
empty()) {
6723 DL, cast<LoadInst>(VL.
back()), &AC, &DT,
6725 AlignedLoadVecTy = LoadVecTy;
6728 LI->getPointerAddressSpace())) {
6731 Instruction::Load, AlignedLoadVecTy,
6732 CompressMask[1], {}, CommonAlignment,
6733 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6734 if (InterleavedCost < GatherCost) {
6735 InterleaveFactor = CompressMask[1];
6736 LoadVecTy = AlignedLoadVecTy;
6743 if (!Order.
empty()) {
6745 for (
unsigned I : seq<unsigned>(Sz)) {
6746 NewMask[
I] = CompressMask[Mask[
I]];
6748 CompressMask.
swap(NewMask);
6750 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6751 return TotalVecCost < GatherCost;
6764 unsigned InterleaveFactor;
6768 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6769 CompressMask, LoadVecTy);
6789 const bool IsAnyPointerUsedOutGraph,
6790 const int64_t Diff) {
6791 const size_t Sz = VL.
size();
6792 const uint64_t AbsoluteDiff = std::abs(Diff);
6795 if (IsAnyPointerUsedOutGraph ||
6796 (AbsoluteDiff > Sz &&
6799 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6800 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6801 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6802 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6811 if (Order.
empty()) {
6812 Ptr0 = PointerOps.
front();
6813 PtrN = PointerOps.
back();
6815 Ptr0 = PointerOps[Order.
front()];
6816 PtrN = PointerOps[Order.
back()];
6825 else if (
Ptr != Ptr0)
6829 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6832 if (Dists.
size() == Sz)
6842 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6855 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
6861 const size_t Sz = VL.
size();
6863 auto *POIter = PointerOps.
begin();
6864 for (
Value *V : VL) {
6865 auto *L = dyn_cast<LoadInst>(V);
6866 if (!L || !L->isSimple())
6868 *POIter = L->getPointerOperand();
6877 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6897 if (Order.
empty()) {
6898 Ptr0 = PointerOps.
front();
6899 PtrN = PointerOps.
back();
6901 Ptr0 = PointerOps[Order.
front()];
6902 PtrN = PointerOps[Order.
back()];
6904 std::optional<int64_t> Diff =
6907 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6910 *TLI, [&](
Value *V) {
6911 return areAllUsersVectorized(
6912 cast<Instruction>(V), UserIgnoreList);
6916 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6918 auto IsAnyPointerUsedOutGraph =
6919 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
6920 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
6921 return !isVectorized(U) && !MustGather.contains(U);
6924 if (IsPossibleStrided &&
6926 IsAnyPointerUsedOutGraph, *Diff))
6935 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
6937 bool ProfitableGatherPointers) {
6942 auto [ScalarGEPCost, VectorGEPCost] =
6944 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6948 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6950 if (
static_cast<unsigned>(
count_if(
6951 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
6977 false, CommonAlignment,
CostKind) +
6978 (ProfitableGatherPointers ? 0 : VectorGEPCost);
6986 constexpr unsigned ListLimit = 4;
6987 if (!TryRecursiveCheck || VL.
size() < ListLimit)
6996 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
7006 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
7019 DemandedElts.
setBits(Cnt, Cnt + VF);
7035 if (!DemandedElts.
isZero()) {
7040 for (
unsigned Idx : seq<unsigned>(VL.
size()))
7041 if (DemandedElts[
Idx])
7047 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
7052 LI0->getPointerOperand(),
7053 Instruction::GetElementPtr,
CostKind, ScalarTy,
7057 if (
static_cast<unsigned>(
7058 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7059 PointerOps.
size() - 1 ||
7079 LI0->getPointerAddressSpace(),
CostKind,
7085 LI0->getPointerOperand(),
7092 Instruction::Load, SubVecTy, CommonAlignment,
7093 LI0->getPointerAddressSpace(),
CostKind) +
7100 LI0->getPointerOperand(),
7110 for (
int Idx : seq<int>(0, VL.
size()))
7120 if (MaskedGatherCost >= VecLdCost &&
7133 bool ProfitableGatherPointers =
7134 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7135 return L->isLoopInvariant(V);
7137 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7138 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
7140 (
GEP &&
GEP->getNumOperands() == 2 &&
7141 isa<Constant, Instruction>(
GEP->getOperand(1)));
7148 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7149 ProfitableGatherPointers))
7162 "Expected list of pointer operands.");
7167 std::pair<BasicBlock *, Value *>,
7173 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
7175 SortedIndices.
clear();
7177 auto Key = std::make_pair(BBs[Cnt + 1],
7181 std::optional<int64_t> Diff =
7182 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7183 ElemTy, Ptr, DL, SE,
7188 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7194 if (Bases.
size() > VL.
size() / 2 - 1)
7198 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7205 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
7206 Bases.
front().second.size() == VL.
size()))
7211 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7220 FirstPointers.
insert(P1);
7221 SecondPointers.
insert(P2);
7227 "Unable to find matching root.");
7230 for (
auto &
Base : Bases) {
7231 for (
auto &Vec :
Base.second) {
7232 if (Vec.size() > 1) {
7234 int64_t InitialOffset = std::get<1>(Vec[0]);
7235 bool AnyConsecutive =
7237 return std::get<1>(
P.value()) ==
7238 int64_t(
P.index()) + InitialOffset;
7242 if (!AnyConsecutive)
7247 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7251 for (
auto &
T : Bases)
7252 for (
const auto &Vec :
T.second)
7253 for (
const auto &
P : Vec)
7257 "Expected SortedIndices to be the size of VL");
7261std::optional<BoUpSLP::OrdersType>
7263 assert(TE.isGather() &&
"Expected gather node only.");
7264 Type *ScalarTy = TE.Scalars[0]->getType();
7267 Ptrs.
reserve(TE.Scalars.size());
7269 BBs.
reserve(TE.Scalars.size());
7270 for (
Value *V : TE.Scalars) {
7271 auto *L = dyn_cast<LoadInst>(V);
7272 if (!L || !L->isSimple())
7273 return std::nullopt;
7279 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
7281 return std::move(Order);
7282 return std::nullopt;
7293 if (VU->
getType() != V->getType())
7296 if (!VU->
hasOneUse() && !V->hasOneUse())
7302 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7308 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
7309 bool IsReusedIdx =
false;
7311 if (IE2 == VU && !IE1)
7313 if (IE1 == V && !IE2)
7314 return V->hasOneUse();
7315 if (IE1 && IE1 != V) {
7317 IsReusedIdx |= ReusedIdx.
test(Idx1);
7318 ReusedIdx.
set(Idx1);
7319 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7322 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7324 if (IE2 && IE2 != VU) {
7326 IsReusedIdx |= ReusedIdx.
test(Idx2);
7327 ReusedIdx.
set(Idx2);
7328 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7331 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7333 }
while (!IsReusedIdx && (IE1 || IE2));
7343std::optional<BoUpSLP::OrdersType>
7345 bool IgnoreReorder) {
7348 if (!TE.ReuseShuffleIndices.empty()) {
7350 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
7351 "Reshuffling scalars not yet supported for nodes with padding");
7354 return std::nullopt;
7362 unsigned Sz = TE.Scalars.size();
7363 if (TE.isGather()) {
7364 if (std::optional<OrdersType> CurrentOrder =
7369 ::addMask(Mask, TE.ReuseShuffleIndices);
7370 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7371 unsigned Sz = TE.Scalars.size();
7372 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
7375 Res[
Idx + K * Sz] =
I + K * Sz;
7377 return std::move(Res);
7380 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7382 2 * TE.getVectorFactor())) == 1)
7383 return std::nullopt;
7384 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7385 return std::nullopt;
7389 if (TE.ReorderIndices.empty())
7390 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7393 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7394 unsigned VF = ReorderMask.
size();
7398 for (
unsigned I = 0;
I < VF;
I += Sz) {
7400 unsigned UndefCnt = 0;
7401 unsigned Limit = std::min(Sz, VF -
I);
7410 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7412 return std::nullopt;
7414 for (
unsigned K = 0; K < NumParts; ++K) {
7415 unsigned Idx = Val + Sz * K;
7416 if (
Idx < VF &&
I + K < VF)
7417 ResOrder[
Idx] =
I + K;
7420 return std::move(ResOrder);
7422 unsigned VF = TE.getVectorFactor();
7425 TE.ReuseShuffleIndices.end());
7426 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7428 if (isa<PoisonValue>(V))
7430 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7431 return Idx && *Idx < Sz;
7433 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7434 "by BinaryOperator and CastInst.");
7436 if (TE.ReorderIndices.empty())
7437 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7440 for (
unsigned I = 0;
I < VF; ++
I) {
7441 int &
Idx = ReusedMask[
I];
7444 Value *V = TE.Scalars[ReorderMask[
Idx]];
7446 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7452 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7453 auto *It = ResOrder.
begin();
7454 for (
unsigned K = 0; K < VF; K += Sz) {
7458 std::iota(SubMask.begin(), SubMask.end(), 0);
7460 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7461 std::advance(It, Sz);
7464 return Data.index() ==
Data.value();
7466 return std::nullopt;
7467 return std::move(ResOrder);
7469 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7470 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7472 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7473 return std::nullopt;
7474 if (TE.State == TreeEntry::SplitVectorize ||
7475 ((TE.State == TreeEntry::Vectorize ||
7476 TE.State == TreeEntry::StridedVectorize ||
7477 TE.State == TreeEntry::CompressVectorize) &&
7478 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
7479 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7480 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7481 "Alternate instructions are only supported by "
7482 "BinaryOperator and CastInst.");
7483 return TE.ReorderIndices;
7485 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7486 TE.isAltShuffle()) {
7487 assert(TE.ReuseShuffleIndices.empty() &&
7488 "ReuseShuffleIndices should be "
7489 "empty for alternate instructions.");
7491 TE.buildAltOpShuffleMask(
7493 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7494 "Unexpected main/alternate opcode");
7498 const int VF = TE.getVectorFactor();
7500 for (
unsigned I : seq<unsigned>(VF)) {
7503 ResOrder[Mask[
I] % VF] =
I;
7505 return std::move(ResOrder);
7507 if (!TE.ReorderIndices.empty())
7508 return TE.ReorderIndices;
7509 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7510 if (!TE.ReorderIndices.empty())
7511 return TE.ReorderIndices;
7514 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7515 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7517 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
7522 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7524 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
7530 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7535 auto *NodeA = DT->
getNode(BB1);
7536 auto *NodeB = DT->
getNode(BB2);
7537 assert(NodeA &&
"Should only process reachable instructions");
7538 assert(NodeB &&
"Should only process reachable instructions");
7539 assert((NodeA == NodeB) ==
7540 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7541 "Different nodes should have different DFS numbers");
7542 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7544 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7545 Value *V1 = TE.Scalars[I1];
7546 Value *V2 = TE.Scalars[I2];
7549 if (isa<PoisonValue>(V1))
7551 if (isa<PoisonValue>(V2))
7557 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
7558 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->
user_begin());
7559 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7560 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7561 FirstUserOfPhi2->getParent());
7562 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7563 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7564 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7565 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7571 if (UserBVHead[I1] && !UserBVHead[I2])
7573 if (!UserBVHead[I1])
7575 if (UserBVHead[I1] == UserBVHead[I2])
7578 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7580 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7587 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7588 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7589 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7590 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7593 if (EE1->getOperand(0) == EE2->getOperand(0))
7595 if (!Inst1 && Inst2)
7597 if (Inst1 && Inst2) {
7605 "Expected either instructions or arguments vector operands.");
7606 return P1->getArgNo() < P2->getArgNo();
7611 std::iota(Phis.
begin(), Phis.
end(), 0);
7614 return std::nullopt;
7615 return std::move(Phis);
7617 if (TE.isGather() &&
7618 (!TE.hasState() || !TE.isAltShuffle() ||
7619 ScalarsInSplitNodes.
contains(TE.getMainOp())) &&
7623 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7624 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
7625 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7627 auto *EE = dyn_cast<ExtractElementInst>(V);
7628 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7634 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7635 if (Reuse || !CurrentOrder.
empty())
7636 return std::move(CurrentOrder);
7644 int Sz = TE.Scalars.size();
7646 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7648 if (It == TE.Scalars.begin())
7651 if (It != TE.Scalars.end()) {
7653 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7668 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7671 return std::move(Order);
7676 return std::nullopt;
7677 if (TE.Scalars.size() >= 3)
7682 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7686 CurrentOrder, PointerOps);
7689 return std::move(CurrentOrder);
7694 if (std::optional<OrdersType> CurrentOrder =
7696 return CurrentOrder;
7698 return std::nullopt;
7708 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7710 if (Cluster != FirstCluster)
7716void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
7719 const unsigned Sz =
TE.Scalars.size();
7721 if (!
TE.isGather() ||
7728 addMask(NewMask,
TE.ReuseShuffleIndices);
7730 TE.ReorderIndices.clear();
7737 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7738 *
End =
TE.ReuseShuffleIndices.end();
7739 It !=
End; std::advance(It, Sz))
7740 std::iota(It, std::next(It, Sz), 0);
7746 "Expected same size of orders");
7747 size_t Sz = Order.
size();
7749 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
7750 if (Order[
Idx] != Sz)
7751 UsedIndices.
set(Order[
Idx]);
7753 if (SecondaryOrder.
empty()) {
7754 for (
unsigned Idx : seq<unsigned>(0, Sz))
7755 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
7758 for (
unsigned Idx : seq<unsigned>(0, Sz))
7759 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
7760 !UsedIndices.
test(SecondaryOrder[
Idx]))
7761 Order[
Idx] = SecondaryOrder[
Idx];
7766 constexpr unsigned TinyVF = 2;
7767 constexpr unsigned TinyTree = 10;
7768 constexpr unsigned PhiOpsLimit = 12;
7769 constexpr unsigned GatherLoadsLimit = 2;
7770 if (VectorizableTree.size() <= TinyTree)
7772 if (VectorizableTree.front()->hasState() &&
7773 !VectorizableTree.front()->isGather() &&
7774 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7775 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7776 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7777 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7778 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7779 VectorizableTree.front()->ReorderIndices.empty()) {
7783 if (VectorizableTree.front()->hasState() &&
7784 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7785 VectorizableTree.front()->Scalars.size() == TinyVF &&
7786 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7789 if (VectorizableTree.front()->hasState() &&
7790 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7791 VectorizableTree.front()->ReorderIndices.empty()) {
7792 const unsigned ReorderedSplitsCnt =
7793 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7794 return TE->State == TreeEntry::SplitVectorize &&
7795 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7796 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7799 if (ReorderedSplitsCnt <= 1 &&
7801 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7802 return ((!TE->isGather() &&
7803 (TE->ReorderIndices.empty() ||
7804 (TE->UserTreeIndex.UserTE &&
7805 TE->UserTreeIndex.UserTE->State ==
7806 TreeEntry::Vectorize &&
7807 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7809 (TE->isGather() && TE->ReorderIndices.empty() &&
7810 (!TE->hasState() || TE->isAltShuffle() ||
7811 TE->getOpcode() == Instruction::Load ||
7812 TE->getOpcode() == Instruction::ZExt ||
7813 TE->getOpcode() == Instruction::SExt))) &&
7814 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7815 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7816 return !isConstant(V) && isVectorized(V);
7818 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7821 bool HasPhis =
false;
7822 bool HasLoad =
true;
7823 unsigned GatherLoads = 0;
7824 for (
const std::unique_ptr<TreeEntry> &TE :
7825 ArrayRef(VectorizableTree).drop_front()) {
7826 if (TE->State == TreeEntry::SplitVectorize)
7828 if (!TE->hasState()) {
7829 if (
all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7830 all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
7832 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7833 any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
7837 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7838 if (!TE->isGather()) {
7845 if (GatherLoads >= GatherLoadsLimit)
7848 if (TE->getOpcode() == Instruction::GetElementPtr ||
7851 if (TE->getOpcode() != Instruction::PHI &&
7852 (!TE->hasCopyableElements() ||
7853 static_cast<unsigned>(
count_if(TE->Scalars, IsaPred<PHINode>)) <
7854 TE->Scalars.size() / 2))
7856 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7857 TE->getNumOperands() > PhiOpsLimit)
7868 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7871 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7872 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7875 copy(MaskOrder, NewMaskOrder.begin());
7877 assert(
Idx == 1 &&
"Expected either 0 or 1 index.");
7878 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7879 for (
unsigned I : seq<unsigned>(Mask.size())) {
7887 ReorderIndices.clear();
7906 ExternalUserReorderMap;
7911 const std::unique_ptr<TreeEntry> &TE) {
7914 findExternalStoreUsersReorderIndices(TE.get());
7915 if (!ExternalUserReorderIndices.
empty()) {
7916 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7918 std::move(ExternalUserReorderIndices));
7924 if (TE->hasState() && TE->isAltShuffle() &&
7925 TE->State != TreeEntry::SplitVectorize) {
7926 Type *ScalarTy = TE->Scalars[0]->getType();
7928 unsigned Opcode0 = TE->getOpcode();
7929 unsigned Opcode1 = TE->getAltOpcode();
7933 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7934 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7940 bool IgnoreReorder =
7941 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7942 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7943 VectorizableTree.front()->getOpcode() == Instruction::Store);
7944 if (std::optional<OrdersType> CurrentOrder =
7954 const TreeEntry *UserTE = TE.get();
7956 if (!UserTE->UserTreeIndex)
7958 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7959 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
7960 UserTE->UserTreeIndex.UserTE->Idx != 0)
7962 UserTE = UserTE->UserTreeIndex.UserTE;
7965 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7966 if (!(TE->State == TreeEntry::Vectorize ||
7967 TE->State == TreeEntry::StridedVectorize ||
7968 TE->State == TreeEntry::SplitVectorize ||
7969 TE->State == TreeEntry::CompressVectorize) ||
7970 !TE->ReuseShuffleIndices.empty())
7971 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
7972 if (TE->State == TreeEntry::Vectorize &&
7973 TE->getOpcode() == Instruction::PHI)
7974 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
7979 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
7980 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
7981 auto It = VFToOrderedEntries.
find(VF);
7982 if (It == VFToOrderedEntries.
end())
7996 for (
const TreeEntry *OpTE : OrderedEntries) {
7999 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8000 OpTE->State != TreeEntry::SplitVectorize)
8003 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8005 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8006 auto It = GathersToOrders.find(OpTE);
8007 if (It != GathersToOrders.end())
8010 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8011 auto It = AltShufflesToOrders.find(OpTE);
8012 if (It != AltShufflesToOrders.end())
8015 if (OpTE->State == TreeEntry::Vectorize &&
8016 OpTE->getOpcode() == Instruction::PHI) {
8017 auto It = PhisToOrders.
find(OpTE);
8018 if (It != PhisToOrders.
end())
8021 return OpTE->ReorderIndices;
8024 auto It = ExternalUserReorderMap.
find(OpTE);
8025 if (It != ExternalUserReorderMap.
end()) {
8026 const auto &ExternalUserReorderIndices = It->second;
8030 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8031 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8032 ExternalUserReorderIndices.size();
8034 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8035 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8042 if (OpTE->State == TreeEntry::Vectorize &&
8043 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8044 assert(!OpTE->isAltShuffle() &&
8045 "Alternate instructions are only supported by BinaryOperator "
8049 unsigned E = Order.
size();
8052 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8055 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8057 ++OrdersUses.try_emplace(Order, 0).first->second;
8060 if (OrdersUses.empty())
8063 unsigned IdentityCnt = 0;
8064 unsigned FilledIdentityCnt = 0;
8066 for (
auto &Pair : OrdersUses) {
8068 if (!Pair.first.empty())
8069 FilledIdentityCnt += Pair.second;
8070 IdentityCnt += Pair.second;
8075 unsigned Cnt = IdentityCnt;
8076 for (
auto &Pair : OrdersUses) {
8080 if (Cnt < Pair.second ||
8081 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8082 Cnt == Pair.second && !BestOrder.
empty() &&
8085 BestOrder = Pair.first;
8098 unsigned E = BestOrder.
size();
8100 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8103 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8105 if (TE->Scalars.size() != VF) {
8106 if (TE->ReuseShuffleIndices.size() == VF) {
8107 assert(TE->State != TreeEntry::SplitVectorize &&
8108 "Split vectorized not expected.");
8113 (!TE->UserTreeIndex ||
8114 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8115 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8116 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8117 "All users must be of VF size.");
8124 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8125 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8130 reorderNodeWithReuses(*TE, Mask);
8132 if (TE->UserTreeIndex &&
8133 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8134 TE->UserTreeIndex.UserTE->reorderSplitNode(
8135 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8139 if ((TE->State == TreeEntry::SplitVectorize &&
8140 TE->ReuseShuffleIndices.empty()) ||
8141 ((TE->State == TreeEntry::Vectorize ||
8142 TE->State == TreeEntry::StridedVectorize ||
8143 TE->State == TreeEntry::CompressVectorize) &&
8146 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8148 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8149 TE->ReuseShuffleIndices.empty())) &&
8150 "Alternate instructions are only supported by BinaryOperator "
8155 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8156 TE->reorderOperands(Mask);
8159 TE->reorderOperands(Mask);
8160 assert(TE->ReorderIndices.empty() &&
8161 "Expected empty reorder sequence.");
8164 if (!TE->ReuseShuffleIndices.empty()) {
8171 addMask(NewReuses, TE->ReuseShuffleIndices);
8172 TE->ReuseShuffleIndices.swap(NewReuses);
8173 }
else if (TE->UserTreeIndex &&
8174 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8176 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8182void BoUpSLP::buildReorderableOperands(
8183 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8186 for (
unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8187 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8188 return OpData.first ==
I &&
8189 (OpData.second->State == TreeEntry::Vectorize ||
8190 OpData.second->State == TreeEntry::StridedVectorize ||
8191 OpData.second->State == TreeEntry::CompressVectorize ||
8192 OpData.second->State == TreeEntry::SplitVectorize);
8196 if (UserTE->hasState()) {
8197 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8198 UserTE->getOpcode() == Instruction::ExtractValue)
8200 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8202 if (UserTE->getOpcode() == Instruction::Store &&
8203 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8205 if (UserTE->getOpcode() == Instruction::Load &&
8206 (UserTE->State == TreeEntry::Vectorize ||
8207 UserTE->State == TreeEntry::StridedVectorize ||
8208 UserTE->State == TreeEntry::CompressVectorize))
8211 TreeEntry *TE = getOperandEntry(UserTE,
I);
8212 assert(TE &&
"Expected operand entry.");
8213 if (!TE->isGather()) {
8216 Edges.emplace_back(
I, TE);
8222 if (TE->State == TreeEntry::ScatterVectorize &&
8223 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8227 if (ReorderableGathers.
contains(TE))
8233 struct TreeEntryCompare {
8234 bool operator()(
const TreeEntry *
LHS,
const TreeEntry *
RHS)
const {
8235 if (
LHS->UserTreeIndex &&
RHS->UserTreeIndex)
8236 return LHS->UserTreeIndex.UserTE->Idx <
RHS->UserTreeIndex.UserTE->Idx;
8237 return LHS->Idx <
RHS->Idx;
8246 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8247 if (TE->State != TreeEntry::Vectorize &&
8248 TE->State != TreeEntry::StridedVectorize &&
8249 TE->State != TreeEntry::CompressVectorize &&
8250 TE->State != TreeEntry::SplitVectorize)
8251 NonVectorized.
insert(TE.get());
8252 if (std::optional<OrdersType> CurrentOrder =
8254 Queue.push(TE.get());
8255 if (!(TE->State == TreeEntry::Vectorize ||
8256 TE->State == TreeEntry::StridedVectorize ||
8257 TE->State == TreeEntry::CompressVectorize ||
8258 TE->State == TreeEntry::SplitVectorize) ||
8259 !TE->ReuseShuffleIndices.empty())
8260 GathersToOrders.
insert(TE.get());
8269 while (!Queue.empty()) {
8271 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8272 TreeEntry *TE = Queue.top();
8273 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8276 while (!Queue.empty()) {
8278 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8283 for (TreeEntry *TE : OrderedOps) {
8284 if (!(TE->State == TreeEntry::Vectorize ||
8285 TE->State == TreeEntry::StridedVectorize ||
8286 TE->State == TreeEntry::CompressVectorize ||
8287 TE->State == TreeEntry::SplitVectorize ||
8288 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8289 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8290 !Visited.
insert(TE).second)
8294 Users.first = TE->UserTreeIndex.UserTE;
8295 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8299 if (
Data.first->State == TreeEntry::SplitVectorize) {
8301 Data.second.size() <= 2 &&
8302 "Expected not greater than 2 operands for split vectorize node.");
8304 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8307 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8308 "Expected exactly 2 entries.");
8309 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8310 TreeEntry &OpTE = *VectorizableTree[
P.first];
8312 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8313 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8315 const auto BestOrder =
8324 const unsigned E = Order.
size();
8327 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8329 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8331 if (!OpTE.ReorderIndices.empty()) {
8332 OpTE.ReorderIndices.clear();
8333 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8336 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8340 if (
Data.first->ReuseShuffleIndices.empty() &&
8341 !
Data.first->ReorderIndices.empty()) {
8344 Queue.push(
Data.first);
8350 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8362 for (
const auto &
Op :
Data.second) {
8363 TreeEntry *OpTE =
Op.second;
8364 if (!VisitedOps.
insert(OpTE).second)
8366 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8368 const auto Order = [&]() ->
const OrdersType {
8369 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8373 return OpTE->ReorderIndices;
8377 if (Order.
size() == 1)
8383 Value *Root = OpTE->hasState()
8386 auto GetSameNodesUsers = [&](
Value *Root) {
8388 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8389 if (TE != OpTE && TE->UserTreeIndex &&
8390 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8391 TE->Scalars.size() == OpTE->Scalars.size() &&
8392 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8393 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8394 Res.
insert(TE->UserTreeIndex.UserTE);
8396 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8397 if (TE != OpTE && TE->UserTreeIndex &&
8398 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8399 TE->Scalars.size() == OpTE->Scalars.size() &&
8400 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8401 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8402 Res.
insert(TE->UserTreeIndex.UserTE);
8406 auto GetNumOperands = [](
const TreeEntry *TE) {
8407 if (TE->State == TreeEntry::SplitVectorize)
8408 return TE->getNumOperands();
8409 if (
auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8410 return CI->arg_size();
8411 return TE->getNumOperands();
8413 auto NodeShouldBeReorderedWithOperands = [&,
TTI =
TTI](
8414 const TreeEntry *TE) {
8416 if (
auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8418 for (
unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8422 const TreeEntry *
Op = getOperandEntry(TE,
Idx);
8423 if (
Op->isGather() &&
Op->hasState()) {
8424 const TreeEntry *VecOp =
8425 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8429 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8436 if (!RevisitedOps.
insert(UTE).second)
8438 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8439 !UTE->ReuseShuffleIndices.empty() ||
8440 (UTE->UserTreeIndex &&
8441 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8442 (
Data.first->UserTreeIndex &&
8443 Data.first->UserTreeIndex.UserTE == UTE) ||
8444 (IgnoreReorder && UTE->UserTreeIndex &&
8445 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8446 NodeShouldBeReorderedWithOperands(UTE);
8449 for (TreeEntry *UTE :
Users) {
8451 if (
auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8453 for (
unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8457 const TreeEntry *
Op = getOperandEntry(UTE,
Idx);
8459 Queue.push(
const_cast<TreeEntry *
>(
Op));
8464 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8465 return P.second == OpTE;
8468 if (OpTE->State == TreeEntry::Vectorize &&
8469 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8470 assert(!OpTE->isAltShuffle() &&
8471 "Alternate instructions are only supported by BinaryOperator "
8475 unsigned E = Order.
size();
8478 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8481 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8483 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8485 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8486 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8487 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8488 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8489 (IgnoreReorder && TE->Idx == 0))
8491 if (TE->isGather()) {
8501 if (OpTE->UserTreeIndex) {
8502 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8503 if (!VisitedUsers.
insert(UserTE).second)
8508 if (AllowsReordering(UserTE))
8516 if (
static_cast<unsigned>(
count_if(
8517 Ops, [UserTE, &AllowsReordering](
8518 const std::pair<unsigned, TreeEntry *> &
Op) {
8519 return AllowsReordering(
Op.second) &&
8520 Op.second->UserTreeIndex.UserTE == UserTE;
8521 })) <= Ops.
size() / 2)
8522 ++Res.first->second;
8525 if (OrdersUses.empty()) {
8530 unsigned IdentityCnt = 0;
8531 unsigned VF =
Data.second.front().second->getVectorFactor();
8533 for (
auto &Pair : OrdersUses) {
8535 IdentityCnt += Pair.second;
8540 unsigned Cnt = IdentityCnt;
8541 for (
auto &Pair : OrdersUses) {
8545 if (Cnt < Pair.second) {
8547 BestOrder = Pair.first;
8564 unsigned E = BestOrder.
size();
8566 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8568 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8569 TreeEntry *TE =
Op.second;
8570 if (!VisitedOps.
insert(TE).second)
8572 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8573 reorderNodeWithReuses(*TE, Mask);
8577 if (TE->State != TreeEntry::Vectorize &&
8578 TE->State != TreeEntry::StridedVectorize &&
8579 TE->State != TreeEntry::CompressVectorize &&
8580 TE->State != TreeEntry::SplitVectorize &&
8581 (TE->State != TreeEntry::ScatterVectorize ||
8582 TE->ReorderIndices.empty()))
8584 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8585 TE->ReorderIndices.empty()) &&
8586 "Non-matching sizes of user/operand entries.");
8588 if (IgnoreReorder && TE == VectorizableTree.front().get())
8589 IgnoreReorder =
false;
8592 for (TreeEntry *
Gather : GatherOps) {
8594 "Unexpected reordering of gathers.");
8595 if (!
Gather->ReuseShuffleIndices.empty()) {
8605 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8606 return TE.isAltShuffle() &&
8607 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8608 TE.ReorderIndices.empty());
8610 if (
Data.first->State != TreeEntry::Vectorize ||
8611 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
8612 Data.first->getMainOp()) ||
8613 IsNotProfitableAltCodeNode(*
Data.first))
8614 Data.first->reorderOperands(Mask);
8615 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
8616 IsNotProfitableAltCodeNode(*
Data.first) ||
8617 Data.first->State == TreeEntry::StridedVectorize ||
8618 Data.first->State == TreeEntry::CompressVectorize) {
8622 if (
Data.first->ReuseShuffleIndices.empty() &&
8623 !
Data.first->ReorderIndices.empty() &&
8624 !IsNotProfitableAltCodeNode(*
Data.first)) {
8627 Queue.push(
Data.first);
8635 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8636 VectorizableTree.front()->ReuseShuffleIndices.empty())
8637 VectorizableTree.front()->ReorderIndices.clear();
8640Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8641 if (Entry.hasState() &&
8642 (Entry.getOpcode() == Instruction::Store ||
8643 Entry.getOpcode() == Instruction::Load) &&
8644 Entry.State == TreeEntry::StridedVectorize &&
8645 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8646 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8647 return dyn_cast<Instruction>(Entry.Scalars.front());
8652 const size_t NumVectScalars = ScalarToTreeEntries.
size() + 1;
8656 for (
auto &TEPtr : VectorizableTree) {
8657 TreeEntry *Entry = TEPtr.get();
8660 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8664 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8665 Value *Scalar = Entry->Scalars[Lane];
8666 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8670 auto It = ScalarToExtUses.
find(Scalar);
8671 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8674 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8675 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8676 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8677 <<
" from " << *Scalar <<
"for many users.\n");
8678 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8679 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8680 ExternalUsesWithNonUsers.
insert(Scalar);
8685 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8686 if (ExtI != ExternallyUsedValues.
end()) {
8687 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8688 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8689 << FoundLane <<
" from " << *Scalar <<
".\n");
8690 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8691 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8694 for (
User *U : Scalar->users()) {
8702 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8707 !UseEntries.empty()) {
8711 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8712 isa<LoadInst, StoreInst>(UserInst)) ||
8713 isa<CallInst>(UserInst)) ||
8714 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8715 return UseEntry->State == TreeEntry::ScatterVectorize ||
8717 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8720 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8723 [](TreeEntry *UseEntry) {
8724 return UseEntry->isGather();
8730 if (It != ScalarToExtUses.
end()) {
8731 ExternalUses[It->second].User =
nullptr;
8736 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8738 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8740 <<
" from lane " << FoundLane <<
" from " << *Scalar
8742 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8743 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8744 ExternalUsesWithNonUsers.
insert(Scalar);
8753BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8757 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8758 Value *V = TE->Scalars[Lane];
8760 if (!isa<Instruction>(V))
8767 for (
User *U : V->users()) {
8768 auto *SI = dyn_cast<StoreInst>(U);
8771 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() != F ||
8780 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8781 SI->getValueOperand()->getType(),
Ptr}];
8784 if (StoresVec.size() > Lane)
8786 if (!StoresVec.empty()) {
8788 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8789 SI->getValueOperand()->getType(),
8790 StoresVec.front()->getPointerOperand(), *DL, *SE,
8796 StoresVec.push_back(SI);
8801 for (
auto &
P : PtrToStoresMap) {
8809 OrdersType &ReorderIndices)
const {
8820 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
8822 std::optional<int64_t> Diff =
8824 SI->getPointerOperand(), *DL, *SE,
8830 if (StoreOffsetVec.
size() != StoresVec.
size())
8834 int64_t PrevDist = 0;
8835 for (
const auto &
P : StoreOffsetVec) {
8836 if (
Idx > 0 &&
P.first != PrevDist + 1)
8844 ReorderIndices.assign(StoresVec.
size(), 0);
8845 bool IsIdentity =
true;
8847 ReorderIndices[
P.second] =
I;
8848 IsIdentity &=
P.second ==
I;
8854 ReorderIndices.clear();
8861 for (
unsigned Idx : Order)
8868BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8869 unsigned NumLanes =
TE->Scalars.size();
8882 if (StoresVec.
size() != NumLanes)
8887 if (!canFormVector(StoresVec, ReorderIndices))
8892 ExternalReorderIndices.
push_back(ReorderIndices);
8894 return ExternalReorderIndices;
8900 UserIgnoreList = &UserIgnoreLst;
8903 buildTreeRec(Roots, 0,
EdgeInfo());
8910 buildTreeRec(Roots, 0,
EdgeInfo());
8919 bool AddNew =
true) {
8927 for (
Value *V : VL) {
8928 auto *LI = dyn_cast<LoadInst>(V);
8931 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8933 bool IsFound =
false;
8934 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8935 assert(LI->getParent() ==
Data.front().first->getParent() &&
8936 LI->getType() ==
Data.front().first->getType() &&
8940 "Expected loads with the same type, same parent and same "
8941 "underlying pointer.");
8943 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8944 Data.front().first->getPointerOperand(),
DL, SE,
8948 auto It = Map.find(*Dist);
8949 if (It != Map.end() && It->second != LI)
8951 if (It == Map.end()) {
8952 Data.emplace_back(LI, *Dist);
8953 Map.try_emplace(*Dist, LI);
8963 auto FindMatchingLoads =
8968 int64_t &
Offset,
unsigned &Start) {
8970 return GatheredLoads.
end();
8979 std::optional<int64_t> Dist =
8981 Data.front().first->getType(),
8982 Data.front().first->getPointerOperand(),
DL, SE,
8988 for (std::pair<LoadInst *, int64_t>
P :
Data) {
8994 unsigned NumUniques = 0;
8995 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
8996 bool Used = DataLoads.
contains(Pair.first);
8997 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9001 Repeated.insert(Cnt);
9004 if (NumUniques > 0 &&
9005 (Loads.
size() == NumUniques ||
9006 (Loads.
size() - NumUniques >= 2 &&
9007 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9013 return std::next(GatheredLoads.
begin(),
Idx);
9017 return GatheredLoads.
end();
9019 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9023 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9025 while (It != GatheredLoads.
end()) {
9026 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9027 for (
unsigned Idx : LocalToAdd)
9030 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9034 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9038 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
9047 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9048 return PD.front().first->getParent() == LI->
getParent() &&
9049 PD.front().first->getType() == LI->
getType();
9051 while (It != GatheredLoads.
end()) {
9054 std::next(It), GatheredLoads.
end(),
9055 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9056 return PD.front().first->getParent() == LI->getParent() &&
9057 PD.front().first->getType() == LI->getType();
9061 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9062 AddNewLoads(GatheredLoads.emplace_back());
9067void BoUpSLP::tryToVectorizeGatheredLoads(
9069 std::tuple<BasicBlock *, Value *, Type *>,
9072 GatheredLoadsEntriesFirst = VectorizableTree.size();
9075 LoadEntriesToVectorize.
size());
9076 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9077 Set.insert_range(VectorizableTree[
Idx]->Scalars);
9080 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9081 const std::pair<LoadInst *, int64_t> &L2) {
9082 return L1.second > L2.second;
9088 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9089 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9097 bool Final,
unsigned MaxVF) {
9099 unsigned StartIdx = 0;
9104 *
TTI, Loads.
front()->getType(), MaxVF);
9106 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
9112 if (Final && CandidateVFs.
empty())
9115 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9116 for (
unsigned NumElts : CandidateVFs) {
9117 if (Final && NumElts > BestVF)
9120 for (
unsigned Cnt = StartIdx, E = Loads.
size(); Cnt < E;
9124 if (VectorizedLoads.count(Slice.
front()) ||
9125 VectorizedLoads.count(Slice.
back()) ||
9131 bool AllowToVectorize =
false;
9139 if (LI->hasOneUse())
9145 if (
static_cast<unsigned int>(std::distance(
9146 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9148 if (!IsLegalBroadcastLoad)
9152 for (
User *U : LI->users()) {
9153 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
9155 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9156 for (
int I : seq<int>(UTE->getNumOperands())) {
9158 return V == LI || isa<PoisonValue>(V);
9168 AllowToVectorize = CheckIfAllowed(Slice);
9172 any_of(ValueToGatherNodes.at(Slice.front()),
9173 [=](
const TreeEntry *TE) {
9174 return TE->Scalars.size() == 2 &&
9175 ((TE->Scalars.front() == Slice.front() &&
9176 TE->Scalars.back() == Slice.back()) ||
9177 (TE->Scalars.front() == Slice.back() &&
9178 TE->Scalars.back() == Slice.front()));
9183 if (AllowToVectorize) {
9188 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9190 PointerOps, &BestVF);
9192 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9194 if (MaskedGatherVectorized.
empty() ||
9195 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9200 Results.emplace_back(Values, LS);
9201 VectorizedLoads.insert_range(Slice);
9204 if (Cnt == StartIdx)
9205 StartIdx += NumElts;
9208 if (StartIdx >= Loads.
size())
9212 if (!MaskedGatherVectorized.
empty() &&
9213 Cnt < MaskedGatherVectorized.
back() + NumElts)
9219 if (!AllowToVectorize || BestVF == 0)
9223 for (
unsigned Cnt : MaskedGatherVectorized) {
9225 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9229 VectorizedLoads.insert_range(Slice);
9231 if (Cnt == StartIdx)
9232 StartIdx += NumElts;
9236 if (!VectorizedLoads.contains(LI))
9237 NonVectorized.push_back(LI);
9241 auto ProcessGatheredLoads =
9244 bool Final =
false) {
9246 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9248 if (LoadsDists.size() <= 1) {
9249 NonVectorized.
push_back(LoadsDists.back().first);
9257 unsigned MaxConsecutiveDistance = 0;
9258 unsigned CurrentConsecutiveDist = 1;
9259 int64_t LastDist = LocalLoadsDists.
front().second;
9260 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9261 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9264 assert(LastDist >=
L.second &&
9265 "Expected first distance always not less than second");
9266 if (
static_cast<uint64_t>(LastDist -
L.second) ==
9267 CurrentConsecutiveDist) {
9268 ++CurrentConsecutiveDist;
9269 MaxConsecutiveDistance =
9270 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9274 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9277 CurrentConsecutiveDist = 1;
9278 LastDist =
L.second;
9281 if (Loads.
size() <= 1)
9283 if (AllowMaskedGather)
9284 MaxConsecutiveDistance = Loads.
size();
9285 else if (MaxConsecutiveDistance < 2)
9290 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9291 Final, MaxConsecutiveDistance);
9293 OriginalLoads.size() == Loads.
size() &&
9294 MaxConsecutiveDistance == Loads.
size() &&
9299 VectorizedLoads.
clear();
9303 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9304 UnsortedNonVectorized, Final,
9305 OriginalLoads.size());
9306 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9307 SortedNonVectorized.
swap(UnsortedNonVectorized);
9308 Results.swap(UnsortedResults);
9313 << Slice.
size() <<
")\n");
9315 for (
Value *L : Slice)
9317 SortedNonVectorized.
push_back(cast<LoadInst>(L));
9323 unsigned MaxVF = Slice.size();
9324 unsigned UserMaxVF = 0;
9325 unsigned InterleaveFactor = 0;
9330 std::optional<unsigned> InterleavedLoadsDistance = 0;
9332 std::optional<unsigned> CommonVF = 0;
9336 for (
const TreeEntry *E : ValueToGatherNodes.at(V)) {
9337 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9340 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
9342 if (*CommonVF == 0) {
9343 CommonVF = E->Scalars.size();
9346 if (*CommonVF != E->Scalars.size())
9350 if (Pos !=
Idx && InterleavedLoadsDistance) {
9351 if (!DeinterleavedNodes.
contains(E) &&
9353 if (isa<Constant>(V))
9355 if (isVectorized(V))
9357 const auto &Nodes = ValueToGatherNodes.at(V);
9358 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9359 !is_contained(Slice, V);
9361 InterleavedLoadsDistance.reset();
9364 DeinterleavedNodes.
insert(E);
9365 if (*InterleavedLoadsDistance == 0) {
9366 InterleavedLoadsDistance =
Idx - Pos;
9369 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9370 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
9371 InterleavedLoadsDistance.reset();
9372 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9376 DeinterleavedNodes.
clear();
9378 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9379 CommonVF.value_or(0) != 0) {
9380 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9381 unsigned VF = *CommonVF;
9385 if (InterleaveFactor <= Slice.size() &&
9389 cast<LoadInst>(Slice.front())->getAlign(),
9390 cast<LoadInst>(Slice.front())
9394 UserMaxVF = InterleaveFactor * VF;
9396 InterleaveFactor = 0;
9401 unsigned ConsecutiveNodesSize = 0;
9402 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
9403 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9404 [&, Slice = Slice](
const auto &
P) {
9406 return std::get<1>(
P).contains(V);
9408 if (It == Slice.end())
9410 const TreeEntry &
TE =
9411 *VectorizableTree[std::get<0>(
P)];
9416 VL, VL.
front(), Order, PointerOps);
9420 ConsecutiveNodesSize += VL.
size();
9421 size_t Start = std::distance(Slice.begin(), It);
9422 size_t Sz = Slice.size() - Start;
9423 return Sz < VL.
size() ||
9424 Slice.slice(Start, VL.
size()) != VL;
9429 if (InterleaveFactor == 0 &&
9430 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9431 [&, Slice = Slice](
unsigned Idx) {
9433 SmallVector<Value *> PointerOps;
9434 return canVectorizeLoads(
9435 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9436 Slice[Idx * UserMaxVF], Order,
9438 LoadsState::ScatterVectorize;
9441 if (Slice.size() != ConsecutiveNodesSize)
9442 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9444 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9445 bool IsVectorized =
true;
9446 for (
unsigned I = 0, E = Slice.size();
I < E;
I += VF) {
9448 Slice.
slice(
I, std::min(VF, E -
I));
9453 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9454 [&](
const auto &
P) {
9456 VectorizableTree[std::get<0>(
P)]
9461 unsigned Sz = VectorizableTree.size();
9462 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9463 if (Sz == VectorizableTree.size()) {
9464 IsVectorized =
false;
9467 if (InterleaveFactor > 0) {
9468 VF = 2 * (MaxVF / InterleaveFactor);
9469 InterleaveFactor = 0;
9478 NonVectorized.
append(SortedNonVectorized);
9480 return NonVectorized;
9482 for (
const auto &GLs : GatheredLoads) {
9483 const auto &
Ref = GLs.second;
9485 if (!
Ref.empty() && !NonVectorized.
empty() &&
9487 Ref.begin(),
Ref.end(), 0u,
9488 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9489 ->
unsigned { return S + LoadsDists.size(); }) !=
9490 NonVectorized.
size() &&
9491 IsMaskedGatherSupported(NonVectorized)) {
9494 for (
LoadInst *LI : NonVectorized) {
9502 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9506 for (
unsigned Idx : LoadEntriesToVectorize) {
9507 const TreeEntry &E = *VectorizableTree[
Idx];
9510 if (!E.ReorderIndices.empty()) {
9517 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9521 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9522 VectorizableTree.size())
9523 GatheredLoadsEntriesFirst.reset();
9533 bool AllowAlternate) {
9537 if (
auto *LI = dyn_cast<LoadInst>(V)) {
9540 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
9545 if (isa<ExtractElementInst, UndefValue>(V))
9547 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
9549 !isa<UndefValue>(EI->getIndexOperand()))
9552 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
9555 if ((isa<BinaryOperator, CastInst>(
I)) &&
9556 isValidForAlternation(
I->getOpcode())) {
9565 : cast<CastInst>(
I)->getOperand(0)->getType()));
9567 if (isa<CastInst>(
I)) {
9568 std::pair<size_t, size_t> OpVals =
9574 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
9576 if (CI->isCommutative())
9582 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
9596 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
9597 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9598 SubKey =
hash_value(Gep->getPointerOperand());
9602 !isa<ConstantInt>(
I->getOperand(1))) {
9610 return std::make_pair(Key, SubKey);
9618bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9620 Type *ScalarTy = S.getMainOp()->getType();
9621 unsigned Opcode0 = S.getOpcode();
9622 unsigned Opcode1 = S.getAltOpcode();
9626 Opcode1, OpcodeMask))
9629 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9632 for (
Value *V : VL) {
9633 if (isa<PoisonValue>(V)) {
9638 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
9643 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9649 switch (Res.value_or(0)) {
9664 constexpr unsigned NumAltInsts = 3;
9665 unsigned NonInstCnt = 0;
9668 unsigned UndefCnt = 0;
9670 unsigned ExtraShuffleInsts = 0;
9679 return is_contained(Operands.back(), V);
9682 ++ExtraShuffleInsts;
9699 if (isa<Constant, ExtractElementInst>(V) ||
9701 if (isa<UndefValue>(V))
9707 if (!Res.second && Res.first->second == 1)
9708 ++ExtraShuffleInsts;
9709 ++Res.first->getSecond();
9710 if (
auto *
I = dyn_cast<Instruction>(V))
9711 UniqueOpcodes.
insert(
I->getOpcode());
9712 else if (Res.second)
9715 return none_of(Uniques, [&](
const auto &
P) {
9716 return P.first->hasNUsesOrMore(
P.second + 1) &&
9718 return isVectorized(U) || Uniques.contains(U);
9727 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9728 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9729 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9736 const unsigned VF,
unsigned MinBW,
9759static std::pair<InstructionCost, InstructionCost>
9778 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9779 FMF = FPCI->getFastMathFlags();
9782 LibCost.isValid() ? LibCost : ScalarLimit);
9792BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9794 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9797 "Expected instructions with same/alternate opcodes only.");
9799 unsigned ShuffleOrOp =
9800 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
9802 switch (ShuffleOrOp) {
9803 case Instruction::PHI: {
9806 return TreeEntry::NeedToGather;
9808 for (
Value *V : VL) {
9809 auto *
PHI = dyn_cast<PHINode>(V);
9814 if (Term &&
Term->isTerminator()) {
9816 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9817 return TreeEntry::NeedToGather;
9822 return TreeEntry::Vectorize;
9824 case Instruction::ExtractElement:
9826 auto *EI = dyn_cast<ExtractElementInst>(V);
9831 return TreeEntry::NeedToGather;
9833 case Instruction::ExtractValue: {
9834 bool Reuse = canReuseExtract(VL, CurrentOrder);
9838 return TreeEntry::NeedToGather;
9839 if (Reuse || !CurrentOrder.empty())
9840 return TreeEntry::Vectorize;
9842 return TreeEntry::NeedToGather;
9844 case Instruction::InsertElement: {
9848 for (
Value *V : VL) {
9849 if (isa<PoisonValue>(V)) {
9850 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9851 return TreeEntry::NeedToGather;
9853 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9855 "Non-constant or undef index?");
9859 return !SourceVectors.contains(V);
9862 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9863 "different source vectors.\n");
9864 return TreeEntry::NeedToGather;
9869 return SourceVectors.contains(V) && !
V->hasOneUse();
9872 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9873 "multiple uses.\n");
9874 return TreeEntry::NeedToGather;
9877 return TreeEntry::Vectorize;
9879 case Instruction::Load: {
9886 auto IsGatheredNode = [&]() {
9887 if (!GatheredLoadsEntriesFirst)
9890 if (isa<PoisonValue>(V))
9892 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9893 return TE->Idx >= *GatheredLoadsEntriesFirst;
9899 return TreeEntry::Vectorize;
9901 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9903 LoadEntriesToVectorize.insert(VectorizableTree.size());
9904 return TreeEntry::NeedToGather;
9906 return IsGatheredNode() ? TreeEntry::NeedToGather
9907 : TreeEntry::CompressVectorize;
9909 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9911 LoadEntriesToVectorize.insert(VectorizableTree.size());
9912 return TreeEntry::NeedToGather;
9914 return IsGatheredNode() ? TreeEntry::NeedToGather
9915 : TreeEntry::ScatterVectorize;
9917 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9919 LoadEntriesToVectorize.insert(VectorizableTree.size());
9920 return TreeEntry::NeedToGather;
9922 return IsGatheredNode() ? TreeEntry::NeedToGather
9923 : TreeEntry::StridedVectorize;
9927 if (
DL->getTypeSizeInBits(ScalarTy) !=
9928 DL->getTypeAllocSizeInBits(ScalarTy))
9929 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9931 auto *LI = dyn_cast<LoadInst>(V);
9932 return !LI || !LI->isSimple();
9939 return TreeEntry::NeedToGather;
9943 case Instruction::ZExt:
9944 case Instruction::SExt:
9945 case Instruction::FPToUI:
9946 case Instruction::FPToSI:
9947 case Instruction::FPExt:
9948 case Instruction::PtrToInt:
9949 case Instruction::IntToPtr:
9950 case Instruction::SIToFP:
9951 case Instruction::UIToFP:
9952 case Instruction::Trunc:
9953 case Instruction::FPTrunc:
9954 case Instruction::BitCast: {
9956 for (
Value *V : VL) {
9957 if (isa<PoisonValue>(V))
9959 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
9962 dbgs() <<
"SLP: Gathering casts with different src types.\n");
9963 return TreeEntry::NeedToGather;
9966 return TreeEntry::Vectorize;
9968 case Instruction::ICmp:
9969 case Instruction::FCmp: {
9974 for (
Value *V : VL) {
9975 if (isa<PoisonValue>(V))
9977 auto *
Cmp = cast<CmpInst>(V);
9978 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
9979 Cmp->getOperand(0)->getType() != ComparedTy) {
9980 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
9981 return TreeEntry::NeedToGather;
9984 return TreeEntry::Vectorize;
9986 case Instruction::Select:
9987 case Instruction::FNeg:
9988 case Instruction::Add:
9989 case Instruction::FAdd:
9990 case Instruction::Sub:
9991 case Instruction::FSub:
9992 case Instruction::Mul:
9993 case Instruction::FMul:
9994 case Instruction::UDiv:
9995 case Instruction::SDiv:
9996 case Instruction::FDiv:
9997 case Instruction::URem:
9998 case Instruction::SRem:
9999 case Instruction::FRem:
10000 case Instruction::Shl:
10001 case Instruction::LShr:
10002 case Instruction::AShr:
10003 case Instruction::And:
10004 case Instruction::Or:
10005 case Instruction::Xor:
10006 case Instruction::Freeze:
10007 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10009 auto *
I = dyn_cast<Instruction>(V);
10010 return I &&
I->isBinaryOp() && !
I->isFast();
10012 return TreeEntry::NeedToGather;
10013 return TreeEntry::Vectorize;
10014 case Instruction::GetElementPtr: {
10016 for (
Value *V : VL) {
10017 auto *
I = dyn_cast<GetElementPtrInst>(V);
10020 if (
I->getNumOperands() != 2) {
10021 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10022 return TreeEntry::NeedToGather;
10028 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10029 for (
Value *V : VL) {
10030 auto *
GEP = dyn_cast<GEPOperator>(V);
10033 Type *CurTy =
GEP->getSourceElementType();
10034 if (Ty0 != CurTy) {
10035 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10036 return TreeEntry::NeedToGather;
10042 for (
Value *V : VL) {
10043 auto *
I = dyn_cast<GetElementPtrInst>(V);
10046 auto *
Op =
I->getOperand(1);
10047 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
10048 (
Op->getType() != Ty1 &&
10049 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
10050 Op->getType()->getScalarSizeInBits() >
10051 DL->getIndexSizeInBits(
10052 V->getType()->getPointerAddressSpace())))) {
10054 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10055 return TreeEntry::NeedToGather;
10059 return TreeEntry::Vectorize;
10061 case Instruction::Store: {
10063 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10066 if (
DL->getTypeSizeInBits(ScalarTy) !=
10067 DL->getTypeAllocSizeInBits(ScalarTy)) {
10068 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10069 return TreeEntry::NeedToGather;
10073 for (
Value *V : VL) {
10074 auto *
SI = cast<StoreInst>(V);
10075 if (!
SI->isSimple()) {
10077 return TreeEntry::NeedToGather;
10086 if (CurrentOrder.empty()) {
10087 Ptr0 = PointerOps.
front();
10088 PtrN = PointerOps.
back();
10090 Ptr0 = PointerOps[CurrentOrder.front()];
10091 PtrN = PointerOps[CurrentOrder.back()];
10093 std::optional<int64_t> Dist =
10096 if (
static_cast<uint64_t>(*Dist) == VL.size() - 1)
10097 return TreeEntry::Vectorize;
10101 return TreeEntry::NeedToGather;
10103 case Instruction::Call: {
10104 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10106 auto *
I = dyn_cast<Instruction>(V);
10107 return I && !
I->isFast();
10109 return TreeEntry::NeedToGather;
10112 CallInst *CI = cast<CallInst>(VL0);
10123 return TreeEntry::NeedToGather;
10126 unsigned NumArgs = CI->
arg_size();
10128 for (
unsigned J = 0; J != NumArgs; ++J)
10131 for (
Value *V : VL) {
10132 CallInst *CI2 = dyn_cast<CallInst>(V);
10138 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10140 return TreeEntry::NeedToGather;
10144 for (
unsigned J = 0; J != NumArgs; ++J) {
10147 if (ScalarArgs[J] != A1J) {
10149 <<
"SLP: mismatched arguments in call:" << *CI
10150 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10151 return TreeEntry::NeedToGather;
10160 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10161 <<
"!=" << *V <<
'\n');
10162 return TreeEntry::NeedToGather;
10167 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10169 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10170 return TreeEntry::NeedToGather;
10172 return TreeEntry::Vectorize;
10174 case Instruction::ShuffleVector: {
10175 if (!S.isAltShuffle()) {
10178 return TreeEntry::Vectorize;
10181 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10182 return TreeEntry::NeedToGather;
10187 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10188 "the whole alt sequence is not profitable.\n");
10189 return TreeEntry::NeedToGather;
10192 return TreeEntry::Vectorize;
10196 return TreeEntry::NeedToGather;
10210 PHIHandler() =
delete;
10212 : DT(DT), Main(Main), Phis(Phis),
10213 Operands(Main->getNumIncomingValues(),
10215 void buildOperands() {
10216 constexpr unsigned FastLimit = 4;
10226 auto *
P = dyn_cast<PHINode>(V);
10228 assert(isa<PoisonValue>(V) &&
10229 "Expected isa instruction or poison value.");
10233 if (
P->getIncomingBlock(
I) == InBB)
10249 Blocks.try_emplace(InBB).first->second.push_back(
I);
10252 if (isa<PoisonValue>(V)) {
10257 auto *
P = cast<PHINode>(V);
10258 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
10266 auto *It =
Blocks.find(InBB);
10269 Operands[It->second.front()][
Idx] =
P->getIncomingValue(
I);
10272 for (
const auto &
P :
Blocks) {
10274 if (IncomingValues.
size() <= 1)
10277 for (
unsigned I : IncomingValues) {
10279 [&](
const auto &Data) {
10280 return !Data.value() ||
10281 Data.value() ==
Operands[BasicI][Data.index()];
10283 "Expected empty operands list.");
10297static std::pair<Instruction *, Instruction *>
10301 for (
Value *V : VL) {
10302 if (isa<PoisonValue>(V))
10304 auto *
I = dyn_cast<Instruction>(V);
10311 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10330 "Expected different main and alt instructions.");
10331 return std::make_pair(MainOp, AltOp);
10344 const InstructionsState &S,
10346 bool TryPad =
false) {
10350 for (
Value *V : VL) {
10366 size_t NumUniqueScalarValues = UniqueValues.
size();
10369 if (NumUniqueScalarValues == VL.
size() &&
10371 ReuseShuffleIndices.
clear();
10376 if ((UserTreeIdx.
UserTE &&
10377 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10379 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10380 "for nodes with padding.\n");
10381 ReuseShuffleIndices.
clear();
10386 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10388 return isa<UndefValue>(V) || !
isConstant(V);
10390 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10391 S.getMainOp()->isSafeToRemove() &&
10392 (S.areInstructionsWithCopyableElements() ||
10393 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10396 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10397 PWSz = std::min<unsigned>(PWSz, VL.
size());
10398 if (PWSz == VL.
size()) {
10402 ReuseShuffleIndices.
clear();
10406 UniqueValues.
end());
10407 PaddedUniqueValues.
append(
10408 PWSz - UniqueValues.
size(),
10412 if (!S.areInstructionsWithCopyableElements() &&
10415 ReuseShuffleIndices.
clear();
10418 VL = std::move(PaddedUniqueValues);
10423 ReuseShuffleIndices.
clear();
10426 VL = std::move(UniqueValues);
10431 const InstructionsState &LocalState,
10434 OrdersType &ReorderIndices)
const {
10435 constexpr unsigned SmallNodeSize = 4;
10441 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10443 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10444 if (E->isSame(VL)) {
10446 << *LocalState.getMainOp() <<
".\n");
10451 return isa<PoisonValue>(V) || Values.contains(V);
10458 ReorderIndices.assign(VL.
size(), VL.
size());
10461 auto *
I = dyn_cast<Instruction>(V);
10464 Op1Indices.set(
Idx);
10467 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10470 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10472 LocalState.getAltOp(), *TLI))) {
10474 Op1Indices.set(
Idx);
10481 unsigned Opcode0 = LocalState.getOpcode();
10482 unsigned Opcode1 = LocalState.getAltOpcode();
10488 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10494 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10495 for (
unsigned Idx : seq<unsigned>(VL.
size())) {
10496 if (Op1Indices.test(
Idx)) {
10497 ReorderIndices[Op1Cnt] =
Idx;
10500 ReorderIndices[Op2Cnt] =
Idx;
10505 ReorderIndices.clear();
10507 if (!ReorderIndices.empty())
10514 if (NumParts >= VL.
size())
10523 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10524 (
Mask.empty() || InsertCost >= NewShuffleCost))
10526 if ((LocalState.getMainOp()->isBinaryOp() &&
10527 LocalState.getAltOp()->isBinaryOp() &&
10528 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10529 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10530 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10531 (LocalState.getMainOp()->isUnaryOp() &&
10532 LocalState.getAltOp()->isUnaryOp())) {
10537 for (
unsigned Idx : seq<unsigned>(VL.
size())) {
10538 if (isa<PoisonValue>(VL[
Idx]))
10540 OriginalMask[
Idx] =
Idx + (Op1Indices.test(
Idx) ? 0 : VL.
size());
10544 VecTy, OriginalMask, Kind);
10549 NewVecOpsCost + InsertCost +
10550 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10551 VectorizableTree.front()->getOpcode() == Instruction::Store
10555 if (NewCost >= OriginalCost)
10565class InstructionsCompatibilityAnalysis {
10570 unsigned MainOpcode = 0;
10575 static bool isSupportedOpcode(
const unsigned Opcode) {
10576 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10587 return I && isSupportedOpcode(
I->getOpcode()) &&
10594 for (
Value *V : VL) {
10595 auto *
I = dyn_cast<Instruction>(V);
10600 if (Candidates.
empty()) {
10601 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10602 Parent =
I->getParent();
10603 Operands.insert(
I->op_begin(),
I->op_end());
10606 if (Parent ==
I->getParent()) {
10607 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10608 Operands.insert(
I->op_begin(),
I->op_end());
10611 auto *NodeA = DT.
getNode(Parent);
10612 auto *NodeB = DT.
getNode(
I->getParent());
10613 assert(NodeA &&
"Should only process reachable instructions");
10614 assert(NodeB &&
"Should only process reachable instructions");
10615 assert((NodeA == NodeB) ==
10616 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10617 "Different nodes should have different DFS numbers");
10618 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10619 Candidates.
clear();
10620 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10621 Parent =
I->getParent();
10623 Operands.insert(
I->op_begin(),
I->op_end());
10626 unsigned BestOpcodeNum = 0;
10628 for (
const auto &
P : Candidates) {
10629 if (
P.second.size() < BestOpcodeNum)
10632 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10634 BestOpcodeNum =
P.second.size();
10643 auto *
I = dyn_cast<Instruction>(V);
10644 return I &&
I->getParent() == MainOp->
getParent() &&
10657 Value *selectBestIdempotentValue()
const {
10658 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10667 if (isa<PoisonValue>(V))
10669 if (!S.isCopyableElement(V))
10670 return convertTo(cast<Instruction>(V), S).second;
10671 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10672 return {
V, selectBestIdempotentValue()};
10680 unsigned ShuffleOrOp =
10681 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
10684 switch (ShuffleOrOp) {
10685 case Instruction::PHI: {
10686 auto *PH = cast<PHINode>(VL0);
10689 PHIHandler Handler(DT, PH, VL);
10690 Handler.buildOperands();
10691 Operands.assign(PH->getNumOperands(), {});
10692 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
10693 Operands[
I].assign(Handler.getOperands(
I).begin(),
10694 Handler.getOperands(
I).end());
10697 case Instruction::ExtractValue:
10698 case Instruction::ExtractElement:
10703 case Instruction::InsertElement:
10706 auto *
IE = cast<InsertElementInst>(V);
10711 case Instruction::Load:
10716 auto *LI = dyn_cast<LoadInst>(V);
10719 Op = LI->getPointerOperand();
10722 case Instruction::ZExt:
10723 case Instruction::SExt:
10724 case Instruction::FPToUI:
10725 case Instruction::FPToSI:
10726 case Instruction::FPExt:
10727 case Instruction::PtrToInt:
10728 case Instruction::IntToPtr:
10729 case Instruction::SIToFP:
10730 case Instruction::UIToFP:
10731 case Instruction::Trunc:
10732 case Instruction::FPTrunc:
10733 case Instruction::BitCast:
10734 case Instruction::ICmp:
10735 case Instruction::FCmp:
10736 case Instruction::Select:
10737 case Instruction::FNeg:
10738 case Instruction::Add:
10739 case Instruction::FAdd:
10740 case Instruction::Sub:
10741 case Instruction::FSub:
10742 case Instruction::Mul:
10743 case Instruction::FMul:
10744 case Instruction::UDiv:
10745 case Instruction::SDiv:
10746 case Instruction::FDiv:
10747 case Instruction::URem:
10748 case Instruction::SRem:
10749 case Instruction::FRem:
10750 case Instruction::Shl:
10751 case Instruction::LShr:
10752 case Instruction::AShr:
10753 case Instruction::And:
10754 case Instruction::Or:
10755 case Instruction::Xor:
10756 case Instruction::Freeze:
10757 case Instruction::Store:
10758 case Instruction::ShuffleVector:
10761 auto *
I = dyn_cast<Instruction>(V);
10767 auto [
Op, ConvertedOps] = convertTo(
I, S);
10772 case Instruction::GetElementPtr: {
10779 const unsigned IndexIdx = 1;
10784 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
10785 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10788 :
DL.getIndexType(cast<GetElementPtrInst>(VL0)
10789 ->getPointerOperandType()
10790 ->getScalarType());
10792 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
10795 Operands[1][
Idx] = ConstantInt::getNullValue(Ty);
10799 auto *
Op =
GEP->getOperand(IndexIdx);
10800 auto *CI = dyn_cast<ConstantInt>(
Op);
10802 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10807 case Instruction::Call: {
10808 auto *CI = cast<CallInst>(VL0);
10810 for (
unsigned Idx : seq<unsigned>(CI->
arg_size())) {
10813 auto &Ops =
Operands.emplace_back();
10814 for (
Value *V : VL) {
10815 auto *
I = dyn_cast<Instruction>(V);
10816 Ops.push_back(
I ?
I->getOperand(
Idx)
10836 bool TryCopyableElementsVectorization,
10837 bool WithProfitabilityCheck =
false,
10838 bool SkipSameCodeCheck =
false) {
10839 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10840 ? InstructionsState::invalid()
10846 findAndSetMainInstruction(VL, R);
10848 return InstructionsState::invalid();
10849 S = InstructionsState(MainOp, MainOp,
true);
10850 if (!WithProfitabilityCheck)
10854 auto BuildCandidates =
10857 if (V1 != V2 && isa<PHINode>(V1))
10859 auto *
I1 = dyn_cast<Instruction>(V1);
10860 auto *I2 = dyn_cast<Instruction>(V2);
10861 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10862 I1->getParent() != I2->getParent())
10866 if (VL.
size() == 2) {
10871 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10872 R.findBestRootPair(Candidates1) &&
10873 R.findBestRootPair(Candidates2);
10875 Candidates1.
clear();
10876 Candidates2.
clear();
10879 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10880 R.findBestRootPair(Candidates1) &&
10881 R.findBestRootPair(Candidates2);
10884 return InstructionsState::invalid();
10890 switch (MainOpcode) {
10891 case Instruction::Add:
10892 case Instruction::LShr:
10898 if (VectorCost > ScalarCost)
10899 return InstructionsState::invalid();
10902 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10903 unsigned CopyableNum =
10904 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10905 if (CopyableNum < VL.
size() / 2)
10908 const unsigned Limit = VL.
size() / 24;
10909 if ((CopyableNum >= VL.
size() - Limit ||
10910 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10913 return isa<PHINode>(V) || !S.isCopyableElement(V);
10915 return InstructionsState::invalid();
10921 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10926 if (isa<Constant>(Ops.front())) {
10934 return InstructionsState::invalid();
10940 constexpr unsigned Limit = 4;
10941 if (
Operands.front().size() >= Limit) {
10943 for (
Value *V : Ops) {
10944 if (isa<UndefValue>(V))
10950 return C.second == 1;
10956 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
10957 InstructionsState OpS =
Analysis.buildInstructionsState(
10959 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(Ops)))
10961 unsigned CopyableNum =
10962 count_if(Ops, [&](
Value *V) {
return OpS.isCopyableElement(V); });
10963 return CopyableNum <= VL.
size() / 2;
10965 if (!CheckOperand(
Operands.front()))
10966 return InstructionsState::invalid();
10973 assert(S &&
"Invalid state!");
10975 if (S.areInstructionsWithCopyableElements()) {
10976 MainOp = S.getMainOp();
10977 MainOpcode = S.getOpcode();
10982 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
10986 buildOriginalOperands(S, VL,
Operands);
10993BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
10995 bool TryCopyableElementsVectorization)
const {
10998 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *
TTI, *TLI);
10999 InstructionsState S =
Analysis.buildInstructionsState(
11000 VL, *
this, TryCopyableElementsVectorization,
11001 true, TryCopyableElementsVectorization);
11006 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11009 return ScalarsVectorizationLegality(S,
false,
11015 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11016 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11017 if (E->isSame(VL)) {
11018 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11020 return ScalarsVectorizationLegality(S,
false);
11024 return isa<PoisonValue>(V) || Values.contains(V) ||
11025 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11026 LI->
getLoopFor(S.getMainOp()->getParent()) &&
11030 return ScalarsVectorizationLegality(S,
false);
11039 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11044 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
11046 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11047 return ScalarsVectorizationLegality(S,
false);
11051 if (S && S.getOpcode() == Instruction::ExtractElement &&
11052 isa<ScalableVectorType>(
11053 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11054 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11055 return ScalarsVectorizationLegality(S,
false);
11062 return ScalarsVectorizationLegality(S,
false,
11072 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11081 for (
Value *V : VL) {
11082 auto *
I = cast<Instruction>(V);
11084 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11087 bool IsCommutative =
11089 if ((IsCommutative &&
11090 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11092 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11094 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11096 auto *
I1 = cast<Instruction>(VL.front());
11097 auto *I2 = cast<Instruction>(VL.back());
11098 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
11100 I2->getOperand(
Op));
11101 if (
static_cast<unsigned>(
count_if(
11102 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11104 })) >= S.getMainOp()->getNumOperands() / 2)
11106 if (S.getMainOp()->getNumOperands() > 2)
11108 if (IsCommutative) {
11110 Candidates.
clear();
11113 I2->getOperand((
Op + 1) % E));
11115 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11124 bool IsScatterVectorizeUserTE =
11125 UserTreeIdx.UserTE &&
11126 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11127 bool AreAllSameBlock = S.valid();
11128 bool AreScatterAllGEPSameBlock =
11129 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11133 auto *
I = dyn_cast<GetElementPtrInst>(V);
11137 BB =
I->getParent();
11138 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
11143 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11146 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11149 NotProfitableForVectorization(VL)) {
11151 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11152 "C,S,B,O, small shuffle. \n";
11156 return ScalarsVectorizationLegality(S,
false,
11160 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11164 return ScalarsVectorizationLegality(S,
false);
11168 if (S && !EphValues.
empty()) {
11169 for (
Value *V : VL) {
11170 if (EphValues.
count(V)) {
11172 <<
") is ephemeral.\n");
11174 return ScalarsVectorizationLegality(S,
false,
11186 if (S && S.isAltShuffle()) {
11187 auto GetNumVectorizedExtracted = [&]() {
11191 auto *
I = dyn_cast<Instruction>(V);
11194 return isa<ExtractElementInst>(U.get());
11199 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11202 return std::make_pair(Vectorized, Extracted);
11204 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11206 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11207 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11210 Type *ScalarTy = VL.front()->getType();
11215 false,
true, Kind);
11217 *
TTI, ScalarTy, VecTy, Vectorized,
11218 true,
false, Kind,
false);
11219 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11221 if (PreferScalarize) {
11222 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11223 "node is not profitable.\n");
11224 return ScalarsVectorizationLegality(S,
false);
11229 if (UserIgnoreList && !UserIgnoreList->empty()) {
11230 for (
Value *V : VL) {
11231 if (UserIgnoreList->contains(V)) {
11232 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11233 return ScalarsVectorizationLegality(S,
false);
11240 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11241 assert(VL.front()->getType()->isPointerTy() &&
11242 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
11243 "Expected pointers only.");
11245 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11246 assert(It != VL.end() &&
"Expected at least one GEP.");
11263 return ScalarsVectorizationLegality(S,
false);
11265 return ScalarsVectorizationLegality(S,
true);
11269 const EdgeInfo &UserTreeIdx,
11270 unsigned InterleaveFactor) {
11277 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11280 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11284 copy(Op1, NewVL.begin());
11285 copy(Op2, std::next(NewVL.begin(), Op1.
size()));
11286 auto Invalid = ScheduleBundle::invalid();
11287 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11288 UserTreeIdx, {}, ReorderIndices);
11292 if (S && (isa<LoadInst>(S.getMainOp()) ||
11293 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11295 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11297 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE,
Idx});
11299 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11310 bool AreConsts =
false;
11311 for (
Value *V : VL) {
11312 if (isa<PoisonValue>(V))
11314 if (isa<Constant>(V)) {
11318 if (!isa<PHINode>(V))
11323 if (AreOnlyConstsWithPHIs(VL)) {
11324 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11325 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11329 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11330 VL,
Depth, UserTreeIdx,
false);
11331 InstructionsState S = Legality.getInstructionsState();
11332 if (!Legality.isLegal()) {
11333 if (Legality.trySplitVectorize()) {
11336 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11340 Legality = getScalarsVectorizationLegality(
11341 VL,
Depth, UserTreeIdx,
true);
11342 if (!Legality.isLegal()) {
11343 if (Legality.tryToFindDuplicates())
11347 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11350 S = Legality.getInstructionsState();
11354 if (S.isAltShuffle() && TrySplitNode(S))
11360 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11365 bool IsScatterVectorizeUserTE =
11366 UserTreeIdx.UserTE &&
11367 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11370 TreeEntry::EntryState State = getScalarsVectorizationState(
11371 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
11372 if (State == TreeEntry::NeedToGather) {
11373 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11379 auto &BSRef = BlocksSchedules[BB];
11381 BSRef = std::make_unique<BlockScheduling>(BB);
11383 BlockScheduling &BS = *BSRef;
11386 std::optional<ScheduleBundle *> BundlePtr =
11387 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11388#ifdef EXPENSIVE_CHECKS
11392 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11393 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11395 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11397 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11398 NonScheduledFirst.insert(VL.front());
11399 if (S.getOpcode() == Instruction::Load &&
11400 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11404 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *
TTI, *TLI);
11406 ScheduleBundle
Empty;
11407 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11408 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11410 unsigned ShuffleOrOp =
11411 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
11412 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11415 for (
unsigned I : seq<unsigned>(
Operands.size())) {
11420 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11425 for (
unsigned I : PHIOps)
11428 switch (ShuffleOrOp) {
11429 case Instruction::PHI: {
11431 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11439 case Instruction::ExtractValue:
11440 case Instruction::ExtractElement: {
11441 if (CurrentOrder.empty()) {
11442 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11445 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11447 for (
unsigned Idx : CurrentOrder)
11455 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11456 ReuseShuffleIndices, CurrentOrder);
11458 "(ExtractValueInst/ExtractElementInst).\n";
11465 case Instruction::InsertElement: {
11466 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11468 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11469 const std::pair<int, int> &P2) {
11470 return P1.first > P2.first;
11473 decltype(OrdCompare)>
11474 Indices(OrdCompare);
11475 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11477 Indices.emplace(
Idx,
I);
11479 OrdersType CurrentOrder(VL.size(), VL.size());
11480 bool IsIdentity =
true;
11481 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11482 CurrentOrder[Indices.top().second] =
I;
11483 IsIdentity &= Indices.top().second ==
I;
11487 CurrentOrder.clear();
11488 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11490 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11494 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11497 case Instruction::Load: {
11504 TreeEntry *
TE =
nullptr;
11507 case TreeEntry::Vectorize:
11508 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11509 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11510 if (CurrentOrder.empty())
11511 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11515 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11518 case TreeEntry::CompressVectorize:
11520 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11521 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11524 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11527 case TreeEntry::StridedVectorize:
11529 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11530 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11531 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11534 case TreeEntry::ScatterVectorize:
11536 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11537 UserTreeIdx, ReuseShuffleIndices);
11540 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11543 case TreeEntry::CombinedVectorize:
11544 case TreeEntry::SplitVectorize:
11545 case TreeEntry::NeedToGather:
11548 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11549 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11555 if (State == TreeEntry::ScatterVectorize)
11556 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11559 case Instruction::ZExt:
11560 case Instruction::SExt:
11561 case Instruction::FPToUI:
11562 case Instruction::FPToSI:
11563 case Instruction::FPExt:
11564 case Instruction::PtrToInt:
11565 case Instruction::IntToPtr:
11566 case Instruction::SIToFP:
11567 case Instruction::UIToFP:
11568 case Instruction::Trunc:
11569 case Instruction::FPTrunc:
11570 case Instruction::BitCast: {
11571 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11572 std::make_pair(std::numeric_limits<unsigned>::min(),
11573 std::numeric_limits<unsigned>::max()));
11574 if (ShuffleOrOp == Instruction::ZExt ||
11575 ShuffleOrOp == Instruction::SExt) {
11576 CastMaxMinBWSizes = std::make_pair(
11577 std::max<unsigned>(
DL->getTypeSizeInBits(VL0->
getType()),
11579 std::min<unsigned>(
11582 }
else if (ShuffleOrOp == Instruction::Trunc) {
11583 CastMaxMinBWSizes = std::make_pair(
11584 std::max<unsigned>(
11587 std::min<unsigned>(
DL->getTypeSizeInBits(VL0->
getType()),
11590 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11591 ReuseShuffleIndices);
11592 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11597 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11598 if (ShuffleOrOp == Instruction::Trunc) {
11599 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
11600 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11601 ShuffleOrOp == Instruction::UIToFP) {
11602 unsigned NumSignBits =
11604 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
11606 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11608 if (NumSignBits * 2 >=
11610 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
11614 case Instruction::ICmp:
11615 case Instruction::FCmp: {
11618 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11619 ReuseShuffleIndices);
11623 VLOperands Ops(VL,
Operands, S, *
this);
11628 "Commutative Predicate mismatch");
11635 if (isa<PoisonValue>(V))
11637 auto *
Cmp = cast<CmpInst>(V);
11638 if (
Cmp->getPredicate() != P0)
11645 if (ShuffleOrOp == Instruction::ICmp) {
11646 unsigned NumSignBits0 =
11648 if (NumSignBits0 * 2 >=
11650 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
11651 unsigned NumSignBits1 =
11653 if (NumSignBits1 * 2 >=
11655 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
11659 case Instruction::Select:
11660 case Instruction::FNeg:
11661 case Instruction::Add:
11662 case Instruction::FAdd:
11663 case Instruction::Sub:
11664 case Instruction::FSub:
11665 case Instruction::Mul:
11666 case Instruction::FMul:
11667 case Instruction::UDiv:
11668 case Instruction::SDiv:
11669 case Instruction::FDiv:
11670 case Instruction::URem:
11671 case Instruction::SRem:
11672 case Instruction::FRem:
11673 case Instruction::Shl:
11674 case Instruction::LShr:
11675 case Instruction::AShr:
11676 case Instruction::And:
11677 case Instruction::Or:
11678 case Instruction::Xor:
11679 case Instruction::Freeze: {
11680 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11681 ReuseShuffleIndices);
11683 dbgs() <<
"SLP: added a new TreeEntry "
11684 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11688 VLOperands Ops(VL,
Operands, S, *
this);
11695 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11698 case Instruction::GetElementPtr: {
11699 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11700 ReuseShuffleIndices);
11701 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11705 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
11709 case Instruction::Store: {
11710 bool Consecutive = CurrentOrder.empty();
11713 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11714 ReuseShuffleIndices, CurrentOrder);
11716 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11720 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11723 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11726 case Instruction::Call: {
11729 CallInst *CI = cast<CallInst>(VL0);
11732 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11733 ReuseShuffleIndices);
11734 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11737 VLOperands Ops(VL,
Operands, S, *
this);
11743 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
11748 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11752 case Instruction::ShuffleVector: {
11753 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11754 ReuseShuffleIndices);
11755 if (S.isAltShuffle()) {
11756 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11761 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11766 auto *CI = dyn_cast<CmpInst>(VL0);
11768 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11770 auto *MainCI = cast<CmpInst>(S.getMainOp());
11771 auto *AltCI = cast<CmpInst>(S.getAltOp());
11775 "Expected different main/alternate predicates.");
11779 if (isa<PoisonValue>(V))
11781 auto *
Cmp = cast<CmpInst>(V);
11797 if (isa<BinaryOperator>(VL0) || CI) {
11798 VLOperands Ops(VL,
Operands, S, *
this);
11805 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11818 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
11821 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
11823 for (
const auto *Ty : ST->elements())
11824 if (Ty != *ST->element_begin())
11826 N *= ST->getNumElements();
11827 EltTy = *ST->element_begin();
11828 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
11829 N *= AT->getNumElements();
11830 EltTy = AT->getElementType();
11832 auto *VT = cast<FixedVectorType>(EltTy);
11833 N *= VT->getNumElements();
11834 EltTy = VT->getElementType();
11841 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11842 VTSize !=
DL->getTypeStoreSizeInBits(
T))
11849 bool ResizeAllowed)
const {
11850 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
11851 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11852 auto *E0 = cast<Instruction>(*It);
11854 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
11858 Value *Vec = E0->getOperand(0);
11860 CurrentOrder.
clear();
11864 if (E0->getOpcode() == Instruction::ExtractValue) {
11869 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11873 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11876 unsigned E = VL.
size();
11877 if (!ResizeAllowed && NElts != E)
11880 unsigned MinIdx = NElts, MaxIdx = 0;
11882 auto *Inst = dyn_cast<Instruction>(V);
11885 if (Inst->getOperand(0) != Vec)
11887 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
11888 if (isa<UndefValue>(EE->getIndexOperand()))
11893 const unsigned ExtIdx = *
Idx;
11894 if (ExtIdx >= NElts)
11896 Indices[
I] = ExtIdx;
11897 if (MinIdx > ExtIdx)
11899 if (MaxIdx < ExtIdx)
11902 if (MaxIdx - MinIdx + 1 > E)
11904 if (MaxIdx + 1 <= E)
11908 bool ShouldKeepOrder =
true;
11914 CurrentOrder.
assign(E, E);
11915 for (
unsigned I = 0;
I < E; ++
I) {
11918 const unsigned ExtIdx = Indices[
I] - MinIdx;
11919 if (CurrentOrder[ExtIdx] != E) {
11920 CurrentOrder.
clear();
11923 ShouldKeepOrder &= ExtIdx ==
I;
11924 CurrentOrder[ExtIdx] =
I;
11926 if (ShouldKeepOrder)
11927 CurrentOrder.
clear();
11929 return ShouldKeepOrder;
11932bool BoUpSLP::areAllUsersVectorized(
11934 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11936 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11937 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11941void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11945 unsigned Sz = Scalars.size();
11948 if (!ReorderIndices.empty())
11950 for (
unsigned I = 0;
I < Sz; ++
I) {
11952 if (!ReorderIndices.empty())
11953 Idx = OrderMask[
I];
11954 if (isa<PoisonValue>(Scalars[
Idx]))
11956 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
11957 if (IsAltOp(OpInst)) {
11967 if (!ReuseShuffleIndices.
empty()) {
11970 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
11972 Mask.swap(NewMask);
11979 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
11985 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
11986 auto *AltCI = cast<CmpInst>(AltOp);
11989 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
11990 auto *CI = cast<CmpInst>(
I);
11998 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
11999 "CmpInst expected to match either main or alternate predicate or "
12001 return MainP !=
P && MainP != SwappedP;
12003 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12008 const auto *Op0 = Ops.
front();
12012 return isConstant(V) && !isa<UndefValue>(V);
12014 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
12018 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
12020 if (
auto *CI = dyn_cast<ConstantInt>(V))
12021 return CI->getValue().isPowerOf2();
12024 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
12026 if (
auto *CI = dyn_cast<ConstantInt>(V))
12027 return CI->getValue().isNegatedPowerOf2();
12032 if (IsConstant && IsUniform)
12034 else if (IsConstant)
12036 else if (IsUniform)
12048class BaseShuffleAnalysis {
12050 Type *ScalarTy =
nullptr;
12052 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12060 unsigned getVF(
Value *V)
const {
12061 assert(V &&
"V cannot be nullptr");
12062 assert(isa<FixedVectorType>(
V->getType()) &&
12063 "V does not have FixedVectorType");
12064 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12066 unsigned VNumElements =
12067 cast<FixedVectorType>(
V->getType())->getNumElements();
12068 assert(VNumElements > ScalarTyNumElements &&
12069 "the number of elements of V is not large enough");
12070 assert(VNumElements % ScalarTyNumElements == 0 &&
12071 "the number of elements of V is not a vectorized value");
12072 return VNumElements / ScalarTyNumElements;
12080 int Limit =
Mask.size();
12092 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
12108 unsigned VF =
Mask.size();
12110 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12113 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12117 Mask.swap(NewMask);
12154 bool SinglePermute) {
12158 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
12160 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12166 if (isIdentityMask(Mask, SVTy,
false)) {
12167 if (!IdentityOp || !SinglePermute ||
12168 (isIdentityMask(Mask, SVTy,
true) &&
12170 IdentityMask.
size()))) {
12175 IdentityMask.
assign(Mask);
12195 if (SV->isZeroEltSplat()) {
12197 IdentityMask.
assign(Mask);
12199 int LocalVF =
Mask.size();
12201 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12202 LocalVF = SVOpTy->getNumElements();
12206 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12208 ExtMask[
Idx] = SV->getMaskValue(
I);
12218 if (!IsOp1Undef && !IsOp2Undef) {
12220 for (
int &
I : Mask) {
12223 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12230 combineMasks(LocalVF, ShuffleMask, Mask);
12231 Mask.swap(ShuffleMask);
12233 Op = SV->getOperand(0);
12235 Op = SV->getOperand(1);
12237 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
12238 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12243 "Expected masks of same sizes.");
12248 Mask.swap(IdentityMask);
12249 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12250 return SinglePermute &&
12251 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
12253 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12254 Shuffle->isZeroEltSplat() &&
12258 Shuffle->getShuffleMask()[
P.index()] == 0;
12271 template <
typename T,
typename ShuffleBuilderTy>
12273 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12274 assert(V1 &&
"Expected at least one vector value.");
12277 if (ScalarTyNumElements != 1) {
12283 Builder.resizeToMatch(V1, V2);
12284 int VF =
Mask.size();
12285 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
12286 VF = FTy->getNumElements();
12287 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
12294 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
12297 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12299 CombinedMask1[
I] =
Mask[
I];
12301 CombinedMask2[
I] =
Mask[
I] - VF;
12308 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12309 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12312 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12313 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12318 ExtMask1[
Idx] = SV1->getMaskValue(
I);
12321 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12322 ->getNumElements(),
12323 ExtMask1, UseMask::SecondArg);
12328 ExtMask2[
Idx] = SV2->getMaskValue(
I);
12331 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12332 ->getNumElements(),
12333 ExtMask2, UseMask::SecondArg);
12334 if (SV1->getOperand(0)->getType() ==
12335 SV2->getOperand(0)->getType() &&
12336 SV1->getOperand(0)->getType() != SV1->getType() &&
12339 Op1 = SV1->getOperand(0);
12340 Op2 = SV2->getOperand(0);
12342 int LocalVF = ShuffleMask1.size();
12343 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
12344 LocalVF = FTy->getNumElements();
12345 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12346 CombinedMask1.swap(ShuffleMask1);
12348 LocalVF = ShuffleMask2.size();
12349 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
12350 LocalVF = FTy->getNumElements();
12351 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12352 CombinedMask2.swap(ShuffleMask2);
12355 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12356 Builder.resizeToMatch(Op1, Op2);
12357 VF = std::max(cast<VectorType>(Op1->
getType())
12358 ->getElementCount()
12359 .getKnownMinValue(),
12360 cast<VectorType>(Op2->
getType())
12361 ->getElementCount()
12362 .getKnownMinValue());
12363 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12366 "Expected undefined mask element");
12367 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12373 isa<ShuffleVectorInst>(Op1) &&
12374 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12376 return Builder.createIdentity(Op1);
12377 return Builder.createShuffleVector(
12381 if (isa<PoisonValue>(V1))
12382 return Builder.createPoison(
12383 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
12384 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12385 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12388 return Builder.createShuffleVector(V1, NewMask);
12389 return Builder.createIdentity(V1);
12396 for (
unsigned I : seq<unsigned>(CommonMask.
size()))
12404static std::pair<InstructionCost, InstructionCost>
12415 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12425 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12429 for (
Value *V : Ptrs) {
12430 if (V == BasePtr) {
12434 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
12439 if (!
Ptr || !
Ptr->hasOneUse())
12443 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12449 TTI::PointersChainInfo::getKnownStride(),
12459 [](
const Value *V) {
12460 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
12461 return Ptr && !
Ptr->hasAllConstantIndices();
12463 ? TTI::PointersChainInfo::getUnknownStride()
12464 : TTI::PointersChainInfo::getKnownStride();
12468 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12470 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
12471 if (It != Ptrs.
end())
12472 BaseGEP = cast<GEPOperator>(*It);
12477 BaseGEP->getPointerOperand(), Indices, VecTy,
12482 return std::make_pair(ScalarCost, VecCost);
12485void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12486 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12487 "Expected gather node without reordering.");
12493 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12497 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
12498 return VectorizableTree[Idx]->isSame(TE.Scalars);
12502 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
12507 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
12508 if (LIt != LoadsMap.
end()) {
12509 for (
LoadInst *RLI : LIt->second) {
12515 for (
LoadInst *RLI : LIt->second) {
12522 if (LIt->second.size() > 2) {
12524 hash_value(LIt->second.back()->getPointerOperand());
12529 LoadKeyUsed.
insert(Key);
12530 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
12535 bool IsOrdered =
true;
12536 unsigned NumInstructions = 0;
12540 size_t Key = 1,
Idx = 1;
12541 if (
auto *Inst = dyn_cast<Instruction>(V);
12542 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
12548 auto &Container = SortedValues[
Key];
12549 if (IsOrdered && !KeyToIndex.
contains(V) &&
12550 !(isa<Constant, ExtractElementInst>(V) ||
12552 ((Container.contains(
Idx) &&
12553 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
12554 (!Container.empty() && !Container.contains(
Idx) &&
12555 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12557 auto &KTI = KeyToIndex[
V];
12559 Container[
Idx].push_back(V);
12564 if (!IsOrdered && NumInstructions > 1) {
12566 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12567 for (
const auto &
D : SortedValues) {
12568 for (
const auto &
P :
D.second) {
12570 for (
Value *V :
P.second) {
12573 TE.ReorderIndices[Cnt +
K] =
Idx;
12574 TE.Scalars[Cnt +
K] =
V;
12576 Sz += Indices.
size();
12577 Cnt += Indices.
size();
12579 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
12581 *
TTI,
TE.Scalars.front()->getType(), Sz);
12583 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12585 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12586 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12593 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12598 auto *ScalarTy =
TE.Scalars.front()->getType();
12600 for (
auto [
Idx, Sz] : SubVectors) {
12607 int Sz =
TE.Scalars.size();
12609 TE.ReorderIndices.end());
12610 for (
unsigned I : seq<unsigned>(Sz)) {
12612 if (isa<PoisonValue>(V)) {
12615 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12619 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12622 VecTy, ReorderMask);
12625 for (
unsigned I : seq<unsigned>(Sz)) {
12629 if (!isa<PoisonValue>(V))
12630 ReorderMask[
I] =
I;
12632 ReorderMask[
I] =
I + Sz;
12640 if (
Cost >= BVCost) {
12643 TE.ReorderIndices.clear();
12650 const InstructionsState &S,
12656 return V->getType()->getScalarType()->isFloatingPointTy();
12658 "Can only convert to FMA for floating point types");
12659 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12664 for (
Value *V : VL) {
12665 auto *
I = dyn_cast<Instruction>(V);
12668 if (S.isCopyableElement(
I))
12670 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12671 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12673 if (
auto *FPCI = dyn_cast<FPMathOperator>(
I))
12674 FMF &= FPCI->getFastMathFlags();
12678 if (!CheckForContractable(VL))
12681 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12688 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12690 if (!CheckForContractable(
Operands.front()))
12698 for (
Value *V : VL) {
12699 auto *
I = dyn_cast<Instruction>(V);
12702 if (!S.isCopyableElement(
I))
12703 if (
auto *FPCI = dyn_cast<FPMathOperator>(
I))
12704 FMF &= FPCI->getFastMathFlags();
12707 unsigned NumOps = 0;
12709 if (S.isCopyableElement(V))
12711 auto *
I = dyn_cast<Instruction>(
Op);
12712 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12713 if (
auto *OpI = dyn_cast<Instruction>(V))
12720 if (
auto *FPCI = dyn_cast<FPMathOperator>(
I))
12721 FMF &= FPCI->getFastMathFlags();
12732 BaseGraphSize = VectorizableTree.size();
12734 class GraphTransformModeRAAI {
12735 bool &SavedIsGraphTransformMode;
12738 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12739 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12740 IsGraphTransformMode =
true;
12742 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12743 } TransformContext(IsGraphTransformMode);
12752 const InstructionsState &S) {
12754 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12756 I2->getOperand(
Op));
12758 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12760 [](
const std::pair<Value *, Value *> &
P) {
12761 return isa<Constant>(
P.first) ||
12762 isa<Constant>(
P.second) ||
P.first ==
P.second;
12769 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12770 TreeEntry &E = *VectorizableTree[
Idx];
12772 reorderGatherNode(E);
12777 constexpr unsigned VFLimit = 16;
12778 bool ForceLoadGather =
12779 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12780 return TE->isGather() && TE->hasState() &&
12781 TE->getOpcode() == Instruction::Load &&
12782 TE->getVectorFactor() < VFLimit;
12788 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12789 if (isa<PoisonValue>(V))
12791 auto *
I = dyn_cast<Instruction>(V);
12797 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12798 if (E.hasState()) {
12800 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12801 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12802 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12803 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12804 return is_contained(TEs, TE);
12811 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12812 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12813 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12814 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12815 return is_contained(TEs, TE);
12822 auto *It =
find_if(E.Scalars, IsaPred<Instruction>);
12823 if (It != E.Scalars.end()) {
12825 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12826 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12827 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12828 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12829 return is_contained(TEs, TE);
12839 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12840 TreeEntry &E = *VectorizableTree[
Idx];
12841 if (E.isGather()) {
12843 const unsigned Sz = getVectorElementSize(VL.
front());
12844 unsigned MinVF = getMinVF(2 * Sz);
12847 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
12848 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12854 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12857 if (CheckForSameVectorNodes(E))
12861 unsigned StartIdx = 0;
12866 *
TTI, VL.
front()->getType(), VF - 1)) {
12867 if (StartIdx + VF >
End)
12870 bool AllStrided =
true;
12871 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
12875 if (isVectorized(Slice.
front()) &&
12876 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12883 bool IsSplat =
isSplat(Slice);
12884 bool IsTwoRegisterSplat =
true;
12885 if (IsSplat && VF == 2) {
12888 IsTwoRegisterSplat = NumRegs2VF == 2;
12890 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12892 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
12898 (S.getOpcode() == Instruction::Load &&
12899 areKnownNonVectorizableLoads(Slice)) ||
12900 (S.getOpcode() != Instruction::Load &&
12906 if ((!UserIgnoreList || E.Idx != 0) &&
12910 if (isa<PoisonValue>(V))
12912 return areAllUsersVectorized(cast<Instruction>(V),
12916 if (S.getOpcode() == Instruction::Load) {
12920 canVectorizeLoads(Slice, Slice.
front(), Order, PointerOps);
12921 AllStrided &= Res == LoadsState::StridedVectorize ||
12922 Res == LoadsState::ScatterVectorize ||
12923 Res == LoadsState::Gather;
12925 if (Res == LoadsState::ScatterVectorize ||
12926 Res == LoadsState::Gather) {
12927 if (Res == LoadsState::Gather) {
12928 registerNonVectorizableLoads(Slice);
12931 if (UserIgnoreList && E.Idx == 0)
12932 analyzedReductionVals(Slice);
12936 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12939 !CheckOperandsProfitability(
12942 IsaPred<Instruction>)),
12956 if (VF == 2 && AllStrided && Slices.
size() > 2)
12958 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
12959 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
12960 if (StartIdx == Cnt)
12961 StartIdx = Cnt + Sz;
12962 if (
End == Cnt + Sz)
12965 for (
auto [Cnt, Sz] : Slices) {
12967 const TreeEntry *SameTE =
nullptr;
12968 if (
const auto *It =
find_if(Slice, IsaPred<Instruction>);
12969 It != Slice.
end()) {
12971 SameTE = getSameValuesTreeEntry(*It, Slice);
12973 unsigned PrevSize = VectorizableTree.size();
12974 [[maybe_unused]]
unsigned PrevEntriesSize =
12975 LoadEntriesToVectorize.size();
12976 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
12977 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
12978 VectorizableTree[PrevSize]->isGather() &&
12979 VectorizableTree[PrevSize]->hasState() &&
12980 VectorizableTree[PrevSize]->getOpcode() !=
12981 Instruction::ExtractElement &&
12983 if (UserIgnoreList && E.Idx == 0 && VF == 2)
12984 analyzedReductionVals(Slice);
12985 VectorizableTree.pop_back();
12986 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
12987 "LoadEntriesToVectorize expected to remain the same");
12990 AddCombinedNode(PrevSize, Cnt, Sz);
12994 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
12997 E.ReorderIndices.clear();
13002 switch (E.getOpcode()) {
13003 case Instruction::Load: {
13006 if (E.State != TreeEntry::Vectorize)
13008 Type *ScalarTy = E.getMainOp()->getType();
13010 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13013 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
13017 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13020 BaseLI->getPointerAddressSpace(),
CostKind,
13024 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13025 false, CommonAlignment,
CostKind, BaseLI);
13026 if (StridedCost < OriginalVecCost)
13029 E.State = TreeEntry::StridedVectorize;
13033 case Instruction::Store: {
13035 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13037 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13040 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
13044 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13047 BaseSI->getPointerAddressSpace(),
CostKind,
13051 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13052 false, CommonAlignment,
CostKind, BaseSI);
13053 if (StridedCost < OriginalVecCost)
13056 E.State = TreeEntry::StridedVectorize;
13057 }
else if (!E.ReorderIndices.empty()) {
13060 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13061 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13062 if (
Mask.size() < 4)
13064 for (
unsigned Factor : seq<unsigned>(2,
Mask.size() / 2 + 1)) {
13068 VecTy, Factor, BaseSI->getAlign(),
13069 BaseSI->getPointerAddressSpace()))
13076 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13077 if (InterleaveFactor != 0)
13078 E.setInterleave(InterleaveFactor);
13082 case Instruction::Select: {
13083 if (E.State != TreeEntry::Vectorize)
13089 E.CombinedOp = TreeEntry::MinMax;
13090 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13091 if (SelectOnly && CondEntry->UserTreeIndex &&
13092 CondEntry->State == TreeEntry::Vectorize) {
13094 CondEntry->State = TreeEntry::CombinedVectorize;
13098 case Instruction::FSub:
13099 case Instruction::FAdd: {
13101 if (E.State != TreeEntry::Vectorize ||
13102 !E.getOperations().isAddSubLikeOp())
13108 E.CombinedOp = TreeEntry::FMulAdd;
13109 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13110 if (FMulEntry->UserTreeIndex &&
13111 FMulEntry->State == TreeEntry::Vectorize) {
13113 FMulEntry->State = TreeEntry::CombinedVectorize;
13122 if (LoadEntriesToVectorize.empty()) {
13124 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13125 VectorizableTree.front()->getOpcode() == Instruction::Load)
13128 constexpr unsigned SmallTree = 3;
13129 constexpr unsigned SmallVF = 2;
13130 if ((VectorizableTree.size() <= SmallTree &&
13131 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13132 (VectorizableTree.size() <= 2 && UserIgnoreList))
13135 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13136 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13137 getCanonicalGraphSize() <= SmallTree &&
13139 [](
const std::unique_ptr<TreeEntry> &TE) {
13140 return TE->isGather() &&
TE->hasState() &&
13141 TE->getOpcode() == Instruction::Load &&
13153 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13154 TreeEntry &E = *
TE;
13155 if (E.isGather() &&
13156 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13157 (!E.hasState() &&
any_of(E.Scalars,
13159 return isa<LoadInst>(V) &&
13160 !isVectorized(V) &&
13161 !isDeleted(cast<Instruction>(V));
13164 for (
Value *V : E.Scalars) {
13165 auto *LI = dyn_cast<LoadInst>(V);
13168 if (isDeleted(LI) || isVectorized(LI) || !LI->
isSimple())
13171 *
this, V, *
DL, *SE, *
TTI,
13172 GatheredLoads[std::make_tuple(
13180 if (!GatheredLoads.
empty())
13181 tryToVectorizeGatheredLoads(GatheredLoads);
13191 bool IsFinalized =
false;
13204 bool SameNodesEstimated =
true;
13213 if (
auto *VTy = dyn_cast<VectorType>(Ty))
13229 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
13230 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13233 count(VL, *It) > 1 &&
13235 if (!NeedShuffle) {
13236 if (isa<FixedVectorType>(ScalarTy)) {
13241 cast<FixedVectorType>(ScalarTy));
13244 CostKind, std::distance(VL.
begin(), It),
13250 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13257 VecTy, ShuffleMask, CostKind,
13261 return GatherCost +
13262 (
all_of(Gathers, IsaPred<UndefValue>)
13264 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13272 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13273 unsigned NumParts) {
13274 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13276 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13277 auto *EE = dyn_cast<ExtractElementInst>(V);
13280 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13283 return std::max(Sz, VecTy->getNumElements());
13290 -> std::optional<TTI::ShuffleKind> {
13291 if (NumElts <= EltsPerVector)
13292 return std::nullopt;
13294 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13296 if (I == PoisonMaskElem)
13298 return std::min(S, I);
13301 int OffsetReg1 = OffsetReg0;
13305 int FirstRegId = -1;
13306 Indices.assign(1, OffsetReg0);
13310 int Idx =
I - OffsetReg0;
13312 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
13313 if (FirstRegId < 0)
13314 FirstRegId = RegId;
13315 RegIndices.
insert(RegId);
13316 if (RegIndices.
size() > 2)
13317 return std::nullopt;
13318 if (RegIndices.
size() == 2) {
13320 if (Indices.
size() == 1) {
13323 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13324 [&](
int S,
int I) {
13325 if (I == PoisonMaskElem)
13327 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13328 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13329 if (RegId == FirstRegId)
13331 return std::min(S, I);
13334 unsigned Index = OffsetReg1 % NumElts;
13335 Indices.push_back(Index);
13336 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13338 Idx =
I - OffsetReg1;
13340 I = (
Idx % NumElts) % EltsPerVector +
13341 (RegId == FirstRegId ? 0 : EltsPerVector);
13343 return ShuffleKind;
13350 for (
unsigned Part : seq<unsigned>(NumParts)) {
13351 if (!ShuffleKinds[Part])
13354 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13359 std::optional<TTI::ShuffleKind> RegShuffleKind =
13360 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13361 if (!RegShuffleKind) {
13364 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13377 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13378 for (
const auto [
Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13380 "SK_ExtractSubvector index out of range");
13391 if (OriginalCost <
Cost)
13392 Cost = OriginalCost;
13399 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13401 unsigned SliceSize) {
13402 if (SameNodesEstimated) {
13408 if ((InVectors.
size() == 2 &&
13409 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
13410 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
13411 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
13412 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13415 "Expected all poisoned elements.");
13417 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
13422 Cost += createShuffle(InVectors.
front(),
13423 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
13425 transformMaskAfterShuffle(CommonMask, CommonMask);
13426 }
else if (InVectors.
size() == 2) {
13427 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
13428 transformMaskAfterShuffle(CommonMask, CommonMask);
13430 SameNodesEstimated =
false;
13431 if (!E2 && InVectors.
size() == 1) {
13432 unsigned VF = E1.getVectorFactor();
13433 if (
Value *V1 = dyn_cast<Value *>(InVectors.
front())) {
13434 VF = std::max(VF, getVF(V1));
13436 const auto *E = cast<const TreeEntry *>(InVectors.
front());
13437 VF = std::max(VF, E->getVectorFactor());
13439 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13441 CommonMask[
Idx] = Mask[
Idx] + VF;
13442 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
13443 transformMaskAfterShuffle(CommonMask, CommonMask);
13445 auto P = InVectors.
front();
13446 Cost += createShuffle(&E1, E2, Mask);
13447 unsigned VF = Mask.size();
13448 if (
Value *V1 = dyn_cast<Value *>(
P)) {
13452 const auto *E = cast<const TreeEntry *>(
P);
13453 VF = std::max(VF, E->getVectorFactor());
13455 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13457 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
13458 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
13459 transformMaskAfterShuffle(CommonMask, CommonMask);
13463 class ShuffleCostBuilder {
13466 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13468 return Mask.empty() ||
13469 (VF == Mask.size() &&
13477 ~ShuffleCostBuilder() =
default;
13482 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
13483 if (isEmptyOrIdentity(Mask, VF))
13486 cast<VectorType>(V1->
getType()), Mask);
13491 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
13492 if (isEmptyOrIdentity(Mask, VF))
13495 cast<VectorType>(V1->
getType()), Mask);
13501 void resizeToMatch(
Value *&,
Value *&)
const {}
13511 ShuffleCostBuilder Builder(
TTI);
13514 unsigned CommonVF = Mask.size();
13516 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13520 Type *EScalarTy = E.Scalars.front()->getType();
13521 bool IsSigned =
true;
13522 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13524 IsSigned = It->second.second;
13526 if (EScalarTy != ScalarTy) {
13527 unsigned CastOpcode = Instruction::Trunc;
13528 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13529 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13531 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13539 if (isa<Constant>(V))
13541 auto *VecTy = cast<VectorType>(V->getType());
13543 if (EScalarTy != ScalarTy) {
13545 unsigned CastOpcode = Instruction::Trunc;
13546 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13547 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13549 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13556 if (!V1 && !V2 && !P2.
isNull()) {
13558 const TreeEntry *E = cast<const TreeEntry *>(P1);
13559 unsigned VF = E->getVectorFactor();
13560 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13561 CommonVF = std::max(VF, E2->getVectorFactor());
13564 return Idx < 2 * static_cast<int>(CommonVF);
13566 "All elements in mask must be less than 2 * CommonVF.");
13567 if (E->Scalars.size() == E2->Scalars.size()) {
13571 for (
int &
Idx : CommonMask) {
13574 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13576 else if (
Idx >=
static_cast<int>(CommonVF))
13577 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
13581 CommonVF = E->Scalars.size();
13582 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13583 GetNodeMinBWAffectedCost(*E2, CommonVF);
13585 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13586 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13589 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13590 }
else if (!V1 && P2.
isNull()) {
13592 const TreeEntry *E = cast<const TreeEntry *>(P1);
13593 unsigned VF = E->getVectorFactor();
13597 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13598 "All elements in mask must be less than CommonVF.");
13599 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13601 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13602 for (
int &
Idx : CommonMask) {
13606 CommonVF = E->Scalars.size();
13607 }
else if (
unsigned Factor = E->getInterleaveFactor();
13608 Factor > 0 && E->Scalars.size() != Mask.size() &&
13612 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
13614 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13617 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13618 CommonVF == CommonMask.
size() &&
13620 [](
const auto &&
P) {
13622 static_cast<unsigned>(
P.value()) !=
P.index();
13630 }
else if (V1 && P2.
isNull()) {
13632 ExtraCost += GetValueMinBWAffectedCost(V1);
13633 CommonVF = getVF(V1);
13636 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13637 "All elements in mask must be less than CommonVF.");
13638 }
else if (V1 && !V2) {
13640 unsigned VF = getVF(V1);
13641 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13642 CommonVF = std::max(VF, E2->getVectorFactor());
13645 return Idx < 2 * static_cast<int>(CommonVF);
13647 "All elements in mask must be less than 2 * CommonVF.");
13648 if (E2->Scalars.size() == VF && VF != CommonVF) {
13650 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13651 for (
int &
Idx : CommonMask) {
13654 if (
Idx >=
static_cast<int>(CommonVF))
13655 Idx = E2Mask[
Idx - CommonVF] + VF;
13659 ExtraCost += GetValueMinBWAffectedCost(V1);
13661 ExtraCost += GetNodeMinBWAffectedCost(
13662 *E2, std::min(CommonVF, E2->getVectorFactor()));
13663 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13664 }
else if (!V1 && V2) {
13666 unsigned VF = getVF(V2);
13667 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13668 CommonVF = std::max(VF, E1->getVectorFactor());
13671 return Idx < 2 * static_cast<int>(CommonVF);
13673 "All elements in mask must be less than 2 * CommonVF.");
13674 if (E1->Scalars.size() == VF && VF != CommonVF) {
13676 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13677 for (
int &
Idx : CommonMask) {
13680 if (
Idx >=
static_cast<int>(CommonVF))
13681 Idx = E1Mask[
Idx - CommonVF] + VF;
13687 ExtraCost += GetNodeMinBWAffectedCost(
13688 *E1, std::min(CommonVF, E1->getVectorFactor()));
13690 ExtraCost += GetValueMinBWAffectedCost(V2);
13691 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13693 assert(V1 && V2 &&
"Expected both vectors.");
13694 unsigned VF = getVF(V1);
13695 CommonVF = std::max(VF, getVF(V2));
13698 return Idx < 2 * static_cast<int>(CommonVF);
13700 "All elements in mask must be less than 2 * CommonVF.");
13702 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13705 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13707 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
13709 if (cast<VectorType>(V2->
getType())->getElementType() != ScalarTy)
13710 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13713 InVectors.
front() =
13715 if (InVectors.
size() == 2)
13717 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13718 V1, V2, CommonMask, Builder, ScalarTy);
13725 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
13726 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13727 CheckedExtracts(CheckedExtracts) {}
13729 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13730 unsigned NumParts,
bool &UseVecBaseAsInput) {
13731 UseVecBaseAsInput =
false;
13734 Value *VecBase =
nullptr;
13736 if (!E->ReorderIndices.empty()) {
13738 E->ReorderIndices.end());
13743 bool PrevNodeFound =
any_of(
13745 [&](
const std::unique_ptr<TreeEntry> &TE) {
13746 return ((TE->hasState() && !TE->isAltShuffle() &&
13747 TE->getOpcode() == Instruction::ExtractElement) ||
13749 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13750 return VL.size() > Data.index() &&
13751 (Mask[Data.index()] == PoisonMaskElem ||
13752 isa<UndefValue>(VL[Data.index()]) ||
13753 Data.value() == VL[Data.index()]);
13759 for (
unsigned Part : seq<unsigned>(NumParts)) {
13761 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13765 if (isa<UndefValue>(V) ||
13774 auto *EE = cast<ExtractElementInst>(V);
13775 VecBase = EE->getVectorOperand();
13776 UniqueBases.
insert(VecBase);
13778 if (!CheckedExtracts.
insert(V).second ||
13779 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13782 return isa<GetElementPtrInst>(U) &&
13783 !R.areAllUsersVectorized(cast<Instruction>(U),
13791 unsigned Idx = *EEIdx;
13793 if (EE->hasOneUse() || !PrevNodeFound) {
13795 if (isa<SExtInst, ZExtInst>(Ext) &&
13796 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13800 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13804 Ext->getOpcode(), Ext->getType(), EE->getType(),
13809 APInt &DemandedElts =
13810 VectorOpsToExtracts
13813 .first->getSecond();
13817 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13819 DemandedElts,
false,
13827 if (!PrevNodeFound)
13828 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13831 transformMaskAfterShuffle(CommonMask, CommonMask);
13832 SameNodesEstimated =
false;
13833 if (NumParts != 1 && UniqueBases.
size() != 1) {
13834 UseVecBaseAsInput =
true;
13842 std::optional<InstructionCost>
13846 return std::nullopt;
13850 IsFinalized =
false;
13851 CommonMask.
clear();
13854 VectorizedVals.
clear();
13855 SameNodesEstimated =
true;
13861 return Idx < static_cast<int>(E1.getVectorFactor());
13863 "Expected single vector shuffle mask.");
13867 if (InVectors.
empty()) {
13868 CommonMask.
assign(Mask.begin(), Mask.end());
13869 InVectors.
assign({&E1, &E2});
13872 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
13878 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13879 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13882 if (InVectors.
empty()) {
13883 CommonMask.
assign(Mask.begin(), Mask.end());
13884 InVectors.
assign(1, &E1);
13887 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
13893 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13894 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13895 if (!SameNodesEstimated && InVectors.
size() == 1)
13907 auto *EI = cast<ExtractElementInst>(
13908 cast<const TreeEntry *>(InVectors.
front())
13909 ->getOrdered(
P.index()));
13910 return EI->getVectorOperand() == V1 ||
13911 EI->getVectorOperand() == V2;
13913 "Expected extractelement vectors.");
13917 if (InVectors.
empty()) {
13919 "Expected empty input mask/vectors.");
13920 CommonMask.
assign(Mask.begin(), Mask.end());
13921 InVectors.
assign(1, V1);
13926 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13927 !CommonMask.
empty() &&
13930 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13931 ->getOrdered(
P.index());
13933 return P.value() == Mask[
P.index()] ||
13934 isa<UndefValue>(Scalar);
13935 if (isa<Constant>(V1))
13937 auto *EI = cast<ExtractElementInst>(Scalar);
13938 return EI->getVectorOperand() == V1;
13940 "Expected only tree entry for extractelement vectors.");
13944 "Expected only tree entries from extracts/reused buildvectors.");
13945 unsigned VF = getVF(V1);
13946 if (InVectors.
size() == 2) {
13947 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
13948 transformMaskAfterShuffle(CommonMask, CommonMask);
13949 VF = std::max<unsigned>(VF, CommonMask.
size());
13950 }
else if (
const auto *InTE =
13951 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
13952 VF = std::max(VF, InTE->getVectorFactor());
13955 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
13956 ->getNumElements());
13959 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13961 CommonMask[
Idx] = Mask[
Idx] + VF;
13964 Value *Root =
nullptr) {
13965 Cost += getBuildVectorCost(VL, Root);
13969 unsigned VF = VL.
size();
13971 VF = std::min(VF, MaskVF);
13972 Type *VLScalarTy = VL.
front()->getType();
13975 if (isa<PoisonValue>(V)) {
13979 if (isa<UndefValue>(V)) {
13985 if (
auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
13995 cast<FixedVectorType>(Root->
getType())->getNumElements()),
14002 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14007 IsFinalized =
true;
14010 if (InVectors.
size() == 2)
14011 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14013 Cost += createShuffle(Vec,
nullptr, CommonMask);
14014 transformMaskAfterShuffle(CommonMask, CommonMask);
14016 "Expected vector length for the final value before action.");
14017 Value *V = cast<Value *>(Vec);
14019 Cost += createShuffle(V1, V2, Mask);
14022 InVectors.
front() = V;
14024 if (!SubVectors.empty()) {
14026 if (InVectors.
size() == 2)
14027 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14029 Cost += createShuffle(Vec,
nullptr, CommonMask);
14030 transformMaskAfterShuffle(CommonMask, CommonMask);
14032 if (!SubVectorsMask.
empty()) {
14034 "Expected same size of masks for subvectors and common mask.");
14036 copy(SubVectorsMask, SVMask.begin());
14037 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14040 I1 = I2 + CommonMask.
size();
14047 for (
auto [E,
Idx] : SubVectors) {
14048 Type *EScalarTy = E->Scalars.front()->getType();
14049 bool IsSigned =
true;
14050 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
14053 IsSigned = It->second.second;
14055 if (ScalarTy != EScalarTy) {
14056 unsigned CastOpcode = Instruction::Trunc;
14057 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14058 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14060 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14070 if (!CommonMask.
empty()) {
14071 std::iota(std::next(CommonMask.
begin(),
Idx),
14072 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
14078 if (!ExtMask.
empty()) {
14079 if (CommonMask.
empty()) {
14083 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14086 NewMask[
I] = CommonMask[ExtMask[
I]];
14088 CommonMask.
swap(NewMask);
14091 if (CommonMask.
empty()) {
14092 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14096 createShuffle(InVectors.
front(),
14097 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14103 "Shuffle construction must be finalized.");
14107const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14108 unsigned Idx)
const {
14109 TreeEntry *
Op = OperandsToTreeEntry.
at({E,
Idx});
14110 assert(
Op->isSame(E->getOperand(
Idx)) &&
"Operands mismatch!");
14115 if (TE.State == TreeEntry::ScatterVectorize ||
14116 TE.State == TreeEntry::StridedVectorize)
14118 if (TE.State == TreeEntry::CompressVectorize)
14120 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14121 !TE.isAltShuffle()) {
14122 if (TE.ReorderIndices.empty())
14144 auto It = MinBWs.
find(E);
14145 Type *OrigScalarTy = ScalarTy;
14146 if (It != MinBWs.
end()) {
14147 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14153 unsigned EntryVF = E->getVectorFactor();
14156 if (E->isGather()) {
14159 if (isa<InsertElementInst>(VL[0]))
14161 if (isa<CmpInst>(VL.
front()))
14162 ScalarTy = VL.
front()->getType();
14163 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14164 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
14166 if (E->State == TreeEntry::SplitVectorize) {
14167 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14168 "Expected exactly 2 combined entries.");
14169 assert(E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14171 if (E->ReorderIndices.empty()) {
14174 E->CombinedEntriesWithIndices.back().second,
14177 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14178 ->getVectorFactor()));
14180 unsigned CommonVF =
14181 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14182 ->getVectorFactor(),
14183 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14184 ->getVectorFactor());
14189 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14194 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14195 (E->State != TreeEntry::StridedVectorize ||
14198 if (E->getOpcode() == Instruction::Store) {
14200 NewMask.
resize(E->ReorderIndices.size());
14201 copy(E->ReorderIndices, NewMask.
begin());
14207 if (!E->ReuseShuffleIndices.empty())
14208 ::addMask(Mask, E->ReuseShuffleIndices);
14212 assert((E->State == TreeEntry::Vectorize ||
14213 E->State == TreeEntry::ScatterVectorize ||
14214 E->State == TreeEntry::StridedVectorize ||
14215 E->State == TreeEntry::CompressVectorize) &&
14216 "Unhandled state");
14217 assert(E->getOpcode() &&
14219 (E->getOpcode() == Instruction::GetElementPtr &&
14220 E->getMainOp()->getType()->isPointerTy()) ||
14221 E->hasCopyableElements()) &&
14224 unsigned ShuffleOrOp =
14225 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
14226 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14227 ShuffleOrOp = E->CombinedOp;
14229 const unsigned Sz = UniqueValues.size();
14231 for (
unsigned I = 0;
I < Sz; ++
I) {
14232 if (isa<Instruction>(UniqueValues[
I]) &&
14233 !E->isCopyableElement(UniqueValues[
I]) &&
14234 getTreeEntries(UniqueValues[
I]).front() == E)
14236 UsedScalars.set(
I);
14238 auto GetCastContextHint = [&](
Value *
V) {
14240 return getCastContextHint(*OpTEs.front());
14241 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
14242 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14243 !SrcState.isAltShuffle())
14252 if (isa<CastInst, CallInst>(VL0)) {
14256 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14258 for (
unsigned I = 0;
I < Sz; ++
I) {
14259 if (UsedScalars.test(
I))
14261 ScalarCost += ScalarEltCost(
I);
14270 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14271 const EdgeInfo &EI = E->UserTreeIndex;
14272 if (!EI.UserTE->hasState() ||
14273 EI.UserTE->getOpcode() != Instruction::Select ||
14275 auto UserBWIt = MinBWs.
find(EI.UserTE);
14276 Type *UserScalarTy =
14277 (EI.UserTE->isGather() ||
14278 EI.UserTE->State == TreeEntry::SplitVectorize)
14279 ? EI.UserTE->Scalars.front()->getType()
14280 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14281 if (UserBWIt != MinBWs.
end())
14283 UserBWIt->second.first);
14284 if (ScalarTy != UserScalarTy) {
14285 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
14286 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
14287 unsigned VecOpcode;
14288 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
14289 if (BWSz > SrcBWSz)
14290 VecOpcode = Instruction::Trunc;
14293 It->second.second ? Instruction::SExt : Instruction::ZExt;
14300 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14301 ScalarCost,
"Calculated costs for Tree"));
14302 return VecCost - ScalarCost;
14307 assert((E->State == TreeEntry::Vectorize ||
14308 E->State == TreeEntry::StridedVectorize ||
14309 E->State == TreeEntry::CompressVectorize) &&
14310 "Entry state expected to be Vectorize, StridedVectorize or "
14311 "MaskedLoadCompressVectorize here.");
14315 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14316 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14317 "Calculated GEPs cost for Tree"));
14319 return VecCost - ScalarCost;
14326 Type *CanonicalType = Ty;
14333 {CanonicalType, CanonicalType});
14338 if (VI && SelectOnly) {
14340 "Expected only for scalar type.");
14341 auto *CI = cast<CmpInst>(
VI->getOperand(0));
14343 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14344 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14345 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14349 auto GetFMulAddCost = [&, &
TTI = *
TTI](
const InstructionsState &S,
14354 switch (ShuffleOrOp) {
14355 case Instruction::PHI: {
14359 for (
Value *V : UniqueValues) {
14360 auto *
PHI = dyn_cast<PHINode>(V);
14365 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14369 if (
const TreeEntry *OpTE =
14371 if (CountedOps.
insert(OpTE).second &&
14372 !OpTE->ReuseShuffleIndices.empty())
14373 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14374 OpTE->Scalars.size());
14377 return CommonCost - ScalarCost;
14379 case Instruction::ExtractValue:
14380 case Instruction::ExtractElement: {
14381 APInt DemandedElts;
14383 auto GetScalarCost = [&](
unsigned Idx) {
14384 if (isa<PoisonValue>(UniqueValues[
Idx]))
14387 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
14389 if (ShuffleOrOp == Instruction::ExtractElement) {
14390 auto *EE = cast<ExtractElementInst>(
I);
14391 SrcVecTy = EE->getVectorOperandType();
14393 auto *EV = cast<ExtractValueInst>(
I);
14394 Type *AggregateTy = EV->getAggregateOperand()->getType();
14396 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14397 NumElts = ATy->getNumElements();
14403 if (
I->hasOneUse()) {
14405 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14406 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
14414 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14419 if (DemandedElts.
isZero())
14425 return CommonCost - (DemandedElts.
isZero()
14428 SrcVecTy, DemandedElts,
false,
14431 return GetCostDiff(GetScalarCost, GetVectorCost);
14433 case Instruction::InsertElement: {
14434 assert(E->ReuseShuffleIndices.empty() &&
14435 "Unique insertelements only are expected.");
14436 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
14437 unsigned const NumElts = SrcVecTy->getNumElements();
14438 unsigned const NumScalars = VL.
size();
14444 unsigned OffsetEnd = OffsetBeg;
14445 InsertMask[OffsetBeg] = 0;
14448 if (OffsetBeg >
Idx)
14450 else if (OffsetEnd <
Idx)
14452 InsertMask[
Idx] =
I + 1;
14455 if (NumOfParts > 0 && NumOfParts < NumElts)
14456 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14457 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14459 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14460 unsigned InsertVecSz = std::min<unsigned>(
14462 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14463 bool IsWholeSubvector =
14464 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14468 if (OffsetBeg + InsertVecSz > VecSz) {
14471 InsertVecSz = VecSz;
14477 if (!E->ReorderIndices.empty()) {
14482 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14484 bool IsIdentity =
true;
14486 Mask.swap(PrevMask);
14487 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14489 DemandedElts.
setBit(InsertIdx);
14490 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14491 Mask[InsertIdx - OffsetBeg] =
I;
14493 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14507 InsertVecTy, Mask);
14508 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
14509 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14517 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14518 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14519 if (InsertVecSz != VecSz) {
14530 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14539 case Instruction::ZExt:
14540 case Instruction::SExt:
14541 case Instruction::FPToUI:
14542 case Instruction::FPToSI:
14543 case Instruction::FPExt:
14544 case Instruction::PtrToInt:
14545 case Instruction::IntToPtr:
14546 case Instruction::SIToFP:
14547 case Instruction::UIToFP:
14548 case Instruction::Trunc:
14549 case Instruction::FPTrunc:
14550 case Instruction::BitCast: {
14551 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
14554 unsigned Opcode = ShuffleOrOp;
14555 unsigned VecOpcode = Opcode;
14557 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
14559 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14560 if (SrcIt != MinBWs.
end()) {
14561 SrcBWSz = SrcIt->second.first;
14568 if (BWSz == SrcBWSz) {
14569 VecOpcode = Instruction::BitCast;
14570 }
else if (BWSz < SrcBWSz) {
14571 VecOpcode = Instruction::Trunc;
14572 }
else if (It != MinBWs.
end()) {
14573 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14574 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14575 }
else if (SrcIt != MinBWs.
end()) {
14576 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14578 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14580 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
14581 !SrcIt->second.second) {
14582 VecOpcode = Instruction::UIToFP;
14585 assert(
Idx == 0 &&
"Expected 0 index only");
14593 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14595 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14598 bool IsArithmeticExtendedReduction =
14599 E->Idx == 0 && UserIgnoreList &&
14601 auto *
I = cast<Instruction>(V);
14602 return is_contained({Instruction::Add, Instruction::FAdd,
14603 Instruction::Mul, Instruction::FMul,
14604 Instruction::And, Instruction::Or,
14608 if (IsArithmeticExtendedReduction &&
14609 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14611 return CommonCost +
14613 VecOpcode == Opcode ? VI :
nullptr);
14615 return GetCostDiff(GetScalarCost, GetVectorCost);
14617 case Instruction::FCmp:
14618 case Instruction::ICmp:
14619 case Instruction::Select: {
14623 match(VL0, MatchCmp))
14629 auto GetScalarCost = [&](
unsigned Idx) {
14630 if (isa<PoisonValue>(UniqueValues[
Idx]))
14633 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
14639 !
match(VI, MatchCmp)) ||
14647 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14648 CostKind, getOperandInfo(
VI->getOperand(0)),
14649 getOperandInfo(
VI->getOperand(1)), VI);
14661 CostKind, getOperandInfo(E->getOperand(0)),
14662 getOperandInfo(E->getOperand(1)), VL0);
14663 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
14666 unsigned CondNumElements = CondType->getNumElements();
14668 assert(VecTyNumElements >= CondNumElements &&
14669 VecTyNumElements % CondNumElements == 0 &&
14670 "Cannot vectorize Instruction::Select");
14671 if (CondNumElements != VecTyNumElements) {
14680 return VecCost + CommonCost;
14682 return GetCostDiff(GetScalarCost, GetVectorCost);
14684 case TreeEntry::MinMax: {
14685 auto GetScalarCost = [&](
unsigned Idx) {
14686 return GetMinMaxCost(OrigScalarTy);
14690 return VecCost + CommonCost;
14692 return GetCostDiff(GetScalarCost, GetVectorCost);
14694 case TreeEntry::FMulAdd: {
14695 auto GetScalarCost = [&](
unsigned Idx) {
14696 if (isa<PoisonValue>(UniqueValues[
Idx]))
14698 return GetFMulAddCost(E->getOperations(),
14699 cast<Instruction>(UniqueValues[
Idx]));
14704 for (
Value *V : E->Scalars) {
14705 if (
auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14706 FMF &= FPCI->getFastMathFlags();
14707 if (
auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14708 FMF &= FPCIOp->getFastMathFlags();
14712 {VecTy, VecTy, VecTy}, FMF);
14714 return VecCost + CommonCost;
14716 return GetCostDiff(GetScalarCost, GetVectorCost);
14718 case Instruction::FNeg:
14719 case Instruction::Add:
14720 case Instruction::FAdd:
14721 case Instruction::Sub:
14722 case Instruction::FSub:
14723 case Instruction::Mul:
14724 case Instruction::FMul:
14725 case Instruction::UDiv:
14726 case Instruction::SDiv:
14727 case Instruction::FDiv:
14728 case Instruction::URem:
14729 case Instruction::SRem:
14730 case Instruction::FRem:
14731 case Instruction::Shl:
14732 case Instruction::LShr:
14733 case Instruction::AShr:
14734 case Instruction::And:
14735 case Instruction::Or:
14736 case Instruction::Xor: {
14737 auto GetScalarCost = [&](
unsigned Idx) {
14738 if (isa<PoisonValue>(UniqueValues[
Idx]))
14744 Value *Op1 = E->getOperand(0)[
Idx];
14747 if (isa<UnaryOperator>(UniqueValues[
Idx])) {
14750 Op2 = E->getOperand(1)[
Idx];
14757 if (
auto *
I = dyn_cast<Instruction>(UniqueValues[
Idx]);
14758 I && (ShuffleOrOp == Instruction::FAdd ||
14759 ShuffleOrOp == Instruction::FSub)) {
14767 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
14768 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14771 auto *CI = dyn_cast<ConstantInt>(
Op);
14772 return CI && CI->getValue().countr_one() >= It->second.first;
14777 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14781 Op2Info, {},
nullptr, TLI) +
14784 return GetCostDiff(GetScalarCost, GetVectorCost);
14786 case Instruction::GetElementPtr: {
14787 return CommonCost + GetGEPCostDiff(VL, VL0);
14789 case Instruction::Load: {
14790 auto GetScalarCost = [&](
unsigned Idx) {
14791 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
14793 VI->getAlign(),
VI->getPointerAddressSpace(),
14796 auto *LI0 = cast<LoadInst>(VL0);
14799 switch (E->State) {
14800 case TreeEntry::Vectorize:
14801 if (
unsigned Factor = E->getInterleaveFactor()) {
14803 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14804 LI0->getPointerAddressSpace(),
CostKind);
14808 Instruction::Load, VecTy, LI0->getAlign(),
14812 case TreeEntry::StridedVectorize: {
14813 Align CommonAlignment =
14814 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14816 Instruction::Load, VecTy, LI0->getPointerOperand(),
14817 false, CommonAlignment,
CostKind);
14820 case TreeEntry::CompressVectorize: {
14822 unsigned InterleaveFactor;
14826 if (!E->ReorderIndices.empty()) {
14828 E->ReorderIndices.end());
14833 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
14835 Scalars, PointerOps, E->ReorderIndices, *
TTI, *DL, *SE, *AC, *DT,
14836 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14837 CompressMask, LoadVecTy);
14838 assert(IsVectorized &&
"Failed to vectorize load");
14839 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14840 InterleaveFactor, IsMasked);
14841 Align CommonAlignment = LI0->getAlign();
14842 if (InterleaveFactor) {
14844 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14845 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14846 }
else if (IsMasked) {
14848 Instruction::Load, LoadVecTy, CommonAlignment,
14849 LI0->getPointerAddressSpace(),
CostKind);
14852 LoadVecTy, CompressMask,
CostKind);
14855 Instruction::Load, LoadVecTy, CommonAlignment,
14859 LoadVecTy, CompressMask,
CostKind);
14863 case TreeEntry::ScatterVectorize: {
14864 Align CommonAlignment =
14865 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14867 Instruction::Load, VecTy, LI0->getPointerOperand(),
14868 false, CommonAlignment,
CostKind);
14871 case TreeEntry::CombinedVectorize:
14872 case TreeEntry::SplitVectorize:
14873 case TreeEntry::NeedToGather:
14876 return VecLdCost + CommonCost;
14882 if (E->State == TreeEntry::ScatterVectorize)
14888 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
14889 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14891 case Instruction::Store: {
14892 bool IsReorder = !E->ReorderIndices.empty();
14893 auto GetScalarCost = [=](
unsigned Idx) {
14894 auto *
VI = cast<StoreInst>(VL[
Idx]);
14897 VI->getAlign(),
VI->getPointerAddressSpace(),
14901 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14905 if (E->State == TreeEntry::StridedVectorize) {
14906 Align CommonAlignment =
14907 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14909 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14910 false, CommonAlignment,
CostKind);
14912 assert(E->State == TreeEntry::Vectorize &&
14913 "Expected either strided or consecutive stores.");
14914 if (
unsigned Factor = E->getInterleaveFactor()) {
14915 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14916 "No reused shuffles expected");
14919 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14920 BaseSI->getPointerAddressSpace(),
CostKind);
14924 Instruction::Store, VecTy, BaseSI->getAlign(),
14925 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14928 return VecStCost + CommonCost;
14932 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
14933 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
14936 return GetCostDiff(GetScalarCost, GetVectorCost) +
14937 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14939 case Instruction::Call: {
14940 auto GetScalarCost = [&](
unsigned Idx) {
14941 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
14952 auto *CI = cast<CallInst>(VL0);
14956 It != MinBWs.
end() ? It->second.first : 0,
TTI);
14958 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
14960 return GetCostDiff(GetScalarCost, GetVectorCost);
14962 case Instruction::ShuffleVector: {
14963 if (!
SLPReVec || E->isAltShuffle())
14964 assert(E->isAltShuffle() &&
14969 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
14970 "Invalid Shuffle Vector Operand");
14973 auto TryFindNodeWithEqualOperands = [=]() {
14974 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14977 if (
TE->hasState() &&
TE->isAltShuffle() &&
14978 ((
TE->getOpcode() == E->getOpcode() &&
14979 TE->getAltOpcode() == E->getAltOpcode()) ||
14980 (
TE->getOpcode() == E->getAltOpcode() &&
14981 TE->getAltOpcode() == E->getOpcode())) &&
14982 TE->hasEqualOperands(*E))
14987 auto GetScalarCost = [&](
unsigned Idx) {
14988 if (isa<PoisonValue>(UniqueValues[
Idx]))
14991 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
14992 assert(E->getMatchingMainOpOrAltOp(VI) &&
14993 "Unexpected main/alternate opcode");
15003 if (TryFindNodeWithEqualOperands()) {
15005 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15012 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
15014 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
15015 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15017 VecCost = TTIRef.getCmpSelInstrCost(
15018 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15019 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15021 VecCost += TTIRef.getCmpSelInstrCost(
15022 E->getOpcode(), VecTy, MaskTy,
15023 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
15024 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15027 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15030 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15031 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
15033 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15034 if (SrcIt != MinBWs.
end()) {
15035 SrcBWSz = SrcIt->second.first;
15039 if (BWSz <= SrcBWSz) {
15040 if (BWSz < SrcBWSz)
15042 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15046 <<
"SLP: alternate extension, which should be truncated.\n";
15052 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15055 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15059 E->buildAltOpShuffleMask(
15061 assert(E->getMatchingMainOpOrAltOp(
I) &&
15062 "Unexpected main/alternate opcode");
15073 unsigned Opcode0 = E->getOpcode();
15074 unsigned Opcode1 = E->getAltOpcode();
15079 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15081 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15082 return AltVecCost < VecCost ? AltVecCost : VecCost;
15087 if (
SLPReVec && !E->isAltShuffle())
15088 return GetCostDiff(
15093 "Not supported shufflevector usage.");
15094 auto *SV = cast<ShuffleVectorInst>(VL.
front());
15095 unsigned SVNumElements =
15096 cast<FixedVectorType>(SV->getOperand(0)->getType())
15097 ->getNumElements();
15098 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15099 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
15103 assert(isa<ShuffleVectorInst>(V) &&
15104 "Not supported shufflevector usage.");
15105 auto *SV = cast<ShuffleVectorInst>(V);
15107 [[maybe_unused]]
bool IsExtractSubvectorMask =
15108 SV->isExtractSubvectorMask(Index);
15109 assert(IsExtractSubvectorMask &&
15110 "Not supported shufflevector usage.");
15111 if (NextIndex != Index)
15113 NextIndex += SV->getShuffleMask().size();
15116 return ::getShuffleCost(
15122 return GetCostDiff(GetScalarCost, GetVectorCost);
15124 case Instruction::Freeze:
15131bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15133 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15135 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15137 return TE->isGather() &&
15139 [
this](
Value *V) { return EphValues.contains(V); }) &&
15141 TE->Scalars.size() < Limit ||
15142 (((
TE->hasState() &&
15143 TE->getOpcode() == Instruction::ExtractElement) ||
15144 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
15146 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15147 !
TE->isAltShuffle()) ||
15148 any_of(
TE->Scalars, IsaPred<LoadInst>));
15152 if (VectorizableTree.size() == 1 &&
15153 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15154 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15155 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15157 AreVectorizableGathers(VectorizableTree[0].
get(),
15158 VectorizableTree[0]->Scalars.size()) &&
15159 VectorizableTree[0]->getVectorFactor() > 2)))
15162 if (VectorizableTree.size() != 2)
15169 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15170 AreVectorizableGathers(VectorizableTree[1].
get(),
15171 VectorizableTree[0]->Scalars.size()))
15175 if (VectorizableTree[0]->
isGather() ||
15176 (VectorizableTree[1]->isGather() &&
15177 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15178 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15179 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15187 bool MustMatchOrInst) {
15191 Value *ZextLoad = Root;
15192 const APInt *ShAmtC;
15193 bool FoundOr =
false;
15194 while (!isa<ConstantExpr>(ZextLoad) &&
15197 ShAmtC->
urem(8) == 0))) {
15198 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15199 ZextLoad = BinOp->getOperand(0);
15200 if (BinOp->getOpcode() == Instruction::Or)
15205 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15212 Type *SrcTy = Load->getType();
15219 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15220 << *(cast<Instruction>(Root)) <<
"\n");
15229 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15230 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15238 unsigned NumElts = Stores.
size();
15239 for (
Value *Scalar : Stores) {
15253 if (VectorizableTree.empty()) {
15254 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15260 if (VectorizableTree.size() == 2 &&
15261 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15262 VectorizableTree[1]->isGather() &&
15263 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15264 !(
isSplat(VectorizableTree[1]->Scalars) ||
15272 constexpr int Limit = 4;
15274 !VectorizableTree.empty() &&
15275 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15276 return (TE->isGather() &&
15277 (!TE->hasState() ||
15278 TE->getOpcode() != Instruction::ExtractElement) &&
15279 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15280 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15287 VectorizableTree.size() <= Limit &&
15288 all_of(VectorizableTree,
15289 [&](
const std::unique_ptr<TreeEntry> &TE) {
15290 return (TE->isGather() &&
15291 (!TE->hasState() ||
15292 TE->getOpcode() != Instruction::ExtractElement) &&
15293 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15296 (TE->getOpcode() == Instruction::InsertElement ||
15297 (TE->getOpcode() == Instruction::PHI &&
15299 return isa<PoisonValue>(V) || MustGather.contains(V);
15302 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15303 return TE->State == TreeEntry::Vectorize &&
15304 TE->getOpcode() == Instruction::PHI;
15311 unsigned NumGathers = 0;
15312 constexpr int LimitTreeSize = 36;
15314 all_of(VectorizableTree,
15315 [&](
const std::unique_ptr<TreeEntry> &TE) {
15316 if (!TE->isGather() && TE->hasState() &&
15317 (TE->getOpcode() == Instruction::Load ||
15318 TE->getOpcode() == Instruction::Store)) {
15319 StoreLoadNodes.push_back(TE.get());
15322 if (TE->isGather())
15324 return TE->State == TreeEntry::SplitVectorize ||
15325 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15326 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15327 VectorizableTree.size() > LimitTreeSize) ||
15329 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15331 (TE->getOpcode() == Instruction::PHI ||
15332 (TE->hasCopyableElements() &&
15334 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15335 TE->Scalars.size() / 2) ||
15336 ((!TE->ReuseShuffleIndices.empty() ||
15337 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15338 TE->Scalars.size() == 2)));
15340 (StoreLoadNodes.
empty() ||
15341 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15342 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15343 return TE->getOpcode() == Instruction::Store ||
15345 return !isa<LoadInst>(V) ||
15346 areAllUsersVectorized(cast<Instruction>(V));
15354 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15355 VectorizableTree.size() >= Limit &&
15357 [&](
const std::unique_ptr<TreeEntry> &TE) {
15358 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15359 TE->UserTreeIndex.UserTE->Idx == 0;
15366 VectorizableTree.size() > 2 &&
15367 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15368 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15369 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15370 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15372 ArrayRef(VectorizableTree).drop_front(2),
15373 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15383 if (isFullyVectorizableTinyTree(ForReduction))
15388 bool IsAllowedSingleBVNode =
15389 VectorizableTree.
size() > 1 ||
15390 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15391 !VectorizableTree.front()->isAltShuffle() &&
15392 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15393 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15395 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15396 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15397 return isa<ExtractElementInst, Constant>(V) ||
15398 (IsAllowedSingleBVNode &&
15399 !V->hasNUsesOrMore(UsesLimit) &&
15400 any_of(V->users(), IsaPred<InsertElementInst>));
15405 if (VectorizableTree.back()->isGather() &&
15406 VectorizableTree.back()->hasState() &&
15407 VectorizableTree.back()->isAltShuffle() &&
15408 VectorizableTree.back()->getVectorFactor() > 2 &&
15410 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15412 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15413 VectorizableTree.back()->getVectorFactor()),
15426 constexpr unsigned SmallTree = 3;
15427 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15430 [](
const std::unique_ptr<TreeEntry> &TE) {
15431 return TE->isGather() && TE->hasState() &&
15432 TE->getOpcode() == Instruction::Load &&
15440 TreeEntry &E = *VectorizableTree[
Idx];
15441 if (E.State == TreeEntry::SplitVectorize)
15445 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15447 all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
15448 (isa<ExtractElementInst>(E.Scalars.front()) &&
15464 const TreeEntry *Root = VectorizableTree.front().get();
15465 if (Root->isGather())
15473 for (
const auto &TEPtr : VectorizableTree) {
15474 if (!TEPtr->isGather()) {
15475 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15476 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15477 LastInstructions.
insert(LastInst);
15479 if (TEPtr->UserTreeIndex)
15480 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15484 const auto *
II = dyn_cast<IntrinsicInst>(
I);
15487 if (
II->isAssumeLikeIntrinsic())
15494 return IntrCost < CallCost;
15501 CheckedInstructions;
15502 unsigned Budget = 0;
15503 const unsigned BudgetLimit =
15508 "Expected instructions in same block.");
15509 if (
auto It = CheckedInstructions.
find(
Last);
15510 It != CheckedInstructions.
end()) {
15511 const Instruction *Checked = It->second.getPointer();
15513 return It->second.getInt() != 0;
15519 ++
First->getIterator().getReverse(),
15521 Last->getIterator().getReverse();
15523 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15527 if (
const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15529 for (
const Instruction *LastInst : LastInstsInRange)
15530 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15533 if (LastInstructions.
contains(&*PrevInstIt))
15534 LastInstsInRange.
push_back(&*PrevInstIt);
15539 for (
const Instruction *LastInst : LastInstsInRange)
15541 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15542 Budget <= BudgetLimit ? 1 : 0);
15543 return Budget <= BudgetLimit;
15545 auto AddCosts = [&](
const TreeEntry *
Op) {
15546 Type *ScalarTy =
Op->Scalars.front()->getType();
15547 auto It = MinBWs.
find(
Op);
15548 if (It != MinBWs.
end())
15561 ParentOpParentToPreds;
15564 auto Key = std::make_pair(Root, OpParent);
15565 if (
auto It = ParentOpParentToPreds.
find(Key);
15566 It != ParentOpParentToPreds.
end())
15578 for (
const auto &KeyPair : ParentsPairsToAdd) {
15580 "Should not have been added before.");
15584 while (!Worklist.
empty()) {
15586 if (BB == OpParent || !Visited.
insert(BB).second)
15588 auto Pair = std::make_pair(BB, OpParent);
15589 if (
auto It = ParentOpParentToPreds.
find(Pair);
15590 It != ParentOpParentToPreds.
end()) {
15594 ParentsPairsToAdd.
insert(Pair);
15599 if (Budget > BudgetLimit)
15611 while (!LiveEntries.
empty()) {
15616 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15619 if (!
Op->isGather())
15621 if (Entry->State == TreeEntry::SplitVectorize ||
15622 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15627 if (
auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15628 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15631 if (
Op->isGather()) {
15632 assert(Entry->getOpcode() == Instruction::PHI &&
15633 "Expected phi node only.");
15634 OpParent = cast<PHINode>(Entry->getMainOp())
15635 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15637 for (
Value *V :
Op->Scalars) {
15638 auto *Inst = dyn_cast<Instruction>(V);
15648 OpLastInst = EntriesToLastInstruction.
at(
Op);
15652 if (OpParent == Parent) {
15653 if (Entry->getOpcode() == Instruction::PHI) {
15654 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15658 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15664 if (Entry->getOpcode() != Instruction::PHI &&
15665 !CheckForNonVecCallsInSameBlock(
15666 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15672 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15678 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15694 const auto *I1 = IE1;
15695 const auto *I2 = IE2;
15707 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15709 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15710 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15712 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15713 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15720struct ValueSelect {
15721 template <
typename U>
15722 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15725 template <
typename U>
15726 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15744template <
typename T>
15750 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15752 auto VMIt = std::next(ShuffleMask.begin());
15755 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15757 if (!IsBaseUndef.
all()) {
15759 std::pair<T *, bool> Res =
15760 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15762 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
15766 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
15768 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15769 assert((!V || GetVF(V) == Mask.size()) &&
15770 "Expected base vector of VF number of elements.");
15771 Prev = Action(Mask, {
nullptr, Res.first});
15772 }
else if (ShuffleMask.size() == 1) {
15775 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15781 Prev = Action(Mask, {ShuffleMask.begin()->first});
15785 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15786 unsigned Vec2VF = GetVF(VMIt->first);
15787 if (Vec1VF == Vec2VF) {
15791 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15794 Mask[
I] = SecMask[
I] + Vec1VF;
15797 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15800 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15802 std::pair<T *, bool> Res2 =
15803 ResizeAction(VMIt->first, VMIt->second,
false);
15805 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15812 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15815 Prev = Action(Mask, {Res1.first, Res2.first});
15817 VMIt = std::next(VMIt);
15819 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15821 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15823 std::pair<T *, bool> Res =
15824 ResizeAction(VMIt->first, VMIt->second,
false);
15826 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15829 "Multiple uses of scalars.");
15830 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15835 Prev = Action(Mask, {Prev, Res.first});
15843template <
typename T>
struct ShuffledInsertData {
15855 << VectorizableTree.size() <<
".\n");
15858 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15859 TreeEntry &TE = *VectorizableTree[
I];
15862 if (TE.State == TreeEntry::CombinedVectorize) {
15864 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15865 << *TE.Scalars[0] <<
".\n";
15866 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15869 if (TE.hasState() &&
15870 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15871 if (
const TreeEntry *E =
15872 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15873 E && E->getVectorFactor() == TE.getVectorFactor()) {
15878 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15885 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15886 "Expected gather nodes with users only.");
15892 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15896 none_of(ExternalUses, [](
const ExternalUser &EU) {
15897 return isa_and_nonnull<InsertElementInst>(EU.User);
15907 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15914 for (ExternalUser &EU : ExternalUses) {
15915 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15918 for (ExternalUser &EU : ExternalUses) {
15919 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15920 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15922 else dbgs() <<
" User: nullptr\n");
15923 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15928 if (EphValues.
count(EU.User))
15932 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15934 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
15940 EU.User ? cast<Instruction>(EU.User)->getParent() :
nullptr;
15943 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
15947 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
15948 !ExtractCostCalculated.
insert(EU.Scalar).second)
15952 if (!
SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
15958 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
15960 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
15961 if (!UsedInserts.
insert(VU).second)
15965 const TreeEntry *ScalarTE = &EU.E;
15968 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
15973 Value *Op0 =
II->getOperand(0);
15980 if (It == ShuffledInserts.
end()) {
15982 Data.InsertElements.emplace_back(VU);
15984 VecId = ShuffledInserts.
size() - 1;
15985 auto It = MinBWs.
find(ScalarTE);
15986 if (It != MinBWs.
end() &&
15988 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
15990 unsigned BWSz = It->second.first;
15991 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
15992 unsigned VecOpcode;
15993 if (DstBWSz < BWSz)
15994 VecOpcode = Instruction::Trunc;
15997 It->second.second ? Instruction::SExt : Instruction::ZExt;
16002 FTy->getNumElements()),
16005 <<
" for extending externally used vector with "
16006 "non-equal minimum bitwidth.\n");
16011 It->InsertElements.front() = VU;
16012 VecId = std::distance(ShuffledInserts.
begin(), It);
16014 int InIdx = *InsertIdx;
16016 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16019 Mask[InIdx] = EU.Lane;
16020 DemandedElts[VecId].setBit(InIdx);
16031 auto *ScalarTy = EU.Scalar->getType();
16032 const unsigned BundleWidth = EU.E.getVectorFactor();
16033 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16035 const TreeEntry *Entry = &EU.E;
16036 auto It = MinBWs.
find(Entry);
16037 if (It != MinBWs.
end()) {
16039 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16042 ? Instruction::ZExt
16043 : Instruction::SExt;
16048 << ExtraCost <<
"\n");
16052 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16053 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16054 << *VecTy <<
": " << ExtraCost <<
"\n");
16057 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16058 Entry->getOpcode() == Instruction::Load) {
16060 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16061 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16062 auto *
I = cast<Instruction>(U.Scalar);
16063 const Loop *L = LI->getLoopFor(Phi->getParent());
16064 return L && (Phi->getParent() ==
I->getParent() ||
16065 L == LI->getLoopFor(
I->getParent()));
16069 if (!ValueToExtUses) {
16070 ValueToExtUses.emplace();
16071 for (
const auto &
P :
enumerate(ExternalUses)) {
16073 if (IsPhiInLoop(
P.value()))
16076 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16081 auto *Inst = cast<Instruction>(EU.Scalar);
16083 auto OperandIsScalar = [&](
Value *V) {
16088 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
16089 return !EE->hasOneUse() || !MustGather.contains(EE);
16092 return ValueToExtUses->contains(V);
16094 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16095 bool CanBeUsedAsScalarCast =
false;
16096 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16097 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
16098 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16103 if (ScalarCost + OpCost <= ExtraCost) {
16104 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16105 ScalarCost += OpCost;
16109 if (CanBeUsedAsScalar) {
16110 bool KeepScalar = ScalarCost <= ExtraCost;
16114 bool IsProfitablePHIUser =
16116 VectorizableTree.front()->Scalars.size() > 2)) &&
16117 VectorizableTree.front()->hasState() &&
16118 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16122 auto *PHIUser = dyn_cast<PHINode>(U);
16123 return (!PHIUser ||
16124 PHIUser->getParent() !=
16126 VectorizableTree.front()->getMainOp())
16131 return ValueToExtUses->contains(V);
16133 if (IsProfitablePHIUser) {
16137 (!GatheredLoadsEntriesFirst.has_value() ||
16138 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16139 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16140 return ValueToExtUses->contains(V);
16142 auto It = ExtractsCount.
find(Entry);
16143 if (It != ExtractsCount.
end()) {
16144 assert(ScalarUsesCount >= It->getSecond().size() &&
16145 "Expected total number of external uses not less than "
16146 "number of scalar uses.");
16147 ScalarUsesCount -= It->getSecond().size();
16152 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16155 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
16156 for (
Value *V : Inst->operands()) {
16157 auto It = ValueToExtUses->find(V);
16158 if (It != ValueToExtUses->end()) {
16160 ExternalUses[It->second].User =
nullptr;
16163 ExtraCost = ScalarCost;
16164 if (!IsPhiInLoop(EU))
16165 ExtractsCount[Entry].
insert(Inst);
16166 if (CanBeUsedAsScalarCast) {
16167 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16170 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16171 for (
Value *V : IOp->operands()) {
16172 auto It = ValueToExtUses->find(V);
16173 if (It != ValueToExtUses->end()) {
16175 ExternalUses[It->second].User =
nullptr;
16184 ExtractCost += ExtraCost;
16188 for (
Value *V : ScalarOpsFromCasts) {
16189 ExternalUsesAsOriginalScalar.
insert(V);
16191 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16192 TEs.front()->findLaneForValue(V));
16196 if (!VectorizedVals.
empty()) {
16197 const TreeEntry &Root = *VectorizableTree.front();
16198 auto BWIt = MinBWs.
find(&Root);
16199 if (BWIt != MinBWs.
end()) {
16200 Type *DstTy = Root.Scalars.front()->getType();
16203 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16204 if (OriginalSz != SrcSz) {
16205 unsigned Opcode = Instruction::Trunc;
16206 if (OriginalSz > SrcSz)
16207 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16209 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16220 Cost += ExtractCost;
16222 bool ForSingleMask) {
16224 unsigned VF = Mask.size();
16225 unsigned VecVF = TE->getVectorFactor();
16226 bool HasLargeIndex =
16227 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16228 if ((VF != VecVF && HasLargeIndex) ||
16231 if (HasLargeIndex) {
16233 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16239 dbgs() <<
"SLP: Adding cost " <<
C
16240 <<
" for final shuffle of insertelement external users.\n";
16241 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16243 return std::make_pair(TE,
true);
16246 if (!ForSingleMask) {
16248 for (
unsigned I = 0;
I < VF; ++
I) {
16250 ResizeMask[Mask[
I]] = Mask[
I];
16257 dbgs() <<
"SLP: Adding cost " <<
C
16258 <<
" for final shuffle of insertelement external users.\n";
16259 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16264 return std::make_pair(TE,
false);
16267 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16268 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16269 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16273 assert((TEs.size() == 1 || TEs.size() == 2) &&
16274 "Expected exactly 1 or 2 tree entries.");
16275 if (TEs.size() == 1) {
16277 VF = TEs.front()->getVectorFactor();
16278 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16282 (
Data.index() < VF &&
16283 static_cast<int>(
Data.index()) ==
Data.value());
16288 <<
" for final shuffle of insertelement "
16289 "external users.\n";
16290 TEs.front()->
dump();
16291 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16297 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16298 VF = TEs.front()->getVectorFactor();
16302 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16306 <<
" for final shuffle of vector node and external "
16307 "insertelement users.\n";
16308 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16309 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16315 (void)performExtractsShuffleAction<const TreeEntry>(
16317 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
16318 EstimateShufflesCost);
16320 cast<FixedVectorType>(
16321 ShuffledInserts[
I].InsertElements.
front()->getType()),
16324 Cost -= InsertCost;
16328 if (ReductionBitWidth != 0) {
16329 assert(UserIgnoreList &&
"Expected reduction tree.");
16330 const TreeEntry &E = *VectorizableTree.front();
16331 auto It = MinBWs.
find(&E);
16332 if (It != MinBWs.
end() && It->second.first != ReductionBitWidth) {
16333 unsigned SrcSize = It->second.first;
16334 unsigned DstSize = ReductionBitWidth;
16335 unsigned Opcode = Instruction::Trunc;
16336 if (SrcSize < DstSize) {
16337 bool IsArithmeticExtendedReduction =
16339 auto *
I = cast<Instruction>(V);
16340 return is_contained({Instruction::Add, Instruction::FAdd,
16341 Instruction::Mul, Instruction::FMul,
16342 Instruction::And, Instruction::Or,
16346 if (IsArithmeticExtendedReduction)
16348 Instruction::BitCast;
16350 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16352 if (Opcode != Instruction::BitCast) {
16354 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16356 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16359 switch (E.getOpcode()) {
16360 case Instruction::SExt:
16361 case Instruction::ZExt:
16362 case Instruction::Trunc: {
16363 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16364 CCH = getCastContextHint(*OpTE);
16374 <<
" for final resize for reduction from " << SrcVecTy
16375 <<
" to " << DstVecTy <<
"\n";
16376 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16381 std::optional<InstructionCost> SpillCost;
16384 Cost += *SpillCost;
16390 OS <<
"SLP: Spill Cost = ";
16395 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16396 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16400 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
16411std::optional<TTI::ShuffleKind>
16412BoUpSLP::tryToGatherSingleRegisterExtractElements(
16418 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16419 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
16421 if (isa<UndefValue>(VL[
I]))
16425 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16426 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16439 ExtractMask.reset(*
Idx);
16444 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16449 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16450 return P1.second.size() > P2.second.size();
16453 const int UndefSz = UndefVectorExtracts.
size();
16454 unsigned SingleMax = 0;
16455 unsigned PairMax = 0;
16456 if (!Vectors.
empty()) {
16457 SingleMax = Vectors.
front().second.size() + UndefSz;
16458 if (Vectors.
size() > 1) {
16459 auto *ItNext = std::next(Vectors.
begin());
16460 PairMax = SingleMax + ItNext->second.size();
16463 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16464 return std::nullopt;
16470 if (SingleMax >= PairMax && SingleMax) {
16471 for (
int Idx : Vectors.
front().second)
16473 }
else if (!Vectors.
empty()) {
16474 for (
unsigned Idx : {0, 1})
16475 for (
int Idx : Vectors[
Idx].second)
16479 for (
int Idx : UndefVectorExtracts)
16483 std::optional<TTI::ShuffleKind> Res =
16489 return std::nullopt;
16493 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
16494 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
16495 isa<UndefValue>(GatheredExtracts[
I])) {
16499 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
16500 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16501 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16516 unsigned NumParts)
const {
16517 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16521 for (
unsigned Part : seq<unsigned>(NumParts)) {
16527 std::optional<TTI::ShuffleKind> Res =
16528 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16529 ShufflesRes[Part] = Res;
16530 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16532 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16533 return Res.has_value();
16535 ShufflesRes.clear();
16536 return ShufflesRes;
16539std::optional<TargetTransformInfo::ShuffleKind>
16540BoUpSLP::isGatherShuffledSingleRegisterEntry(
16546 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16547 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16548 TE =
TE->UserTreeIndex.UserTE;
16549 if (TE == VectorizableTree.front().get())
16550 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16551 return TE->UserTreeIndex;
16553 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16554 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16555 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16557 TE =
TE->UserTreeIndex.UserTE;
16561 const EdgeInfo TEUseEI = GetUserEntry(TE);
16563 return std::nullopt;
16564 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16568 if (
auto *
PHI = dyn_cast_or_null<PHINode>(
16569 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16570 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16571 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16574 TEInsertBlock = TEInsertPt->
getParent();
16577 return std::nullopt;
16578 auto *NodeUI = DT->
getNode(TEInsertBlock);
16579 assert(NodeUI &&
"Should only process reachable instructions");
16581 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16595 auto *NodeEUI = DT->
getNode(InsertBlock);
16598 assert((NodeUI == NodeEUI) ==
16599 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16600 "Different nodes should have different DFS numbers");
16602 if (TEInsertPt->
getParent() != InsertBlock &&
16605 if (TEInsertPt->
getParent() == InsertBlock &&
16620 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16622 if ((TEPtr->getVectorFactor() != VL.
size() &&
16623 TEPtr->Scalars.size() != VL.
size()) ||
16624 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16628 for (
Value *V : VL) {
16635 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16636 unsigned EdgeIdx) {
16637 const TreeEntry *Ptr1 = User1;
16638 const TreeEntry *Ptr2 = User2;
16642 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16643 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16646 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16647 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16648 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16649 return Idx < It->second;
16653 for (
Value *V : VL) {
16658 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16659 if (TEPtr == TE || TEPtr->Idx == 0)
16662 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16663 "Must contain at least single gathered value.");
16664 assert(TEPtr->UserTreeIndex &&
16665 "Expected only single user of a gather node.");
16666 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16668 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16669 UseEI.UserTE->hasState())
16670 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16674 : &getLastInstructionInBundle(UseEI.UserTE);
16675 if (TEInsertPt == InsertPt) {
16677 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16678 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16679 TEUseEI.UserTE->isAltShuffle()) &&
16681 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16682 (UseEI.UserTE->hasState() &&
16683 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16684 !UseEI.UserTE->isAltShuffle()) ||
16693 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16696 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16697 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16698 UseEI.UserTE->State == TreeEntry::Vectorize &&
16699 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16700 TEUseEI.UserTE != UseEI.UserTE)
16705 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16709 if (TEUseEI.UserTE != UseEI.UserTE &&
16710 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16711 HasGatherUser(TEUseEI.UserTE)))
16714 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16718 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16719 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16720 UseEI.UserTE->doesNotNeedToSchedule() &&
16725 if ((TEInsertBlock != InsertPt->
getParent() ||
16726 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16727 !CheckOrdering(InsertPt))
16730 if (CheckAndUseSameNode(TEPtr))
16736 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16737 if (It != VTEs.end()) {
16738 const TreeEntry *VTE = *It;
16739 if (
none_of(
TE->CombinedEntriesWithIndices,
16740 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16741 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16742 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16746 if (CheckAndUseSameNode(VTE))
16752 const TreeEntry *VTE = VTEs.front();
16753 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16754 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16755 VTEs = VTEs.drop_front();
16757 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16758 return MTE->State == TreeEntry::Vectorize;
16760 if (MIt == VTEs.end())
16764 if (
none_of(
TE->CombinedEntriesWithIndices,
16765 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16766 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16767 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16771 if (CheckAndUseSameNode(VTE))
16775 if (VToTEs.
empty())
16777 if (UsedTEs.
empty()) {
16791 if (!VToTEs.
empty()) {
16797 VToTEs = SavedVToTEs;
16806 if (UsedTEs.
size() == 2)
16808 UsedTEs.push_back(SavedVToTEs);
16815 if (UsedTEs.
empty()) {
16817 return std::nullopt;
16821 if (UsedTEs.
size() == 1) {
16824 UsedTEs.front().
end());
16825 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16826 return TE1->Idx < TE2->Idx;
16829 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16830 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16832 if (It != FirstEntries.end() &&
16833 ((*It)->getVectorFactor() == VL.size() ||
16834 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16835 TE->ReuseShuffleIndices.size() == VL.size() &&
16836 (*It)->isSame(
TE->Scalars)))) {
16837 Entries.push_back(*It);
16838 if ((*It)->getVectorFactor() == VL.size()) {
16839 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16840 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16846 for (
unsigned I : seq<unsigned>(VL.size()))
16847 if (isa<PoisonValue>(VL[
I]))
16853 Entries.push_back(FirstEntries.front());
16855 for (
auto &
P : UsedValuesEntry)
16857 VF = FirstEntries.front()->getVectorFactor();
16860 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16863 for (
const TreeEntry *TE : UsedTEs.front()) {
16864 unsigned VF =
TE->getVectorFactor();
16865 auto It = VFToTE.
find(VF);
16866 if (It != VFToTE.
end()) {
16867 if (It->second->Idx >
TE->Idx)
16868 It->getSecond() =
TE;
16875 UsedTEs.back().
end());
16876 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16877 return TE1->Idx < TE2->Idx;
16879 for (
const TreeEntry *TE : SecondEntries) {
16880 auto It = VFToTE.
find(
TE->getVectorFactor());
16881 if (It != VFToTE.
end()) {
16883 Entries.push_back(It->second);
16884 Entries.push_back(TE);
16890 if (Entries.empty()) {
16892 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16893 return TE1->Idx < TE2->Idx;
16895 Entries.push_back(SecondEntries.front());
16896 VF = std::max(Entries.front()->getVectorFactor(),
16897 Entries.back()->getVectorFactor());
16899 VF = Entries.front()->getVectorFactor();
16902 for (
const TreeEntry *E : Entries)
16903 ValuesToEntries.
emplace_back().insert(E->Scalars.begin(),
16906 for (
auto &
P : UsedValuesEntry) {
16907 for (
unsigned Idx : seq<unsigned>(ValuesToEntries.
size()))
16915 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
16918 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16919 auto *
PHI = cast<PHINode>(V);
16920 auto *PHI1 = cast<PHINode>(V1);
16925 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
16927 Value *In1 = PHI1->getIncomingValue(
I);
16932 if (cast<Instruction>(In)->
getParent() !=
16942 auto MightBeIgnored = [=](
Value *
V) {
16943 auto *
I = dyn_cast<Instruction>(V);
16946 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
16951 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
16953 bool UsedInSameVTE =
false;
16954 auto It = UsedValuesEntry.find(V1);
16955 if (It != UsedValuesEntry.end())
16956 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
16957 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
16959 cast<Instruction>(V)->getParent() ==
16960 cast<Instruction>(V1)->getParent() &&
16961 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
16966 for (
int I = 0, E = VL.size();
I < E; ++
I) {
16968 auto It = UsedValuesEntry.find(V);
16969 if (It == UsedValuesEntry.end())
16975 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
16976 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
16978 unsigned Idx = It->second;
16985 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
16986 if (!UsedIdxs.test(
I))
16992 for (std::pair<unsigned, int> &Pair : EntryLanes)
16993 if (Pair.first ==
I)
16994 Pair.first = TempEntries.
size();
16997 Entries.swap(TempEntries);
16998 if (EntryLanes.size() == Entries.size() &&
17000 .
slice(Part * VL.size(),
17001 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17007 return std::nullopt;
17010 bool IsIdentity = Entries.size() == 1;
17013 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17014 unsigned Idx = Part * VL.size() + Pair.second;
17017 (ForOrder ? std::distance(
17018 Entries[Pair.first]->Scalars.begin(),
17019 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17020 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17021 IsIdentity &=
Mask[
Idx] == Pair.second;
17023 if (ForOrder || IsIdentity || Entries.empty()) {
17024 switch (Entries.size()) {
17026 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17030 if (EntryLanes.size() > 2 || VL.size() <= 2)
17036 }
else if (!isa<VectorType>(VL.front()->getType()) &&
17037 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17040 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17041 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17042 for (
int Idx : SubMask) {
17050 assert(MaxElement >= 0 && MinElement >= 0 &&
17051 MaxElement % VF >= MinElement % VF &&
17052 "Expected at least single element.");
17053 unsigned NewVF = std::max<unsigned>(
17055 (MaxElement % VF) -
17056 (MinElement % VF) + 1));
17058 for (
int &
Idx : SubMask) {
17061 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17062 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
17070 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17071 auto GetShuffleCost = [&,
17075 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17077 Mask, Entries.front()->getInterleaveFactor()))
17079 return ::getShuffleCost(
TTI,
17084 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17087 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17088 FirstShuffleCost = ShuffleCost;
17092 bool IsIdentity =
true;
17094 if (
Idx >=
static_cast<int>(NewVF)) {
17099 IsIdentity &=
static_cast<int>(
I) ==
Idx;
17103 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17105 *
TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17110 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17111 SecondShuffleCost = ShuffleCost;
17115 bool IsIdentity =
true;
17117 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
17123 IsIdentity &=
static_cast<int>(
I) ==
Idx;
17128 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17130 *
TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17138 *
TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17140 const TreeEntry *BestEntry =
nullptr;
17141 if (FirstShuffleCost < ShuffleCost) {
17142 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17143 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17145 if (Idx >= static_cast<int>(VF))
17146 Idx = PoisonMaskElem;
17148 BestEntry = Entries.front();
17149 ShuffleCost = FirstShuffleCost;
17151 if (SecondShuffleCost < ShuffleCost) {
17152 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17153 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17155 if (Idx < static_cast<int>(VF))
17156 Idx = PoisonMaskElem;
17160 BestEntry = Entries[1];
17161 ShuffleCost = SecondShuffleCost;
17163 if (BuildVectorCost >= ShuffleCost) {
17166 Entries.push_back(BestEntry);
17174 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17176 return std::nullopt;
17180BoUpSLP::isGatherShuffledEntry(
17184 assert(NumParts > 0 && NumParts < VL.
size() &&
17185 "Expected positive number of registers.");
17188 if (TE == VectorizableTree.front().get() &&
17189 (!GatheredLoadsEntriesFirst.has_value() ||
17191 [](
const std::unique_ptr<TreeEntry> &TE) {
17192 return !
TE->isGather();
17197 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
17200 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17201 "Expected only single user of the gather node.");
17203 "Number of scalars must be divisible by NumParts.");
17204 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17205 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17207 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17210 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17214 for (
unsigned Part : seq<unsigned>(NumParts)) {
17218 std::optional<TTI::ShuffleKind> SubRes =
17219 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17222 SubEntries.
clear();
17225 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17226 (SubEntries.
front()->isSame(
TE->Scalars) ||
17227 SubEntries.
front()->isSame(VL))) {
17229 LocalSubEntries.
swap(SubEntries);
17232 std::iota(
Mask.begin(),
Mask.end(), 0);
17234 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17235 if (isa<PoisonValue>(VL[
I]))
17237 Entries.emplace_back(1, LocalSubEntries.
front());
17243 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17251 Type *ScalarTy)
const {
17252 const unsigned VF = VL.
size();
17260 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17262 if (
V->getType() != ScalarTy)
17267 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17270 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V))
17274 ConstantShuffleMask[
I] =
I + VF;
17277 EstimateInsertCost(
I, V);
17280 bool IsAnyNonUndefConst =
17283 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17285 ConstantShuffleMask);
17289 if (!DemandedElements.
isZero())
17293 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17297Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
17298 auto It = EntryToLastInstruction.
find(E);
17299 if (It != EntryToLastInstruction.
end())
17300 return *cast<Instruction>(It->second);
17307 if (E->hasState()) {
17308 Front = E->getMainOp();
17309 Opcode = E->getOpcode();
17311 Front = cast<Instruction>(*
find_if(E->Scalars, IsaPred<Instruction>));
17316 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17317 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17318 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17320 [=](
Value *V) ->
bool {
17321 if (Opcode == Instruction::GetElementPtr &&
17322 !isa<GetElementPtrInst>(V))
17324 auto *I = dyn_cast<Instruction>(V);
17325 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17326 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17328 "Expected gathered loads or GEPs or instructions from same basic "
17331 auto FindLastInst = [&]() {
17333 for (
Value *V : E->Scalars) {
17334 auto *
I = dyn_cast<Instruction>(V);
17337 if (E->isCopyableElement(
I))
17339 if (LastInst->
getParent() ==
I->getParent()) {
17344 assert(((Opcode == Instruction::GetElementPtr &&
17345 !isa<GetElementPtrInst>(
I)) ||
17346 E->State == TreeEntry::SplitVectorize ||
17349 (GatheredLoadsEntriesFirst.has_value() &&
17350 Opcode == Instruction::Load && E->isGather() &&
17351 E->Idx < *GatheredLoadsEntriesFirst)) &&
17352 "Expected vector-like or non-GEP in GEP node insts only.");
17360 auto *NodeB = DT->
getNode(
I->getParent());
17361 assert(NodeA &&
"Should only process reachable instructions");
17362 assert(NodeB &&
"Should only process reachable instructions");
17363 assert((NodeA == NodeB) ==
17364 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17365 "Different nodes should have different DFS numbers");
17366 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17373 auto FindFirstInst = [&]() {
17375 for (
Value *V : E->Scalars) {
17376 auto *
I = dyn_cast<Instruction>(V);
17379 if (E->isCopyableElement(
I))
17381 if (FirstInst->
getParent() ==
I->getParent()) {
17382 if (
I->comesBefore(FirstInst))
17386 assert(((Opcode == Instruction::GetElementPtr &&
17387 !isa<GetElementPtrInst>(
I)) ||
17390 "Expected vector-like or non-GEP in GEP node insts only.");
17398 auto *NodeB = DT->
getNode(
I->getParent());
17399 assert(NodeA &&
"Should only process reachable instructions");
17400 assert(NodeB &&
"Should only process reachable instructions");
17401 assert((NodeA == NodeB) ==
17402 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17403 "Different nodes should have different DFS numbers");
17404 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17410 if (E->State == TreeEntry::SplitVectorize) {
17411 Res = FindLastInst();
17413 for (
auto *E : Entries) {
17414 auto *
I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17416 I = &getLastInstructionInBundle(E);
17426 if (GatheredLoadsEntriesFirst.has_value() &&
17427 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17428 Opcode == Instruction::Load) {
17429 Res = FindFirstInst();
17436 auto FindScheduleBundle = [&](
const TreeEntry *E) ->
const ScheduleBundle * {
17440 const auto *It = BlocksSchedules.find(BB);
17441 if (It == BlocksSchedules.end())
17443 for (
Value *V : E->Scalars) {
17444 auto *
I = dyn_cast<Instruction>(V);
17445 if (!
I || isa<PHINode>(
I) ||
17449 if (Bundles.
empty())
17452 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() == E; });
17453 if (It != Bundles.
end())
17458 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17459 if (!E->isGather() && !Bundle) {
17460 if ((Opcode == Instruction::GetElementPtr &&
17463 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17466 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17467 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17469 Res = FindLastInst();
17471 Res = FindFirstInst();
17481 assert(!E->isGather() &&
"Gathered instructions should not be scheduled");
17482 Res = Bundle->getBundle().back()->getInst();
17506 Res = FindLastInst();
17507 assert(Res &&
"Failed to find last instruction in bundle");
17512void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
17513 auto *Front = E->getMainOp();
17514 Instruction *LastInst = &getLastInstructionInBundle(E);
17515 assert(LastInst &&
"Failed to find last instruction in bundle");
17518 bool IsPHI = isa<PHINode>(LastInst);
17520 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17521 if (LastInstIt != LastInst->
getParent()->end() &&
17522 LastInstIt->getParent()->isLandingPad())
17523 LastInstIt = std::next(LastInstIt);
17526 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17527 E->doesNotNeedToSchedule()) ||
17528 (GatheredLoadsEntriesFirst.has_value() &&
17529 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17530 E->getOpcode() == Instruction::Load)) {
17531 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17535 Builder.SetInsertPoint(
17539 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17542Value *BoUpSLP::gather(
17551 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17554 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17555 InsertBB = InsertBB->getSinglePredecessor();
17556 return InsertBB && InsertBB == InstBB;
17558 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17559 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
17560 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17562 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17563 PostponedIndices.
insert(
I).second)
17567 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17570 if (Scalar->getType() != Ty) {
17571 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17574 if (
auto *CI = dyn_cast<CastInst>(Scalar);
17575 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
17577 if (
auto *IOp = dyn_cast<Instruction>(
Op);
17581 Scalar = Builder.CreateIntCast(
17586 if (
auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17590 auto *
II = dyn_cast<Instruction>(Vec);
17595 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17596 InsElt = dyn_cast<InsertElementInst>(Vec);
17600 GatherShuffleExtractSeq.
insert(InsElt);
17603 if (isa<Instruction>(V)) {
17606 User *UserOp =
nullptr;
17608 if (
auto *SI = dyn_cast<Instruction>(Scalar))
17611 if (
V->getType()->isVectorTy()) {
17612 if (
auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17613 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17616 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17617 if (SV->getOperand(0) ==
V)
17619 if (SV->getOperand(1) ==
V)
17630 "Failed to find shufflevector, caused by resize.");
17636 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17637 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17647 std::iota(
Mask.begin(),
Mask.end(), 0);
17648 Value *OriginalRoot = Root;
17649 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17650 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17651 SV->getOperand(0)->getType() == VecTy) {
17652 Root = SV->getOperand(0);
17653 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17656 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17663 if (isa<PoisonValue>(VL[
I]))
17665 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17669 if (isa<PoisonValue>(Vec)) {
17670 Vec = OriginalRoot;
17672 Vec = CreateShuffle(Root, Vec, Mask);
17673 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
17674 OI && OI->use_empty() &&
17675 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17676 return TE->VectorizedValue == OI;
17682 for (
int I : NonConsts)
17683 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17686 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17687 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17725 bool IsFinalized =
false;
17738 class ShuffleIRBuilder {
17751 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17752 CSEBlocks(CSEBlocks),
DL(
DL) {}
17753 ~ShuffleIRBuilder() =
default;
17759 "Expected integer vector types only.");
17761 if (cast<VectorType>(V2->
getType())
17763 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
17765 ->getIntegerBitWidth())
17774 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
17775 GatherShuffleExtractSeq.
insert(
I);
17776 CSEBlocks.
insert(
I->getParent());
17785 unsigned VF = Mask.size();
17786 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
17790 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
17791 GatherShuffleExtractSeq.
insert(
I);
17792 CSEBlocks.
insert(
I->getParent());
17796 Value *createIdentity(
Value *V) {
return V; }
17797 Value *createPoison(
Type *Ty,
unsigned VF) {
17802 void resizeToMatch(
Value *&V1,
Value *&V2) {
17805 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
17806 int V2VF = cast<FixedVectorType>(V2->
getType())->getNumElements();
17807 int VF = std::max(V1VF, V2VF);
17808 int MinVF = std::min(V1VF, V2VF);
17810 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17812 Value *&
Op = MinVF == V1VF ? V1 : V2;
17814 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
17815 GatherShuffleExtractSeq.
insert(
I);
17816 CSEBlocks.
insert(
I->getParent());
17829 assert(V1 &&
"Expected at least one vector value.");
17830 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17831 R.CSEBlocks, *R.DL);
17832 return BaseShuffleAnalysis::createShuffle<Value *>(
17833 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17839 std::optional<bool> IsSigned = std::nullopt) {
17840 auto *VecTy = cast<VectorType>(V->getType());
17849 Value *getVectorizedValue(
const TreeEntry &E) {
17850 Value *Vec = E.VectorizedValue;
17853 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17854 return !isa<PoisonValue>(V) &&
17855 !isKnownNonNegative(
17856 V, SimplifyQuery(*R.DL));
17862 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17866 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17867 unsigned NumParts,
bool &UseVecBaseAsInput) {
17868 UseVecBaseAsInput =
false;
17870 Value *VecBase =
nullptr;
17872 if (!E->ReorderIndices.empty()) {
17874 E->ReorderIndices.end());
17877 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17881 auto *EI = cast<ExtractElementInst>(VL[
I]);
17882 VecBase = EI->getVectorOperand();
17884 VecBase = TEs.front()->VectorizedValue;
17885 assert(VecBase &&
"Expected vectorized value.");
17886 UniqueBases.
insert(VecBase);
17889 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17890 (NumParts != 1 &&
count(VL, EI) > 1) ||
17892 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17893 return UTEs.empty() || UTEs.size() > 1 ||
17894 (isa<GetElementPtrInst>(U) &&
17895 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17897 count_if(R.VectorizableTree,
17898 [&](const std::unique_ptr<TreeEntry> &TE) {
17899 return TE->UserTreeIndex.UserTE ==
17901 is_contained(VL, EI);
17905 R.eraseInstruction(EI);
17907 if (NumParts == 1 || UniqueBases.
size() == 1) {
17908 assert(VecBase &&
"Expected vectorized value.");
17909 return castToScalarTyElem(VecBase);
17911 UseVecBaseAsInput =
true;
17921 Value *Vec =
nullptr;
17924 for (
unsigned Part : seq<unsigned>(NumParts)) {
17928 constexpr int MaxBases = 2;
17930 auto VLMask =
zip(SubVL, SubMask);
17931 const unsigned VF = std::accumulate(
17932 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17933 if (std::get<1>(D) == PoisonMaskElem)
17936 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17937 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17939 VecOp = TEs.front()->VectorizedValue;
17940 assert(VecOp &&
"Expected vectorized value.");
17941 const unsigned Size =
17942 cast<FixedVectorType>(VecOp->getType())->getNumElements();
17943 return std::max(S, Size);
17945 for (
const auto [V,
I] : VLMask) {
17948 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
17950 VecOp = TEs.front()->VectorizedValue;
17951 assert(VecOp &&
"Expected vectorized value.");
17952 VecOp = castToScalarTyElem(VecOp);
17953 Bases[
I / VF] = VecOp;
17955 if (!Bases.front())
17958 if (Bases.back()) {
17959 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
17960 TransformToIdentity(SubMask);
17962 SubVec = Bases.front();
17969 Mask.slice(
P * SliceSize,
17976 "Expected first part or all previous parts masked.");
17977 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17980 cast<FixedVectorType>(Vec->
getType())->getNumElements();
17982 unsigned SubVecVF =
17983 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
17984 NewVF = std::max(NewVF, SubVecVF);
17987 for (
int &
Idx : SubMask)
17990 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
17991 Vec = createShuffle(Vec, SubVec, VecMask);
17992 TransformToIdentity(VecMask);
18000 std::optional<Value *>
18006 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18008 return std::nullopt;
18011 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18019 IsFinalized =
false;
18020 CommonMask.
clear();
18026 Value *V1 = getVectorizedValue(E1);
18027 Value *V2 = getVectorizedValue(E2);
18033 Value *V1 = getVectorizedValue(E1);
18038 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18040 isa<FixedVectorType>(V2->
getType()) &&
18041 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18042 V1 = castToScalarTyElem(V1);
18043 V2 = castToScalarTyElem(V2);
18044 if (InVectors.
empty()) {
18047 CommonMask.
assign(Mask.begin(), Mask.end());
18051 if (InVectors.
size() == 2) {
18052 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18053 transformMaskAfterShuffle(CommonMask, CommonMask);
18054 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
18056 Vec = createShuffle(Vec,
nullptr, CommonMask);
18057 transformMaskAfterShuffle(CommonMask, CommonMask);
18059 V1 = createShuffle(V1, V2, Mask);
18060 unsigned VF = std::max(getVF(V1), getVF(Vec));
18061 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18063 CommonMask[
Idx] =
Idx + VF;
18064 InVectors.
front() = Vec;
18065 if (InVectors.
size() == 2)
18066 InVectors.
back() = V1;
18073 "castToScalarTyElem expects V1 to be FixedVectorType");
18074 V1 = castToScalarTyElem(V1);
18075 if (InVectors.
empty()) {
18077 CommonMask.
assign(Mask.begin(), Mask.end());
18080 const auto *It =
find(InVectors, V1);
18081 if (It == InVectors.
end()) {
18082 if (InVectors.
size() == 2 ||
18085 if (InVectors.
size() == 2) {
18086 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18087 transformMaskAfterShuffle(CommonMask, CommonMask);
18088 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18089 CommonMask.
size()) {
18090 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
18091 transformMaskAfterShuffle(CommonMask, CommonMask);
18093 unsigned VF = std::max(CommonMask.
size(), Mask.size());
18094 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18096 CommonMask[
Idx] = V->getType() != V1->
getType()
18098 : Mask[
Idx] + getVF(V1);
18099 if (V->getType() != V1->
getType())
18100 V1 = createShuffle(V1,
nullptr, Mask);
18101 InVectors.
front() = V;
18102 if (InVectors.
size() == 2)
18103 InVectors.
back() = V1;
18110 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18117 for (
Value *V : InVectors)
18118 VF = std::max(VF, getVF(V));
18119 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
18121 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
18130 Value *Root =
nullptr) {
18131 return R.gather(VL, Root, ScalarTy,
18133 return createShuffle(V1, V2, Mask);
18142 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18147 IsFinalized =
true;
18150 if (InVectors.
size() == 2) {
18151 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18154 Vec = createShuffle(Vec,
nullptr, CommonMask);
18156 transformMaskAfterShuffle(CommonMask, CommonMask);
18158 "Expected vector length for the final value before action.");
18159 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
18162 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18163 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18166 return createShuffle(V1, V2, Mask);
18168 InVectors.
front() = Vec;
18170 if (!SubVectors.empty()) {
18172 if (InVectors.
size() == 2) {
18173 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18176 Vec = createShuffle(Vec,
nullptr, CommonMask);
18178 transformMaskAfterShuffle(CommonMask, CommonMask);
18179 auto CreateSubVectors = [&](
Value *Vec,
18181 for (
auto [E,
Idx] : SubVectors) {
18182 Value *
V = getVectorizedValue(*E);
18189 Type *OrigScalarTy = ScalarTy;
18192 Builder, Vec, V, InsertionIndex,
18193 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18195 ScalarTy = OrigScalarTy;
18196 if (!CommonMask.
empty()) {
18197 std::iota(std::next(CommonMask.
begin(),
Idx),
18198 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
18204 if (SubVectorsMask.
empty()) {
18205 Vec = CreateSubVectors(Vec, CommonMask);
18208 copy(SubVectorsMask, SVMask.begin());
18209 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18212 I1 = I2 + CommonMask.
size();
18217 Vec = createShuffle(InsertVec, Vec, SVMask);
18218 transformMaskAfterShuffle(CommonMask, SVMask);
18220 InVectors.
front() = Vec;
18223 if (!ExtMask.
empty()) {
18224 if (CommonMask.
empty()) {
18228 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18231 NewMask[
I] = CommonMask[ExtMask[
I]];
18233 CommonMask.
swap(NewMask);
18236 if (CommonMask.
empty()) {
18237 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18238 return InVectors.
front();
18240 if (InVectors.
size() == 2)
18241 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18242 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18247 "Shuffle construction must be finalized.");
18251Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18255template <
typename BVTy,
typename ResTy,
typename... Args>
18256ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18258 assert(E->isGather() &&
"Expected gather node.");
18259 unsigned VF = E->getVectorFactor();
18261 bool NeedFreeze =
false;
18264 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
18266 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
18269 E->CombinedEntriesWithIndices.size());
18270 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18271 [&](
const auto &
P) {
18272 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18277 E->ReorderIndices.end());
18278 if (!ReorderMask.empty())
18284 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18285 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
18286 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18289 SubVectorsMask.
clear();
18293 unsigned I,
unsigned SliceSize,
18294 bool IsNotPoisonous) {
18296 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18299 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18300 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18301 if (UserTE->getNumOperands() != 2)
18303 if (!IsNotPoisonous) {
18304 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18305 [=](
const std::unique_ptr<TreeEntry> &TE) {
18306 return TE->UserTreeIndex.UserTE == UserTE &&
18307 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18309 if (It == VectorizableTree.end())
18312 if (!(*It)->ReorderIndices.empty()) {
18316 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18317 Value *V0 = std::get<0>(
P);
18318 Value *V1 = std::get<1>(
P);
18319 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18320 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18326 if ((
Mask.size() < InputVF &&
18329 (
Mask.size() == InputVF &&
18332 std::next(
Mask.begin(),
I * SliceSize),
18333 std::next(
Mask.begin(),
18340 std::next(
Mask.begin(),
I * SliceSize),
18341 std::next(
Mask.begin(),
18347 BVTy ShuffleBuilder(ScalarTy, Params...);
18348 ResTy Res = ResTy();
18352 Value *ExtractVecBase =
nullptr;
18353 bool UseVecBaseAsInput =
false;
18356 Type *OrigScalarTy = GatheredScalars.front()->getType();
18359 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
18361 bool Resized =
false;
18363 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18364 if (!ExtractShuffles.
empty()) {
18370 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand());
18372 ExtractEntries.
append(TEs.begin(), TEs.end());
18374 if (std::optional<ResTy> Delayed =
18375 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18377 PostponedGathers.
insert(E);
18382 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18383 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18384 ExtractVecBase = VecBase;
18385 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18386 if (VF == VecBaseTy->getNumElements() &&
18387 GatheredScalars.size() != VF) {
18389 GatheredScalars.append(VF - GatheredScalars.size(),
18397 if (!ExtractShuffles.
empty() || !E->hasState() ||
18398 E->getOpcode() != Instruction::Load ||
18399 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18400 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18403 return isa<LoadInst>(V) && isVectorized(V);
18405 (E->hasState() && E->isAltShuffle()) ||
18406 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18408 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18410 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18412 if (!GatherShuffles.
empty()) {
18413 if (std::optional<ResTy> Delayed =
18414 ShuffleBuilder.needToDelay(E, Entries)) {
18416 PostponedGathers.
insert(E);
18421 if (GatherShuffles.
size() == 1 &&
18423 Entries.front().front()->isSame(E->Scalars)) {
18426 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18429 Mask.resize(E->Scalars.size());
18430 const TreeEntry *FrontTE = Entries.front().front();
18431 if (FrontTE->ReorderIndices.empty() &&
18432 ((FrontTE->ReuseShuffleIndices.empty() &&
18433 E->Scalars.size() == FrontTE->Scalars.size()) ||
18434 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18435 std::iota(
Mask.begin(),
Mask.end(), 0);
18438 if (isa<PoisonValue>(V)) {
18442 Mask[
I] = FrontTE->findLaneForValue(V);
18447 ShuffleBuilder.resetForSameNode();
18448 ShuffleBuilder.add(*FrontTE, Mask);
18450 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18454 if (GatheredScalars.size() != VF &&
18456 return any_of(TEs, [&](
const TreeEntry *TE) {
18457 return TE->getVectorFactor() == VF;
18460 GatheredScalars.append(VF - GatheredScalars.size(),
18464 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18472 bool IsRootPoison) {
18475 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18482 int NumNonConsts = 0;
18485 if (isa<UndefValue>(V)) {
18486 if (!isa<PoisonValue>(V)) {
18501 Scalars.
front() = OrigV;
18504 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18505 Scalars[Res.first->second] = OrigV;
18506 ReuseMask[
I] = Res.first->second;
18509 if (NumNonConsts == 1) {
18514 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18517 ReuseMask[SinglePos] = SinglePos;
18518 }
else if (!UndefPos.
empty() && IsSplat) {
18523 return !isa<UndefValue>(V) &&
18525 (E->UserTreeIndex &&
any_of(
V->uses(), [E](
const Use &U) {
18528 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18529 is_contained(E->UserTreeIndex.UserTE->Scalars,
18533 if (It != Scalars.
end()) {
18535 int Pos = std::distance(Scalars.
begin(), It);
18536 for (
int I : UndefPos) {
18538 ReuseMask[
I] = Pos;
18547 for (
int I : UndefPos) {
18549 if (isa<UndefValue>(Scalars[
I]))
18556 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18557 bool IsNonPoisoned =
true;
18558 bool IsUsedInExpr =
true;
18559 Value *Vec1 =
nullptr;
18560 if (!ExtractShuffles.
empty()) {
18564 Value *Vec2 =
nullptr;
18565 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18569 if (UseVecBaseAsInput) {
18570 Vec1 = ExtractVecBase;
18572 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18575 if (isa<UndefValue>(StoredGS[
I]))
18577 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
18578 Value *VecOp = EI->getVectorOperand();
18580 !TEs.
empty() && TEs.
front()->VectorizedValue)
18581 VecOp = TEs.
front()->VectorizedValue;
18584 }
else if (Vec1 != VecOp) {
18585 assert((!Vec2 || Vec2 == VecOp) &&
18586 "Expected only 1 or 2 vectors shuffle.");
18592 IsUsedInExpr =
false;
18595 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18598 IsUsedInExpr &= FindReusedSplat(
18600 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
18601 ExtractMask.size(), IsNotPoisonedVec);
18602 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18603 IsNonPoisoned &= IsNotPoisonedVec;
18605 IsUsedInExpr =
false;
18610 if (!GatherShuffles.
empty()) {
18611 unsigned SliceSize =
18615 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18618 "No shuffles with empty entries list expected.");
18622 "Expected shuffle of 1 or 2 entries.");
18626 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18627 if (TEs.
size() == 1) {
18628 bool IsNotPoisonedVec =
18629 TEs.
front()->VectorizedValue
18633 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18634 SliceSize, IsNotPoisonedVec);
18635 ShuffleBuilder.add(*TEs.
front(), VecMask);
18636 IsNonPoisoned &= IsNotPoisonedVec;
18638 IsUsedInExpr =
false;
18639 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18640 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18651 int EMSz = ExtractMask.size();
18652 int MSz =
Mask.size();
18655 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18656 bool IsIdentityShuffle =
18657 ((UseVecBaseAsInput ||
18659 [](
const std::optional<TTI::ShuffleKind> &SK) {
18663 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18665 (!GatherShuffles.
empty() &&
18667 [](
const std::optional<TTI::ShuffleKind> &SK) {
18671 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18673 bool EnoughConstsForShuffle =
18677 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18681 return isa<Constant>(V) && !isa<UndefValue>(V);
18683 (!IsIdentityShuffle ||
18684 (GatheredScalars.size() == 2 &&
18686 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
18688 return isa<Constant>(V) && !isa<PoisonValue>(V);
18692 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18693 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
18699 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18701 TryPackScalars(GatheredScalars, BVMask,
true);
18702 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18703 ShuffleBuilder.add(BV, BVMask);
18706 return isa<PoisonValue>(V) ||
18707 (IsSingleShuffle && ((IsIdentityShuffle &&
18708 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18710 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18713 Res = ShuffleBuilder.finalize(
18714 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18716 bool IsSplat =
isSplat(NonConstants);
18718 TryPackScalars(NonConstants, BVMask,
false);
18719 auto CheckIfSplatIsProfitable = [&]() {
18727 Instruction::InsertElement, VecTy,
CostKind, 0,
18736 Instruction::InsertElement, VecTy,
CostKind,
18741 static_cast<int>(BVMask.size() - 1)) {
18749 return SplatCost <= BVCost;
18751 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18755 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18761 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18763 transform(BVMask, SplatMask.begin(), [](
int I) {
18764 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18767 BV = CreateShuffle(BV,
nullptr, SplatMask);
18771 Vec = CreateShuffle(Vec, BV, Mask);
18780 TryPackScalars(GatheredScalars, ReuseMask,
true);
18781 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18782 ShuffleBuilder.add(BV, ReuseMask);
18783 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18788 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18789 if (!isa<PoisonValue>(V))
18792 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18793 ShuffleBuilder.add(BV, Mask);
18794 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18799 Res = ShuffleBuilder.createFreeze(Res);
18803Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
18804 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
18806 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18814 for (
Value *V : VL)
18815 if (isa<Instruction>(V))
18829 Value *
V = E->Scalars.front();
18830 Type *ScalarTy =
V->getType();
18831 if (!isa<CmpInst>(V))
18833 auto It = MinBWs.
find(E);
18834 if (It != MinBWs.
end()) {
18835 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18840 if (E->VectorizedValue)
18841 return E->VectorizedValue;
18843 if (E->isGather()) {
18845 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18846 setInsertPointAfterBundle(E);
18847 Value *Vec = createBuildVector(E, ScalarTy);
18848 E->VectorizedValue = Vec;
18851 if (E->State == TreeEntry::SplitVectorize) {
18852 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18853 "Expected exactly 2 combined entries.");
18854 setInsertPointAfterBundle(E);
18856 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18859 "Expected same first part of scalars.");
18862 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18865 "Expected same second part of scalars.");
18867 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18868 bool IsSigned =
false;
18869 auto It = MinBWs.
find(OpE);
18870 if (It != MinBWs.
end())
18871 IsSigned = It->second.second;
18874 if (isa<PoisonValue>(V))
18880 if (cast<VectorType>(Op1->
getType())->getElementType() !=
18887 cast<FixedVectorType>(Op1->
getType())->getNumElements()),
18888 GetOperandSignedness(&OpTE1));
18890 if (cast<VectorType>(Op2->
getType())->getElementType() !=
18897 cast<FixedVectorType>(Op2->
getType())->getNumElements()),
18898 GetOperandSignedness(&OpTE2));
18900 if (E->ReorderIndices.empty()) {
18904 std::next(
Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18907 if (ScalarTyNumElements != 1) {
18913 E->CombinedEntriesWithIndices.back().second *
18914 ScalarTyNumElements);
18915 E->VectorizedValue = Vec;
18918 unsigned CommonVF =
18919 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18922 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18928 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18933 E->VectorizedValue = Vec;
18937 bool IsReverseOrder =
18938 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
18939 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
18940 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
18941 if (E->getOpcode() == Instruction::Store &&
18942 E->State == TreeEntry::Vectorize) {
18944 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
18945 E->ReorderIndices.size());
18946 ShuffleBuilder.add(V, Mask);
18947 }
else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
18948 E->State == TreeEntry::CompressVectorize) {
18949 ShuffleBuilder.addOrdered(V, {});
18951 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
18954 E->CombinedEntriesWithIndices.size());
18956 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
18957 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18960 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
18961 "Expected either combined subnodes or reordering");
18962 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
18965 assert(!E->isGather() &&
"Unhandled state");
18966 unsigned ShuffleOrOp =
18967 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
18969 auto GetOperandSignedness = [&](
unsigned Idx) {
18970 const TreeEntry *OpE = getOperandEntry(E,
Idx);
18971 bool IsSigned =
false;
18972 auto It = MinBWs.
find(OpE);
18973 if (It != MinBWs.
end())
18974 IsSigned = It->second.second;
18977 if (isa<PoisonValue>(V))
18979 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18983 switch (ShuffleOrOp) {
18984 case Instruction::PHI: {
18985 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
18986 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
18987 "PHI reordering is free.");
18988 auto *PH = cast<PHINode>(VL0);
18990 PH->getParent()->getFirstNonPHIIt());
18997 PH->getParent()->getFirstInsertionPt());
19000 V = FinalShuffle(V, E);
19002 E->VectorizedValue =
V;
19011 for (
unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19016 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19020 if (!VisitedBBs.
insert(IBB).second) {
19023 TreeEntry *OpTE = getOperandEntry(E,
I);
19024 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19025 OpTE->VectorizedValue = VecOp;
19031 Value *Vec = vectorizeOperand(E,
I);
19032 if (VecTy != Vec->
getType()) {
19034 MinBWs.
contains(getOperandEntry(E,
I))) &&
19035 "Expected item in MinBWs.");
19036 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19042 "Invalid number of incoming values");
19043 assert(E->VectorizedValue &&
"Expected vectorized value.");
19044 return E->VectorizedValue;
19047 case Instruction::ExtractElement: {
19048 Value *
V = E->getSingleOperand(0);
19049 setInsertPointAfterBundle(E);
19050 V = FinalShuffle(V, E);
19051 E->VectorizedValue =
V;
19054 case Instruction::ExtractValue: {
19055 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19060 NewV = FinalShuffle(NewV, E);
19061 E->VectorizedValue = NewV;
19064 case Instruction::InsertElement: {
19065 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19067 Value *
V = vectorizeOperand(E, 1);
19069 Type *ScalarTy =
Op.front()->getType();
19070 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
19072 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
19073 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19078 cast<FixedVectorType>(
V->getType())->getNumElements()),
19083 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
19084 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19086 const unsigned NumElts =
19087 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19088 const unsigned NumScalars = E->Scalars.size();
19091 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19095 if (!E->ReorderIndices.empty()) {
19100 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19103 bool IsIdentity =
true;
19105 Mask.swap(PrevMask);
19106 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19107 Value *Scalar = E->Scalars[PrevMask[
I]];
19109 IsIdentity &= InsertIdx -
Offset ==
I;
19112 if (!IsIdentity || NumElts != NumScalars) {
19113 Value *V2 =
nullptr;
19114 bool IsVNonPoisonous =
19117 if (NumElts != NumScalars &&
Offset == 0) {
19126 InsertMask[*InsertIdx] = *InsertIdx;
19127 if (!
Ins->hasOneUse())
19129 Ins = dyn_cast_or_null<InsertElementInst>(
19130 Ins->getUniqueUndroppableUser());
19133 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19135 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19138 if (!IsFirstPoison.
all()) {
19140 for (
unsigned I = 0;
I < NumElts;
I++) {
19142 IsFirstUndef.
test(
I)) {
19143 if (IsVNonPoisonous) {
19144 InsertMask[
I] =
I < NumScalars ?
I : 0;
19149 if (
Idx >= NumScalars)
19150 Idx = NumScalars - 1;
19151 InsertMask[
I] = NumScalars +
Idx;
19165 if (
auto *
I = dyn_cast<Instruction>(V)) {
19166 GatherShuffleExtractSeq.
insert(
I);
19167 CSEBlocks.
insert(
I->getParent());
19172 for (
unsigned I = 0;
I < NumElts;
I++) {
19177 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19180 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19181 NumElts != NumScalars) {
19182 if (IsFirstUndef.
all()) {
19185 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19186 if (!IsFirstPoison.
all()) {
19187 for (
unsigned I = 0;
I < NumElts;
I++) {
19189 InsertMask[
I] =
I + NumElts;
19196 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
19197 if (
auto *
I = dyn_cast<Instruction>(V)) {
19198 GatherShuffleExtractSeq.
insert(
I);
19199 CSEBlocks.
insert(
I->getParent());
19204 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19205 for (
unsigned I = 0;
I < NumElts;
I++) {
19209 InsertMask[
I] += NumElts;
19212 FirstInsert->getOperand(0), V, InsertMask,
19213 cast<Instruction>(E->Scalars.back())->getName());
19214 if (
auto *
I = dyn_cast<Instruction>(V)) {
19215 GatherShuffleExtractSeq.
insert(
I);
19216 CSEBlocks.
insert(
I->getParent());
19221 ++NumVectorInstructions;
19222 E->VectorizedValue =
V;
19225 case Instruction::ZExt:
19226 case Instruction::SExt:
19227 case Instruction::FPToUI:
19228 case Instruction::FPToSI:
19229 case Instruction::FPExt:
19230 case Instruction::PtrToInt:
19231 case Instruction::IntToPtr:
19232 case Instruction::SIToFP:
19233 case Instruction::UIToFP:
19234 case Instruction::Trunc:
19235 case Instruction::FPTrunc:
19236 case Instruction::BitCast: {
19237 setInsertPointAfterBundle(E);
19239 Value *InVec = vectorizeOperand(E, 0);
19241 auto *CI = cast<CastInst>(VL0);
19243 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
19244 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
19246 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
19249 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
19250 if (SrcIt != MinBWs.
end())
19251 SrcBWSz = SrcIt->second.first;
19253 if (BWSz == SrcBWSz) {
19254 VecOpcode = Instruction::BitCast;
19255 }
else if (BWSz < SrcBWSz) {
19256 VecOpcode = Instruction::Trunc;
19257 }
else if (It != MinBWs.
end()) {
19258 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19259 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19260 }
else if (SrcIt != MinBWs.
end()) {
19261 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19263 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19265 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
19266 !SrcIt->second.second) {
19267 VecOpcode = Instruction::UIToFP;
19269 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19271 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
19272 V = FinalShuffle(V, E);
19274 E->VectorizedValue =
V;
19275 ++NumVectorInstructions;
19278 case Instruction::FCmp:
19279 case Instruction::ICmp: {
19280 setInsertPointAfterBundle(E);
19282 Value *
L = vectorizeOperand(E, 0);
19283 Value *
R = vectorizeOperand(E, 1);
19284 if (
L->getType() !=
R->getType()) {
19286 getOperandEntry(E, 1)->
isGather() ||
19287 MinBWs.
contains(getOperandEntry(E, 0)) ||
19288 MinBWs.
contains(getOperandEntry(E, 1))) &&
19289 "Expected item in MinBWs.");
19290 if (cast<VectorType>(
L->getType())
19292 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
19294 ->getIntegerBitWidth()) {
19295 Type *CastTy =
R->getType();
19298 Type *CastTy =
L->getType();
19306 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
19307 ICmp->setSameSign(
false);
19309 VecTy = cast<FixedVectorType>(
V->getType());
19310 V = FinalShuffle(V, E);
19312 E->VectorizedValue =
V;
19313 ++NumVectorInstructions;
19316 case Instruction::Select: {
19317 setInsertPointAfterBundle(E);
19320 Value *True = vectorizeOperand(E, 1);
19321 Value *False = vectorizeOperand(E, 2);
19324 getOperandEntry(E, 2)->
isGather() ||
19325 MinBWs.
contains(getOperandEntry(E, 1)) ||
19326 MinBWs.
contains(getOperandEntry(E, 2))) &&
19327 "Expected item in MinBWs.");
19328 if (True->
getType() != VecTy)
19329 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
19330 if (False->
getType() != VecTy)
19331 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
19336 assert(TrueNumElements >= CondNumElements &&
19337 TrueNumElements % CondNumElements == 0 &&
19338 "Cannot vectorize Instruction::Select");
19340 "Cannot vectorize Instruction::Select");
19341 if (CondNumElements != TrueNumElements) {
19349 "Cannot vectorize Instruction::Select");
19351 V = FinalShuffle(V, E);
19353 E->VectorizedValue =
V;
19354 ++NumVectorInstructions;
19357 case Instruction::FNeg: {
19358 setInsertPointAfterBundle(E);
19360 Value *
Op = vectorizeOperand(E, 0);
19365 if (
auto *
I = dyn_cast<Instruction>(V))
19368 V = FinalShuffle(V, E);
19370 E->VectorizedValue =
V;
19371 ++NumVectorInstructions;
19375 case Instruction::Freeze: {
19376 setInsertPointAfterBundle(E);
19378 Value *
Op = vectorizeOperand(E, 0);
19380 if (
Op->getType() != VecTy) {
19382 MinBWs.
contains(getOperandEntry(E, 0))) &&
19383 "Expected item in MinBWs.");
19387 V = FinalShuffle(V, E);
19389 E->VectorizedValue =
V;
19390 ++NumVectorInstructions;
19394 case Instruction::Add:
19395 case Instruction::FAdd:
19396 case Instruction::Sub:
19397 case Instruction::FSub:
19398 case Instruction::Mul:
19399 case Instruction::FMul:
19400 case Instruction::UDiv:
19401 case Instruction::SDiv:
19402 case Instruction::FDiv:
19403 case Instruction::URem:
19404 case Instruction::SRem:
19405 case Instruction::FRem:
19406 case Instruction::Shl:
19407 case Instruction::LShr:
19408 case Instruction::AShr:
19409 case Instruction::And:
19410 case Instruction::Or:
19411 case Instruction::Xor: {
19412 setInsertPointAfterBundle(E);
19414 Value *
LHS = vectorizeOperand(E, 0);
19415 Value *
RHS = vectorizeOperand(E, 1);
19416 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
19417 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19420 auto *CI = dyn_cast<ConstantInt>(
Op);
19421 return CI && CI->getValue().countr_one() >= It->second.first;
19423 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
19424 E->VectorizedValue =
V;
19425 ++NumVectorInstructions;
19432 getOperandEntry(E, 1)->
isGather() ||
19433 MinBWs.
contains(getOperandEntry(E, 0)) ||
19434 MinBWs.
contains(getOperandEntry(E, 1))) &&
19435 "Expected item in MinBWs.");
19446 if (
auto *
I = dyn_cast<Instruction>(V)) {
19449 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
19451 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19453 I->setHasNoUnsignedWrap(
false);
19456 V = FinalShuffle(V, E);
19458 E->VectorizedValue =
V;
19459 ++NumVectorInstructions;
19463 case Instruction::Load: {
19466 setInsertPointAfterBundle(E);
19468 LoadInst *LI = cast<LoadInst>(VL0);
19471 if (E->State == TreeEntry::Vectorize) {
19473 }
else if (E->State == TreeEntry::CompressVectorize) {
19474 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19475 CompressEntryToData.at(E);
19482 for (
int I : CompressMask)
19484 if (
auto *VecTy = dyn_cast<FixedVectorType>(LI->
getType())) {
19496 if (
auto *VecTy = dyn_cast<FixedVectorType>(LI->
getType())) {
19503 }
else if (E->State == TreeEntry::StridedVectorize) {
19504 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19505 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19506 PO = IsReverseOrder ? PtrN : Ptr0;
19513 *Diff / (
static_cast<int64_t
>(E->Scalars.size()) - 1);
19515 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
19516 DL->getTypeAllocSize(ScalarTy));
19520 return cast<LoadInst>(V)->getPointerOperand();
19523 std::optional<Value *> Stride =
19532 (IsReverseOrder ? -1 : 1) *
19533 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
19535 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19537 Intrinsic::experimental_vp_strided_load,
19538 {VecTy, PO->
getType(), StrideTy},
19540 Builder.
getInt32(E->Scalars.size())});
19546 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19547 Value *VecPtr = vectorizeOperand(E, 0);
19548 if (isa<FixedVectorType>(ScalarTy)) {
19552 unsigned ScalarTyNumElements =
19553 cast<FixedVectorType>(ScalarTy)->getNumElements();
19554 unsigned VecTyNumElements =
19555 cast<FixedVectorType>(VecTy)->getNumElements();
19556 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19557 "Cannot expand getelementptr.");
19558 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19561 return Builder.getInt64(I % ScalarTyNumElements);
19570 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19573 Value *
V = E->State == TreeEntry::CompressVectorize
19577 V = FinalShuffle(V, E);
19578 E->VectorizedValue =
V;
19579 ++NumVectorInstructions;
19582 case Instruction::Store: {
19583 auto *
SI = cast<StoreInst>(VL0);
19585 setInsertPointAfterBundle(E);
19587 Value *VecValue = vectorizeOperand(E, 0);
19588 if (VecValue->
getType() != VecTy)
19590 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19591 VecValue = FinalShuffle(VecValue, E);
19595 if (E->State == TreeEntry::Vectorize) {
19598 assert(E->State == TreeEntry::StridedVectorize &&
19599 "Expected either strided or consecutive stores.");
19600 if (!E->ReorderIndices.empty()) {
19601 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19602 Ptr =
SI->getPointerOperand();
19604 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19605 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
19607 Intrinsic::experimental_vp_strided_store,
19608 {VecTy,
Ptr->getType(), StrideTy},
19611 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
19613 Builder.
getInt32(E->Scalars.size())});
19622 E->VectorizedValue =
V;
19623 ++NumVectorInstructions;
19626 case Instruction::GetElementPtr: {
19627 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19628 setInsertPointAfterBundle(E);
19630 Value *Op0 = vectorizeOperand(E, 0);
19633 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19634 Value *OpVec = vectorizeOperand(E, J);
19638 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19639 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
19641 for (
Value *V : E->Scalars) {
19642 if (isa<GetElementPtrInst>(V))
19648 V = FinalShuffle(V, E);
19650 E->VectorizedValue =
V;
19651 ++NumVectorInstructions;
19655 case Instruction::Call: {
19656 CallInst *CI = cast<CallInst>(VL0);
19657 setInsertPointAfterBundle(E);
19663 It != MinBWs.
end() ? It->second.first : 0,
TTI);
19666 VecCallCosts.first <= VecCallCosts.second;
19668 Value *ScalarArg =
nullptr;
19674 auto *CEI = cast<CallInst>(VL0);
19675 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
19679 ScalarArg = CEI->getArgOperand(
I);
19682 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
19683 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
19691 Value *OpVec = vectorizeOperand(E,
I);
19692 ScalarArg = CEI->getArgOperand(
I);
19693 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
19695 It == MinBWs.
end()) {
19698 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19699 }
else if (It != MinBWs.
end()) {
19700 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19709 if (!UseIntrinsic) {
19724 V = FinalShuffle(V, E);
19726 E->VectorizedValue =
V;
19727 ++NumVectorInstructions;
19730 case Instruction::ShuffleVector: {
19732 if (
SLPReVec && !E->isAltShuffle()) {
19733 setInsertPointAfterBundle(E);
19734 Value *Src = vectorizeOperand(E, 0);
19736 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19739 return SVSrc->getShuffleMask()[Mask];
19742 SVSrc->getOperand(1), NewMask);
19747 if (
auto *
I = dyn_cast<Instruction>(V))
19749 V = FinalShuffle(V, E);
19751 assert(E->isAltShuffle() &&
19756 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19757 "Invalid Shuffle Vector Operand");
19761 setInsertPointAfterBundle(E);
19762 LHS = vectorizeOperand(E, 0);
19763 RHS = vectorizeOperand(E, 1);
19765 setInsertPointAfterBundle(E);
19766 LHS = vectorizeOperand(E, 0);
19773 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19774 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19775 MinBWs.
contains(getOperandEntry(E, 0)) ||
19776 MinBWs.
contains(getOperandEntry(E, 1))) &&
19777 "Expected item in MinBWs.");
19778 Type *CastTy = VecTy;
19782 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
19784 ->getIntegerBitWidth())
19801 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19802 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
19803 auto *AltCI = cast<CmpInst>(E->getAltOp());
19805 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
19808 unsigned SrcBWSz =
DL->getTypeSizeInBits(
19809 cast<VectorType>(
LHS->
getType())->getElementType());
19810 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
19811 if (BWSz <= SrcBWSz) {
19812 if (BWSz < SrcBWSz)
19815 "Expected same type as operand.");
19816 if (
auto *
I = dyn_cast<Instruction>(LHS))
19818 LHS = FinalShuffle(LHS, E);
19819 E->VectorizedValue =
LHS;
19820 ++NumVectorInstructions;
19831 for (
Value *V : {V0, V1}) {
19832 if (
auto *
I = dyn_cast<Instruction>(V)) {
19833 GatherShuffleExtractSeq.
insert(
I);
19834 CSEBlocks.
insert(
I->getParent());
19843 E->buildAltOpShuffleMask(
19845 assert(E->getMatchingMainOpOrAltOp(
I) &&
19846 "Unexpected main/alternate opcode");
19850 Mask, &OpScalars, &AltScalars);
19854 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19856 if (
auto *
I = dyn_cast<Instruction>(Vec);
19857 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
19859 if (isa<PoisonValue>(V))
19861 auto *IV = cast<Instruction>(V);
19862 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19864 I->setHasNoUnsignedWrap(
false);
19866 DropNuwFlag(V0, E->getOpcode());
19867 DropNuwFlag(V1, E->getAltOpcode());
19869 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19874 if (
auto *
I = dyn_cast<Instruction>(V)) {
19876 GatherShuffleExtractSeq.
insert(
I);
19877 CSEBlocks.
insert(
I->getParent());
19881 E->VectorizedValue =
V;
19882 ++NumVectorInstructions;
19900 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19903 EntryToLastInstruction.
clear();
19905 for (
auto &BSIter : BlocksSchedules)
19906 scheduleBlock(*
this, BSIter.second.get());
19909 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19910 if (TE->isGather())
19912 (void)getLastInstructionInBundle(TE.get());
19923 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19924 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19925 TE->UserTreeIndex.UserTE->hasState() &&
19926 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19927 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19928 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19929 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19930 all_of(TE->UserTreeIndex.UserTE->Scalars,
19931 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19933 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19937 for (
auto &Entry : GatherEntries) {
19945 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19946 if (GatheredLoadsEntriesFirst.has_value() &&
19947 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
19948 (!TE->isGather() || TE->UserTreeIndex)) {
19949 assert((TE->UserTreeIndex ||
19950 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
19951 "Expected gathered load node.");
19960 for (
const TreeEntry *E : PostponedNodes) {
19961 auto *TE =
const_cast<TreeEntry *
>(E);
19962 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
19963 TE->VectorizedValue =
nullptr;
19964 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
19973 if (isa<PHINode>(UserI)) {
19976 for (
User *U : PrevVec->users()) {
19979 auto *UI = dyn_cast<Instruction>(U);
19980 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
19982 if (UI->comesBefore(InsertPt))
19991 if (
auto *VecI = dyn_cast<Instruction>(Vec);
19996 if (Vec->
getType() != PrevVec->getType()) {
19998 PrevVec->getType()->isIntOrIntVectorTy() &&
19999 "Expected integer vector types only.");
20000 std::optional<bool> IsSigned;
20001 for (
Value *V : TE->Scalars) {
20003 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20004 auto It = MinBWs.
find(MNTE);
20005 if (It != MinBWs.
end()) {
20006 IsSigned = IsSigned.value_or(
false) || It->second.second;
20011 if (IsSigned.value_or(
false))
20014 for (
const TreeEntry *BVE : ValueToGatherNodes.
lookup(V)) {
20015 auto It = MinBWs.
find(BVE);
20016 if (It != MinBWs.
end()) {
20017 IsSigned = IsSigned.value_or(
false) || It->second.second;
20022 if (IsSigned.value_or(
false))
20024 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
20026 IsSigned.value_or(
false) ||
20030 if (IsSigned.value_or(
false))
20034 if (IsSigned.value_or(
false)) {
20036 auto It = MinBWs.
find(TE->UserTreeIndex.UserTE);
20037 if (It != MinBWs.
end())
20038 IsSigned = It->second.second;
20041 "Expected user node or perfect diamond match in MinBWs.");
20045 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20048 auto It = PostponedValues.
find(PrevVec);
20049 if (It != PostponedValues.
end()) {
20050 for (TreeEntry *VTE : It->getSecond())
20051 VTE->VectorizedValue = Vec;
20071 for (
const auto &ExternalUse : ExternalUses) {
20072 Value *Scalar = ExternalUse.Scalar;
20079 const TreeEntry *E = &ExternalUse.E;
20080 assert(E &&
"Invalid scalar");
20081 assert(!E->isGather() &&
"Extracting from a gather list");
20083 if (E->getOpcode() == Instruction::GetElementPtr &&
20084 !isa<GetElementPtrInst>(Scalar))
20087 Value *Vec = E->VectorizedValue;
20088 assert(Vec &&
"Can't find vectorizable value");
20091 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20092 if (Scalar->getType() != Vec->
getType()) {
20093 Value *Ex =
nullptr;
20094 Value *ExV =
nullptr;
20095 auto *Inst = dyn_cast<Instruction>(Scalar);
20096 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
20097 auto It = ScalarToEEs.
find(Scalar);
20098 if (It != ScalarToEEs.
end()) {
20101 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20103 if (EEIt != It->second.end()) {
20104 Value *PrevV = EEIt->second.first;
20105 if (
auto *
I = dyn_cast<Instruction>(PrevV);
20106 I && !ReplaceInst &&
20111 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20115 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20123 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20124 IgnoredExtracts.
insert(EE);
20127 auto *CloneInst = Inst->clone();
20128 CloneInst->insertBefore(Inst->getIterator());
20129 if (Inst->hasName())
20133 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20134 ES && isa<Instruction>(Vec)) {
20135 Value *V = ES->getVectorOperand();
20136 auto *IVec = cast<Instruction>(Vec);
20138 V = ETEs.front()->VectorizedValue;
20139 if (
auto *
IV = dyn_cast<Instruction>(V);
20140 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20141 IV->comesBefore(IVec))
20145 }
else if (
auto *VecTy =
20146 dyn_cast<FixedVectorType>(Scalar->getType())) {
20153 ExternalUse.Lane * VecTyNumElements);
20160 if (Scalar->getType() != Ex->
getType())
20162 Ex, Scalar->getType(),
20164 auto *
I = dyn_cast<Instruction>(Ex);
20166 : &
F->getEntryBlock(),
20167 std::make_pair(Ex, ExV));
20171 if (
auto *ExI = dyn_cast<Instruction>(Ex);
20173 GatherShuffleExtractSeq.
insert(ExI);
20174 CSEBlocks.
insert(ExI->getParent());
20178 assert(isa<FixedVectorType>(Scalar->getType()) &&
20179 isa<InsertElementInst>(Scalar) &&
20180 "In-tree scalar of vector type is not insertelement?");
20181 auto *IE = cast<InsertElementInst>(Scalar);
20189 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20192 (ExternallyUsedValues.
count(Scalar) ||
20193 ExternalUsesWithNonUsers.
count(Scalar) ||
20194 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
20198 if (ExternalUsesAsOriginalScalar.contains(U))
20200 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20201 return !UseEntries.empty() &&
20202 (E->State == TreeEntry::Vectorize ||
20203 E->State == TreeEntry::StridedVectorize ||
20204 E->State == TreeEntry::CompressVectorize) &&
20205 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20206 return (UseEntry->State == TreeEntry::Vectorize ||
20208 TreeEntry::StridedVectorize ||
20210 TreeEntry::CompressVectorize) &&
20211 doesInTreeUserNeedToExtract(
20212 Scalar, getRootEntryInstruction(*UseEntry),
20216 "Scalar with nullptr User must be registered in "
20217 "ExternallyUsedValues map or remain as scalar in vectorized "
20219 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
20220 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
20221 if (
PHI->getParent()->isLandingPad())
20225 PHI->getParent()->getLandingPadInst()->getIterator()));
20228 PHI->getParent()->getFirstNonPHIIt());
20231 std::next(VecI->getIterator()));
20236 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20238 if (Scalar != NewInst) {
20239 assert((!isa<ExtractElementInst>(Scalar) ||
20240 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
20241 "Extractelements should not be replaced.");
20242 Scalar->replaceAllUsesWith(NewInst);
20247 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
20250 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20251 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
20252 if (!UsedInserts.
insert(VU).second)
20255 auto BWIt = MinBWs.
find(E);
20257 auto *ScalarTy = FTy->getElementType();
20258 auto Key = std::make_pair(Vec, ScalarTy);
20259 auto VecIt = VectorCasts.
find(Key);
20260 if (VecIt == VectorCasts.
end()) {
20262 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
20263 if (IVec->getParent()->isLandingPad())
20265 std::next(IVec->getParent()
20266 ->getLandingPadInst()
20270 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20271 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
20278 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
20279 BWIt->second.second);
20282 Vec = VecIt->second;
20289 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20296 unsigned Idx = *InsertIdx;
20297 if (It == ShuffledInserts.
end()) {
20299 It = std::next(ShuffledInserts.
begin(),
20300 ShuffledInserts.
size() - 1);
20305 Mask[
Idx] = ExternalUse.Lane;
20306 It->InsertElements.push_back(cast<InsertElementInst>(
User));
20315 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
20317 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20318 if (PH->getIncomingValue(
I) == Scalar) {
20320 PH->getIncomingBlock(
I)->getTerminator();
20321 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20323 std::next(VecI->getIterator()));
20327 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20328 PH->setOperand(
I, NewInst);
20333 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20338 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20348 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
20349 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20351 CombinedMask1[
I] = Mask[
I];
20353 CombinedMask2[
I] = Mask[
I] - VF;
20355 ShuffleInstructionBuilder ShuffleBuilder(
20356 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
20357 ShuffleBuilder.add(V1, CombinedMask1);
20359 ShuffleBuilder.add(V2, CombinedMask2);
20360 return ShuffleBuilder.finalize({}, {}, {});
20364 bool ForSingleMask) {
20365 unsigned VF =
Mask.size();
20366 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
20368 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20369 Vec = CreateShuffle(Vec,
nullptr, Mask);
20370 return std::make_pair(Vec,
true);
20372 if (!ForSingleMask) {
20374 for (
unsigned I = 0;
I < VF; ++
I) {
20378 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20382 return std::make_pair(Vec,
false);
20386 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
20392 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20393 Value *NewInst = performExtractsShuffleAction<Value>(
20397 return cast<VectorType>(Vec->getType())
20398 ->getElementCount()
20399 .getKnownMinValue();
20404 assert((Vals.size() == 1 || Vals.size() == 2) &&
20405 "Expected exactly 1 or 2 input values.");
20406 if (Vals.size() == 1) {
20409 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20410 ->getNumElements() ||
20411 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20412 return CreateShuffle(Vals.front(), nullptr, Mask);
20413 return Vals.front();
20415 return CreateShuffle(Vals.
front() ? Vals.
front()
20417 Vals.
back(), Mask);
20419 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20422 if (It != ShuffledInserts[
I].InsertElements.rend())
20425 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20426 assert(
II &&
"Must be an insertelement instruction.");
20431 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
20434 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20435 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
20436 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20437 II->moveAfter(NewI);
20442 IE->replaceUsesOfWith(
IE->getOperand(0),
20444 IE->replaceUsesOfWith(
IE->getOperand(1),
20453 for (
auto &TEPtr : VectorizableTree) {
20454 TreeEntry *
Entry = TEPtr.get();
20457 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20460 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20463 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20466 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20467 !isa<GetElementPtrInst>(Scalar))
20469 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20470 EE && IgnoredExtracts.contains(EE))
20472 if (!isa<Instruction>(Scalar) ||
Entry->isCopyableElement(Scalar))
20475 Type *Ty = Scalar->getType();
20477 for (
User *U : Scalar->users()) {
20481 assert((isVectorized(U) ||
20482 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20483 (isa_and_nonnull<Instruction>(U) &&
20484 isDeleted(cast<Instruction>(U)))) &&
20485 "Deleting out-of-tree value");
20489 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20490 auto *
I = cast<Instruction>(Scalar);
20497 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20498 V->mergeDIAssignID(RemovedInsts);
20501 if (UserIgnoreList) {
20503 const TreeEntry *
IE = getTreeEntries(
I).front();
20504 if (
IE->Idx != 0 &&
20505 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20506 (ValueToGatherNodes.lookup(
I).contains(
20507 VectorizableTree.front().get()) ||
20508 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20509 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20510 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20511 IE->UserTreeIndex &&
20513 !(GatheredLoadsEntriesFirst.has_value() &&
20514 IE->Idx >= *GatheredLoadsEntriesFirst &&
20515 VectorizableTree.front()->isGather() &&
20517 !(!VectorizableTree.front()->isGather() &&
20518 VectorizableTree.front()->isCopyableElement(
I)))
20523 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20524 (match(U.getUser(), m_LogicalAnd()) ||
20525 match(U.getUser(), m_LogicalOr())) &&
20526 U.getOperandNo() == 0;
20527 if (IsPoisoningLogicalOp) {
20528 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20531 return UserIgnoreList->contains(
U.getUser());
20543 removeInstructionsAndOperands(
ArrayRef(RemovedInsts), VectorValuesAndScales);
20546 InstrElementSize.
clear();
20548 const TreeEntry &RootTE = *VectorizableTree.front();
20549 Value *Vec = RootTE.VectorizedValue;
20550 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20551 It != MinBWs.end() &&
20552 ReductionBitWidth != It->second.first) {
20555 ReductionRoot->getIterator());
20559 cast<VectorType>(Vec->
getType())->getElementCount()),
20560 It->second.second);
20567 <<
" gather sequences instructions.\n");
20574 Loop *L = LI->getLoopFor(
I->getParent());
20579 BasicBlock *PreHeader = L->getLoopPreheader();
20587 auto *OpI = dyn_cast<Instruction>(V);
20588 return OpI && L->contains(OpI);
20594 CSEBlocks.
insert(PreHeader);
20609 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20610 "Different nodes should have different DFS numbers");
20611 return A->getDFSNumIn() <
B->getDFSNumIn();
20622 if (I1->getType() != I2->getType())
20624 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20625 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20627 return I1->isIdenticalTo(I2);
20628 if (SI1->isIdenticalTo(SI2))
20630 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20631 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20634 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20638 unsigned LastUndefsCnt = 0;
20639 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20645 NewMask[
I] != SM1[
I])
20648 NewMask[
I] = SM1[
I];
20652 return SM1.
size() - LastUndefsCnt > 1 &&
20656 SM1.
size() - LastUndefsCnt));
20662 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20665 "Worklist not sorted properly!");
20671 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
20672 !GatherShuffleExtractSeq.contains(&In))
20677 bool Replaced =
false;
20680 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20681 DT->
dominates(V->getParent(), In.getParent())) {
20682 In.replaceAllUsesWith(V);
20684 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
20685 if (!NewMask.
empty())
20686 SI->setShuffleMask(NewMask);
20690 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
20691 GatherShuffleExtractSeq.contains(V) &&
20692 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20693 DT->
dominates(In.getParent(), V->getParent())) {
20695 V->replaceAllUsesWith(&In);
20697 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20698 if (!NewMask.
empty())
20699 SI->setShuffleMask(NewMask);
20707 Visited.push_back(&In);
20712 GatherShuffleExtractSeq.clear();
20715BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20718 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20719 for (
Value *V : VL) {
20720 if (S.isNonSchedulable(V))
20722 auto *
I = cast<Instruction>(V);
20723 if (S.isCopyableElement(V)) {
20725 ScheduleCopyableData &SD =
20726 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20728 BundlePtr->add(&SD);
20731 ScheduleData *BundleMember = getScheduleData(V);
20732 assert(BundleMember &&
"no ScheduleData for bundle member "
20733 "(maybe not in same basic block)");
20735 BundlePtr->add(BundleMember);
20736 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20739 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20745std::optional<BoUpSLP::ScheduleBundle *>
20747 const InstructionsState &S,
20748 const EdgeInfo &EI) {
20751 bool HasCopyables = S.areInstructionsWithCopyableElements();
20752 if (isa<PHINode>(S.getMainOp()) ||
20755 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
20760 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20762 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20766 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20768 for (ScheduleEntity *SE : Bundle.getBundle()) {
20769 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20770 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20771 BundleMember && BundleMember->hasValidDependencies()) {
20772 BundleMember->clearDirectDependencies();
20773 if (RegionHasStackSave ||
20775 BundleMember->getInst()))
20776 ControlDependentMembers.
push_back(BundleMember);
20780 auto *SD = cast<ScheduleData>(SE);
20781 for (
const Use &U : SD->getInst()->operands()) {
20784 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20785 .first->getSecond();
20787 if (
auto *
Op = dyn_cast<Instruction>(
U.get());
20788 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20790 if (ScheduleData *OpSD = getScheduleData(
Op);
20791 OpSD && OpSD->hasValidDependencies()) {
20792 OpSD->clearDirectDependencies();
20793 if (RegionHasStackSave ||
20795 ControlDependentMembers.
push_back(OpSD);
20806 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20807 for_each(ScheduleDataMap, [&](
auto &
P) {
20808 if (BB !=
P.first->getParent())
20810 ScheduleData *SD =
P.second;
20811 if (isInSchedulingRegion(*SD))
20812 SD->clearDependencies();
20814 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20815 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20816 if (isInSchedulingRegion(*SD))
20817 SD->clearDependencies();
20824 if (Bundle && !Bundle.getBundle().empty()) {
20825 if (S.areInstructionsWithCopyableElements() ||
20826 !ScheduleCopyableDataMap.empty())
20827 CheckIfNeedToClearDeps(Bundle);
20828 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20830 calculateDependencies(Bundle, !ReSchedule, SLP,
20831 ControlDependentMembers);
20832 }
else if (!ControlDependentMembers.
empty()) {
20833 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20834 calculateDependencies(
Invalid, !ReSchedule, SLP,
20835 ControlDependentMembers);
20840 initialFillReadyList(ReadyInsts);
20847 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20848 !ReadyInsts.empty()) {
20849 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20850 assert(Picked->isReady() &&
"must be ready to schedule");
20851 schedule(*SLP, S, EI, Picked, ReadyInsts);
20852 if (Picked == &Bundle)
20859 for (
Value *V : VL) {
20860 if (S.isNonSchedulable(V))
20862 if (!extendSchedulingRegion(V, S)) {
20869 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20870 TryScheduleBundleImpl(
false,
Invalid);
20871 return std::nullopt;
20875 bool ReSchedule =
false;
20876 for (
Value *V : VL) {
20877 if (S.isNonSchedulable(V))
20880 getScheduleCopyableData(cast<Instruction>(V));
20881 if (!CopyableData.
empty()) {
20882 for (ScheduleCopyableData *SD : CopyableData)
20883 ReadyInsts.remove(SD);
20885 ScheduleData *BundleMember = getScheduleData(V);
20886 assert((BundleMember || S.isCopyableElement(V)) &&
20887 "no ScheduleData for bundle member (maybe not in same basic block)");
20893 ReadyInsts.remove(BundleMember);
20895 !Bundles.
empty()) {
20896 for (ScheduleBundle *
B : Bundles)
20897 ReadyInsts.remove(
B);
20900 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20907 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
20908 <<
" was already scheduled\n");
20912 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
20913 TryScheduleBundleImpl(ReSchedule, Bundle);
20914 if (!Bundle.isReady()) {
20915 for (ScheduleEntity *BD : Bundle.getBundle()) {
20917 if (isa<ScheduleCopyableData>(BD))
20919 if (BD->isReady()) {
20921 if (Bundles.
empty()) {
20922 ReadyInsts.insert(BD);
20925 for (ScheduleBundle *
B : Bundles)
20927 ReadyInsts.insert(
B);
20930 ScheduledBundlesList.pop_back();
20933 for (
Value *V : VL) {
20934 if (S.isNonSchedulable(V))
20936 auto *
I = cast<Instruction>(V);
20937 if (S.isCopyableElement(
I)) {
20940 auto KV = std::make_pair(EI,
I);
20941 assert(ScheduleCopyableDataMap.contains(KV) &&
20942 "no ScheduleCopyableData for copyable element");
20943 ScheduleCopyableData *SD =
20944 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
20945 ScheduleCopyableDataMapByUsers[
I].remove(SD);
20948 const auto *It =
find(
Op,
I);
20949 assert(It !=
Op.end() &&
"Lane not set");
20952 int Lane = std::distance(
Op.begin(), It);
20953 assert(Lane >= 0 &&
"Lane not set");
20954 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
20955 !EI.UserTE->ReorderIndices.empty())
20956 Lane = EI.UserTE->ReorderIndices[Lane];
20957 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
20958 "Couldn't find extract lane");
20959 auto *
In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
20960 if (!Visited.
insert(In).second) {
20964 ScheduleCopyableDataMapByInstUser
20965 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
20968 }
while (It !=
Op.end());
20969 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
20970 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
20971 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
20973 if (ScheduleCopyableDataMapByUsers[
I].empty())
20974 ScheduleCopyableDataMapByUsers.erase(
I);
20975 ScheduleCopyableDataMap.erase(KV);
20977 if (ScheduleData *OpSD = getScheduleData(
I);
20978 OpSD && OpSD->hasValidDependencies()) {
20979 OpSD->clearDirectDependencies();
20980 if (RegionHasStackSave ||
20982 ControlDependentMembers.
push_back(OpSD);
20986 ScheduledBundles.find(
I)->getSecond().pop_back();
20988 if (!ControlDependentMembers.
empty()) {
20989 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20990 calculateDependencies(
Invalid,
false, SLP,
20991 ControlDependentMembers);
20993 return std::nullopt;
20998BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21000 if (ChunkPos >= ChunkSize) {
21001 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21004 return &(ScheduleDataChunks.back()[ChunkPos++]);
21007bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21008 Value *V,
const InstructionsState &S) {
21010 assert(
I &&
"bundle member must be an instruction");
21011 if (getScheduleData(
I))
21013 if (!ScheduleStart) {
21015 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21017 ScheduleEnd =
I->getNextNode();
21018 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21019 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21027 ++ScheduleStart->getIterator().getReverse();
21032 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
21033 return II->isAssumeLikeIntrinsic();
21036 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21037 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21038 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21040 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21041 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21048 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21049 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21051 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21052 assert(
I->getParent() == ScheduleStart->getParent() &&
21053 "Instruction is in wrong basic block.");
21054 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21060 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21061 "Expected to reach top of the basic block or instruction down the "
21063 assert(
I->getParent() == ScheduleEnd->getParent() &&
21064 "Instruction is in wrong basic block.");
21065 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21067 ScheduleEnd =
I->getNextNode();
21068 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21069 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21073void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
21075 ScheduleData *PrevLoadStore,
21076 ScheduleData *NextLoadStore) {
21077 ScheduleData *CurrentLoadStore = PrevLoadStore;
21080 if (isa<PHINode>(
I))
21082 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21084 SD = allocateScheduleDataChunks();
21085 ScheduleDataMap[
I] = SD;
21087 assert(!isInSchedulingRegion(*SD) &&
21088 "new ScheduleData already in scheduling region");
21089 SD->init(SchedulingRegionID,
I);
21091 if (
I->mayReadOrWriteMemory() &&
21092 (!isa<IntrinsicInst>(
I) ||
21093 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
21094 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
21095 Intrinsic::pseudoprobe))) {
21097 if (CurrentLoadStore) {
21098 CurrentLoadStore->setNextLoadStore(SD);
21100 FirstLoadStoreInRegion = SD;
21102 CurrentLoadStore = SD;
21105 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
21106 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
21107 RegionHasStackSave =
true;
21109 if (NextLoadStore) {
21110 if (CurrentLoadStore)
21111 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21113 LastLoadStoreInRegion = CurrentLoadStore;
21117void BoUpSLP::BlockScheduling::calculateDependencies(
21118 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21121 auto ProcessNode = [&](ScheduleEntity *SE) {
21122 if (
auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21123 if (CD->hasValidDependencies())
21126 CD->initDependencies();
21127 CD->resetUnscheduledDeps();
21128 const EdgeInfo &EI = CD->getEdgeInfo();
21131 const auto *It =
find(
Op, CD->getInst());
21132 assert(It !=
Op.end() &&
"Lane not set");
21135 int Lane = std::distance(
Op.begin(), It);
21136 assert(Lane >= 0 &&
"Lane not set");
21137 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21138 !EI.UserTE->ReorderIndices.empty())
21139 Lane = EI.UserTE->ReorderIndices[Lane];
21140 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21141 "Couldn't find extract lane");
21142 auto *
In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21143 if (EI.UserTE->isCopyableElement(In)) {
21146 if (ScheduleCopyableData *UseSD =
21147 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21148 CD->incDependencies();
21149 if (!UseSD->isScheduled())
21150 CD->incrementUnscheduledDeps(1);
21151 if (!UseSD->hasValidDependencies() ||
21152 (InsertInReadyList && UseSD->isReady()))
21155 }
else if (Visited.
insert(In).second) {
21156 if (ScheduleData *UseSD = getScheduleData(In)) {
21157 CD->incDependencies();
21158 if (!UseSD->isScheduled())
21159 CD->incrementUnscheduledDeps(1);
21160 if (!UseSD->hasValidDependencies() ||
21161 (InsertInReadyList && UseSD->isReady()))
21166 }
while (It !=
Op.end());
21167 if (CD->isReady() && CD->getDependencies() == 0 &&
21168 (EI.UserTE->hasState() &&
21169 (EI.UserTE->getMainOp()->getParent() !=
21170 CD->getInst()->getParent() ||
21171 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21172 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21173 any_of(EI.UserTE->getMainOp()->users(), [&](
User *U) {
21174 auto *IU = dyn_cast<Instruction>(U);
21177 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21183 CD->incDependencies();
21184 CD->incrementUnscheduledDeps(1);
21189 auto *BundleMember = cast<ScheduleData>(SE);
21190 if (BundleMember->hasValidDependencies())
21192 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21193 BundleMember->initDependencies();
21194 BundleMember->resetUnscheduledDeps();
21197 for (
User *U : BundleMember->getInst()->
users()) {
21198 if (isa<PHINode>(U))
21200 if (ScheduleData *UseSD = getScheduleData(U)) {
21202 unsigned &NumOps = UserToNumOps.
try_emplace(U, 0).first->getSecond();
21204 if (areAllOperandsReplacedByCopyableData(
21205 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21207 BundleMember->incDependencies();
21208 if (!UseSD->isScheduled())
21209 BundleMember->incrementUnscheduledDeps(1);
21210 if (!UseSD->hasValidDependencies() ||
21211 (InsertInReadyList && UseSD->isReady()))
21215 for (ScheduleCopyableData *UseSD :
21216 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21217 BundleMember->incDependencies();
21218 if (!UseSD->isScheduled())
21219 BundleMember->incrementUnscheduledDeps(1);
21220 if (!UseSD->hasValidDependencies() ||
21221 (InsertInReadyList && UseSD->isReady()))
21228 if (!Visited.
insert(
I).second)
21230 auto *DepDest = getScheduleData(
I);
21231 assert(DepDest &&
"must be in schedule window");
21232 DepDest->addControlDependency(BundleMember);
21233 BundleMember->incDependencies();
21234 if (!DepDest->isScheduled())
21235 BundleMember->incrementUnscheduledDeps(1);
21236 if (!DepDest->hasValidDependencies() ||
21237 (InsertInReadyList && DepDest->isReady()))
21245 for (
Instruction *
I = BundleMember->getInst()->getNextNode();
21246 I != ScheduleEnd;
I =
I->getNextNode()) {
21251 MakeControlDependent(
I);
21259 if (RegionHasStackSave) {
21263 if (
match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21264 match(BundleMember->getInst(),
21265 m_Intrinsic<Intrinsic::stackrestore>())) {
21266 for (
Instruction *
I = BundleMember->getInst()->getNextNode();
21267 I != ScheduleEnd;
I =
I->getNextNode()) {
21268 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
21269 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
21274 if (!isa<AllocaInst>(
I))
21278 MakeControlDependent(
I);
21287 if (isa<AllocaInst>(BundleMember->getInst()) ||
21288 BundleMember->getInst()->mayReadOrWriteMemory()) {
21289 for (
Instruction *
I = BundleMember->getInst()->getNextNode();
21290 I != ScheduleEnd;
I =
I->getNextNode()) {
21291 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
21292 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
21296 MakeControlDependent(
I);
21303 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21304 if (!NextLoadStore)
21308 "NextLoadStore list for non memory effecting bundle?");
21311 unsigned NumAliased = 0;
21312 unsigned DistToSrc = 1;
21313 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21315 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21316 DepDest = DepDest->getNextLoadStore()) {
21317 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21327 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21329 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21336 DepDest->addMemoryDependency(BundleMember);
21337 BundleMember->incDependencies();
21338 if (!DepDest->isScheduled())
21339 BundleMember->incrementUnscheduledDeps(1);
21340 if (!DepDest->hasValidDependencies() ||
21341 (InsertInReadyList && DepDest->isReady()))
21365 "expected at least one instruction to schedule");
21367 WorkList.
push_back(Bundle.getBundle().front());
21370 while (!WorkList.
empty()) {
21374 if (
auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21375 CopyableBundle.
push_back(&CD->getBundle());
21376 Bundles = CopyableBundle;
21378 Bundles = getScheduleBundles(SD->getInst());
21380 if (Bundles.
empty()) {
21381 if (!SD->hasValidDependencies())
21383 if (InsertInReadyList && SD->isReady()) {
21384 ReadyInsts.insert(SD);
21385 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21389 for (ScheduleBundle *Bundle : Bundles) {
21390 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21392 assert(isInSchedulingRegion(*Bundle) &&
21393 "ScheduleData not in scheduling region");
21394 for_each(Bundle->getBundle(), ProcessNode);
21396 if (InsertInReadyList && SD->isReady()) {
21397 for (ScheduleBundle *Bundle : Bundles) {
21398 assert(isInSchedulingRegion(*Bundle) &&
21399 "ScheduleData not in scheduling region");
21400 if (!Bundle->isReady())
21402 ReadyInsts.insert(Bundle);
21410void BoUpSLP::BlockScheduling::resetSchedule() {
21412 "tried to reset schedule on block which has not been scheduled");
21413 for_each(ScheduleDataMap, [&](
auto &
P) {
21414 if (BB !=
P.first->getParent())
21416 ScheduleData *SD =
P.second;
21417 if (isInSchedulingRegion(*SD)) {
21418 SD->setScheduled(
false);
21419 SD->resetUnscheduledDeps();
21422 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21423 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21424 if (isInSchedulingRegion(*SD)) {
21425 SD->setScheduled(false);
21426 SD->resetUnscheduledDeps();
21430 for_each(ScheduledBundles, [&](
auto &
P) {
21431 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21432 if (isInSchedulingRegion(*Bundle))
21433 Bundle->setScheduled(false);
21437 for (
auto &
P : ScheduleCopyableDataMap) {
21438 if (isInSchedulingRegion(*
P.second)) {
21439 P.second->setScheduled(
false);
21440 P.second->resetUnscheduledDeps();
21443 ReadyInsts.clear();
21446void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21447 if (!BS->ScheduleStart)
21450 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21457 BS->resetSchedule();
21464 struct ScheduleDataCompare {
21465 bool operator()(
const ScheduleEntity *SD1,
21466 const ScheduleEntity *SD2)
const {
21467 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21470 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21475 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21476 I =
I->getNextNode()) {
21478 if (!Bundles.
empty()) {
21479 for (ScheduleBundle *Bundle : Bundles) {
21480 Bundle->setSchedulingPriority(
Idx++);
21481 if (!Bundle->hasValidDependencies())
21482 BS->calculateDependencies(*Bundle,
false,
this);
21485 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21486 ScheduleBundle &Bundle = SD->getBundle();
21487 Bundle.setSchedulingPriority(
Idx++);
21488 if (!Bundle.hasValidDependencies())
21489 BS->calculateDependencies(Bundle,
false,
this);
21494 BS->getScheduleCopyableDataUsers(
I);
21495 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21498 SDTEs.
front()->doesNotNeedToSchedule() ||
21500 "scheduler and vectorizer bundle mismatch");
21501 SD->setSchedulingPriority(
Idx++);
21502 if (!SD->hasValidDependencies() &&
21503 (!CopyableData.
empty() ||
21504 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21505 assert(TE->isGather() &&
"expected gather node");
21506 return TE->hasState() && TE->hasCopyableElements() &&
21507 TE->isCopyableElement(I);
21513 ScheduleBundle Bundle;
21515 BS->calculateDependencies(Bundle,
false,
this);
21518 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21519 ScheduleBundle &Bundle = SD->getBundle();
21520 Bundle.setSchedulingPriority(
Idx++);
21521 if (!Bundle.hasValidDependencies())
21522 BS->calculateDependencies(Bundle,
false,
this);
21525 BS->initialFillReadyList(ReadyInsts);
21527 Instruction *LastScheduledInst = BS->ScheduleEnd;
21531 while (!ReadyInsts.empty()) {
21532 auto *Picked = *ReadyInsts.
begin();
21533 ReadyInsts.erase(ReadyInsts.begin());
21537 if (
auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21538 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21539 Instruction *PickedInst = BundleMember->getInst();
21541 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21542 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21543 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21545 if (PickedInst->
getNextNode() != LastScheduledInst)
21547 LastScheduledInst = PickedInst;
21549 EntryToLastInstruction.
try_emplace(Bundle->getTreeEntry(),
21550 LastScheduledInst);
21552 auto *SD = cast<ScheduleData>(Picked);
21554 if (PickedInst->
getNextNode() != LastScheduledInst)
21556 LastScheduledInst = PickedInst;
21558 auto Invalid = InstructionsState::invalid();
21559 BS->schedule(R,
Invalid, EdgeInfo(), Picked, ReadyInsts);
21563#ifdef EXPENSIVE_CHECKS
21567#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21569 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21570 I =
I->getNextNode()) {
21573 [](
const ScheduleBundle *Bundle) {
21574 return Bundle->isScheduled();
21576 "must be scheduled at this point");
21581 BS->ScheduleStart =
nullptr;
21588 if (
auto *Store = dyn_cast<StoreInst>(V))
21589 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21591 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
21594 auto E = InstrElementSize.
find(V);
21595 if (E != InstrElementSize.
end())
21604 if (
auto *
I = dyn_cast<Instruction>(V)) {
21612 Value *FirstNonBool =
nullptr;
21613 while (!Worklist.
empty()) {
21618 auto *Ty =
I->getType();
21619 if (isa<VectorType>(Ty))
21621 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
21628 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
21629 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
21637 for (
Use &U :
I->operands()) {
21638 if (
auto *J = dyn_cast<Instruction>(U.get()))
21639 if (Visited.
insert(J).second &&
21640 (isa<PHINode>(
I) || J->getParent() == Parent)) {
21644 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
21645 FirstNonBool = U.get();
21656 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
21658 Width =
DL->getTypeSizeInBits(V->getType());
21662 InstrElementSize[
I] = Width;
21667bool BoUpSLP::collectValuesToDemote(
21668 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21671 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21673 if (
all_of(E.Scalars, IsaPred<Constant>))
21676 unsigned OrigBitWidth =
21677 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21684 if (NodesToKeepBWs.
contains(E.Idx))
21690 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21691 if (isa<PoisonValue>(R))
21693 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21695 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21696 if (isa<PoisonValue>(V))
21698 if (getTreeEntries(V).size() > 1)
21704 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21710 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21713 if (
auto *
I = dyn_cast<Instruction>(V)) {
21715 unsigned BitWidth2 =
21716 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21717 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21723 BitWidth1 = std::min(BitWidth1, BitWidth2);
21728 auto FinalAnalysis = [&,
TTI =
TTI]() {
21729 if (!IsProfitableToDemote)
21732 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21734 if (Res && E.isGather()) {
21735 if (E.hasState()) {
21736 if (
const TreeEntry *SameTE =
21737 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21739 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21740 ToDemote, Visited, NodesToKeepBWs,
21741 MaxDepthLevel, IsProfitableToDemote,
21750 for (
Value *V : E.Scalars) {
21751 auto *EE = dyn_cast<ExtractElementInst>(V);
21754 UniqueBases.
insert(EE->getVectorOperand());
21756 const unsigned VF = E.Scalars.size();
21757 Type *OrigScalarTy = E.Scalars.front()->getType();
21758 if (UniqueBases.
size() <= 2 ||
21771 if (E.isGather() || !Visited.
insert(&E).second ||
21773 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21774 return isa<InsertElementInst>(U) && !isVectorized(U);
21777 return FinalAnalysis();
21780 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21781 return isVectorized(U) ||
21782 (E.Idx == 0 && UserIgnoreList &&
21783 UserIgnoreList->contains(U)) ||
21784 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21785 !U->getType()->isScalableTy() &&
21786 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21787 }) && !IsPotentiallyTruncated(V,
BitWidth);
21792 bool &NeedToExit) {
21793 NeedToExit =
false;
21794 unsigned InitLevel = MaxDepthLevel;
21796 unsigned Level = InitLevel;
21797 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21798 ToDemote, Visited, NodesToKeepBWs, Level,
21799 IsProfitableToDemote, IsTruncRoot)) {
21800 if (!IsProfitableToDemote)
21803 if (!FinalAnalysis())
21807 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21811 auto AttemptCheckBitwidth =
21814 NeedToExit =
false;
21815 unsigned BestFailBitwidth = 0;
21817 if (Checker(
BitWidth, OrigBitWidth))
21819 if (BestFailBitwidth == 0 && FinalAnalysis())
21823 if (BestFailBitwidth == 0) {
21834 auto TryProcessInstruction =
21840 for (
Value *V : E.Scalars)
21841 (void)IsPotentiallyTruncated(V,
BitWidth);
21846 return !
V->hasOneUse() && !IsPotentiallyTruncated(V,
BitWidth);
21849 bool NeedToExit =
false;
21850 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21854 if (!ProcessOperands(
Operands, NeedToExit))
21863 return IsProfitableToDemote;
21866 if (E.State == TreeEntry::SplitVectorize)
21867 return TryProcessInstruction(
21869 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].
get(),
21870 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21872 switch (E.getOpcode()) {
21876 case Instruction::Trunc:
21877 if (IsProfitableToDemoteRoot)
21878 IsProfitableToDemote =
true;
21879 return TryProcessInstruction(
BitWidth);
21880 case Instruction::ZExt:
21881 case Instruction::SExt:
21882 IsProfitableToDemote =
true;
21883 return TryProcessInstruction(
BitWidth);
21887 case Instruction::Add:
21888 case Instruction::Sub:
21889 case Instruction::Mul:
21890 case Instruction::And:
21891 case Instruction::Or:
21892 case Instruction::Xor: {
21893 return TryProcessInstruction(
21894 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21896 case Instruction::Freeze:
21897 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
21898 case Instruction::Shl: {
21903 if (isa<PoisonValue>(V))
21905 auto *I = cast<Instruction>(V);
21906 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21907 return AmtKnownBits.getMaxValue().ult(BitWidth);
21910 return TryProcessInstruction(
21911 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
21913 case Instruction::LShr: {
21917 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21919 if (isa<PoisonValue>(V))
21921 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21922 if (E.isCopyableElement(V))
21923 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
21924 auto *I = cast<Instruction>(V);
21925 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21926 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21927 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
21928 SimplifyQuery(*DL));
21931 return TryProcessInstruction(
21932 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21935 case Instruction::AShr: {
21939 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21941 if (isa<PoisonValue>(V))
21943 auto *I = cast<Instruction>(V);
21944 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21945 unsigned ShiftedBits = OrigBitWidth - BitWidth;
21946 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
21948 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
21951 return TryProcessInstruction(
21952 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
21955 case Instruction::UDiv:
21956 case Instruction::URem: {
21958 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
21961 auto *I = cast<Instruction>(V);
21962 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21963 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
21964 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
21967 return TryProcessInstruction(
21968 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
21972 case Instruction::Select: {
21973 return TryProcessInstruction(
21974 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
21978 case Instruction::PHI: {
21979 const unsigned NumOps = E.getNumOperands();
21982 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
21984 return TryProcessInstruction(
BitWidth, Ops);
21987 case Instruction::Call: {
21988 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
21992 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
21993 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
21997 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22000 auto *I = cast<Instruction>(V);
22001 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22002 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22003 return MaskedValueIsZero(I->getOperand(0), Mask,
22004 SimplifyQuery(*DL)) &&
22005 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22007 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22008 "Expected min/max intrinsics only.");
22009 unsigned SignBits = OrigBitWidth -
BitWidth;
22011 unsigned Op0SignBits =
22013 unsigned Op1SignBits =
22015 return SignBits <= Op0SignBits &&
22016 ((SignBits != Op0SignBits &&
22020 SignBits <= Op1SignBits &&
22021 ((SignBits != Op1SignBits &&
22026 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22029 auto *I = cast<Instruction>(V);
22030 unsigned SignBits = OrigBitWidth - BitWidth;
22031 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22032 unsigned Op0SignBits =
22033 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22034 return SignBits <= Op0SignBits &&
22035 ((SignBits != Op0SignBits &&
22036 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22037 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22040 if (
ID != Intrinsic::abs) {
22041 Operands.push_back(getOperandEntry(&E, 1));
22042 CallChecker = CompChecker;
22044 CallChecker = AbsChecker;
22047 std::numeric_limits<InstructionCost::CostType>::max();
22049 unsigned VF = E.Scalars.size();
22059 if (
Cost < BestCost) {
22065 [[maybe_unused]]
bool NeedToExit;
22066 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22076 return FinalAnalysis();
22083 bool IsStoreOrInsertElt =
22084 VectorizableTree.front()->hasState() &&
22085 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22086 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22087 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22088 ExtraBitWidthNodes.
size() <= 1 &&
22089 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22090 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22093 unsigned NodeIdx = 0;
22094 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22098 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22099 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22100 "Unexpected tree is graph.");
22104 bool IsTruncRoot =
false;
22105 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22108 if (NodeIdx != 0 &&
22109 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22110 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22111 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22112 IsTruncRoot =
true;
22114 IsProfitableToDemoteRoot =
true;
22119 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
22123 auto ComputeMaxBitWidth =
22124 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22125 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22129 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22130 !NodesToKeepBWs.
contains(E.Idx) &&
22131 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22133 return V->hasOneUse() || isa<Constant>(V) ||
22136 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22137 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22138 if (TEs.empty() || is_contained(TEs, UserTE))
22140 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22142 isa<SIToFPInst, UIToFPInst>(U) ||
22143 (UserTE->hasState() &&
22144 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22145 SelectInst>(UserTE->getMainOp()) ||
22146 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22148 unsigned UserTESz = DL->getTypeSizeInBits(
22149 UserTE->Scalars.front()->getType());
22150 if (all_of(TEs, [&](const TreeEntry *TE) {
22151 auto It = MinBWs.find(TE);
22152 return It != MinBWs.end() &&
22153 It->second.first > UserTESz;
22156 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22160 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22161 auto It = MinBWs.
find(UserTE);
22162 if (It != MinBWs.
end())
22163 return It->second.first;
22164 unsigned MaxBitWidth =
22165 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22166 MaxBitWidth =
bit_ceil(MaxBitWidth);
22167 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22169 return MaxBitWidth;
22175 unsigned VF = E.getVectorFactor();
22176 Type *ScalarTy = E.Scalars.front()->getType();
22178 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
22183 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22192 unsigned MaxBitWidth = 1u;
22200 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22201 if (isa<PoisonValue>(R))
22203 KnownBits Known = computeKnownBits(R, *DL);
22204 return Known.isNonNegative();
22207 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22208 E.UserTreeIndex.UserTE->hasState() &&
22209 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22211 std::min(
DL->getTypeSizeInBits(
22212 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22213 DL->getTypeSizeInBits(ScalarTy));
22217 for (
Value *Root : E.Scalars) {
22218 if (isa<PoisonValue>(Root))
22223 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22239 if (!IsKnownPositive)
22242 auto *
I = dyn_cast<Instruction>(Root);
22244 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22248 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22250 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22253 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22258 if (NumParts > 1 &&
22266 unsigned Opcode = E.getOpcode();
22267 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22268 Opcode == Instruction::SExt ||
22269 Opcode == Instruction::ZExt || NumParts > 1;
22274 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22275 bool NeedToDemote = IsProfitableToDemote;
22277 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22278 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22279 NeedToDemote, IsTruncRoot) ||
22280 (MaxDepthLevel <= Limit &&
22281 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22282 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22283 DL->getTypeSizeInBits(TreeRootIT) /
22284 DL->getTypeSizeInBits(
22285 E.getMainOp()->getOperand(0)->getType()) >
22289 MaxBitWidth =
bit_ceil(MaxBitWidth);
22291 return MaxBitWidth;
22298 if (UserIgnoreList &&
22299 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22302 if (
all_of(*UserIgnoreList,
22304 return isa<PoisonValue>(V) ||
22305 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22307 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22308 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22309 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22311 ReductionBitWidth = 1;
22313 for (
Value *V : *UserIgnoreList) {
22314 if (isa<PoisonValue>(V))
22317 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22318 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22321 unsigned BitWidth2 = BitWidth1;
22323 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
22324 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22326 ReductionBitWidth =
22327 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22329 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22330 ReductionBitWidth = 8;
22332 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22335 bool IsTopRoot = NodeIdx == 0;
22336 while (NodeIdx < VectorizableTree.size() &&
22337 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22338 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22339 RootDemotes.push_back(NodeIdx);
22341 IsTruncRoot =
true;
22343 bool IsSignedCmp =
false;
22344 if (UserIgnoreList &&
all_of(*UserIgnoreList, [](
Value *V) {
22348 IsSignedCmp =
true;
22349 while (NodeIdx < VectorizableTree.size()) {
22351 unsigned Limit = 2;
22353 ReductionBitWidth ==
22354 DL->getTypeSizeInBits(
22355 VectorizableTree.front()->Scalars.front()->getType()))
22357 unsigned MaxBitWidth = ComputeMaxBitWidth(
22358 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22359 IsTruncRoot, IsSignedCmp);
22360 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22361 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22362 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22363 else if (MaxBitWidth == 0)
22364 ReductionBitWidth = 0;
22367 for (
unsigned Idx : RootDemotes) {
22370 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22371 if (OrigBitWidth > MaxBitWidth) {
22379 RootDemotes.clear();
22381 IsProfitableToDemoteRoot =
true;
22383 if (ExtraBitWidthNodes.empty()) {
22384 NodeIdx = VectorizableTree.size();
22386 unsigned NewIdx = 0;
22388 NewIdx = *ExtraBitWidthNodes.begin();
22389 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22390 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22393 NodeIdx < VectorizableTree.size() &&
22394 VectorizableTree[NodeIdx]->UserTreeIndex &&
22395 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22396 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22397 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22398 Instruction::Trunc &&
22399 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22401 NodeIdx < VectorizableTree.size() &&
22402 VectorizableTree[NodeIdx]->UserTreeIndex &&
22403 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22404 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22405 Instruction::ICmp &&
22407 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22409 auto *IC = dyn_cast<ICmpInst>(V);
22410 return IC && (IC->isSigned() ||
22411 !isKnownNonNegative(IC->getOperand(0),
22412 SimplifyQuery(*DL)) ||
22413 !isKnownNonNegative(IC->getOperand(1),
22414 SimplifyQuery(*DL)));
22420 if (MaxBitWidth == 0 ||
22422 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
22424 if (UserIgnoreList)
22432 for (
unsigned Idx : ToDemote) {
22433 TreeEntry *
TE = VectorizableTree[
Idx].get();
22434 if (MinBWs.contains(TE))
22437 if (isa<PoisonValue>(R))
22439 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22441 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22457 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
22482 DL = &
F.getDataLayout();
22486 bool Changed =
false;
22492 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22497 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22500 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22504 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
22513 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22518 R.clearReductionData();
22519 collectSeedInstructions(BB);
22522 if (!Stores.
empty()) {
22524 <<
" underlying objects.\n");
22525 Changed |= vectorizeStoreChains(R);
22529 Changed |= vectorizeChainsInBlock(BB, R);
22534 if (!GEPs.
empty()) {
22536 <<
" underlying objects.\n");
22537 Changed |= vectorizeGEPIndices(BB, R);
22542 R.optimizeGatherSequence();
22550 unsigned Idx,
unsigned MinVF,
22555 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22556 unsigned VF = Chain.
size();
22560 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
22562 VF < 2 || VF < MinVF) {
22574 for (
Value *V : Chain)
22575 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
22577 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22578 InstructionsState S =
Analysis.buildInstructionsState(
22580 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
22582 bool IsAllowedSize =
22586 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22587 (!S.getMainOp()->isSafeToRemove() ||
22590 return !isa<ExtractElementInst>(V) &&
22591 (V->getNumUses() > Chain.size() ||
22592 any_of(V->users(), [&](User *U) {
22593 return !Stores.contains(U);
22596 (ValOps.
size() > Chain.size() / 2 && !S)) {
22597 Size = (!IsAllowedSize && S) ? 1 : 2;
22601 if (
R.isLoadCombineCandidate(Chain))
22603 R.buildTree(Chain);
22605 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22606 if (
R.isGathered(Chain.front()) ||
22607 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22608 return std::nullopt;
22609 Size =
R.getCanonicalGraphSize();
22612 if (
R.isProfitableToReorder()) {
22613 R.reorderTopToBottom();
22614 R.reorderBottomToTop();
22616 R.transformNodes();
22617 R.buildExternalUses();
22619 R.computeMinimumValueSizes();
22621 Size =
R.getCanonicalGraphSize();
22622 if (S && S.getOpcode() == Instruction::Load)
22630 using namespace ore;
22633 cast<StoreInst>(Chain[0]))
22634 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22635 <<
" and with tree size "
22636 <<
NV(
"TreeSize",
R.getTreeSize()));
22650 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22651 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22652 unsigned Size = First ? Val.first : Val.second;
22664 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22665 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22666 unsigned P = First ? Val.first : Val.second;
22669 return V + (P - Mean) * (P - Mean);
22672 return Dev * 96 / (Mean * Mean) == 0;
22680class RelatedStoreInsts {
22683 : AllStores(AllStores) {
22684 reset(BaseInstrIdx);
22687 void reset(
unsigned NewBaseInstr) {
22688 assert(NewBaseInstr < AllStores.size() &&
22689 "Instruction index out of bounds");
22690 BaseInstrIdx = NewBaseInstr;
22692 insertOrLookup(NewBaseInstr, 0);
22699 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22700 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22701 return Inserted ? std::nullopt : std::make_optional(It->second);
22704 using DistToInstMap = std::map<int64_t, unsigned>;
22705 const DistToInstMap &getStores()
const {
return Instrs; }
22711 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22714 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22720 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22721 int64_t DistFromCurBase) {
22722 DistToInstMap PrevSet = std::move(Instrs);
22723 reset(NewBaseInstIdx);
22728 for (
auto [Dist, InstIdx] : PrevSet) {
22729 if (InstIdx >= MinSafeIdx)
22730 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22736 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22737 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22738 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22743 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22744 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22749 unsigned BaseInstrIdx;
22752 DistToInstMap Instrs;
22760bool SLPVectorizerPass::vectorizeStores(
22762 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22767 bool Changed =
false;
22769 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22770 int64_t PrevDist = -1;
22774 auto &[Dist, InstIdx] =
Data;
22775 if (
Operands.empty() || Dist - PrevDist == 1) {
22776 Operands.push_back(Stores[InstIdx]);
22778 if (
Idx != StoreSeq.size() - 1)
22783 Operands.push_back(Stores[InstIdx]);
22789 .
insert({Operands.front(),
22790 cast<StoreInst>(Operands.front())->getValueOperand(),
22792 cast<StoreInst>(Operands.back())->getValueOperand(),
22797 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22798 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22802 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22804 Type *StoreTy =
Store->getValueOperand()->getType();
22805 Type *ValueTy = StoreTy;
22806 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
22807 ValueTy = Trunc->getSrcTy();
22816 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22819 MinVF = std::max<unsigned>(2, MinVF);
22821 if (MaxVF < MinVF) {
22822 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22824 <<
"MinVF (" << MinVF <<
")\n");
22828 unsigned NonPowerOf2VF = 0;
22833 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22835 NonPowerOf2VF = CandVF;
22836 assert(NonPowerOf2VF != MaxVF &&
22837 "Non-power-of-2 VF should not be equal to MaxVF");
22844 unsigned MaxRegVF = MaxVF;
22847 if (MaxVF < MinVF) {
22848 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22850 <<
"MinVF (" << MinVF <<
")\n");
22855 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22860 unsigned Repeat = 0;
22861 constexpr unsigned MaxAttempts = 4;
22863 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22864 P.first =
P.second = 1;
22866 auto IsNotVectorized = [](
bool First,
22867 const std::pair<unsigned, unsigned> &
P) {
22868 return First ?
P.first > 0 :
P.second > 0;
22870 auto IsVectorized = [](
bool First,
22871 const std::pair<unsigned, unsigned> &
P) {
22872 return First ?
P.first == 0 :
P.second == 0;
22874 auto VFIsProfitable = [](
bool First,
unsigned Size,
22875 const std::pair<unsigned, unsigned> &
P) {
22878 auto FirstSizeSame = [](
unsigned Size,
22879 const std::pair<unsigned, unsigned> &
P) {
22880 return Size ==
P.first;
22884 bool RepeatChanged =
false;
22885 bool AnyProfitableGraph =
false;
22886 for (
unsigned VF : CandidateVFs) {
22887 AnyProfitableGraph =
false;
22888 unsigned FirstUnvecStore =
22889 std::distance(RangeSizes.begin(),
22890 find_if(RangeSizes, std::bind(IsNotVectorized,
22891 VF >= MaxRegVF, _1)));
22895 while (FirstUnvecStore <
End) {
22896 unsigned FirstVecStore = std::distance(
22897 RangeSizes.begin(),
22898 find_if(RangeSizes.drop_front(FirstUnvecStore),
22899 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22900 unsigned MaxSliceEnd = FirstVecStore >=
End ?
End : FirstVecStore;
22901 for (
unsigned SliceStartIdx = FirstUnvecStore;
22902 SliceStartIdx + VF <= MaxSliceEnd;) {
22912 return cast<StoreInst>(V)
22913 ->getValueOperand()
22915 cast<StoreInst>(Slice.
front())
22916 ->getValueOperand()
22919 "Expected all operands of same type.");
22920 if (!NonSchedulable.
empty()) {
22921 auto [NonSchedSizeMax, NonSchedSizeMin] =
22923 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
22926 SliceStartIdx += NonSchedSizeMax;
22931 std::optional<bool> Res =
22932 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
22938 .first->getSecond()
22946 AnyProfitableGraph = RepeatChanged = Changed =
true;
22949 for (std::pair<unsigned, unsigned> &
P :
22950 RangeSizes.slice(SliceStartIdx, VF))
22951 P.first =
P.second = 0;
22952 if (SliceStartIdx < FirstUnvecStore + MinVF) {
22953 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
22954 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
22955 P.first =
P.second = 0;
22956 FirstUnvecStore = SliceStartIdx + VF;
22958 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
22959 for (std::pair<unsigned, unsigned> &
P :
22960 RangeSizes.slice(SliceStartIdx + VF,
22961 MaxSliceEnd - (SliceStartIdx + VF)))
22962 P.first =
P.second = 0;
22963 if (MaxSliceEnd ==
End)
22964 End = SliceStartIdx;
22965 MaxSliceEnd = SliceStartIdx;
22967 SliceStartIdx += VF;
22970 if (VF > 2 && Res &&
22971 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
22972 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
22974 SliceStartIdx += VF;
22979 if (VF > MaxRegVF && TreeSize > 1 &&
22980 all_of(RangeSizes.slice(SliceStartIdx, VF),
22981 std::bind(FirstSizeSame, TreeSize, _1))) {
22982 SliceStartIdx += VF;
22983 while (SliceStartIdx != MaxSliceEnd &&
22984 RangeSizes[SliceStartIdx].first == TreeSize)
22988 if (TreeSize > 1) {
22989 for (std::pair<unsigned, unsigned> &
P :
22990 RangeSizes.slice(SliceStartIdx, VF)) {
22991 if (VF >= MaxRegVF)
22992 P.second = std::max(
P.second, TreeSize);
22994 P.first = std::max(
P.first, TreeSize);
22998 AnyProfitableGraph =
true;
23000 if (FirstUnvecStore >=
End)
23002 if (MaxSliceEnd - FirstUnvecStore < VF &&
23003 MaxSliceEnd - FirstUnvecStore >= MinVF)
23004 AnyProfitableGraph =
true;
23005 FirstUnvecStore = std::distance(
23006 RangeSizes.begin(),
23007 find_if(RangeSizes.drop_front(MaxSliceEnd),
23008 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23010 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23014 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23015 return P.first == 0 &&
P.second == 0;
23019 if (Repeat >= MaxAttempts ||
23020 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23022 constexpr unsigned StoresLimit = 64;
23023 const unsigned MaxTotalNum = std::min<unsigned>(
23025 static_cast<unsigned>(
23028 RangeSizes.begin(),
23029 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23031 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23034 CandidateVFs.clear();
23036 CandidateVFs.push_back(Limit);
23037 if (VF > MaxTotalNum || VF >= StoresLimit)
23039 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23041 P.first = std::max(
P.second,
P.first);
23045 CandidateVFs.push_back(VF);
23086 std::optional<int64_t> PtrDist;
23087 auto *RelatedStores =
find_if(
23088 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23089 PtrDist = StoreSeq.getPointerDiff(*SI, *
DL, *SE);
23090 return PtrDist.has_value();
23094 if (RelatedStores == SortedStores.
end()) {
23102 if (std::optional<unsigned> PrevInst =
23103 RelatedStores->insertOrLookup(
Idx, *PtrDist)) {
23104 TryToVectorize(RelatedStores->getStores());
23105 RelatedStores->clearVectorizedStores(VectorizedStores);
23106 RelatedStores->rebase(*PrevInst + 1,
23111 Type *PrevValTy =
nullptr;
23113 if (
R.isDeleted(SI))
23116 PrevValTy =
SI->getValueOperand()->getType();
23118 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23119 for (RelatedStoreInsts &StoreSeq : SortedStores)
23120 TryToVectorize(StoreSeq.getStores());
23121 SortedStores.clear();
23122 PrevValTy =
SI->getValueOperand()->getType();
23124 FillStoresSet(
I, SI);
23128 for (RelatedStoreInsts &StoreSeq : SortedStores)
23129 TryToVectorize(StoreSeq.getStores());
23134void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
23145 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
23146 if (!
SI->isSimple())
23156 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
23157 if (
GEP->getNumIndices() != 1)
23160 if (isa<Constant>(
Idx))
23164 if (
GEP->getType()->isVectorTy())
23176 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23177 << VL.
size() <<
".\n");
23188 for (
Value *V : VL) {
23189 Type *Ty =
V->getType();
23193 R.getORE()->emit([&]() {
23194 std::string TypeStr;
23198 <<
"Cannot SLP vectorize list: type "
23199 << TypeStr +
" is unsupported by vectorizer";
23206 unsigned Sz =
R.getVectorElementSize(I0);
23207 unsigned MinVF =
R.getMinVF(Sz);
23208 unsigned MaxVF = std::max<unsigned>(
23210 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23212 R.getORE()->emit([&]() {
23214 <<
"Cannot SLP vectorize list: vectorization factor "
23215 <<
"less than 2 is not supported";
23220 bool Changed =
false;
23221 bool CandidateFound =
false;
23224 unsigned NextInst = 0, MaxInst = VL.size();
23225 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23233 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23234 unsigned ActualVF = std::min(MaxInst -
I, VF);
23239 if (MaxVFOnly && ActualVF < MaxVF)
23241 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23246 for (
Value *V : VL.drop_front(
I)) {
23249 if (
auto *Inst = dyn_cast<Instruction>(V);
23250 !Inst || !
R.isDeleted(Inst)) {
23253 if (
Idx == ActualVF)
23258 if (
Idx != ActualVF)
23261 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23265 if (
R.isTreeTinyAndNotFullyVectorizable())
23267 if (
R.isProfitableToReorder()) {
23268 R.reorderTopToBottom();
23269 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.
front()));
23271 R.transformNodes();
23272 R.buildExternalUses();
23274 R.computeMinimumValueSizes();
23276 CandidateFound =
true;
23277 MinCost = std::min(MinCost,
Cost);
23280 <<
" for VF=" << ActualVF <<
"\n");
23284 cast<Instruction>(Ops[0]))
23285 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23286 <<
" and with tree size "
23287 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23298 if (!Changed && CandidateFound) {
23299 R.getORE()->emit([&]() {
23301 <<
"List vectorization was possible but not beneficial with cost "
23302 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23305 }
else if (!Changed) {
23306 R.getORE()->emit([&]() {
23308 <<
"Cannot SLP vectorize list: vectorization was impossible"
23309 <<
" with available vectorization factors";
23346 ReductionOpsListType ReductionOps;
23356 bool IsSupportedHorRdxIdentityOp =
false;
23372 return isa<SelectInst>(
I) &&
23378 bool TwoElementReduction =
false) {
23379 if (Kind == RecurKind::None)
23388 if (TwoElementReduction)
23391 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23395 return I->getFastMathFlags().noNaNs();
23398 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23401 return I->isAssociative();
23410 return I->getOperand(2);
23411 return I->getOperand(
Index);
23420 case RecurKind::Or: {
23429 case RecurKind::And: {
23438 case RecurKind::Add:
23439 case RecurKind::Mul:
23440 case RecurKind::Xor:
23441 case RecurKind::FAdd:
23442 case RecurKind::FMul: {
23447 case RecurKind::SMax:
23448 case RecurKind::SMin:
23449 case RecurKind::UMax:
23450 case RecurKind::UMin:
23457 case RecurKind::FMax:
23458 case RecurKind::FMin:
23459 case RecurKind::FMaximum:
23460 case RecurKind::FMinimum:
23461 case RecurKind::FMaximumNum:
23462 case RecurKind::FMinimumNum: {
23475 const ReductionOpsListType &ReductionOps) {
23476 bool UseSelect = ReductionOps.size() == 2 ||
23478 (ReductionOps.size() == 1 &&
23479 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23480 assert((!UseSelect || ReductionOps.size() != 2 ||
23481 isa<SelectInst>(ReductionOps[1][0])) &&
23482 "Expected cmp + select pairs for reduction");
23485 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
23499 auto *
I = dyn_cast<Instruction>(V);
23501 return RecurKind::None;
23503 return RecurKind::Add;
23505 return RecurKind::Mul;
23508 return RecurKind::And;
23511 return RecurKind::Or;
23513 return RecurKind::Xor;
23515 return RecurKind::FAdd;
23517 return RecurKind::FMul;
23520 return RecurKind::FMax;
23522 return RecurKind::FMin;
23525 return RecurKind::FMaximum;
23527 return RecurKind::FMinimum;
23533 return RecurKind::SMax;
23535 return RecurKind::SMin;
23537 return RecurKind::UMax;
23539 return RecurKind::UMin;
23541 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
23563 if (!isa<ExtractElementInst>(
RHS) ||
23565 return RecurKind::None;
23567 if (!isa<ExtractElementInst>(
LHS) ||
23569 return RecurKind::None;
23571 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
23572 return RecurKind::None;
23576 return RecurKind::None;
23581 return RecurKind::None;
23584 return RecurKind::SMax;
23587 return RecurKind::SMin;
23590 return RecurKind::UMax;
23593 return RecurKind::UMin;
23596 return RecurKind::None;
23600 static unsigned getFirstOperandIndex(
Instruction *
I) {
23601 return isCmpSelMinMax(
I) ? 1 : 0;
23607 return isCmpSelMinMax(
I) ? 3 : 2;
23613 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23614 auto *Sel = cast<SelectInst>(
I);
23615 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
23616 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23618 return I->getParent() == BB;
23622 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
23623 if (IsCmpSelMinMax) {
23626 if (
auto *Sel = dyn_cast<SelectInst>(
I))
23627 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
23628 return I->hasNUses(2);
23632 return I->hasOneUse();
23637 if (isCmpSelMinMax(
I))
23638 ReductionOps.assign(2, ReductionOpsType());
23640 ReductionOps.assign(1, ReductionOpsType());
23645 if (isCmpSelMinMax(
I)) {
23646 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
23647 ReductionOps[1].emplace_back(
I);
23649 ReductionOps[0].emplace_back(
I);
23654 int Sz = Data.size();
23655 auto *
I = dyn_cast<Instruction>(Data.front());
23656 return Sz > 1 ||
isConstant(Data.front()) ||
23657 (
I && !isa<LoadInst>(
I) && isValidForAlternation(
I->getOpcode()));
23663 : ReductionRoot(
I), ReductionLimit(2) {
23664 RdxKind = HorizontalReduction::getRdxKind(
I);
23665 ReductionOps.emplace_back().push_back(
I);
23667 for (
Value *V : Ops)
23668 ReducedValsToOps[
V].push_back(
I);
23671 bool matchReductionForOperands()
const {
23674 assert(ReductionRoot &&
"Reduction root is not set!");
23675 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23677 return Ops.
size() == 2;
23688 RdxKind = HorizontalReduction::getRdxKind(Root);
23689 if (!isVectorizable(RdxKind, Root))
23700 if (
auto *Sel = dyn_cast<SelectInst>(Root))
23701 if (!Sel->getCondition()->hasOneUse())
23704 ReductionRoot = Root;
23709 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23711 1, std::make_pair(Root, 0));
23719 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
23720 getNumberOfOperands(TreeN)))) {
23721 Value *EdgeVal = getRdxOperand(TreeN,
I);
23722 ReducedValsToOps[EdgeVal].push_back(TreeN);
23723 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23730 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23731 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23732 !isVectorizable(RdxKind, EdgeInst) ||
23733 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23734 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23735 PossibleReducedVals.push_back(EdgeVal);
23738 ReductionOps.push_back(EdgeInst);
23749 PossibleReducedVals;
23750 initReductionOps(Root);
23754 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
23758 if (!LoadKeyUsed.
insert(Key).second) {
23759 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
23760 if (LIt != LoadsMap.
end()) {
23761 for (
LoadInst *RLI : LIt->second) {
23767 for (
LoadInst *RLI : LIt->second) {
23774 if (LIt->second.size() > 2) {
23776 hash_value(LIt->second.back()->getPointerOperand());
23782 .first->second.push_back(LI);
23786 while (!Worklist.empty()) {
23787 auto [TreeN, Level] = Worklist.pop_back_val();
23790 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23791 addReductionOps(TreeN);
23794 for (
Value *V : PossibleRedVals) {
23801 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23803 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23806 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23807 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23809 for (
auto &Slice : PossibleRedVals) {
23811 auto RedValsVect = Slice.second.takeVector();
23813 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
23814 PossibleRedValsVect.
back().append(Data.second, Data.first);
23816 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23817 return P1.size() > P2.size();
23824 }
else if (!isGoodForReduction(Data)) {
23825 auto *LI = dyn_cast<LoadInst>(Data.front());
23826 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.
back().front());
23827 if (!LI || !LastLI ||
23832 ReducedVals.
back().append(Data.rbegin(), Data.rend());
23838 return P1.size() > P2.
size();
23847 constexpr unsigned RegMaxNumber = 4;
23848 constexpr unsigned RedValsMaxNumber = 128;
23852 if (
unsigned NumReducedVals = std::accumulate(
23853 ReducedVals.
begin(), ReducedVals.
end(), 0,
23855 if (!isGoodForReduction(Vals))
23857 return Num + Vals.size();
23859 NumReducedVals < ReductionLimit &&
23863 for (ReductionOpsType &RdxOps : ReductionOps)
23864 for (
Value *RdxOp : RdxOps)
23865 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23876 ReducedVals.
front().size());
23880 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23881 assert(isa<SelectInst>(RdxRootInst) &&
23882 "Expected min/max reduction to have select root instruction");
23883 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23884 assert(isa<Instruction>(ScalarCond) &&
23885 "Expected min/max reduction to have compare condition");
23886 return cast<Instruction>(ScalarCond);
23889 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23890 return isBoolLogicOp(cast<Instruction>(V));
23893 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23894 if (VectorizedTree) {
23897 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
23898 if (AnyBoolLogicOp) {
23899 auto It = ReducedValsToOps.
find(VectorizedTree);
23900 auto It1 = ReducedValsToOps.
find(Res);
23901 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
23903 (It != ReducedValsToOps.
end() &&
23905 return isBoolLogicOp(I) &&
23906 getRdxOperand(I, 0) == VectorizedTree;
23910 (It1 != ReducedValsToOps.
end() &&
23912 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
23916 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
23920 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
23927 ReductionOps.front().size());
23928 for (ReductionOpsType &RdxOps : ReductionOps)
23929 for (
Value *RdxOp : RdxOps) {
23932 IgnoreList.insert(RdxOp);
23937 for (
Value *U : IgnoreList)
23938 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
23939 RdxFMF &= FPMO->getFastMathFlags();
23940 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
23945 for (
Value *V : Candidates)
23946 TrackedVals.try_emplace(V, V);
23949 Value *
V) ->
unsigned & {
23950 auto *It = MV.
find(V);
23951 assert(It != MV.
end() &&
"Unable to find given key.");
23960 bool CheckForReusedReductionOps =
false;
23965 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
23967 InstructionsState S = States[
I];
23971 for (
Value *ReducedVal : OrigReducedVals) {
23972 Value *RdxVal = TrackedVals.at(ReducedVal);
23977 auto *Inst = dyn_cast<Instruction>(RdxVal);
23979 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
23983 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
23985 bool ShuffledExtracts =
false;
23987 if (S && S.getOpcode() == Instruction::ExtractElement &&
23988 !S.isAltShuffle() &&
I + 1 <
E) {
23990 for (
Value *RV : ReducedVals[
I + 1]) {
23991 Value *RdxVal = TrackedVals.at(RV);
23995 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
23998 CommonCandidates.push_back(RdxVal);
23999 TrackedToOrig.try_emplace(RdxVal, RV);
24004 Candidates.
swap(CommonCandidates);
24005 ShuffledExtracts =
true;
24012 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24013 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24015 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24016 Value *OrigV = TrackedToOrig.at(VC);
24017 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24018 if (
auto *ResI = dyn_cast<Instruction>(Res))
24019 V.analyzedReductionRoot(ResI);
24021 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24025 unsigned NumReducedVals = Candidates.
size();
24026 if (NumReducedVals < ReductionLimit &&
24027 (NumReducedVals < 2 || !
isSplat(Candidates)))
24032 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24033 RdxKind != RecurKind::FMul &&
24034 RdxKind != RecurKind::FMulAdd;
24037 if (IsSupportedHorRdxIdentityOp)
24038 for (
Value *V : Candidates) {
24039 Value *OrigV = TrackedToOrig.at(V);
24040 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24052 bool SameScaleFactor =
false;
24053 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24054 SameValuesCounter.
size() != Candidates.size();
24056 if (OptReusedScalars) {
24058 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24059 RdxKind == RecurKind::Xor) &&
24061 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24062 return P.second == SameValuesCounter.
front().second;
24064 Candidates.resize(SameValuesCounter.
size());
24065 transform(SameValuesCounter, Candidates.begin(),
24066 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24067 NumReducedVals = Candidates.size();
24069 if (NumReducedVals == 1) {
24070 Value *OrigV = TrackedToOrig.at(Candidates.front());
24071 unsigned Cnt = At(SameValuesCounter, OrigV);
24073 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24074 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24075 VectorizedVals.try_emplace(OrigV, Cnt);
24076 ExternallyUsedValues.
insert(OrigV);
24081 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24082 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24083 const unsigned MaxElts = std::clamp<unsigned>(
24085 RegMaxNumber * RedValsMaxNumber);
24087 unsigned ReduxWidth = NumReducedVals;
24088 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24089 unsigned NumParts, NumRegs;
24090 Type *ScalarTy = Candidates.front()->getType();
24097 while (NumParts > NumRegs) {
24098 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24099 ReduxWidth =
bit_floor(ReduxWidth - 1);
24105 if (NumParts > NumRegs / 2)
24110 ReduxWidth = GetVectorFactor(ReduxWidth);
24111 ReduxWidth = std::min(ReduxWidth, MaxElts);
24113 unsigned Start = 0;
24114 unsigned Pos = Start;
24116 unsigned PrevReduxWidth = ReduxWidth;
24117 bool CheckForReusedReductionOpsLocal =
false;
24118 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24119 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24120 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24123 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24126 if (Pos < NumReducedVals - ReduxWidth + 1)
24127 return IsAnyRedOpGathered;
24130 if (ReduxWidth > 1)
24131 ReduxWidth = GetVectorFactor(ReduxWidth);
24132 return IsAnyRedOpGathered;
24134 bool AnyVectorized =
false;
24136 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24137 ReduxWidth >= ReductionLimit) {
24140 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24142 CheckForReusedReductionOps =
true;
24145 PrevReduxWidth = ReduxWidth;
24148 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24151 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24153 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24155 V.areAnalyzedReductionVals(VL)) {
24156 (void)AdjustReducedVals(
true);
24162 auto *RedValI = dyn_cast<Instruction>(RedVal);
24163 return RedValI &&
V.isDeleted(RedValI);
24166 V.buildTree(VL, IgnoreList);
24167 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24168 if (!AdjustReducedVals())
24169 V.analyzedReductionVals(VL);
24172 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24173 if (!AdjustReducedVals())
24174 V.analyzedReductionVals(VL);
24177 V.reorderTopToBottom();
24180 VL.front()->getType()->isIntOrIntVectorTy() ||
24181 ReductionLimit > 2);
24185 ExternallyUsedValues);
24189 LocalExternallyUsedValues.insert(ReductionRoot);
24190 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24191 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24193 for (
Value *V : ReducedVals[Cnt])
24194 if (isa<Instruction>(V))
24195 LocalExternallyUsedValues.insert(TrackedVals[V]);
24197 if (!IsSupportedHorRdxIdentityOp) {
24200 "Reused values counter map is not empty");
24201 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24202 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24204 Value *
V = Candidates[Cnt];
24205 Value *OrigV = TrackedToOrig.at(V);
24206 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24209 V.transformNodes();
24213 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24214 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24216 Value *RdxVal = Candidates[Cnt];
24217 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24218 RdxVal = It->second;
24219 if (!Visited.
insert(RdxVal).second)
24223 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24224 LocalExternallyUsedValues.insert(RdxVal);
24227 Value *OrigV = TrackedToOrig.at(RdxVal);
24229 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24230 if (NumOps != ReducedValsToOps.
at(OrigV).size())
24231 LocalExternallyUsedValues.insert(RdxVal);
24234 if (!IsSupportedHorRdxIdentityOp)
24235 SameValuesCounter.
clear();
24236 for (
Value *RdxVal : VL)
24237 if (RequiredExtract.
contains(RdxVal))
24238 LocalExternallyUsedValues.insert(RdxVal);
24239 V.buildExternalUses(LocalExternallyUsedValues);
24241 V.computeMinimumValueSizes();
24245 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24248 <<
" for reduction\n");
24252 V.getORE()->emit([&]() {
24254 ReducedValsToOps.
at(VL[0]).front())
24255 <<
"Vectorizing horizontal reduction is possible "
24256 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24257 <<
" and threshold "
24260 if (!AdjustReducedVals()) {
24261 V.analyzedReductionVals(VL);
24262 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24263 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24266 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24267 VF >= ReductionLimit;
24269 *
TTI, VL.front()->getType(), VF - 1)) {
24271 V.getCanonicalGraphSize() !=
V.getTreeSize())
24273 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24281 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24282 <<
Cost <<
". (HorRdx)\n");
24283 V.getORE()->emit([&]() {
24285 ReducedValsToOps.
at(VL[0]).front())
24286 <<
"Vectorized horizontal reduction with cost "
24287 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24288 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24295 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24297 if (IsCmpSelMinMax)
24298 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24301 Value *VectorizedRoot =
V.vectorizeTree(
24302 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24305 for (
Value *RdxVal : Candidates) {
24306 Value *OrigVal = TrackedToOrig.at(RdxVal);
24307 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24308 if (TransformedRdxVal != RdxVal)
24309 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24318 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24321 if (OptReusedScalars && !SameScaleFactor) {
24322 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24323 SameValuesCounter, TrackedToOrig);
24326 Type *ScalarTy = VL.front()->getType();
24331 OptReusedScalars && SameScaleFactor
24332 ? SameValuesCounter.
front().second
24335 ?
V.isSignedMinBitwidthRootNode()
24339 for (
Value *RdxVal : VL) {
24340 Value *OrigV = TrackedToOrig.at(RdxVal);
24341 if (IsSupportedHorRdxIdentityOp) {
24342 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24345 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24346 if (!
V.isVectorized(RdxVal))
24347 RequiredExtract.
insert(RdxVal);
24351 ReduxWidth = NumReducedVals - Pos;
24352 if (ReduxWidth > 1)
24353 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24354 AnyVectorized =
true;
24356 if (OptReusedScalars && !AnyVectorized) {
24357 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24358 Value *RdxVal = TrackedVals.at(
P.first);
24359 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24360 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24361 VectorizedVals.try_emplace(
P.first,
P.second);
24366 if (!VectorValuesAndScales.
empty())
24367 VectorizedTree = GetNewVectorizedTree(
24369 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24370 if (VectorizedTree) {
24391 if (!AnyBoolLogicOp)
24393 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24394 getRdxOperand(RedOp1, 0) ==
LHS ||
24397 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24398 getRdxOperand(RedOp2, 0) ==
RHS ||
24403 if (
LHS != VectorizedTree)
24414 unsigned Sz = InstVals.
size();
24417 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24420 Value *RdxVal1 = InstVals[
I].second;
24421 Value *StableRdxVal1 = RdxVal1;
24422 auto It1 = TrackedVals.find(RdxVal1);
24423 if (It1 != TrackedVals.end())
24424 StableRdxVal1 = It1->second;
24425 Value *RdxVal2 = InstVals[
I + 1].second;
24426 Value *StableRdxVal2 = RdxVal2;
24427 auto It2 = TrackedVals.find(RdxVal2);
24428 if (It2 != TrackedVals.end())
24429 StableRdxVal2 = It2->second;
24433 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24435 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24436 StableRdxVal2,
"op.rdx", ReductionOps);
24437 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24440 ExtraReds[Sz / 2] = InstVals.
back();
24444 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
24448 for (
Value *RdxVal : Candidates) {
24449 if (!Visited.
insert(RdxVal).second)
24451 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24458 bool InitStep =
true;
24459 while (ExtraReductions.
size() > 1) {
24461 FinalGen(ExtraReductions, InitStep);
24462 ExtraReductions.
swap(NewReds);
24465 VectorizedTree = ExtraReductions.
front().second;
24467 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24483 for (
auto *U :
Ignore->users()) {
24485 "All users must be either in the reduction ops list.");
24488 if (!
Ignore->use_empty()) {
24490 Ignore->replaceAllUsesWith(
P);
24493 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24495 }
else if (!CheckForReusedReductionOps) {
24496 for (ReductionOpsType &RdxOps : ReductionOps)
24497 for (
Value *RdxOp : RdxOps)
24498 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24500 return VectorizedTree;
24507 Value *Vec,
unsigned Scale,
bool IsSigned,
24510 if (
auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24515 for (
unsigned I : seq<unsigned>(DestTyNumElements)) {
24531 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24534 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24536 if (Rdx->
getType() != DestTy)
24542 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24554 Type *ScalarTy = ReducedVals.
front()->getType();
24555 unsigned ReduxWidth = ReducedVals.
size();
24564 int Cnt = ReducedVals.
size();
24565 for (
Value *RdxVal : ReducedVals) {
24570 Cost += GenCostFn();
24575 auto *RdxOp = cast<Instruction>(U);
24576 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24577 if (RdxKind == RecurKind::FAdd) {
24582 if (
auto *
I = dyn_cast<Instruction>(RdxVal)) {
24587 FMACost -= FMulCost;
24589 ScalarCost += FMACost;
24600 Cost += ScalarCost;
24602 Cost += GenCostFn();
24611 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24613 case RecurKind::Add:
24614 case RecurKind::Mul:
24615 case RecurKind::Or:
24616 case RecurKind::And:
24617 case RecurKind::Xor:
24618 case RecurKind::FAdd:
24619 case RecurKind::FMul: {
24622 if (DoesRequireReductionOp) {
24623 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24625 unsigned ScalarTyNumElements = VecTy->getNumElements();
24626 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
24630 ReducedVals.size()),
24641 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24642 std::make_pair(RedTy,
true));
24643 if (RType == RedTy) {
24648 RdxOpcode, !IsSigned, RedTy,
24654 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24655 std::make_pair(RedTy,
true));
24658 if (RdxKind == RecurKind::FAdd) {
24663 for (
Value *RdxVal : ReducedVals) {
24668 if (
auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24669 FMF &= FPCI->getFastMathFlags();
24672 if (!Ops.
empty()) {
24678 {RVecTy, RVecTy, RVecTy}, FMF);
24684 Instruction::FMul, RVecTy,
CostKind);
24686 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24687 FMACost -= FMulCost;
24691 if (FMACost.isValid())
24692 VectorCost += FMACost;
24696 if (RType != RedTy) {
24697 unsigned Opcode = Instruction::Trunc;
24699 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24705 ScalarCost = EvaluateScalarCost([&]() {
24710 case RecurKind::FMax:
24711 case RecurKind::FMin:
24712 case RecurKind::FMaximum:
24713 case RecurKind::FMinimum:
24714 case RecurKind::SMax:
24715 case RecurKind::SMin:
24716 case RecurKind::UMax:
24717 case RecurKind::UMin: {
24720 if (DoesRequireReductionOp) {
24726 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24727 std::make_pair(RedTy,
true));
24731 if (RType != RedTy) {
24732 unsigned Opcode = Instruction::Trunc;
24734 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24740 ScalarCost = EvaluateScalarCost([&]() {
24750 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24752 <<
" (It is a splitting reduction)\n");
24753 return VectorCost - ScalarCost;
24761 Value *ReducedSubTree =
nullptr;
24763 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24764 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24765 if (ReducedSubTree)
24766 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24767 "op.rdx", ReductionOps);
24769 ReducedSubTree = Rdx;
24771 if (VectorValuesAndScales.
size() == 1) {
24772 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24773 CreateSingleOp(Vec, Scale, IsSigned);
24774 return ReducedSubTree;
24778 Value *VecRes =
nullptr;
24779 bool VecResSignedness =
false;
24780 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24786 case RecurKind::Add: {
24787 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24790 <<
". (HorRdx)\n");
24792 for (
unsigned I : seq<unsigned>(Cnt))
24793 std::iota(std::next(
Mask.begin(), VF *
I),
24794 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24795 ++NumVectorInstructions;
24806 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24807 <<
". (HorRdx)\n");
24808 ++NumVectorInstructions;
24812 case RecurKind::Xor: {
24815 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24820 case RecurKind::FAdd: {
24824 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24825 <<
". (HorRdx)\n");
24826 ++NumVectorInstructions;
24830 case RecurKind::And:
24831 case RecurKind::Or:
24832 case RecurKind::SMax:
24833 case RecurKind::SMin:
24834 case RecurKind::UMax:
24835 case RecurKind::UMin:
24836 case RecurKind::FMax:
24837 case RecurKind::FMin:
24838 case RecurKind::FMaximum:
24839 case RecurKind::FMinimum:
24842 case RecurKind::Sub:
24843 case RecurKind::AddChainWithSubs:
24844 case RecurKind::Mul:
24845 case RecurKind::FMul:
24846 case RecurKind::FMulAdd:
24847 case RecurKind::AnyOf:
24848 case RecurKind::FindFirstIVSMin:
24849 case RecurKind::FindFirstIVUMin:
24850 case RecurKind::FindLastIVSMax:
24851 case RecurKind::FindLastIVUMax:
24852 case RecurKind::FMaxNum:
24853 case RecurKind::FMinNum:
24854 case RecurKind::FMaximumNum:
24855 case RecurKind::FMinimumNum:
24856 case RecurKind::None:
24863 VecResSignedness = IsSigned;
24865 ++NumVectorInstructions;
24866 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24872 std::iota(
Mask.begin(),
Mask.end(), 0);
24874 if (VecResVF < VecVF) {
24878 if (VecResVF != VecVF) {
24880 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24897 if (VecResVF < VecVF) {
24903 if (VecResVF != VecVF)
24905 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
24906 if (VecResVF != VecVF)
24911 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
24912 CreateVecOp(Vec, Scale, IsSigned);
24913 CreateSingleOp(VecRes, 1,
false);
24915 return ReducedSubTree;
24921 assert(VectorizedValue &&
"Need to have a vectorized tree node");
24922 assert(RdxKind != RecurKind::FMulAdd &&
24923 "A call to the llvm.fmuladd intrinsic is not handled yet");
24925 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
24926 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
24927 RdxKind == RecurKind::Add &&
24932 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
24933 ++NumVectorInstructions;
24936 ++NumVectorInstructions;
24943 assert(IsSupportedHorRdxIdentityOp &&
24944 "The optimization of matched scalar identity horizontal reductions "
24945 "must be supported.");
24947 return VectorizedValue;
24949 case RecurKind::Add: {
24951 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
24953 << VectorizedValue <<
". (HorRdx)\n");
24954 return Builder.
CreateMul(VectorizedValue, Scale);
24956 case RecurKind::Xor: {
24958 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
24959 <<
". (HorRdx)\n");
24962 return VectorizedValue;
24964 case RecurKind::FAdd: {
24966 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
24968 << VectorizedValue <<
". (HorRdx)\n");
24969 return Builder.
CreateFMul(VectorizedValue, Scale);
24971 case RecurKind::And:
24972 case RecurKind::Or:
24973 case RecurKind::SMax:
24974 case RecurKind::SMin:
24975 case RecurKind::UMax:
24976 case RecurKind::UMin:
24977 case RecurKind::FMax:
24978 case RecurKind::FMin:
24979 case RecurKind::FMaximum:
24980 case RecurKind::FMinimum:
24982 return VectorizedValue;
24983 case RecurKind::Sub:
24984 case RecurKind::AddChainWithSubs:
24985 case RecurKind::Mul:
24986 case RecurKind::FMul:
24987 case RecurKind::FMulAdd:
24988 case RecurKind::AnyOf:
24989 case RecurKind::FindFirstIVSMin:
24990 case RecurKind::FindFirstIVUMin:
24991 case RecurKind::FindLastIVSMax:
24992 case RecurKind::FindLastIVUMax:
24993 case RecurKind::FMaxNum:
24994 case RecurKind::FMinNum:
24995 case RecurKind::FMaximumNum:
24996 case RecurKind::FMinimumNum:
24997 case RecurKind::None:
25009 assert(IsSupportedHorRdxIdentityOp &&
25010 "The optimization of matched scalar identity horizontal reductions "
25011 "must be supported.");
25013 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
25014 if (VTy->getElementType() != VL.
front()->getType()) {
25018 R.isSignedMinBitwidthRootNode());
25021 case RecurKind::Add: {
25024 for (
Value *V : VL) {
25025 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25026 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25030 << VectorizedValue <<
". (HorRdx)\n");
25031 return Builder.
CreateMul(VectorizedValue, Scale);
25033 case RecurKind::And:
25034 case RecurKind::Or:
25037 <<
". (HorRdx)\n");
25038 return VectorizedValue;
25039 case RecurKind::SMax:
25040 case RecurKind::SMin:
25041 case RecurKind::UMax:
25042 case RecurKind::UMin:
25043 case RecurKind::FMax:
25044 case RecurKind::FMin:
25045 case RecurKind::FMaximum:
25046 case RecurKind::FMinimum:
25049 <<
". (HorRdx)\n");
25050 return VectorizedValue;
25051 case RecurKind::Xor: {
25057 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
25059 std::iota(
Mask.begin(),
Mask.end(), 0);
25060 bool NeedShuffle =
false;
25061 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25063 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25064 if (Cnt % 2 == 0) {
25066 NeedShuffle =
true;
25072 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25076 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
25077 return VectorizedValue;
25079 case RecurKind::FAdd: {
25082 for (
Value *V : VL) {
25083 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25084 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25087 return Builder.
CreateFMul(VectorizedValue, Scale);
25089 case RecurKind::Sub:
25090 case RecurKind::AddChainWithSubs:
25091 case RecurKind::Mul:
25092 case RecurKind::FMul:
25093 case RecurKind::FMulAdd:
25094 case RecurKind::AnyOf:
25095 case RecurKind::FindFirstIVSMin:
25096 case RecurKind::FindFirstIVUMin:
25097 case RecurKind::FindLastIVSMax:
25098 case RecurKind::FindLastIVUMax:
25099 case RecurKind::FMaxNum:
25100 case RecurKind::FMinNum:
25101 case RecurKind::FMaximumNum:
25102 case RecurKind::FMinimumNum:
25103 case RecurKind::None:
25113 return HorizontalReduction::getRdxKind(V);
25116 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25117 return cast<FixedVectorType>(IE->getType())->getNumElements();
25119 unsigned AggregateSize = 1;
25120 auto *
IV = cast<InsertValueInst>(InsertInst);
25121 Type *CurrentType =
IV->getType();
25123 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
25124 for (
auto *Elt : ST->elements())
25125 if (Elt != ST->getElementType(0))
25126 return std::nullopt;
25127 AggregateSize *= ST->getNumElements();
25128 CurrentType = ST->getElementType(0);
25129 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25130 AggregateSize *= AT->getNumElements();
25131 CurrentType = AT->getElementType();
25132 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25133 AggregateSize *= VT->getNumElements();
25134 return AggregateSize;
25136 return AggregateSize;
25138 return std::nullopt;
25147 unsigned OperandOffset,
const BoUpSLP &R) {
25150 std::optional<unsigned> OperandIndex =
25152 if (!OperandIndex || R.isDeleted(LastInsertInst))
25154 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25156 BuildVectorOpds, InsertElts, *OperandIndex, R);
25159 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25160 InsertElts[*OperandIndex] = LastInsertInst;
25162 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
25163 }
while (LastInsertInst !=
nullptr &&
25164 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
25188 assert((isa<InsertElementInst>(LastInsertInst) ||
25189 isa<InsertValueInst>(LastInsertInst)) &&
25190 "Expected insertelement or insertvalue instruction!");
25193 "Expected empty result vectors!");
25196 if (!AggregateSize)
25198 BuildVectorOpds.
resize(*AggregateSize);
25199 InsertElts.
resize(*AggregateSize);
25204 if (BuildVectorOpds.
size() >= 2)
25222 auto DominatedReduxValue = [&](
Value *R) {
25223 return isa<Instruction>(R) &&
25224 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
25230 if (
P->getIncomingBlock(0) == ParentBB) {
25231 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
25232 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25233 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
25236 if (Rdx && DominatedReduxValue(Rdx))
25249 if (
P->getIncomingBlock(0) == BBLatch) {
25250 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
25251 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25252 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
25255 if (Rdx && DominatedReduxValue(Rdx))
25289 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25290 isa<IntrinsicInst>(Root)) &&
25291 "Expected binop, select, or intrinsic for reduction matching");
25293 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25295 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25297 return dyn_cast<Instruction>(
RHS);
25299 return dyn_cast<Instruction>(
LHS);
25306 Value *Op0 =
nullptr;
25307 Value *Op1 =
nullptr;
25310 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25316 Value *B0 =
nullptr, *B1 =
nullptr;
25321bool SLPVectorizerPass::vectorizeHorReduction(
25326 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
25328 if (Root->
getParent() != BB || isa<PHINode>(Root))
25332 auto SelectRoot = [&]() {
25351 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25352 Stack.emplace(SelectRoot(), 0);
25356 if (
R.isAnalyzedReductionRoot(Inst))
25361 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
25363 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC, *DT);
25365 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25366 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25373 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
25378 while (!
Stack.empty()) {
25381 std::tie(Inst, Level) =
Stack.front();
25386 if (
R.isDeleted(Inst))
25388 if (
Value *VectorizedV = TryToReduce(Inst)) {
25390 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
25392 Stack.emplace(
I, Level);
25395 if (
R.isDeleted(Inst))
25399 if (!TryAppendToPostponedInsts(Inst)) {
25410 if (VisitedInstrs.
insert(
Op).second)
25411 if (
auto *
I = dyn_cast<Instruction>(
Op))
25414 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
25415 !
R.isDeleted(
I) &&
I->getParent() == BB)
25416 Stack.emplace(
I, Level);
25425 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
25428 if ((
I->getOpcode() == Instruction::FAdd ||
25429 I->getOpcode() == Instruction::FSub) &&
25437 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
25438 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
25439 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25440 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25447 auto *
A = dyn_cast<BinaryOperator>(Op0);
25448 auto *
B = dyn_cast<BinaryOperator>(Op1);
25450 if (
A &&
B &&
B->hasOneUse()) {
25451 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
25452 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
25453 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25455 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25459 if (
B &&
A &&
A->hasOneUse()) {
25460 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
25461 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
25462 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25464 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25472 Type *Ty = Inst->getType();
25476 if (!HorRdx.matchReductionForOperands())
25496 if (
auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25497 FMF = FPCI->getFastMathFlags();
25505 if (RedCost >= ScalarCost)
25508 return HorRdx.tryToReduce(R, *
DL, &
TTI, *TLI, AC, *DT) !=
nullptr;
25510 if (Candidates.
size() == 1)
25511 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25514 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25515 if (!BestCandidate)
25517 return (*BestCandidate == 0 &&
25518 TryToReduce(
I, {Candidates[*BestCandidate].first,
25519 Candidates[*BestCandidate].second})) ||
25520 tryToVectorizeList({Candidates[*BestCandidate].first,
25521 Candidates[*BestCandidate].second},
25528 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25529 Res |= tryToVectorize(PostponedInsts, R);
25536 for (
Value *V : Insts)
25537 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
25538 Res |= tryToVectorize(Inst, R);
25542bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
25545 if (!
R.canMapToVector(IVI->
getType()))
25553 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25554 R.getORE()->emit([&]() {
25556 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25557 "trying reduction first.";
25561 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25563 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25573 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
25577 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25578 R.getORE()->emit([&]() {
25580 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25581 "trying reduction first.";
25585 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25586 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25589template <
typename T>
25594 bool MaxVFOnly,
BoUpSLP &R) {
25595 bool Changed =
false;
25606 auto *
I = dyn_cast<Instruction>(*IncIt);
25607 if (!
I || R.isDeleted(
I)) {
25611 auto *SameTypeIt = IncIt;
25612 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25613 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25614 AreCompatible(VL, *SameTypeIt))) {
25615 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
25617 if (
I && !R.isDeleted(
I))
25622 unsigned NumElts = VL.
size();
25623 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25624 << NumElts <<
")\n");
25634 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25637 VL.
swap(Candidates);
25638 Candidates.
clear();
25640 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
25646 auto GetMinNumElements = [&R](
Value *V) {
25647 unsigned EltSize = R.getVectorElementSize(V);
25648 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25650 if (NumElts < GetMinNumElements(*IncIt) &&
25651 (Candidates.
empty() ||
25652 Candidates.
front()->getType() == (*IncIt)->getType())) {
25654 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
25660 if (Candidates.
size() > 1 &&
25661 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25662 if (TryToVectorizeHelper(Candidates,
false)) {
25665 }
else if (MaxVFOnly) {
25668 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
25670 auto *
I = dyn_cast<Instruction>(*It);
25671 if (!
I || R.isDeleted(
I)) {
25675 auto *SameTypeIt = It;
25676 while (SameTypeIt !=
End &&
25677 (!isa<Instruction>(*SameTypeIt) ||
25678 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25679 AreCompatible(*SameTypeIt, *It))) {
25680 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
25682 if (
I && !R.isDeleted(
I))
25685 unsigned NumElts = VL.
size();
25686 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25692 Candidates.
clear();
25696 IncIt = SameTypeIt;
25708template <
bool IsCompatibility>
25713 "Expected valid element types only.");
25715 return IsCompatibility;
25716 auto *CI1 = cast<CmpInst>(V);
25717 auto *CI2 = cast<CmpInst>(V2);
25718 if (CI1->getOperand(0)->getType()->getTypeID() <
25720 return !IsCompatibility;
25721 if (CI1->getOperand(0)->getType()->getTypeID() >
25724 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25726 return !IsCompatibility;
25727 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25736 if (BasePred1 < BasePred2)
25737 return !IsCompatibility;
25738 if (BasePred1 > BasePred2)
25741 bool CI1Preds = Pred1 == BasePred1;
25742 bool CI2Preds = Pred2 == BasePred1;
25743 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
25744 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
25745 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
25749 return !IsCompatibility;
25752 if (
auto *I1 = dyn_cast<Instruction>(Op1))
25753 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
25754 if (IsCompatibility) {
25755 if (I1->getParent() != I2->getParent())
25762 return NodeI2 !=
nullptr;
25765 assert((NodeI1 == NodeI2) ==
25767 "Different nodes should have different DFS numbers");
25768 if (NodeI1 != NodeI2)
25772 if (S && (IsCompatibility || !S.isAltShuffle()))
25774 if (IsCompatibility)
25776 if (I1->getOpcode() != I2->getOpcode())
25777 return I1->getOpcode() < I2->getOpcode();
25780 return IsCompatibility;
25783template <
typename ItT>
25786 bool Changed =
false;
25789 if (
R.isDeleted(
I))
25792 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
25793 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25794 if (
R.isDeleted(
I))
25800 if (
R.isDeleted(
I))
25802 Changed |= tryToVectorize(
I, R);
25809 return compareCmp<false>(V, V2, *TLI, *DT);
25815 return compareCmp<true>(V1, VL.
back(), *TLI, *DT);
25822 if (Vals.
size() <= 1)
25824 Changed |= tryToVectorizeSequence<Value>(
25825 Vals, CompareSorter, AreCompatibleCompares,
25828 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25830 auto *Select = dyn_cast<SelectInst>(U);
25832 Select->getParent() != cast<Instruction>(V)->getParent();
25835 if (ArePossiblyReducedInOtherBlock)
25837 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25843bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25845 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
25846 "This function only accepts Insert instructions");
25847 bool OpsChanged =
false;
25849 for (
auto *
I :
reverse(Instructions)) {
25851 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
25853 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
25855 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25856 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
25858 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25861 if (
R.isDeleted(
I))
25863 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25864 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
25867 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
25869 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25870 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
25871 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25876 OpsChanged |= tryToVectorize(PostponedInsts, R);
25883 bool Changed =
false;
25890 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25893 "Expected vectorizable types only.");
25903 V2->getType()->getScalarSizeInBits())
25906 V2->getType()->getScalarSizeInBits())
25910 if (Opcodes1.
size() < Opcodes2.
size())
25912 if (Opcodes1.
size() > Opcodes2.
size())
25914 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
25917 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
25918 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
25923 return NodeI2 !=
nullptr;
25926 assert((NodeI1 == NodeI2) ==
25928 "Different nodes should have different DFS numbers");
25929 if (NodeI1 != NodeI2)
25932 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
25933 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
25934 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
25940 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
25941 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
25948 DT->
getNode(V1->getParent());
25950 DT->
getNode(V2->getParent());
25952 return NodeI2 !=
nullptr;
25955 assert((NodeI1 == NodeI2) ==
25957 "Different nodes should have different DFS numbers");
25958 if (NodeI1 != NodeI2)
25960 return V1->comesBefore(V2);
25973 return *Id1 < *Id2;
25977 if (
I1->getOpcode() == I2->getOpcode())
25979 return I1->getOpcode() < I2->getOpcode();
25988 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
25989 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
25997 bool U1 = isa<UndefValue>(Opcodes1[
I]);
25998 bool U2 = isa<UndefValue>(Opcodes2[
I]);
26002 auto ValID1 = Opcodes1[
I]->getValueID();
26003 auto ValID2 = Opcodes2[
I]->getValueID();
26004 if (ValID1 == ValID2)
26006 if (ValID1 < ValID2)
26008 if (ValID1 > ValID2)
26017 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26023 if (VL.empty() || V1 == VL.back())
26025 Value *V2 = VL.back();
26030 if (Opcodes1.
size() != Opcodes2.
size())
26032 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
26034 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
26036 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
26037 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
26038 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26040 if (
I1->getParent() != I2->getParent())
26046 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
26048 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26054 bool HaveVectorizedPhiNodes =
false;
26059 auto *
P = dyn_cast<PHINode>(&
I);
26065 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26078 if (!Opcodes.
empty())
26082 while (!Nodes.empty()) {
26083 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
26086 for (
Value *V :
PHI->incoming_values()) {
26087 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
26088 Nodes.push_back(PHI1);
26096 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26097 Incoming, PHICompare, AreCompatiblePHIs,
26099 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26102 Changed |= HaveVectorizedPhiNodes;
26103 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26104 auto *
PHI = dyn_cast<PHINode>(
P.first);
26105 return !
PHI ||
R.isDeleted(
PHI);
26107 PHIToOpcodes.
clear();
26109 }
while (HaveVectorizedPhiNodes);
26111 VisitedInstrs.
clear();
26113 InstSetVector PostProcessInserts;
26117 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26118 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26119 if (VectorizeCmps) {
26120 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
26121 PostProcessCmps.
clear();
26123 PostProcessInserts.clear();
26128 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
26129 return PostProcessCmps.
contains(Cmp);
26130 return isa<InsertElementInst, InsertValueInst>(
I) &&
26131 PostProcessInserts.contains(
I);
26137 return I->use_empty() &&
26138 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
26143 if (isa<ScalableVectorType>(It->getType()))
26147 if (
R.isDeleted(&*It))
26150 if (!VisitedInstrs.
insert(&*It).second) {
26151 if (HasNoUsers(&*It) &&
26152 VectorizeInsertsAndCmps(It->isTerminator())) {
26163 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
26165 if (
P->getNumIncomingValues() == 2) {
26168 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26177 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
26182 if (BB ==
P->getIncomingBlock(
I) ||
26188 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
26189 PI && !IsInPostProcessInstrs(PI)) {
26191 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26193 if (Res &&
R.isDeleted(
P)) {
26203 if (HasNoUsers(&*It)) {
26204 bool OpsChanged =
false;
26205 auto *
SI = dyn_cast<StoreInst>(It);
26215 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26216 SI->getValueOperand()->hasOneUse();
26218 if (TryToVectorizeRoot) {
26219 for (
auto *V : It->operand_values()) {
26222 if (
auto *VI = dyn_cast<Instruction>(V);
26223 VI && !IsInPostProcessInstrs(VI))
26225 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26232 VectorizeInsertsAndCmps(It->isTerminator());
26243 if (isa<InsertElementInst, InsertValueInst>(It))
26244 PostProcessInserts.insert(&*It);
26245 else if (isa<CmpInst>(It))
26246 PostProcessCmps.
insert(cast<CmpInst>(&*It));
26253 auto Changed =
false;
26254 for (
auto &Entry : GEPs) {
26257 if (
Entry.second.size() < 2)
26260 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26261 <<
Entry.second.size() <<
".\n");
26269 return !R.isDeleted(GEP);
26271 if (It ==
Entry.second.end())
26273 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26274 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26275 if (MaxVecRegSize < EltSize)
26278 unsigned MaxElts = MaxVecRegSize / EltSize;
26279 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26280 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26293 Candidates.remove_if([&R](
Value *
I) {
26294 return R.isDeleted(cast<Instruction>(
I)) ||
26295 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
26303 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
26304 auto *GEPI = GEPList[
I];
26305 if (!Candidates.count(GEPI))
26308 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
26309 auto *GEPJ = GEPList[J];
26311 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
26312 Candidates.remove(GEPI);
26313 Candidates.remove(GEPJ);
26314 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26315 Candidates.remove(GEPJ);
26322 if (Candidates.
size() < 2)
26329 auto BundleIndex = 0
u;
26330 for (
auto *V : Candidates) {
26331 auto *
GEP = cast<GetElementPtrInst>(V);
26332 auto *GEPIdx =
GEP->idx_begin()->get();
26333 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26334 Bundle[BundleIndex++] = GEPIdx;
26346 Changed |= tryToVectorizeList(Bundle, R);
26352bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26353 bool Changed =
false;
26358 if (
V->getValueOperand()->getType()->getTypeID() <
26361 if (
V->getValueOperand()->getType()->getTypeID() >
26364 if (
V->getPointerOperandType()->getTypeID() <
26365 V2->getPointerOperandType()->getTypeID())
26367 if (
V->getPointerOperandType()->getTypeID() >
26368 V2->getPointerOperandType()->getTypeID())
26370 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26373 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26377 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
26378 if (
auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26382 DT->
getNode(I2->getParent());
26383 assert(NodeI1 &&
"Should only process reachable instructions");
26384 assert(NodeI2 &&
"Should only process reachable instructions");
26385 assert((NodeI1 == NodeI2) ==
26387 "Different nodes should have different DFS numbers");
26388 if (NodeI1 != NodeI2)
26390 return I1->getOpcode() < I2->getOpcode();
26392 return V->getValueOperand()->getValueID() <
26396 bool SameParent =
true;
26410 if (isa<UndefValue>(V1->getValueOperand()) ||
26413 if (isa<Constant>(V1->getValueOperand()) &&
26419 auto *
I1 = dyn_cast<Instruction>(V1->getValueOperand());
26427 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26429 for (
auto [SI, V] :
zip(VL, NewVL))
26430 V =
SI->getValueOperand();
26431 NewVL.back() = V1->getValueOperand();
26432 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
26433 InstructionsState S =
Analysis.buildInstructionsState(
26441 return V1->getValueOperand()->
getValueID() ==
26447 for (
auto &Pair : Stores) {
26448 if (Pair.second.size() < 2)
26452 << Pair.second.size() <<
".\n");
26461 Pair.second.rend());
26462 Changed |= tryToVectorizeSequence<StoreInst>(
26463 ReversedStores, StoreSorter, AreCompatibleStores,
26465 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff)
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
ArrayRef< T > take_back(size_t N=1) const
Return a copy of *this with only the last N elements.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
LLVM_ABI CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.