LLVM 21.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107using namespace std::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
115 "Controls which SLP graphs should be vectorized.");
116
117static cl::opt<bool>
118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
119 cl::desc("Run the SLP vectorization passes"));
120
121static cl::opt<bool>
122 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
123 cl::desc("Enable vectorization for wider vector utilization"));
124
125static cl::opt<int>
127 cl::desc("Only vectorize if you gain more than this "
128 "number "));
129
131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
144static cl::opt<int>
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
150 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156static cl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
158 cl::desc("Limit the size of the SLP scheduling region per block"));
159
161 "slp-min-reg-size", cl::init(128), cl::Hidden,
162 cl::desc("Attempt to vectorize for this register size in bits"));
163
165 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
166 cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
169 "slp-min-tree-size", cl::init(3), cl::Hidden,
170 cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
188 "slp-min-strided-loads", cl::init(2), cl::Hidden,
189 cl::desc("The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
191
193 "slp-max-stride", cl::init(8), cl::Hidden,
194 cl::desc("The maximum stride, considered to be profitable."));
195
196static cl::opt<bool>
197 ViewSLPTree("view-slp-tree", cl::Hidden,
198 cl::desc("Display the SLP trees with Graphviz"));
199
201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
202 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206static const unsigned AliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210static constexpr int UsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215static const unsigned MaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219static const int MinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222static const unsigned MaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231static bool isValidElementType(Type *Ty) {
232 // TODO: Support ScalableVectorType.
233 if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
244 if (auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
250 return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254static unsigned getNumElements(Type *Ty) {
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
259 return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
264 return FixedVectorType::get(ScalarTy->getScalarType(),
265 VF * getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
272 Type *Ty, unsigned Sz) {
273 if (!isValidElementType(Ty))
274 return bit_ceil(Sz);
275 // Find the number of elements, which forms full vectors.
276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277 if (NumParts == 0 || NumParts >= Sz)
278 return bit_ceil(Sz);
279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285static unsigned
287 unsigned Sz) {
288 if (!isValidElementType(Ty))
289 return bit_floor(Sz);
290 // Find the number of elements, which forms full vectors.
291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292 if (NumParts == 0 || NumParts >= Sz)
293 return bit_floor(Sz);
294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
295 if (RegVF > Sz)
296 return bit_floor(Sz);
297 return (Sz / RegVF) * RegVF;
298}
299
300static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301 SmallVectorImpl<int> &Mask) {
302 // The ShuffleBuilder implementation use shufflevector to splat an "element".
303 // But the element have different meaning for SLP (scalar) and REVEC
304 // (vector). We need to expand Mask into masks which shufflevector can use
305 // directly.
306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307 for (unsigned I : seq<unsigned>(Mask.size()))
308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
309 I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
340 if (VL.empty())
341 return 0;
342 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343 return 0;
344 auto *SV = cast<ShuffleVectorInst>(VL.front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
349 return 0;
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352 return 0;
353 unsigned NumGroup = 0;
354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[I]);
356 Value *Src = SV->getOperand(0);
357 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358 SmallBitVector ExpectedIndex(GroupSize);
359 if (!all_of(Group, [&](Value *V) {
360 auto *SV = cast<ShuffleVectorInst>(V);
361 // From the same source.
362 if (SV->getOperand(0) != Src)
363 return false;
364 int Index;
365 if (!SV->isExtractSubvectorMask(Index))
366 return false;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368 return true;
369 }))
370 return 0;
371 if (!ExpectedIndex.all())
372 return 0;
373 ++NumGroup;
374 }
375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
376 return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
393 auto *SV = cast<ShuffleVectorInst>(VL.front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 SmallVector<int> Mask;
397 unsigned AccumulateLength = 0;
398 for (Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (int M : SV->getShuffleMask())
401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405 return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410static bool isConstant(Value *V) {
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420 return false;
421 auto *I = dyn_cast<Instruction>(V);
422 if (!I || isa<ExtractValueInst>(I))
423 return true;
424 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425 return false;
426 if (isa<ExtractElementInst>(I))
427 return isConstant(I->getOperand(1));
428 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
429 return isConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
443 unsigned Part) {
444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
450 std::string Result;
451 raw_string_ostream OS(Result);
452 if (Idx >= 0)
453 OS << "Idx: " << Idx << ", ";
454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
455 return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
462 auto *It = find_if(VL, IsaPred<Instruction>);
463 if (It == VL.end())
464 return false;
465 Instruction *I0 = cast<Instruction>(*It);
467 return true;
468
469 BasicBlock *BB = I0->getParent();
470 for (Value *V : iterator_range(It, VL.end())) {
471 if (isa<PoisonValue>(V))
472 continue;
473 auto *II = dyn_cast<Instruction>(V);
474 if (!II)
475 return false;
476
477 if (BB != II->getParent())
478 return false;
479 }
480 return true;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
486 // Constant expressions and globals can't be vectorized like normal integer/FP
487 // constants.
488 return all_of(VL, isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493static bool isSplat(ArrayRef<Value *> VL) {
494 Value *FirstNonUndef = nullptr;
495 for (Value *V : VL) {
496 if (isa<UndefValue>(V))
497 continue;
498 if (!FirstNonUndef) {
499 FirstNonUndef = V;
500 continue;
501 }
502 if (V != FirstNonUndef)
503 return false;
504 }
505 return FirstNonUndef != nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
510 if (auto *Cmp = dyn_cast<CmpInst>(I))
511 return Cmp->isCommutative();
512 if (auto *BO = dyn_cast<BinaryOperator>(I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516 all_of(
517 BO->uses(),
518 [](const Use &U) {
519 // Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525 // Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535 all_of(BO->uses(), [](const Use &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539 return I->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
544 unsigned Offset) {
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547 "unsupported T");
548 int Index = Offset;
549 if (const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
551 if (!VT)
552 return std::nullopt;
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554 if (!CI)
555 return std::nullopt;
556 if (CI->getValue().uge(VT->getNumElements()))
557 return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560 return Index;
561 }
562 return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned> getElementIndex(const Value *Inst,
569 unsigned Offset = 0) {
570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
571 return Index;
572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
573 return Index;
574
575 int Index = Offset;
576
577 const auto *IV = dyn_cast<InsertValueInst>(Inst);
578 if (!IV)
579 return std::nullopt;
580
581 Type *CurrentType = IV->getType();
582 for (unsigned I : IV->indices()) {
583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 } else {
590 return std::nullopt;
591 }
592 Index += I;
593 }
594 return Index;
595}
596
597namespace {
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612} // namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
617 UseMask MaskArg) {
618 SmallBitVector UseMask(VF, true);
619 for (auto [Idx, Value] : enumerate(Mask)) {
620 if (Value == PoisonMaskElem) {
621 if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623 continue;
624 }
625 if (MaskArg == UseMask::FirstArg && Value < VF)
626 UseMask.reset(Value);
627 else if (MaskArg == UseMask::SecondArg && Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630 return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
638 const SmallBitVector &UseMask = {}) {
639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641 if (isa<T>(V))
642 return Res;
643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644 if (!VecTy)
645 return Res.reset();
646 auto *C = dyn_cast<Constant>(V);
647 if (!C) {
648 if (!UseMask.empty()) {
649 const Value *Base = V;
650 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651 Base = II->getOperand(0);
652 if (isa<T>(II->getOperand(1)))
653 continue;
654 std::optional<unsigned> Idx = getElementIndex(II);
655 if (!Idx) {
656 Res.reset();
657 return Res;
658 }
659 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662 // TODO: Add analysis for shuffles here too.
663 if (V == Base) {
664 Res.reset();
665 } else {
666 SmallBitVector SubMask(UseMask.size(), false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 } else {
670 Res.reset();
671 }
672 return Res;
673 }
674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
675 if (Constant *Elem = C->getAggregateElement(I))
676 if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680 return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
707 AssumptionCache *AC) {
708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
709 if (It == VL.end())
710 return std::nullopt;
711 unsigned Size =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722 Value *Vec1 = nullptr;
723 Value *Vec2 = nullptr;
724 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
725 auto *EE = dyn_cast<ExtractElementInst>(V);
726 if (!EE)
727 return false;
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
730 return false;
731 return isGuaranteedNotToBePoison(Vec, AC);
732 });
733 enum ShuffleMode { Unknown, Select, Permute };
734 ShuffleMode CommonShuffleMode = Unknown;
735 Mask.assign(VL.size(), PoisonMaskElem);
736 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
737 // Undef can be represented as an undef element in a vector.
738 if (isa<UndefValue>(VL[I]))
739 continue;
740 auto *EI = cast<ExtractElementInst>(VL[I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 return std::nullopt;
743 auto *Vec = EI->getVectorOperand();
744 // We can extractelement from undef or poison vector.
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 continue;
747 // All vector operands must have the same number of vector elements.
748 if (isa<UndefValue>(Vec)) {
749 Mask[I] = I;
750 } else {
751 if (isa<UndefValue>(EI->getIndexOperand()))
752 continue;
753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754 if (!Idx)
755 return std::nullopt;
756 // Undefined behavior if Idx is negative or >= Size.
757 if (Idx->getValue().uge(Size))
758 continue;
759 unsigned IntIdx = Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762 if (isUndefVector(Vec).all() && HasNonUndefVec)
763 continue;
764 // For correct shuffling we have to have at most 2 different vector operands
765 // in all extractelement instructions.
766 if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 } else if (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] += Size;
771 } else {
772 return std::nullopt;
773 }
774 if (CommonShuffleMode == Permute)
775 continue;
776 // If the extract index is not the same as the operation number, it is a
777 // permutation.
778 if (Mask[I] % Size != I) {
779 CommonShuffleMode = Permute;
780 continue;
781 }
782 CommonShuffleMode = Select;
783 }
784 // If we're not crossing lanes in different vectors, consider it as blending.
785 if (CommonShuffleMode == Select && Vec2)
787 // If Vec2 was never used, we have a permutation of a single vector, otherwise
788 // we have permutation of 2 vectors.
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned> getExtractIndex(Instruction *E) {
795 unsigned Opcode = E->getOpcode();
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801 if (!CI)
802 return std::nullopt;
803 return CI->getZExtValue();
804 }
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
807 return std::nullopt;
808 return *EI->idx_begin();
809}
810
811namespace {
812
813/// Main data required for vectorization of instructions.
814class InstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816 Instruction *MainOp = nullptr;
817 Instruction *AltOp = nullptr;
818
819public:
820 Instruction *getMainOp() const {
821 assert(valid() && "InstructionsState is invalid.");
822 return MainOp;
823 }
824
825 Instruction *getAltOp() const {
826 assert(valid() && "InstructionsState is invalid.");
827 return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
832
833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
837
838 bool isOpcodeOrAlt(Instruction *I) const {
839 unsigned CheckedOpcode = I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844 bool valid() const { return MainOp && AltOp; }
845
846 explicit operator bool() const { return valid(); }
847
848 InstructionsState() = delete;
849 InstructionsState(Instruction *MainOp, Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() { return {nullptr, nullptr}; }
852};
853
854} // end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861static bool isValidForAlternation(unsigned Opcode) {
862 if (Instruction::isIntDivRem(Opcode))
863 return false;
864
865 return true;
866}
867
868static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
869 const TargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
874 Value *Op1, const TargetLibraryInfo &TLI) {
875 return (isConstant(BaseOp0) && isConstant(Op0)) ||
876 (isConstant(BaseOp1) && isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880 getSameOpcode({BaseOp0, Op0}, TLI) ||
881 getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
888 const TargetLibraryInfo &TLI) {
889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890 "Assessing comparisons of different types?");
891 CmpInst::Predicate BasePred = BaseCI->getPredicate();
892 CmpInst::Predicate Pred = CI->getPredicate();
894
895 Value *BaseOp0 = BaseCI->getOperand(0);
896 Value *BaseOp1 = BaseCI->getOperand(1);
897 Value *Op0 = CI->getOperand(0);
898 Value *Op1 = CI->getOperand(1);
899
900 return (BasePred == Pred &&
901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
910 const TargetLibraryInfo &TLI) {
911 // Make sure these are all Instructions.
912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
914
915 auto *It = find_if(VL, IsaPred<Instruction>);
916 if (It == VL.end())
917 return InstructionsState::invalid();
918
919 Instruction *MainOp = cast<Instruction>(*It);
920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
924
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
930 Instruction *AltOp = MainOp;
931 unsigned Opcode = MainOp->getOpcode();
932 unsigned AltOpcode = Opcode;
933
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938 for (Value *V : VL) {
939 auto *I = dyn_cast<CmpInst>(V);
940 if (!I)
941 return false;
942 CmpInst::Predicate CurrentPred = I->getPredicate();
943 CmpInst::Predicate SwappedCurrentPred =
944 CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946 if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950 // Total number of predicates > 2, but if consider swapped predicates
951 // compatible only 2, consider swappable predicates as compatible opcodes,
952 // not alternate.
953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955 // Check for one alternate opcode from another BinaryOperator.
956 // TODO - generalize to support all operators (types, calls etc.).
957 Intrinsic::ID BaseID = 0;
958 SmallVector<VFInfo> BaseMappings;
959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963 return InstructionsState::invalid();
964 }
965 bool AnyPoison = InstCnt != VL.size();
966 // Check MainOp too to be sure that it matches the requirements for the
967 // instructions.
968 for (Value *V : iterator_range(It, VL.end())) {
969 auto *I = dyn_cast<Instruction>(V);
970 if (!I)
971 continue;
972
973 // Cannot combine poison and divisions.
974 // TODO: do some smart analysis of the CallInsts to exclude divide-like
975 // intrinsics/functions only.
976 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
977 return InstructionsState::invalid();
978 unsigned InstOpcode = I->getOpcode();
979 if (IsBinOp && isa<BinaryOperator>(I)) {
980 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
981 continue;
982 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
983 isValidForAlternation(Opcode)) {
984 AltOpcode = InstOpcode;
985 AltOp = I;
986 continue;
987 }
988 } else if (IsCastOp && isa<CastInst>(I)) {
989 Value *Op0 = MainOp->getOperand(0);
990 Type *Ty0 = Op0->getType();
991 Value *Op1 = I->getOperand(0);
992 Type *Ty1 = Op1->getType();
993 if (Ty0 == Ty1) {
994 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 continue;
996 if (Opcode == AltOpcode) {
998 isValidForAlternation(InstOpcode) &&
999 "Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1001 AltOp = I;
1002 continue;
1003 }
1004 }
1005 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1006 auto *BaseInst = cast<CmpInst>(MainOp);
1007 Type *Ty0 = BaseInst->getOperand(0)->getType();
1008 Type *Ty1 = Inst->getOperand(0)->getType();
1009 if (Ty0 == Ty1) {
1010 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1011 assert(InstOpcode == AltOpcode &&
1012 "Alternate instructions are only supported by BinaryOperator "
1013 "and CastInst.");
1014 // Check for compatible operands. If the corresponding operands are not
1015 // compatible - need to perform alternate vectorization.
1016 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1017 CmpInst::Predicate SwappedCurrentPred =
1018 CmpInst::getSwappedPredicate(CurrentPred);
1019
1020 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1022 continue;
1023
1024 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1025 continue;
1026 auto *AltInst = cast<CmpInst>(AltOp);
1027 if (MainOp != AltOp) {
1028 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1029 continue;
1030 } else if (BasePred != CurrentPred) {
1031 assert(
1032 isValidForAlternation(InstOpcode) &&
1033 "CmpInst isn't safe for alternation, logic needs to be updated!");
1034 AltOp = I;
1035 continue;
1036 }
1037 CmpInst::Predicate AltPred = AltInst->getPredicate();
1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1040 continue;
1041 }
1042 } else if (InstOpcode == Opcode) {
1043 assert(InstOpcode == AltOpcode &&
1044 "Alternate instructions are only supported by BinaryOperator and "
1045 "CastInst.");
1046 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1047 if (Gep->getNumOperands() != 2 ||
1048 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1049 return InstructionsState::invalid();
1050 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1052 return InstructionsState::invalid();
1053 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1054 auto *BaseLI = cast<LoadInst>(MainOp);
1055 if (!LI->isSimple() || !BaseLI->isSimple())
1056 return InstructionsState::invalid();
1057 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1058 auto *CallBase = cast<CallInst>(MainOp);
1059 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1060 return InstructionsState::invalid();
1061 if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1065 CallBase->op_begin() +
1067 return InstructionsState::invalid();
1069 if (ID != BaseID)
1070 return InstructionsState::invalid();
1071 if (!ID) {
1072 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1073 if (Mappings.size() != BaseMappings.size() ||
1074 Mappings.front().ISA != BaseMappings.front().ISA ||
1075 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1076 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1077 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1078 Mappings.front().Shape.Parameters !=
1079 BaseMappings.front().Shape.Parameters)
1080 return InstructionsState::invalid();
1081 }
1082 }
1083 continue;
1084 }
1085 return InstructionsState::invalid();
1086 }
1087
1088 return InstructionsState(MainOp, AltOp);
1089}
1090
1091/// \returns true if all of the values in \p VL have the same type or false
1092/// otherwise.
1094 Type *Ty = VL.front()->getType();
1095 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1096}
1097
1098/// \returns True if in-tree use also needs extract. This refers to
1099/// possible scalar operand in vectorized instruction.
1100static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1101 TargetLibraryInfo *TLI,
1102 const TargetTransformInfo *TTI) {
1103 if (!UserInst)
1104 return false;
1105 unsigned Opcode = UserInst->getOpcode();
1106 switch (Opcode) {
1107 case Instruction::Load: {
1108 LoadInst *LI = cast<LoadInst>(UserInst);
1109 return (LI->getPointerOperand() == Scalar);
1110 }
1111 case Instruction::Store: {
1112 StoreInst *SI = cast<StoreInst>(UserInst);
1113 return (SI->getPointerOperand() == Scalar);
1114 }
1115 case Instruction::Call: {
1116 CallInst *CI = cast<CallInst>(UserInst);
1118 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1120 Arg.value().get() == Scalar;
1121 });
1122 }
1123 default:
1124 return false;
1125 }
1126}
1127
1128/// \returns the AA location that is being access by the instruction.
1130 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1131 return MemoryLocation::get(SI);
1132 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1133 return MemoryLocation::get(LI);
1134 return MemoryLocation();
1135}
1136
1137/// \returns True if the instruction is not a volatile or atomic load/store.
1138static bool isSimple(Instruction *I) {
1139 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1140 return LI->isSimple();
1141 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1142 return SI->isSimple();
1143 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1144 return !MI->isVolatile();
1145 return true;
1146}
1147
1148/// Shuffles \p Mask in accordance with the given \p SubMask.
1149/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1150/// one but two input vectors.
1151static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1152 bool ExtendingManyInputs = false) {
1153 if (SubMask.empty())
1154 return;
1155 assert(
1156 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1157 // Check if input scalars were extended to match the size of other node.
1158 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1159 "SubMask with many inputs support must be larger than the mask.");
1160 if (Mask.empty()) {
1161 Mask.append(SubMask.begin(), SubMask.end());
1162 return;
1163 }
1164 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1165 int TermValue = std::min(Mask.size(), SubMask.size());
1166 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1167 if (SubMask[I] == PoisonMaskElem ||
1168 (!ExtendingManyInputs &&
1169 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1170 continue;
1171 NewMask[I] = Mask[SubMask[I]];
1172 }
1173 Mask.swap(NewMask);
1174}
1175
1176/// Order may have elements assigned special value (size) which is out of
1177/// bounds. Such indices only appear on places which correspond to undef values
1178/// (see canReuseExtract for details) and used in order to avoid undef values
1179/// have effect on operands ordering.
1180/// The first loop below simply finds all unused indices and then the next loop
1181/// nest assigns these indices for undef values positions.
1182/// As an example below Order has two undef positions and they have assigned
1183/// values 3 and 7 respectively:
1184/// before: 6 9 5 4 9 2 1 0
1185/// after: 6 3 5 4 7 2 1 0
1187 const unsigned Sz = Order.size();
1188 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1189 SmallBitVector MaskedIndices(Sz);
1190 for (unsigned I = 0; I < Sz; ++I) {
1191 if (Order[I] < Sz)
1192 UnusedIndices.reset(Order[I]);
1193 else
1194 MaskedIndices.set(I);
1195 }
1196 if (MaskedIndices.none())
1197 return;
1198 assert(UnusedIndices.count() == MaskedIndices.count() &&
1199 "Non-synced masked/available indices.");
1200 int Idx = UnusedIndices.find_first();
1201 int MIdx = MaskedIndices.find_first();
1202 while (MIdx >= 0) {
1203 assert(Idx >= 0 && "Indices must be synced.");
1204 Order[MIdx] = Idx;
1205 Idx = UnusedIndices.find_next(Idx);
1206 MIdx = MaskedIndices.find_next(MIdx);
1207 }
1208}
1209
1210/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1211/// Opcode1.
1213 unsigned Opcode1) {
1214 Type *ScalarTy = VL[0]->getType();
1215 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1216 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1217 for (unsigned Lane : seq<unsigned>(VL.size())) {
1218 if (isa<PoisonValue>(VL[Lane]))
1219 continue;
1220 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1221 OpcodeMask.set(Lane * ScalarTyNumElements,
1222 Lane * ScalarTyNumElements + ScalarTyNumElements);
1223 }
1224 return OpcodeMask;
1225}
1226
1227namespace llvm {
1228
1230 SmallVectorImpl<int> &Mask) {
1231 Mask.clear();
1232 const unsigned E = Indices.size();
1233 Mask.resize(E, PoisonMaskElem);
1234 for (unsigned I = 0; I < E; ++I)
1235 Mask[Indices[I]] = I;
1236}
1237
1238/// Reorders the list of scalars in accordance with the given \p Mask.
1240 ArrayRef<int> Mask) {
1241 assert(!Mask.empty() && "Expected non-empty mask.");
1242 SmallVector<Value *> Prev(Scalars.size(),
1243 PoisonValue::get(Scalars.front()->getType()));
1244 Prev.swap(Scalars);
1245 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1246 if (Mask[I] != PoisonMaskElem)
1247 Scalars[Mask[I]] = Prev[I];
1248}
1249
1250/// Checks if the provided value does not require scheduling. It does not
1251/// require scheduling if this is not an instruction or it is an instruction
1252/// that does not read/write memory and all operands are either not instructions
1253/// or phi nodes or instructions from different blocks.
1255 auto *I = dyn_cast<Instruction>(V);
1256 if (!I)
1257 return true;
1258 return !mayHaveNonDefUseDependency(*I) &&
1259 all_of(I->operands(), [I](Value *V) {
1260 auto *IO = dyn_cast<Instruction>(V);
1261 if (!IO)
1262 return true;
1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1264 });
1265}
1266
1267/// Checks if the provided value does not require scheduling. It does not
1268/// require scheduling if this is not an instruction or it is an instruction
1269/// that does not read/write memory and all users are phi nodes or instructions
1270/// from the different blocks.
1271static bool isUsedOutsideBlock(Value *V) {
1272 auto *I = dyn_cast<Instruction>(V);
1273 if (!I)
1274 return true;
1275 // Limits the number of uses to save compile time.
1276 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1277 all_of(I->users(), [I](User *U) {
1278 auto *IU = dyn_cast<Instruction>(U);
1279 if (!IU)
1280 return true;
1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1282 });
1283}
1284
1285/// Checks if the specified value does not require scheduling. It does not
1286/// require scheduling if all operands and all users do not need to be scheduled
1287/// in the current basic block.
1290}
1291
1292/// Checks if the specified array of instructions does not require scheduling.
1293/// It is so if all either instructions have operands that do not require
1294/// scheduling or their users do not require scheduling since they are phis or
1295/// in other basic blocks.
1297 return !VL.empty() &&
1299}
1300
1301/// Returns true if widened type of \p Ty elements with size \p Sz represents
1302/// full vector type, i.e. adding extra element results in extra parts upon type
1303/// legalization.
1305 unsigned Sz) {
1306 if (Sz <= 1)
1307 return false;
1308 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1309 return false;
1310 if (has_single_bit(Sz))
1311 return true;
1312 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1313 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1314 Sz % NumParts == 0;
1315}
1316
1317/// Returns number of parts, the type \p VecTy will be split at the codegen
1318/// phase. If the type is going to be scalarized or does not uses whole
1319/// registers, returns 1.
1320static unsigned
1322 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1323 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1324 if (NumParts == 0 || NumParts >= Limit)
1325 return 1;
1326 unsigned Sz = getNumElements(VecTy);
1327 if (NumParts >= Sz || Sz % NumParts != 0 ||
1328 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329 return 1;
1330 return NumParts;
1331}
1332
1333namespace slpvectorizer {
1334
1335/// Bottom Up SLP Vectorizer.
1336class BoUpSLP {
1337 struct TreeEntry;
1338 struct ScheduleData;
1341
1342public:
1343 /// Tracks the state we can represent the loads in the given sequence.
1344 enum class LoadsState {
1345 Gather,
1346 Vectorize,
1349 };
1350
1357
1359 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1362 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1363 AC(AC), DB(DB), DL(DL), ORE(ORE),
1364 Builder(Se->getContext(), TargetFolder(*DL)) {
1365 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1366 // Use the vector register size specified by the target unless overridden
1367 // by a command-line option.
1368 // TODO: It would be better to limit the vectorization factor based on
1369 // data type rather than just register size. For example, x86 AVX has
1370 // 256-bit registers, but it does not support integer operations
1371 // at that width (that requires AVX2).
1372 if (MaxVectorRegSizeOption.getNumOccurrences())
1373 MaxVecRegSize = MaxVectorRegSizeOption;
1374 else
1375 MaxVecRegSize =
1377 .getFixedValue();
1378
1379 if (MinVectorRegSizeOption.getNumOccurrences())
1380 MinVecRegSize = MinVectorRegSizeOption;
1381 else
1382 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1383 }
1384
1385 /// Vectorize the tree that starts with the elements in \p VL.
1386 /// Returns the vectorized root.
1388
1389 /// Vectorize the tree but with the list of externally used values \p
1390 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1391 /// generated extractvalue instructions.
1392 Value *
1393 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1394 Instruction *ReductionRoot = nullptr);
1395
1396 /// \returns the cost incurred by unwanted spills and fills, caused by
1397 /// holding live values over call sites.
1399
1400 /// \returns the vectorization cost of the subtree that starts at \p VL.
1401 /// A negative number means that this is profitable.
1402 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1403
1404 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1405 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1406 void buildTree(ArrayRef<Value *> Roots,
1407 const SmallDenseSet<Value *> &UserIgnoreLst);
1408
1409 /// Construct a vectorizable tree that starts at \p Roots.
1410 void buildTree(ArrayRef<Value *> Roots);
1411
1412 /// Returns whether the root node has in-tree uses.
1414 return !VectorizableTree.empty() &&
1415 !VectorizableTree.front()->UserTreeIndices.empty();
1416 }
1417
1418 /// Return the scalars of the root node.
1420 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1421 return VectorizableTree.front()->Scalars;
1422 }
1423
1424 /// Returns the type/is-signed info for the root node in the graph without
1425 /// casting.
1426 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1427 const TreeEntry &Root = *VectorizableTree.front().get();
1428 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1429 !Root.Scalars.front()->getType()->isIntegerTy())
1430 return std::nullopt;
1431 auto It = MinBWs.find(&Root);
1432 if (It != MinBWs.end())
1433 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1434 It->second.first),
1435 It->second.second);
1436 if (Root.getOpcode() == Instruction::ZExt ||
1437 Root.getOpcode() == Instruction::SExt)
1438 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1439 Root.getOpcode() == Instruction::SExt);
1440 return std::nullopt;
1441 }
1442
1443 /// Checks if the root graph node can be emitted with narrower bitwidth at
1444 /// codegen and returns it signedness, if so.
1446 return MinBWs.at(VectorizableTree.front().get()).second;
1447 }
1448
1449 /// Returns reduction type after minbitdth analysis.
1451 if (ReductionBitWidth == 0 ||
1452 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1453 ReductionBitWidth >=
1454 DL->getTypeSizeInBits(
1455 VectorizableTree.front()->Scalars.front()->getType()))
1456 return getWidenedType(
1457 VectorizableTree.front()->Scalars.front()->getType(),
1458 VectorizableTree.front()->getVectorFactor());
1459 return getWidenedType(
1461 VectorizableTree.front()->Scalars.front()->getContext(),
1462 ReductionBitWidth),
1463 VectorizableTree.front()->getVectorFactor());
1464 }
1465
1466 /// Builds external uses of the vectorized scalars, i.e. the list of
1467 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1468 /// ExternallyUsedValues contains additional list of external uses to handle
1469 /// vectorization of reductions.
1470 void
1471 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1472
1473 /// Transforms graph nodes to target specific representations, if profitable.
1474 void transformNodes();
1475
1476 /// Clear the internal data structures that are created by 'buildTree'.
1477 void deleteTree() {
1478 VectorizableTree.clear();
1479 ScalarToTreeEntries.clear();
1480 MustGather.clear();
1481 NonScheduledFirst.clear();
1482 EntryToLastInstruction.clear();
1483 LoadEntriesToVectorize.clear();
1484 IsGraphTransformMode = false;
1485 GatheredLoadsEntriesFirst.reset();
1486 ExternalUses.clear();
1487 ExternalUsesAsOriginalScalar.clear();
1488 for (auto &Iter : BlocksSchedules) {
1489 BlockScheduling *BS = Iter.second.get();
1490 BS->clear();
1491 }
1492 MinBWs.clear();
1493 ReductionBitWidth = 0;
1494 BaseGraphSize = 1;
1495 CastMaxMinBWSizes.reset();
1496 ExtraBitWidthNodes.clear();
1497 InstrElementSize.clear();
1498 UserIgnoreList = nullptr;
1499 PostponedGathers.clear();
1500 ValueToGatherNodes.clear();
1501 }
1502
1503 unsigned getTreeSize() const { return VectorizableTree.size(); }
1504
1505 /// Returns the base graph size, before any transformations.
1506 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1507
1508 /// Perform LICM and CSE on the newly generated gather sequences.
1510
1511 /// Does this non-empty order represent an identity order? Identity
1512 /// should be represented as an empty order, so this is used to
1513 /// decide if we can canonicalize a computed order. Undef elements
1514 /// (represented as size) are ignored.
1516 assert(!Order.empty() && "expected non-empty order");
1517 const unsigned Sz = Order.size();
1518 return all_of(enumerate(Order), [&](const auto &P) {
1519 return P.value() == P.index() || P.value() == Sz;
1520 });
1521 }
1522
1523 /// Checks if the specified gather tree entry \p TE can be represented as a
1524 /// shuffled vector entry + (possibly) permutation with other gathers. It
1525 /// implements the checks only for possibly ordered scalars (Loads,
1526 /// ExtractElement, ExtractValue), which can be part of the graph.
1527 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1528
1529 /// Sort loads into increasing pointers offsets to allow greater clustering.
1530 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1531
1532 /// Gets reordering data for the given tree entry. If the entry is vectorized
1533 /// - just return ReorderIndices, otherwise check if the scalars can be
1534 /// reordered and return the most optimal order.
1535 /// \return std::nullopt if ordering is not important, empty order, if
1536 /// identity order is important, or the actual order.
1537 /// \param TopToBottom If true, include the order of vectorized stores and
1538 /// insertelement nodes, otherwise skip them.
1539 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1540 bool TopToBottom);
1541
1542 /// Reorders the current graph to the most profitable order starting from the
1543 /// root node to the leaf nodes. The best order is chosen only from the nodes
1544 /// of the same size (vectorization factor). Smaller nodes are considered
1545 /// parts of subgraph with smaller VF and they are reordered independently. We
1546 /// can make it because we still need to extend smaller nodes to the wider VF
1547 /// and we can merge reordering shuffles with the widening shuffles.
1548 void reorderTopToBottom();
1549
1550 /// Reorders the current graph to the most profitable order starting from
1551 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1552 /// number of reshuffles if the leaf nodes use the same order. In this case we
1553 /// can merge the orders and just shuffle user node instead of shuffling its
1554 /// operands. Plus, even the leaf nodes have different orders, it allows to
1555 /// sink reordering in the graph closer to the root node and merge it later
1556 /// during analysis.
1557 void reorderBottomToTop(bool IgnoreReorder = false);
1558
1559 /// \return The vector element size in bits to use when vectorizing the
1560 /// expression tree ending at \p V. If V is a store, the size is the width of
1561 /// the stored value. Otherwise, the size is the width of the largest loaded
1562 /// value reaching V. This method is used by the vectorizer to calculate
1563 /// vectorization factors.
1564 unsigned getVectorElementSize(Value *V);
1565
1566 /// Compute the minimum type sizes required to represent the entries in a
1567 /// vectorizable tree.
1569
1570 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1571 unsigned getMaxVecRegSize() const {
1572 return MaxVecRegSize;
1573 }
1574
1575 // \returns minimum vector register size as set by cl::opt.
1576 unsigned getMinVecRegSize() const {
1577 return MinVecRegSize;
1578 }
1579
1580 unsigned getMinVF(unsigned Sz) const {
1581 return std::max(2U, getMinVecRegSize() / Sz);
1582 }
1583
1584 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1585 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1586 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1587 return MaxVF ? MaxVF : UINT_MAX;
1588 }
1589
1590 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1591 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1592 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1593 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1594 ///
1595 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1596 unsigned canMapToVector(Type *T) const;
1597
1598 /// \returns True if the VectorizableTree is both tiny and not fully
1599 /// vectorizable. We do not vectorize such trees.
1600 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1601
1602 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1603 /// It may happen, if all gather nodes are loads and they cannot be
1604 /// "clusterized". In this case even subgraphs cannot be vectorized more
1605 /// effectively than the base graph.
1606 bool isTreeNotExtendable() const;
1607
1608 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1609 /// can be load combined in the backend. Load combining may not be allowed in
1610 /// the IR optimizer, so we do not want to alter the pattern. For example,
1611 /// partially transforming a scalar bswap() pattern into vector code is
1612 /// effectively impossible for the backend to undo.
1613 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1614 /// may not be necessary.
1615 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1616
1617 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1618 /// can be load combined in the backend. Load combining may not be allowed in
1619 /// the IR optimizer, so we do not want to alter the pattern. For example,
1620 /// partially transforming a scalar bswap() pattern into vector code is
1621 /// effectively impossible for the backend to undo.
1622 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1623 /// may not be necessary.
1624 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1625
1626 /// Checks if the given array of loads can be represented as a vectorized,
1627 /// scatter or just simple gather.
1628 /// \param VL list of loads.
1629 /// \param VL0 main load value.
1630 /// \param Order returned order of load instructions.
1631 /// \param PointerOps returned list of pointer operands.
1632 /// \param BestVF return best vector factor, if recursive check found better
1633 /// vectorization sequences rather than masked gather.
1634 /// \param TryRecursiveCheck used to check if long masked gather can be
1635 /// represented as a serie of loads/insert subvector, if profitable.
1638 SmallVectorImpl<Value *> &PointerOps,
1639 unsigned *BestVF = nullptr,
1640 bool TryRecursiveCheck = true) const;
1641
1642 /// Registers non-vectorizable sequence of loads
1643 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1644 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1645 }
1646
1647 /// Checks if the given loads sequence is known as not vectorizable
1648 template <typename T>
1650 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1651 }
1652
1654
1655 /// This structure holds any data we need about the edges being traversed
1656 /// during buildTree_rec(). We keep track of:
1657 /// (i) the user TreeEntry index, and
1658 /// (ii) the index of the edge.
1659 struct EdgeInfo {
1660 EdgeInfo() = default;
1661 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1663 /// The user TreeEntry.
1664 TreeEntry *UserTE = nullptr;
1665 /// The operand index of the use.
1666 unsigned EdgeIdx = UINT_MAX;
1667#ifndef NDEBUG
1669 const BoUpSLP::EdgeInfo &EI) {
1670 EI.dump(OS);
1671 return OS;
1672 }
1673 /// Debug print.
1674 void dump(raw_ostream &OS) const {
1675 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1676 << " EdgeIdx:" << EdgeIdx << "}";
1677 }
1678 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1679#endif
1680 bool operator == (const EdgeInfo &Other) const {
1681 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1682 }
1683 };
1684
1685 /// A helper class used for scoring candidates for two consecutive lanes.
1687 const TargetLibraryInfo &TLI;
1688 const DataLayout &DL;
1689 ScalarEvolution &SE;
1690 const BoUpSLP &R;
1691 int NumLanes; // Total number of lanes (aka vectorization factor).
1692 int MaxLevel; // The maximum recursion depth for accumulating score.
1693
1694 public:
1696 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1697 int MaxLevel)
1698 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1699 MaxLevel(MaxLevel) {}
1700
1701 // The hard-coded scores listed here are not very important, though it shall
1702 // be higher for better matches to improve the resulting cost. When
1703 // computing the scores of matching one sub-tree with another, we are
1704 // basically counting the number of values that are matching. So even if all
1705 // scores are set to 1, we would still get a decent matching result.
1706 // However, sometimes we have to break ties. For example we may have to
1707 // choose between matching loads vs matching opcodes. This is what these
1708 // scores are helping us with: they provide the order of preference. Also,
1709 // this is important if the scalar is externally used or used in another
1710 // tree entry node in the different lane.
1711
1712 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1713 static const int ScoreConsecutiveLoads = 4;
1714 /// The same load multiple times. This should have a better score than
1715 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1716 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1717 /// a vector load and 1.0 for a broadcast.
1718 static const int ScoreSplatLoads = 3;
1719 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1720 static const int ScoreReversedLoads = 3;
1721 /// A load candidate for masked gather.
1722 static const int ScoreMaskedGatherCandidate = 1;
1723 /// ExtractElementInst from same vector and consecutive indexes.
1724 static const int ScoreConsecutiveExtracts = 4;
1725 /// ExtractElementInst from same vector and reversed indices.
1726 static const int ScoreReversedExtracts = 3;
1727 /// Constants.
1728 static const int ScoreConstants = 2;
1729 /// Instructions with the same opcode.
1730 static const int ScoreSameOpcode = 2;
1731 /// Instructions with alt opcodes (e.g, add + sub).
1732 static const int ScoreAltOpcodes = 1;
1733 /// Identical instructions (a.k.a. splat or broadcast).
1734 static const int ScoreSplat = 1;
1735 /// Matching with an undef is preferable to failing.
1736 static const int ScoreUndef = 1;
1737 /// Score for failing to find a decent match.
1738 static const int ScoreFail = 0;
1739 /// Score if all users are vectorized.
1740 static const int ScoreAllUserVectorized = 1;
1741
1742 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1743 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1744 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1745 /// MainAltOps.
1747 ArrayRef<Value *> MainAltOps) const {
1748 if (!isValidElementType(V1->getType()) ||
1749 !isValidElementType(V2->getType()))
1751
1752 if (V1 == V2) {
1753 if (isa<LoadInst>(V1)) {
1754 // Retruns true if the users of V1 and V2 won't need to be extracted.
1755 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1756 // Bail out if we have too many uses to save compilation time.
1757 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1758 return false;
1759
1760 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1761 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1762 return U == U1 || U == U2 || R.isVectorized(U);
1763 });
1764 };
1765 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1766 };
1767 // A broadcast of a load can be cheaper on some targets.
1768 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1769 ElementCount::getFixed(NumLanes)) &&
1770 ((int)V1->getNumUses() == NumLanes ||
1771 AllUsersAreInternal(V1, V2)))
1773 }
1775 }
1776
1777 auto CheckSameEntryOrFail = [&]() {
1778 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
1779 SmallPtrSet<TreeEntry *, 4> Set(TEs1.begin(), TEs1.end());
1780 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
1781 !TEs2.empty() &&
1782 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
1784 }
1786 };
1787
1788 auto *LI1 = dyn_cast<LoadInst>(V1);
1789 auto *LI2 = dyn_cast<LoadInst>(V2);
1790 if (LI1 && LI2) {
1791 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1792 !LI2->isSimple())
1793 return CheckSameEntryOrFail();
1794
1795 std::optional<int> Dist = getPointersDiff(
1796 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1797 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1798 if (!Dist || *Dist == 0) {
1799 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1800 getUnderlyingObject(LI2->getPointerOperand()) &&
1801 R.TTI->isLegalMaskedGather(
1802 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1804 return CheckSameEntryOrFail();
1805 }
1806 // The distance is too large - still may be profitable to use masked
1807 // loads/gathers.
1808 if (std::abs(*Dist) > NumLanes / 2)
1810 // This still will detect consecutive loads, but we might have "holes"
1811 // in some cases. It is ok for non-power-2 vectorization and may produce
1812 // better results. It should not affect current vectorization.
1815 }
1816
1817 auto *C1 = dyn_cast<Constant>(V1);
1818 auto *C2 = dyn_cast<Constant>(V2);
1819 if (C1 && C2)
1821
1822 // Extracts from consecutive indexes of the same vector better score as
1823 // the extracts could be optimized away.
1824 Value *EV1;
1825 ConstantInt *Ex1Idx;
1826 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1827 // Undefs are always profitable for extractelements.
1828 // Compiler can easily combine poison and extractelement <non-poison> or
1829 // undef and extractelement <poison>. But combining undef +
1830 // extractelement <non-poison-but-may-produce-poison> requires some
1831 // extra operations.
1832 if (isa<UndefValue>(V2))
1833 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1836 Value *EV2 = nullptr;
1837 ConstantInt *Ex2Idx = nullptr;
1838 if (match(V2,
1840 m_Undef())))) {
1841 // Undefs are always profitable for extractelements.
1842 if (!Ex2Idx)
1844 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1846 if (EV2 == EV1) {
1847 int Idx1 = Ex1Idx->getZExtValue();
1848 int Idx2 = Ex2Idx->getZExtValue();
1849 int Dist = Idx2 - Idx1;
1850 // The distance is too large - still may be profitable to use
1851 // shuffles.
1852 if (std::abs(Dist) == 0)
1854 if (std::abs(Dist) > NumLanes / 2)
1858 }
1860 }
1861 return CheckSameEntryOrFail();
1862 }
1863
1864 auto *I1 = dyn_cast<Instruction>(V1);
1865 auto *I2 = dyn_cast<Instruction>(V2);
1866 if (I1 && I2) {
1867 if (I1->getParent() != I2->getParent())
1868 return CheckSameEntryOrFail();
1869 SmallVector<Value *, 4> Ops(MainAltOps);
1870 Ops.push_back(I1);
1871 Ops.push_back(I2);
1872 InstructionsState S = getSameOpcode(Ops, TLI);
1873 // Note: Only consider instructions with <= 2 operands to avoid
1874 // complexity explosion.
1875 if (S &&
1876 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1877 !S.isAltShuffle()) &&
1878 all_of(Ops, [&S](Value *V) {
1879 return isa<PoisonValue>(V) ||
1880 cast<Instruction>(V)->getNumOperands() ==
1881 S.getMainOp()->getNumOperands();
1882 }))
1883 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1885 }
1886
1887 if (I1 && isa<PoisonValue>(V2))
1889
1890 if (isa<UndefValue>(V2))
1892
1893 return CheckSameEntryOrFail();
1894 }
1895
1896 /// Go through the operands of \p LHS and \p RHS recursively until
1897 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1898 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1899 /// of \p U1 and \p U2), except at the beginning of the recursion where
1900 /// these are set to nullptr.
1901 ///
1902 /// For example:
1903 /// \verbatim
1904 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1905 /// \ / \ / \ / \ /
1906 /// + + + +
1907 /// G1 G2 G3 G4
1908 /// \endverbatim
1909 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1910 /// each level recursively, accumulating the score. It starts from matching
1911 /// the additions at level 0, then moves on to the loads (level 1). The
1912 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1913 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1914 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1915 /// Please note that the order of the operands does not matter, as we
1916 /// evaluate the score of all profitable combinations of operands. In
1917 /// other words the score of G1 and G4 is the same as G1 and G2. This
1918 /// heuristic is based on ideas described in:
1919 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1920 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1921 /// Luís F. W. Góes
1923 Instruction *U2, int CurrLevel,
1924 ArrayRef<Value *> MainAltOps) const {
1925
1926 // Get the shallow score of V1 and V2.
1927 int ShallowScoreAtThisLevel =
1928 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1929
1930 // If reached MaxLevel,
1931 // or if V1 and V2 are not instructions,
1932 // or if they are SPLAT,
1933 // or if they are not consecutive,
1934 // or if profitable to vectorize loads or extractelements, early return
1935 // the current cost.
1936 auto *I1 = dyn_cast<Instruction>(LHS);
1937 auto *I2 = dyn_cast<Instruction>(RHS);
1938 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1939 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1940 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1941 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1942 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1943 ShallowScoreAtThisLevel))
1944 return ShallowScoreAtThisLevel;
1945 assert(I1 && I2 && "Should have early exited.");
1946
1947 // Contains the I2 operand indexes that got matched with I1 operands.
1948 SmallSet<unsigned, 4> Op2Used;
1949
1950 // Recursion towards the operands of I1 and I2. We are trying all possible
1951 // operand pairs, and keeping track of the best score.
1952 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1953 OpIdx1 != NumOperands1; ++OpIdx1) {
1954 // Try to pair op1I with the best operand of I2.
1955 int MaxTmpScore = 0;
1956 unsigned MaxOpIdx2 = 0;
1957 bool FoundBest = false;
1958 // If I2 is commutative try all combinations.
1959 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1960 unsigned ToIdx = isCommutative(I2)
1961 ? I2->getNumOperands()
1962 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1963 assert(FromIdx <= ToIdx && "Bad index");
1964 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1965 // Skip operands already paired with OpIdx1.
1966 if (Op2Used.count(OpIdx2))
1967 continue;
1968 // Recursively calculate the cost at each level
1969 int TmpScore =
1970 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1971 I1, I2, CurrLevel + 1, {});
1972 // Look for the best score.
1973 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1974 TmpScore > MaxTmpScore) {
1975 MaxTmpScore = TmpScore;
1976 MaxOpIdx2 = OpIdx2;
1977 FoundBest = true;
1978 }
1979 }
1980 if (FoundBest) {
1981 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1982 Op2Used.insert(MaxOpIdx2);
1983 ShallowScoreAtThisLevel += MaxTmpScore;
1984 }
1985 }
1986 return ShallowScoreAtThisLevel;
1987 }
1988 };
1989 /// A helper data structure to hold the operands of a vector of instructions.
1990 /// This supports a fixed vector length for all operand vectors.
1992 /// For each operand we need (i) the value, and (ii) the opcode that it
1993 /// would be attached to if the expression was in a left-linearized form.
1994 /// This is required to avoid illegal operand reordering.
1995 /// For example:
1996 /// \verbatim
1997 /// 0 Op1
1998 /// |/
1999 /// Op1 Op2 Linearized + Op2
2000 /// \ / ----------> |/
2001 /// - -
2002 ///
2003 /// Op1 - Op2 (0 + Op1) - Op2
2004 /// \endverbatim
2005 ///
2006 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2007 ///
2008 /// Another way to think of this is to track all the operations across the
2009 /// path from the operand all the way to the root of the tree and to
2010 /// calculate the operation that corresponds to this path. For example, the
2011 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2012 /// corresponding operation is a '-' (which matches the one in the
2013 /// linearized tree, as shown above).
2014 ///
2015 /// For lack of a better term, we refer to this operation as Accumulated
2016 /// Path Operation (APO).
2017 struct OperandData {
2018 OperandData() = default;
2019 OperandData(Value *V, bool APO, bool IsUsed)
2020 : V(V), APO(APO), IsUsed(IsUsed) {}
2021 /// The operand value.
2022 Value *V = nullptr;
2023 /// TreeEntries only allow a single opcode, or an alternate sequence of
2024 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2025 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2026 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2027 /// (e.g., Add/Mul)
2028 bool APO = false;
2029 /// Helper data for the reordering function.
2030 bool IsUsed = false;
2031 };
2032
2033 /// During operand reordering, we are trying to select the operand at lane
2034 /// that matches best with the operand at the neighboring lane. Our
2035 /// selection is based on the type of value we are looking for. For example,
2036 /// if the neighboring lane has a load, we need to look for a load that is
2037 /// accessing a consecutive address. These strategies are summarized in the
2038 /// 'ReorderingMode' enumerator.
2039 enum class ReorderingMode {
2040 Load, ///< Matching loads to consecutive memory addresses
2041 Opcode, ///< Matching instructions based on opcode (same or alternate)
2042 Constant, ///< Matching constants
2043 Splat, ///< Matching the same instruction multiple times (broadcast)
2044 Failed, ///< We failed to create a vectorizable group
2045 };
2046
2048
2049 /// A vector of operand vectors.
2051 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2052 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2053 unsigned ArgSize = 0;
2054
2055 const TargetLibraryInfo &TLI;
2056 const DataLayout &DL;
2057 ScalarEvolution &SE;
2058 const BoUpSLP &R;
2059 const Loop *L = nullptr;
2060
2061 /// \returns the operand data at \p OpIdx and \p Lane.
2062 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2063 return OpsVec[OpIdx][Lane];
2064 }
2065
2066 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2067 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2068 return OpsVec[OpIdx][Lane];
2069 }
2070
2071 /// Clears the used flag for all entries.
2072 void clearUsed() {
2073 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2074 OpIdx != NumOperands; ++OpIdx)
2075 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2076 ++Lane)
2077 OpsVec[OpIdx][Lane].IsUsed = false;
2078 }
2079
2080 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2081 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2082 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2083 }
2084
2085 /// \param Lane lane of the operands under analysis.
2086 /// \param OpIdx operand index in \p Lane lane we're looking the best
2087 /// candidate for.
2088 /// \param Idx operand index of the current candidate value.
2089 /// \returns The additional score due to possible broadcasting of the
2090 /// elements in the lane. It is more profitable to have power-of-2 unique
2091 /// elements in the lane, it will be vectorized with higher probability
2092 /// after removing duplicates. Currently the SLP vectorizer supports only
2093 /// vectorization of the power-of-2 number of unique scalars.
2094 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2095 const SmallBitVector &UsedLanes) const {
2096 Value *IdxLaneV = getData(Idx, Lane).V;
2097 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2098 isa<ExtractElementInst>(IdxLaneV))
2099 return 0;
2101 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2102 if (Ln == Lane)
2103 continue;
2104 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2105 if (!isa<Instruction>(OpIdxLnV))
2106 return 0;
2107 Uniques.try_emplace(OpIdxLnV, Ln);
2108 }
2109 unsigned UniquesCount = Uniques.size();
2110 auto IdxIt = Uniques.find(IdxLaneV);
2111 unsigned UniquesCntWithIdxLaneV =
2112 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2113 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2114 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2115 unsigned UniquesCntWithOpIdxLaneV =
2116 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2117 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2118 return 0;
2119 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2120 UniquesCntWithOpIdxLaneV,
2121 UniquesCntWithOpIdxLaneV -
2122 bit_floor(UniquesCntWithOpIdxLaneV)) -
2123 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2124 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2125 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2126 }
2127
2128 /// \param Lane lane of the operands under analysis.
2129 /// \param OpIdx operand index in \p Lane lane we're looking the best
2130 /// candidate for.
2131 /// \param Idx operand index of the current candidate value.
2132 /// \returns The additional score for the scalar which users are all
2133 /// vectorized.
2134 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2135 Value *IdxLaneV = getData(Idx, Lane).V;
2136 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2137 // Do not care about number of uses for vector-like instructions
2138 // (extractelement/extractvalue with constant indices), they are extracts
2139 // themselves and already externally used. Vectorization of such
2140 // instructions does not add extra extractelement instruction, just may
2141 // remove it.
2142 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2143 isVectorLikeInstWithConstOps(OpIdxLaneV))
2145 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2146 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2147 return 0;
2148 return R.areAllUsersVectorized(IdxLaneI)
2150 : 0;
2151 }
2152
2153 /// Score scaling factor for fully compatible instructions but with
2154 /// different number of external uses. Allows better selection of the
2155 /// instructions with less external uses.
2156 static const int ScoreScaleFactor = 10;
2157
2158 /// \Returns the look-ahead score, which tells us how much the sub-trees
2159 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2160 /// score. This helps break ties in an informed way when we cannot decide on
2161 /// the order of the operands by just considering the immediate
2162 /// predecessors.
2163 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2164 int Lane, unsigned OpIdx, unsigned Idx,
2165 bool &IsUsed, const SmallBitVector &UsedLanes) {
2166 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2168 // Keep track of the instruction stack as we recurse into the operands
2169 // during the look-ahead score exploration.
2170 int Score =
2171 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2172 /*CurrLevel=*/1, MainAltOps);
2173 if (Score) {
2174 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2175 if (Score <= -SplatScore) {
2176 // Failed score.
2177 Score = 0;
2178 } else {
2179 Score += SplatScore;
2180 // Scale score to see the difference between different operands
2181 // and similar operands but all vectorized/not all vectorized
2182 // uses. It does not affect actual selection of the best
2183 // compatible operand in general, just allows to select the
2184 // operand with all vectorized uses.
2185 Score *= ScoreScaleFactor;
2186 Score += getExternalUseScore(Lane, OpIdx, Idx);
2187 IsUsed = true;
2188 }
2189 }
2190 return Score;
2191 }
2192
2193 /// Best defined scores per lanes between the passes. Used to choose the
2194 /// best operand (with the highest score) between the passes.
2195 /// The key - {Operand Index, Lane}.
2196 /// The value - the best score between the passes for the lane and the
2197 /// operand.
2199 BestScoresPerLanes;
2200
2201 // Search all operands in Ops[*][Lane] for the one that matches best
2202 // Ops[OpIdx][LastLane] and return its opreand index.
2203 // If no good match can be found, return std::nullopt.
2204 std::optional<unsigned>
2205 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2206 ArrayRef<ReorderingMode> ReorderingModes,
2207 ArrayRef<Value *> MainAltOps,
2208 const SmallBitVector &UsedLanes) {
2209 unsigned NumOperands = getNumOperands();
2210
2211 // The operand of the previous lane at OpIdx.
2212 Value *OpLastLane = getData(OpIdx, LastLane).V;
2213
2214 // Our strategy mode for OpIdx.
2215 ReorderingMode RMode = ReorderingModes[OpIdx];
2216 if (RMode == ReorderingMode::Failed)
2217 return std::nullopt;
2218
2219 // The linearized opcode of the operand at OpIdx, Lane.
2220 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2221
2222 // The best operand index and its score.
2223 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2224 // are using the score to differentiate between the two.
2225 struct BestOpData {
2226 std::optional<unsigned> Idx;
2227 unsigned Score = 0;
2228 } BestOp;
2229 BestOp.Score =
2230 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2231 .first->second;
2232
2233 // Track if the operand must be marked as used. If the operand is set to
2234 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2235 // want to reestimate the operands again on the following iterations).
2236 bool IsUsed = RMode == ReorderingMode::Splat ||
2237 RMode == ReorderingMode::Constant ||
2238 RMode == ReorderingMode::Load;
2239 // Iterate through all unused operands and look for the best.
2240 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2241 // Get the operand at Idx and Lane.
2242 OperandData &OpData = getData(Idx, Lane);
2243 Value *Op = OpData.V;
2244 bool OpAPO = OpData.APO;
2245
2246 // Skip already selected operands.
2247 if (OpData.IsUsed)
2248 continue;
2249
2250 // Skip if we are trying to move the operand to a position with a
2251 // different opcode in the linearized tree form. This would break the
2252 // semantics.
2253 if (OpAPO != OpIdxAPO)
2254 continue;
2255
2256 // Look for an operand that matches the current mode.
2257 switch (RMode) {
2258 case ReorderingMode::Load:
2259 case ReorderingMode::Opcode: {
2260 bool LeftToRight = Lane > LastLane;
2261 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2262 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2263 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2264 OpIdx, Idx, IsUsed, UsedLanes);
2265 if (Score > static_cast<int>(BestOp.Score) ||
2266 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2267 Idx == OpIdx)) {
2268 BestOp.Idx = Idx;
2269 BestOp.Score = Score;
2270 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2271 }
2272 break;
2273 }
2274 case ReorderingMode::Constant:
2275 if (isa<Constant>(Op) ||
2276 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2277 BestOp.Idx = Idx;
2278 if (isa<Constant>(Op)) {
2280 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2282 }
2283 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2284 IsUsed = false;
2285 }
2286 break;
2287 case ReorderingMode::Splat:
2288 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2289 IsUsed = Op == OpLastLane;
2290 if (Op == OpLastLane) {
2291 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2292 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2294 }
2295 BestOp.Idx = Idx;
2296 }
2297 break;
2298 case ReorderingMode::Failed:
2299 llvm_unreachable("Not expected Failed reordering mode.");
2300 }
2301 }
2302
2303 if (BestOp.Idx) {
2304 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2305 return BestOp.Idx;
2306 }
2307 // If we could not find a good match return std::nullopt.
2308 return std::nullopt;
2309 }
2310
2311 /// Helper for reorderOperandVecs.
2312 /// \returns the lane that we should start reordering from. This is the one
2313 /// which has the least number of operands that can freely move about or
2314 /// less profitable because it already has the most optimal set of operands.
2315 unsigned getBestLaneToStartReordering() const {
2316 unsigned Min = UINT_MAX;
2317 unsigned SameOpNumber = 0;
2318 // std::pair<unsigned, unsigned> is used to implement a simple voting
2319 // algorithm and choose the lane with the least number of operands that
2320 // can freely move about or less profitable because it already has the
2321 // most optimal set of operands. The first unsigned is a counter for
2322 // voting, the second unsigned is the counter of lanes with instructions
2323 // with same/alternate opcodes and same parent basic block.
2325 // Try to be closer to the original results, if we have multiple lanes
2326 // with same cost. If 2 lanes have the same cost, use the one with the
2327 // highest index.
2328 for (int I = getNumLanes(); I > 0; --I) {
2329 unsigned Lane = I - 1;
2330 OperandsOrderData NumFreeOpsHash =
2331 getMaxNumOperandsThatCanBeReordered(Lane);
2332 // Compare the number of operands that can move and choose the one with
2333 // the least number.
2334 if (NumFreeOpsHash.NumOfAPOs < Min) {
2335 Min = NumFreeOpsHash.NumOfAPOs;
2336 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2337 HashMap.clear();
2338 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2339 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2340 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2341 // Select the most optimal lane in terms of number of operands that
2342 // should be moved around.
2343 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2344 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2345 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2346 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2347 auto [It, Inserted] =
2348 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2349 if (!Inserted)
2350 ++It->second.first;
2351 }
2352 }
2353 // Select the lane with the minimum counter.
2354 unsigned BestLane = 0;
2355 unsigned CntMin = UINT_MAX;
2356 for (const auto &Data : reverse(HashMap)) {
2357 if (Data.second.first < CntMin) {
2358 CntMin = Data.second.first;
2359 BestLane = Data.second.second;
2360 }
2361 }
2362 return BestLane;
2363 }
2364
2365 /// Data structure that helps to reorder operands.
2366 struct OperandsOrderData {
2367 /// The best number of operands with the same APOs, which can be
2368 /// reordered.
2369 unsigned NumOfAPOs = UINT_MAX;
2370 /// Number of operands with the same/alternate instruction opcode and
2371 /// parent.
2372 unsigned NumOpsWithSameOpcodeParent = 0;
2373 /// Hash for the actual operands ordering.
2374 /// Used to count operands, actually their position id and opcode
2375 /// value. It is used in the voting mechanism to find the lane with the
2376 /// least number of operands that can freely move about or less profitable
2377 /// because it already has the most optimal set of operands. Can be
2378 /// replaced with SmallVector<unsigned> instead but hash code is faster
2379 /// and requires less memory.
2380 unsigned Hash = 0;
2381 };
2382 /// \returns the maximum number of operands that are allowed to be reordered
2383 /// for \p Lane and the number of compatible instructions(with the same
2384 /// parent/opcode). This is used as a heuristic for selecting the first lane
2385 /// to start operand reordering.
2386 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2387 unsigned CntTrue = 0;
2388 unsigned NumOperands = getNumOperands();
2389 // Operands with the same APO can be reordered. We therefore need to count
2390 // how many of them we have for each APO, like this: Cnt[APO] = x.
2391 // Since we only have two APOs, namely true and false, we can avoid using
2392 // a map. Instead we can simply count the number of operands that
2393 // correspond to one of them (in this case the 'true' APO), and calculate
2394 // the other by subtracting it from the total number of operands.
2395 // Operands with the same instruction opcode and parent are more
2396 // profitable since we don't need to move them in many cases, with a high
2397 // probability such lane already can be vectorized effectively.
2398 bool AllUndefs = true;
2399 unsigned NumOpsWithSameOpcodeParent = 0;
2400 Instruction *OpcodeI = nullptr;
2401 BasicBlock *Parent = nullptr;
2402 unsigned Hash = 0;
2403 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2404 const OperandData &OpData = getData(OpIdx, Lane);
2405 if (OpData.APO)
2406 ++CntTrue;
2407 // Use Boyer-Moore majority voting for finding the majority opcode and
2408 // the number of times it occurs.
2409 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2410 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2411 I->getParent() != Parent) {
2412 if (NumOpsWithSameOpcodeParent == 0) {
2413 NumOpsWithSameOpcodeParent = 1;
2414 OpcodeI = I;
2415 Parent = I->getParent();
2416 } else {
2417 --NumOpsWithSameOpcodeParent;
2418 }
2419 } else {
2420 ++NumOpsWithSameOpcodeParent;
2421 }
2422 }
2423 Hash = hash_combine(
2424 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2425 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2426 }
2427 if (AllUndefs)
2428 return {};
2429 OperandsOrderData Data;
2430 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2431 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2432 Data.Hash = Hash;
2433 return Data;
2434 }
2435
2436 /// Go through the instructions in VL and append their operands.
2437 void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
2438 assert(!VL.empty() && "Bad VL");
2439 assert((empty() || VL.size() == getNumLanes()) &&
2440 "Expected same number of lanes");
2441 assert(S.valid() && "InstructionsState is invalid.");
2442 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2443 // arguments to the intrinsic produces the same result.
2444 constexpr unsigned IntrinsicNumOperands = 2;
2445 Instruction *MainOp = S.getMainOp();
2446 unsigned NumOperands = MainOp->getNumOperands();
2447 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2448 OpsVec.resize(NumOperands);
2449 unsigned NumLanes = VL.size();
2450 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2451 OpsVec[OpIdx].resize(NumLanes);
2452 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2453 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2454 "Expected instruction or poison value");
2455 // Our tree has just 3 nodes: the root and two operands.
2456 // It is therefore trivial to get the APO. We only need to check the
2457 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2458 // RHS operand. The LHS operand of both add and sub is never attached
2459 // to an inversese operation in the linearized form, therefore its APO
2460 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2461
2462 // Since operand reordering is performed on groups of commutative
2463 // operations or alternating sequences (e.g., +, -), we can safely
2464 // tell the inverse operations by checking commutativity.
2465 if (isa<PoisonValue>(VL[Lane])) {
2466 if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2467 if (OpIdx == 0) {
2468 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
2469 continue;
2470 }
2471 } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2472 if (OpIdx == 0) {
2473 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
2474 continue;
2475 }
2476 }
2477 OpsVec[OpIdx][Lane] = {
2478 PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
2479 false};
2480 continue;
2481 }
2482 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2483 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2484 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2485 APO, false};
2486 }
2487 }
2488 }
2489
2490 /// \returns the number of operands.
2491 unsigned getNumOperands() const { return ArgSize; }
2492
2493 /// \returns the number of lanes.
2494 unsigned getNumLanes() const { return OpsVec[0].size(); }
2495
2496 /// \returns the operand value at \p OpIdx and \p Lane.
2497 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2498 return getData(OpIdx, Lane).V;
2499 }
2500
2501 /// \returns true if the data structure is empty.
2502 bool empty() const { return OpsVec.empty(); }
2503
2504 /// Clears the data.
2505 void clear() { OpsVec.clear(); }
2506
2507 /// \Returns true if there are enough operands identical to \p Op to fill
2508 /// the whole vector (it is mixed with constants or loop invariant values).
2509 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2510 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2511 assert(Op == getValue(OpIdx, Lane) &&
2512 "Op is expected to be getValue(OpIdx, Lane).");
2513 // Small number of loads - try load matching.
2514 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2515 return false;
2516 bool OpAPO = getData(OpIdx, Lane).APO;
2517 bool IsInvariant = L && L->isLoopInvariant(Op);
2518 unsigned Cnt = 0;
2519 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2520 if (Ln == Lane)
2521 continue;
2522 // This is set to true if we found a candidate for broadcast at Lane.
2523 bool FoundCandidate = false;
2524 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2525 OperandData &Data = getData(OpI, Ln);
2526 if (Data.APO != OpAPO || Data.IsUsed)
2527 continue;
2528 Value *OpILane = getValue(OpI, Lane);
2529 bool IsConstantOp = isa<Constant>(OpILane);
2530 // Consider the broadcast candidate if:
2531 // 1. Same value is found in one of the operands.
2532 if (Data.V == Op ||
2533 // 2. The operand in the given lane is not constant but there is a
2534 // constant operand in another lane (which can be moved to the
2535 // given lane). In this case we can represent it as a simple
2536 // permutation of constant and broadcast.
2537 (!IsConstantOp &&
2538 ((Lns > 2 && isa<Constant>(Data.V)) ||
2539 // 2.1. If we have only 2 lanes, need to check that value in the
2540 // next lane does not build same opcode sequence.
2541 (Lns == 2 &&
2542 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2543 isa<Constant>(Data.V)))) ||
2544 // 3. The operand in the current lane is loop invariant (can be
2545 // hoisted out) and another operand is also a loop invariant
2546 // (though not a constant). In this case the whole vector can be
2547 // hoisted out.
2548 // FIXME: need to teach the cost model about this case for better
2549 // estimation.
2550 (IsInvariant && !isa<Constant>(Data.V) &&
2551 !getSameOpcode({Op, Data.V}, TLI) &&
2552 L->isLoopInvariant(Data.V))) {
2553 FoundCandidate = true;
2554 Data.IsUsed = Data.V == Op;
2555 if (Data.V == Op)
2556 ++Cnt;
2557 break;
2558 }
2559 }
2560 if (!FoundCandidate)
2561 return false;
2562 }
2563 return getNumLanes() == 2 || Cnt > 1;
2564 }
2565
2566 /// Checks if there is at least single compatible operand in lanes other
2567 /// than \p Lane, compatible with the operand \p Op.
2568 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2569 assert(Op == getValue(OpIdx, Lane) &&
2570 "Op is expected to be getValue(OpIdx, Lane).");
2571 bool OpAPO = getData(OpIdx, Lane).APO;
2572 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2573 if (Ln == Lane)
2574 continue;
2575 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2576 const OperandData &Data = getData(OpI, Ln);
2577 if (Data.APO != OpAPO || Data.IsUsed)
2578 return true;
2579 Value *OpILn = getValue(OpI, Ln);
2580 return (L && L->isLoopInvariant(OpILn)) ||
2581 (getSameOpcode({Op, OpILn}, TLI) &&
2582 allSameBlock({Op, OpILn}));
2583 }))
2584 return true;
2585 }
2586 return false;
2587 }
2588
2589 public:
2590 /// Initialize with all the operands of the instruction vector \p RootVL.
2591 VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
2592 const BoUpSLP &R)
2593 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2594 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
2595 // Append all the operands of RootVL.
2596 appendOperandsOfVL(RootVL, S);
2597 }
2598
2599 /// \Returns a value vector with the operands across all lanes for the
2600 /// opearnd at \p OpIdx.
2601 ValueList getVL(unsigned OpIdx) const {
2602 ValueList OpVL(OpsVec[OpIdx].size());
2603 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2604 "Expected same num of lanes across all operands");
2605 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2606 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2607 return OpVL;
2608 }
2609
2610 // Performs operand reordering for 2 or more operands.
2611 // The original operands are in OrigOps[OpIdx][Lane].
2612 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2613 void reorder() {
2614 unsigned NumOperands = getNumOperands();
2615 unsigned NumLanes = getNumLanes();
2616 // Each operand has its own mode. We are using this mode to help us select
2617 // the instructions for each lane, so that they match best with the ones
2618 // we have selected so far.
2619 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2620
2621 // This is a greedy single-pass algorithm. We are going over each lane
2622 // once and deciding on the best order right away with no back-tracking.
2623 // However, in order to increase its effectiveness, we start with the lane
2624 // that has operands that can move the least. For example, given the
2625 // following lanes:
2626 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2627 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2628 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2629 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2630 // we will start at Lane 1, since the operands of the subtraction cannot
2631 // be reordered. Then we will visit the rest of the lanes in a circular
2632 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2633
2634 // Find the first lane that we will start our search from.
2635 unsigned FirstLane = getBestLaneToStartReordering();
2636
2637 // Initialize the modes.
2638 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2639 Value *OpLane0 = getValue(OpIdx, FirstLane);
2640 // Keep track if we have instructions with all the same opcode on one
2641 // side.
2642 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2643 // Check if OpLane0 should be broadcast.
2644 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2645 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2646 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2647 else if (isa<LoadInst>(OpILane0))
2648 ReorderingModes[OpIdx] = ReorderingMode::Load;
2649 else
2650 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2651 } else if (isa<Constant>(OpLane0)) {
2652 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2653 } else if (isa<Argument>(OpLane0)) {
2654 // Our best hope is a Splat. It may save some cost in some cases.
2655 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2656 } else {
2657 llvm_unreachable("Unexpected value kind.");
2658 }
2659 }
2660
2661 // Check that we don't have same operands. No need to reorder if operands
2662 // are just perfect diamond or shuffled diamond match. Do not do it only
2663 // for possible broadcasts or non-power of 2 number of scalars (just for
2664 // now).
2665 auto &&SkipReordering = [this]() {
2666 SmallPtrSet<Value *, 4> UniqueValues;
2667 ArrayRef<OperandData> Op0 = OpsVec.front();
2668 for (const OperandData &Data : Op0)
2669 UniqueValues.insert(Data.V);
2671 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2672 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2673 return !UniqueValues.contains(Data.V);
2674 }))
2675 return false;
2676 }
2677 // TODO: Check if we can remove a check for non-power-2 number of
2678 // scalars after full support of non-power-2 vectorization.
2679 return UniqueValues.size() != 2 &&
2680 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2681 UniqueValues.size());
2682 };
2683
2684 // If the initial strategy fails for any of the operand indexes, then we
2685 // perform reordering again in a second pass. This helps avoid assigning
2686 // high priority to the failed strategy, and should improve reordering for
2687 // the non-failed operand indexes.
2688 for (int Pass = 0; Pass != 2; ++Pass) {
2689 // Check if no need to reorder operands since they're are perfect or
2690 // shuffled diamond match.
2691 // Need to do it to avoid extra external use cost counting for
2692 // shuffled matches, which may cause regressions.
2693 if (SkipReordering())
2694 break;
2695 // Skip the second pass if the first pass did not fail.
2696 bool StrategyFailed = false;
2697 // Mark all operand data as free to use.
2698 clearUsed();
2699 // We keep the original operand order for the FirstLane, so reorder the
2700 // rest of the lanes. We are visiting the nodes in a circular fashion,
2701 // using FirstLane as the center point and increasing the radius
2702 // distance.
2703 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2704 for (unsigned I = 0; I < NumOperands; ++I)
2705 MainAltOps[I].push_back(getData(I, FirstLane).V);
2706
2707 SmallBitVector UsedLanes(NumLanes);
2708 UsedLanes.set(FirstLane);
2709 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2710 // Visit the lane on the right and then the lane on the left.
2711 for (int Direction : {+1, -1}) {
2712 int Lane = FirstLane + Direction * Distance;
2713 if (Lane < 0 || Lane >= (int)NumLanes)
2714 continue;
2715 UsedLanes.set(Lane);
2716 int LastLane = Lane - Direction;
2717 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2718 "Out of bounds");
2719 // Look for a good match for each operand.
2720 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2721 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2722 std::optional<unsigned> BestIdx =
2723 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2724 MainAltOps[OpIdx], UsedLanes);
2725 // By not selecting a value, we allow the operands that follow to
2726 // select a better matching value. We will get a non-null value in
2727 // the next run of getBestOperand().
2728 if (BestIdx) {
2729 // Swap the current operand with the one returned by
2730 // getBestOperand().
2731 swap(OpIdx, *BestIdx, Lane);
2732 } else {
2733 // Enable the second pass.
2734 StrategyFailed = true;
2735 }
2736 // Try to get the alternate opcode and follow it during analysis.
2737 if (MainAltOps[OpIdx].size() != 2) {
2738 OperandData &AltOp = getData(OpIdx, Lane);
2739 InstructionsState OpS =
2740 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2741 if (OpS && OpS.isAltShuffle())
2742 MainAltOps[OpIdx].push_back(AltOp.V);
2743 }
2744 }
2745 }
2746 }
2747 // Skip second pass if the strategy did not fail.
2748 if (!StrategyFailed)
2749 break;
2750 }
2751 }
2752
2753#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2754 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2755 switch (RMode) {
2756 case ReorderingMode::Load:
2757 return "Load";
2758 case ReorderingMode::Opcode:
2759 return "Opcode";
2760 case ReorderingMode::Constant:
2761 return "Constant";
2762 case ReorderingMode::Splat:
2763 return "Splat";
2764 case ReorderingMode::Failed:
2765 return "Failed";
2766 }
2767 llvm_unreachable("Unimplemented Reordering Type");
2768 }
2769
2770 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2771 raw_ostream &OS) {
2772 return OS << getModeStr(RMode);
2773 }
2774
2775 /// Debug print.
2776 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2777 printMode(RMode, dbgs());
2778 }
2779
2780 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2781 return printMode(RMode, OS);
2782 }
2783
2785 const unsigned Indent = 2;
2786 unsigned Cnt = 0;
2787 for (const OperandDataVec &OpDataVec : OpsVec) {
2788 OS << "Operand " << Cnt++ << "\n";
2789 for (const OperandData &OpData : OpDataVec) {
2790 OS.indent(Indent) << "{";
2791 if (Value *V = OpData.V)
2792 OS << *V;
2793 else
2794 OS << "null";
2795 OS << ", APO:" << OpData.APO << "}\n";
2796 }
2797 OS << "\n";
2798 }
2799 return OS;
2800 }
2801
2802 /// Debug print.
2803 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2804#endif
2805 };
2806
2807 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2808 /// for a pair which have highest score deemed to have best chance to form
2809 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2810 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2811 /// of the cost, considered to be good enough score.
2812 std::optional<int>
2813 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2814 int Limit = LookAheadHeuristics::ScoreFail) const {
2815 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2817 int BestScore = Limit;
2818 std::optional<int> Index;
2819 for (int I : seq<int>(0, Candidates.size())) {
2820 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2821 Candidates[I].second,
2822 /*U1=*/nullptr, /*U2=*/nullptr,
2823 /*CurrLevel=*/1, {});
2824 if (Score > BestScore) {
2825 BestScore = Score;
2826 Index = I;
2827 }
2828 }
2829 return Index;
2830 }
2831
2832 /// Checks if the instruction is marked for deletion.
2833 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2834
2835 /// Removes an instruction from its block and eventually deletes it.
2836 /// It's like Instruction::eraseFromParent() except that the actual deletion
2837 /// is delayed until BoUpSLP is destructed.
2839 DeletedInstructions.insert(I);
2840 }
2841
2842 /// Remove instructions from the parent function and clear the operands of \p
2843 /// DeadVals instructions, marking for deletion trivially dead operands.
2844 template <typename T>
2847 for (T *V : DeadVals) {
2848 auto *I = cast<Instruction>(V);
2849 DeletedInstructions.insert(I);
2850 }
2851 DenseSet<Value *> Processed;
2852 for (T *V : DeadVals) {
2853 if (!V || !Processed.insert(V).second)
2854 continue;
2855 auto *I = cast<Instruction>(V);
2857 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
2858 for (Use &U : I->operands()) {
2859 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2860 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2862 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2863 return Entry->VectorizedValue == OpI;
2864 })))
2865 DeadInsts.push_back(OpI);
2866 }
2867 I->dropAllReferences();
2868 }
2869 for (T *V : DeadVals) {
2870 auto *I = cast<Instruction>(V);
2871 if (!I->getParent())
2872 continue;
2873 assert((I->use_empty() || all_of(I->uses(),
2874 [&](Use &U) {
2875 return isDeleted(
2876 cast<Instruction>(U.getUser()));
2877 })) &&
2878 "trying to erase instruction with users.");
2879 I->removeFromParent();
2880 SE->forgetValue(I);
2881 }
2882 // Process the dead instruction list until empty.
2883 while (!DeadInsts.empty()) {
2884 Value *V = DeadInsts.pop_back_val();
2885 Instruction *VI = cast_or_null<Instruction>(V);
2886 if (!VI || !VI->getParent())
2887 continue;
2889 "Live instruction found in dead worklist!");
2890 assert(VI->use_empty() && "Instructions with uses are not dead.");
2891
2892 // Don't lose the debug info while deleting the instructions.
2893 salvageDebugInfo(*VI);
2894
2895 // Null out all of the instruction's operands to see if any operand
2896 // becomes dead as we go.
2897 for (Use &OpU : VI->operands()) {
2898 Value *OpV = OpU.get();
2899 if (!OpV)
2900 continue;
2901 OpU.set(nullptr);
2902
2903 if (!OpV->use_empty())
2904 continue;
2905
2906 // If the operand is an instruction that became dead as we nulled out
2907 // the operand, and if it is 'trivially' dead, delete it in a future
2908 // loop iteration.
2909 if (auto *OpI = dyn_cast<Instruction>(OpV))
2910 if (!DeletedInstructions.contains(OpI) &&
2912 DeadInsts.push_back(OpI);
2913 }
2914
2915 VI->removeFromParent();
2916 DeletedInstructions.insert(VI);
2917 SE->forgetValue(VI);
2918 }
2919 }
2920
2921 /// Checks if the instruction was already analyzed for being possible
2922 /// reduction root.
2924 return AnalyzedReductionsRoots.count(I);
2925 }
2926 /// Register given instruction as already analyzed for being possible
2927 /// reduction root.
2929 AnalyzedReductionsRoots.insert(I);
2930 }
2931 /// Checks if the provided list of reduced values was checked already for
2932 /// vectorization.
2934 return AnalyzedReductionVals.contains(hash_value(VL));
2935 }
2936 /// Adds the list of reduced values to list of already checked values for the
2937 /// vectorization.
2939 AnalyzedReductionVals.insert(hash_value(VL));
2940 }
2941 /// Clear the list of the analyzed reduction root instructions.
2943 AnalyzedReductionsRoots.clear();
2944 AnalyzedReductionVals.clear();
2945 AnalyzedMinBWVals.clear();
2946 }
2947 /// Checks if the given value is gathered in one of the nodes.
2948 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2949 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2950 }
2951 /// Checks if the given value is gathered in one of the nodes.
2952 bool isGathered(const Value *V) const {
2953 return MustGather.contains(V);
2954 }
2955 /// Checks if the specified value was not schedule.
2956 bool isNotScheduled(const Value *V) const {
2957 return NonScheduledFirst.contains(V);
2958 }
2959
2960 /// Check if the value is vectorized in the tree.
2961 bool isVectorized(Value *V) const {
2962 assert(V && "V cannot be nullptr.");
2963 return ScalarToTreeEntries.contains(V);
2964 }
2965
2966 ~BoUpSLP();
2967
2968private:
2969 /// Determine if a node \p E in can be demoted to a smaller type with a
2970 /// truncation. We collect the entries that will be demoted in ToDemote.
2971 /// \param E Node for analysis
2972 /// \param ToDemote indices of the nodes to be demoted.
2973 bool collectValuesToDemote(
2974 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2976 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2977 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2978
2979 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2980 /// reordering (i.e. the operands can be reordered because they have only one
2981 /// user and reordarable).
2982 /// \param ReorderableGathers List of all gather nodes that require reordering
2983 /// (e.g., gather of extractlements or partially vectorizable loads).
2984 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2985 /// reordering, subset of \p NonVectorized.
2986 bool
2987 canReorderOperands(TreeEntry *UserTE,
2988 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2989 ArrayRef<TreeEntry *> ReorderableGathers,
2990 SmallVectorImpl<TreeEntry *> &GatherOps);
2991
2992 /// Checks if the given \p TE is a gather node with clustered reused scalars
2993 /// and reorders it per given \p Mask.
2994 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2995
2996 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2997 /// if any. If it is not vectorized (gather node), returns nullptr.
2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2999 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
3000 TreeEntry *TE = nullptr;
3001 const auto *It = find_if(VL, [&](Value *V) {
3002 for (TreeEntry *E : getTreeEntries(V)) {
3003 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
3004 TE = E;
3005 return true;
3006 }
3007 }
3008 return false;
3009 });
3010 if (It != VL.end()) {
3011 assert(TE->isSame(VL) && "Expected same scalars.");
3012 return TE;
3013 }
3014 return nullptr;
3015 }
3016
3017 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3018 /// if any. If it is not vectorized (gather node), returns nullptr.
3019 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3020 unsigned OpIdx) const {
3021 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
3022 const_cast<TreeEntry *>(UserTE), OpIdx);
3023 }
3024
3025 /// Checks if all users of \p I are the part of the vectorization tree.
3026 bool areAllUsersVectorized(
3027 Instruction *I,
3028 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3029
3030 /// Return information about the vector formed for the specified index
3031 /// of a vector of (the same) instruction.
3033
3034 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3035 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3036
3037 /// Gets the root instruction for the given node. If the node is a strided
3038 /// load/store node with the reverse order, the root instruction is the last
3039 /// one.
3040 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3041
3042 /// \returns Cast context for the given graph node.
3044 getCastContextHint(const TreeEntry &TE) const;
3045
3046 /// \returns the cost of the vectorizable entry.
3047 InstructionCost getEntryCost(const TreeEntry *E,
3048 ArrayRef<Value *> VectorizedVals,
3049 SmallPtrSetImpl<Value *> &CheckedExtracts);
3050
3051 /// This is the recursive part of buildTree.
3052 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3053 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3054
3055 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3056 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3057 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3058 /// returns false, setting \p CurrentOrder to either an empty vector or a
3059 /// non-identity permutation that allows to reuse extract instructions.
3060 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3061 /// extract order.
3062 bool canReuseExtract(ArrayRef<Value *> VL,
3063 SmallVectorImpl<unsigned> &CurrentOrder,
3064 bool ResizeAllowed = false) const;
3065
3066 /// Vectorize a single entry in the tree.
3067 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3068 /// avoid issues with def-use order.
3069 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3070
3071 /// Returns vectorized operand node, that matches the order of the scalars
3072 /// operand number \p NodeIdx in entry \p E.
3073 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3074 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3075 unsigned NodeIdx) const {
3076 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3077 }
3078
3079 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3080 /// \p E.
3081 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3082 /// avoid issues with def-use order.
3083 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3084
3085 /// Create a new vector from a list of scalar values. Produces a sequence
3086 /// which exploits values reused across lanes, and arranges the inserts
3087 /// for ease of later optimization.
3088 template <typename BVTy, typename ResTy, typename... Args>
3089 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3090
3091 /// Create a new vector from a list of scalar values. Produces a sequence
3092 /// which exploits values reused across lanes, and arranges the inserts
3093 /// for ease of later optimization.
3094 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3095 bool PostponedPHIs);
3096
3097 /// Returns the instruction in the bundle, which can be used as a base point
3098 /// for scheduling. Usually it is the last instruction in the bundle, except
3099 /// for the case when all operands are external (in this case, it is the first
3100 /// instruction in the list).
3101 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3102
3103 /// Tries to find extractelement instructions with constant indices from fixed
3104 /// vector type and gather such instructions into a bunch, which highly likely
3105 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3106 /// was successful, the matched scalars are replaced by poison values in \p VL
3107 /// for future analysis.
3108 std::optional<TargetTransformInfo::ShuffleKind>
3109 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3110 SmallVectorImpl<int> &Mask) const;
3111
3112 /// Tries to find extractelement instructions with constant indices from fixed
3113 /// vector type and gather such instructions into a bunch, which highly likely
3114 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3115 /// was successful, the matched scalars are replaced by poison values in \p VL
3116 /// for future analysis.
3118 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3120 unsigned NumParts) const;
3121
3122 /// Checks if the gathered \p VL can be represented as a single register
3123 /// shuffle(s) of previous tree entries.
3124 /// \param TE Tree entry checked for permutation.
3125 /// \param VL List of scalars (a subset of the TE scalar), checked for
3126 /// permutations. Must form single-register vector.
3127 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3128 /// commands to build the mask using the original vector value, without
3129 /// relying on the potential reordering.
3130 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3131 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3132 std::optional<TargetTransformInfo::ShuffleKind>
3133 isGatherShuffledSingleRegisterEntry(
3134 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3135 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3136 bool ForOrder);
3137
3138 /// Checks if the gathered \p VL can be represented as multi-register
3139 /// shuffle(s) of previous tree entries.
3140 /// \param TE Tree entry checked for permutation.
3141 /// \param VL List of scalars (a subset of the TE scalar), checked for
3142 /// permutations.
3143 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3144 /// commands to build the mask using the original vector value, without
3145 /// relying on the potential reordering.
3146 /// \returns per-register series of ShuffleKind, if gathered values can be
3147 /// represented as shuffles of previous tree entries. \p Mask is filled with
3148 /// the shuffle mask (also on per-register base).
3150 isGatherShuffledEntry(
3151 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3153 unsigned NumParts, bool ForOrder = false);
3154
3155 /// \returns the cost of gathering (inserting) the values in \p VL into a
3156 /// vector.
3157 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3158 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3159 Type *ScalarTy) const;
3160
3161 /// Set the Builder insert point to one after the last instruction in
3162 /// the bundle
3163 void setInsertPointAfterBundle(const TreeEntry *E);
3164
3165 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3166 /// specified, the starting vector value is poison.
3167 Value *
3168 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3169 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3170
3171 /// \returns whether the VectorizableTree is fully vectorizable and will
3172 /// be beneficial even the tree height is tiny.
3173 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3174
3175 /// Run through the list of all gathered loads in the graph and try to find
3176 /// vector loads/masked gathers instead of regular gathers. Later these loads
3177 /// are reshufled to build final gathered nodes.
3178 void tryToVectorizeGatheredLoads(
3179 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3180 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3181 8> &GatheredLoads);
3182
3183 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3184 /// users of \p TE and collects the stores. It returns the map from the store
3185 /// pointers to the collected stores.
3187 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3188
3189 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3190 /// stores in \p StoresVec can form a vector instruction. If so it returns
3191 /// true and populates \p ReorderIndices with the shuffle indices of the
3192 /// stores when compared to the sorted vector.
3193 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3194 OrdersType &ReorderIndices) const;
3195
3196 /// Iterates through the users of \p TE, looking for scalar stores that can be
3197 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3198 /// their order and builds an order index vector for each store bundle. It
3199 /// returns all these order vectors found.
3200 /// We run this after the tree has formed, otherwise we may come across user
3201 /// instructions that are not yet in the tree.
3203 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3204
3205 /// Tries to reorder the gathering node for better vectorization
3206 /// opportunities.
3207 void reorderGatherNode(TreeEntry &TE);
3208
3209 struct TreeEntry {
3210 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3211 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3212
3213 /// \returns Common mask for reorder indices and reused scalars.
3214 SmallVector<int> getCommonMask() const {
3216 inversePermutation(ReorderIndices, Mask);
3217 ::addMask(Mask, ReuseShuffleIndices);
3218 return Mask;
3219 }
3220
3221 /// \returns true if the scalars in VL are equal to this entry.
3222 bool isSame(ArrayRef<Value *> VL) const {
3223 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3224 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3225 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3226 return VL.size() == Mask.size() &&
3227 std::equal(VL.begin(), VL.end(), Mask.begin(),
3228 [Scalars](Value *V, int Idx) {
3229 return (isa<UndefValue>(V) &&
3230 Idx == PoisonMaskElem) ||
3231 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3232 });
3233 };
3234 if (!ReorderIndices.empty()) {
3235 // TODO: implement matching if the nodes are just reordered, still can
3236 // treat the vector as the same if the list of scalars matches VL
3237 // directly, without reordering.
3239 inversePermutation(ReorderIndices, Mask);
3240 if (VL.size() == Scalars.size())
3241 return IsSame(Scalars, Mask);
3242 if (VL.size() == ReuseShuffleIndices.size()) {
3243 ::addMask(Mask, ReuseShuffleIndices);
3244 return IsSame(Scalars, Mask);
3245 }
3246 return false;
3247 }
3248 return IsSame(Scalars, ReuseShuffleIndices);
3249 }
3250
3251 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3252 return isGather() && !UserTreeIndices.empty() &&
3253 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3254 UserTreeIndices.front().UserTE == UserEI.UserTE;
3255 }
3256
3257 /// \returns true if current entry has same operands as \p TE.
3258 bool hasEqualOperands(const TreeEntry &TE) const {
3259 if (TE.getNumOperands() != getNumOperands())
3260 return false;
3261 SmallBitVector Used(getNumOperands());
3262 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3263 unsigned PrevCount = Used.count();
3264 for (unsigned K = 0; K < E; ++K) {
3265 if (Used.test(K))
3266 continue;
3267 if (getOperand(K) == TE.getOperand(I)) {
3268 Used.set(K);
3269 break;
3270 }
3271 }
3272 // Check if we actually found the matching operand.
3273 if (PrevCount == Used.count())
3274 return false;
3275 }
3276 return true;
3277 }
3278
3279 /// \return Final vectorization factor for the node. Defined by the total
3280 /// number of vectorized scalars, including those, used several times in the
3281 /// entry and counted in the \a ReuseShuffleIndices, if any.
3282 unsigned getVectorFactor() const {
3283 if (!ReuseShuffleIndices.empty())
3284 return ReuseShuffleIndices.size();
3285 return Scalars.size();
3286 };
3287
3288 /// Checks if the current node is a gather node.
3289 bool isGather() const { return State == NeedToGather; }
3290
3291 /// A vector of scalars.
3292 ValueList Scalars;
3293
3294 /// The Scalars are vectorized into this value. It is initialized to Null.
3295 WeakTrackingVH VectorizedValue = nullptr;
3296
3297 /// New vector phi instructions emitted for the vectorized phi nodes.
3298 PHINode *PHI = nullptr;
3299
3300 /// Do we need to gather this sequence or vectorize it
3301 /// (either with vector instruction or with scatter/gather
3302 /// intrinsics for store/load)?
3303 enum EntryState {
3304 Vectorize, ///< The node is regularly vectorized.
3305 ScatterVectorize, ///< Masked scatter/gather node.
3306 StridedVectorize, ///< Strided loads (and stores)
3307 NeedToGather, ///< Gather/buildvector node.
3308 CombinedVectorize, ///< Vectorized node, combined with its user into more
3309 ///< complex node like select/cmp to minmax, mul/add to
3310 ///< fma, etc. Must be used for the following nodes in
3311 ///< the pattern, not the very first one.
3312 };
3313 EntryState State;
3314
3315 /// List of combined opcodes supported by the vectorizer.
3316 enum CombinedOpcode {
3317 NotCombinedOp = -1,
3318 MinMax = Instruction::OtherOpsEnd + 1,
3319 };
3320 CombinedOpcode CombinedOp = NotCombinedOp;
3321
3322 /// Does this sequence require some shuffling?
3323 SmallVector<int, 4> ReuseShuffleIndices;
3324
3325 /// Does this entry require reordering?
3326 SmallVector<unsigned, 4> ReorderIndices;
3327
3328 /// Points back to the VectorizableTree.
3329 ///
3330 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3331 /// to be a pointer and needs to be able to initialize the child iterator.
3332 /// Thus we need a reference back to the container to translate the indices
3333 /// to entries.
3334 VecTreeTy &Container;
3335
3336 /// The TreeEntry index containing the user of this entry. We can actually
3337 /// have multiple users so the data structure is not truly a tree.
3338 SmallVector<EdgeInfo, 1> UserTreeIndices;
3339
3340 /// The index of this treeEntry in VectorizableTree.
3341 unsigned Idx = 0;
3342
3343 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3344 /// other nodes as a series of insertvector instructions.
3345 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3346
3347 private:
3348 /// The operands of each instruction in each lane Operands[op_index][lane].
3349 /// Note: This helps avoid the replication of the code that performs the
3350 /// reordering of operands during buildTree_rec() and vectorizeTree().
3352
3353 /// MainOp and AltOp are recorded inside. S should be obtained from
3354 /// newTreeEntry.
3355 InstructionsState S = InstructionsState::invalid();
3356
3357 /// Interleaving factor for interleaved loads Vectorize nodes.
3358 unsigned InterleaveFactor = 0;
3359
3360 public:
3361 /// Returns interleave factor for interleave nodes.
3362 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3363 /// Sets interleaving factor for the interleaving nodes.
3364 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3365
3366 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3367 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3368 if (Operands.size() < OpIdx + 1)
3369 Operands.resize(OpIdx + 1);
3370 assert(Operands[OpIdx].empty() && "Already resized?");
3371 assert(OpVL.size() <= Scalars.size() &&
3372 "Number of operands is greater than the number of scalars.");
3373 Operands[OpIdx].resize(OpVL.size());
3374 copy(OpVL, Operands[OpIdx].begin());
3375 }
3376
3377 /// Set this bundle's operand from Scalars.
3378 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3379 VLOperands Ops(Scalars, S, R);
3380 if (RequireReorder)
3381 Ops.reorder();
3382 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3383 setOperand(I, Ops.getVL(I));
3384 }
3385
3386 /// Reorders operands of the node to the given mask \p Mask.
3387 void reorderOperands(ArrayRef<int> Mask) {
3388 for (ValueList &Operand : Operands)
3389 reorderScalars(Operand, Mask);
3390 }
3391
3392 /// \returns the \p OpIdx operand of this TreeEntry.
3393 ValueList &getOperand(unsigned OpIdx) {
3394 assert(OpIdx < Operands.size() && "Off bounds");
3395 return Operands[OpIdx];
3396 }
3397
3398 /// \returns the \p OpIdx operand of this TreeEntry.
3399 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3400 assert(OpIdx < Operands.size() && "Off bounds");
3401 return Operands[OpIdx];
3402 }
3403
3404 /// \returns the number of operands.
3405 unsigned getNumOperands() const { return Operands.size(); }
3406
3407 /// \return the single \p OpIdx operand.
3408 Value *getSingleOperand(unsigned OpIdx) const {
3409 assert(OpIdx < Operands.size() && "Off bounds");
3410 assert(!Operands[OpIdx].empty() && "No operand available");
3411 return Operands[OpIdx][0];
3412 }
3413
3414 /// Some of the instructions in the list have alternate opcodes.
3415 bool isAltShuffle() const { return S.isAltShuffle(); }
3416
3417 bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
3418
3419 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3420 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3421 /// \p OpValue.
3422 Value *isOneOf(Value *Op) const {
3423 auto *I = dyn_cast<Instruction>(Op);
3424 if (I && isOpcodeOrAlt(I))
3425 return Op;
3426 return S.getMainOp();
3427 }
3428
3429 void setOperations(const InstructionsState &S) {
3430 assert(S && "InstructionsState is invalid.");
3431 this->S = S;
3432 }
3433
3434 Instruction *getMainOp() const { return S.getMainOp(); }
3435
3436 Instruction *getAltOp() const { return S.getAltOp(); }
3437
3438 /// The main/alternate opcodes for the list of instructions.
3439 unsigned getOpcode() const { return S.getOpcode(); }
3440
3441 unsigned getAltOpcode() const { return S.getAltOpcode(); }
3442
3443 bool hasState() const { return S.valid(); }
3444
3445 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3446 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3447 int findLaneForValue(Value *V) const {
3448 unsigned FoundLane = getVectorFactor();
3449 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3450 std::advance(It, 1)) {
3451 if (*It != V)
3452 continue;
3453 FoundLane = std::distance(Scalars.begin(), It);
3454 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3455 if (!ReorderIndices.empty())
3456 FoundLane = ReorderIndices[FoundLane];
3457 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3458 if (ReuseShuffleIndices.empty())
3459 break;
3460 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3461 RIt != ReuseShuffleIndices.end()) {
3462 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3463 break;
3464 }
3465 }
3466 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3467 return FoundLane;
3468 }
3469
3470 /// Build a shuffle mask for graph entry which represents a merge of main
3471 /// and alternate operations.
3472 void
3473 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3475 SmallVectorImpl<Value *> *OpScalars = nullptr,
3476 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3477
3478 /// Return true if this is a non-power-of-2 node.
3479 bool isNonPowOf2Vec() const {
3480 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3481 return IsNonPowerOf2;
3482 }
3483
3484 /// Return true if this is a node, which tries to vectorize number of
3485 /// elements, forming whole vectors.
3486 bool
3487 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3488 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3489 TTI, getValueType(Scalars.front()), Scalars.size());
3490 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3491 "Reshuffling not supported with non-power-of-2 vectors yet.");
3492 return IsNonPowerOf2;
3493 }
3494
3495 Value *getOrdered(unsigned Idx) const {
3496 assert(isGather() && "Must be used only for buildvectors/gathers.");
3497 if (ReorderIndices.empty())
3498 return Scalars[Idx];
3500 inversePermutation(ReorderIndices, Mask);
3501 return Scalars[Mask[Idx]];
3502 }
3503
3504#ifndef NDEBUG
3505 /// Debug printer.
3506 LLVM_DUMP_METHOD void dump() const {
3507 dbgs() << Idx << ".\n";
3508 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3509 dbgs() << "Operand " << OpI << ":\n";
3510 for (const Value *V : Operands[OpI])
3511 dbgs().indent(2) << *V << "\n";
3512 }
3513 dbgs() << "Scalars: \n";
3514 for (Value *V : Scalars)
3515 dbgs().indent(2) << *V << "\n";
3516 dbgs() << "State: ";
3517 switch (State) {
3518 case Vectorize:
3519 if (InterleaveFactor > 0) {
3520 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3521 << "\n";
3522 } else {
3523 dbgs() << "Vectorize\n";
3524 }
3525 break;
3526 case ScatterVectorize:
3527 dbgs() << "ScatterVectorize\n";
3528 break;
3529 case StridedVectorize:
3530 dbgs() << "StridedVectorize\n";
3531 break;
3532 case NeedToGather:
3533 dbgs() << "NeedToGather\n";
3534 break;
3535 case CombinedVectorize:
3536 dbgs() << "CombinedVectorize\n";
3537 break;
3538 }
3539 if (S) {
3540 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
3541 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
3542 } else {
3543 dbgs() << "MainOp: NULL\n";
3544 dbgs() << "AltOp: NULL\n";
3545 }
3546 dbgs() << "VectorizedValue: ";
3547 if (VectorizedValue)
3548 dbgs() << *VectorizedValue << "\n";
3549 else
3550 dbgs() << "NULL\n";
3551 dbgs() << "ReuseShuffleIndices: ";
3552 if (ReuseShuffleIndices.empty())
3553 dbgs() << "Empty";
3554 else
3555 for (int ReuseIdx : ReuseShuffleIndices)
3556 dbgs() << ReuseIdx << ", ";
3557 dbgs() << "\n";
3558 dbgs() << "ReorderIndices: ";
3559 for (unsigned ReorderIdx : ReorderIndices)
3560 dbgs() << ReorderIdx << ", ";
3561 dbgs() << "\n";
3562 dbgs() << "UserTreeIndices: ";
3563 for (const auto &EInfo : UserTreeIndices)
3564 dbgs() << EInfo << ", ";
3565 dbgs() << "\n";
3566 if (!CombinedEntriesWithIndices.empty()) {
3567 dbgs() << "Combined entries: ";
3568 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3569 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3570 });
3571 dbgs() << "\n";
3572 }
3573 }
3574#endif
3575 };
3576
3577#ifndef NDEBUG
3578 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3579 InstructionCost VecCost, InstructionCost ScalarCost,
3580 StringRef Banner) const {
3581 dbgs() << "SLP: " << Banner << ":\n";
3582 E->dump();
3583 dbgs() << "SLP: Costs:\n";
3584 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3585 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3586 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3587 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3588 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3589 }
3590#endif
3591
3592 /// Create a new VectorizableTree entry.
3593 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3594 std::optional<ScheduleData *> Bundle,
3595 const InstructionsState &S,
3596 const EdgeInfo &UserTreeIdx,
3597 ArrayRef<int> ReuseShuffleIndices = {},
3598 ArrayRef<unsigned> ReorderIndices = {},
3599 unsigned InterleaveFactor = 0) {
3600 TreeEntry::EntryState EntryState =
3601 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3602 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3603 ReuseShuffleIndices, ReorderIndices);
3604 if (E && InterleaveFactor > 0)
3605 E->setInterleave(InterleaveFactor);
3606 return E;
3607 }
3608
3609 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3610 TreeEntry::EntryState EntryState,
3611 std::optional<ScheduleData *> Bundle,
3612 const InstructionsState &S,
3613 const EdgeInfo &UserTreeIdx,
3614 ArrayRef<int> ReuseShuffleIndices = {},
3615 ArrayRef<unsigned> ReorderIndices = {}) {
3616 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3617 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3618 "Need to vectorize gather entry?");
3619 // Gathered loads still gathered? Do not create entry, use the original one.
3620 if (GatheredLoadsEntriesFirst.has_value() &&
3621 EntryState == TreeEntry::NeedToGather && S &&
3622 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3623 !UserTreeIdx.UserTE)
3624 return nullptr;
3625 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3626 TreeEntry *Last = VectorizableTree.back().get();
3627 Last->Idx = VectorizableTree.size() - 1;
3628 Last->State = EntryState;
3629 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3630 // for non-power-of-two vectors.
3631 assert(
3633 ReuseShuffleIndices.empty()) &&
3634 "Reshuffling scalars not yet supported for nodes with padding");
3635 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3636 ReuseShuffleIndices.end());
3637 if (ReorderIndices.empty()) {
3638 Last->Scalars.assign(VL.begin(), VL.end());
3639 if (S)
3640 Last->setOperations(S);
3641 } else {
3642 // Reorder scalars and build final mask.
3643 Last->Scalars.assign(VL.size(), nullptr);
3644 transform(ReorderIndices, Last->Scalars.begin(),
3645 [VL](unsigned Idx) -> Value * {
3646 if (Idx >= VL.size())
3647 return UndefValue::get(VL.front()->getType());
3648 return VL[Idx];
3649 });
3650 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3651 if (S)
3652 Last->setOperations(S);
3653 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3654 }
3655 if (!Last->isGather()) {
3656 SmallPtrSet<Value *, 4> Processed;
3657 for (Value *V : VL) {
3658 if (isa<PoisonValue>(V))
3659 continue;
3660 auto It = ScalarToTreeEntries.find(V);
3661 assert(
3662 (It == ScalarToTreeEntries.end() ||
3663 (It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
3665 "Scalar already in tree!");
3666 if (It == ScalarToTreeEntries.end()) {
3667 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
3668 (void)Processed.insert(V);
3669 } else if (Processed.insert(V).second) {
3670 assert(!is_contained(It->getSecond(), Last) &&
3671 "Value already associated with the node.");
3672 It->getSecond().push_back(Last);
3673 }
3674 }
3675 // Update the scheduler bundle to point to this TreeEntry.
3676 ScheduleData *BundleMember = *Bundle;
3677 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3678 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3679 doesNotNeedToSchedule(VL)) &&
3680 "Bundle and VL out of sync");
3681 if (BundleMember) {
3682 for (Value *V : VL) {
3684 continue;
3685 if (!BundleMember)
3686 continue;
3687 BundleMember->TE = Last;
3688 BundleMember = BundleMember->NextInBundle;
3689 }
3690 }
3691 assert(!BundleMember && "Bundle and VL out of sync");
3692 } else {
3693 // Build a map for gathered scalars to the nodes where they are used.
3694 bool AllConstsOrCasts = true;
3695 for (Value *V : VL)
3696 if (!isConstant(V)) {
3697 auto *I = dyn_cast<CastInst>(V);
3698 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3699 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3700 !UserTreeIdx.UserTE->isGather())
3701 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3702 }
3703 if (AllConstsOrCasts)
3704 CastMaxMinBWSizes =
3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3706 MustGather.insert(VL.begin(), VL.end());
3707 }
3708
3709 if (UserTreeIdx.UserTE)
3710 Last->UserTreeIndices.push_back(UserTreeIdx);
3711 return Last;
3712 }
3713
3714 /// -- Vectorization State --
3715 /// Holds all of the tree entries.
3716 TreeEntry::VecTreeTy VectorizableTree;
3717
3718#ifndef NDEBUG
3719 /// Debug printer.
3720 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3721 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3722 VectorizableTree[Id]->dump();
3723 dbgs() << "\n";
3724 }
3725 }
3726#endif
3727
3728 /// Get list of vector entries, associated with the value \p V.
3729 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
3730 assert(V && "V cannot be nullptr.");
3731 auto It = ScalarToTreeEntries.find(V);
3732 if (It == ScalarToTreeEntries.end())
3733 return {};
3734 return It->getSecond();
3735 }
3736
3737 /// Returns first vector node for value \p V, matching values \p VL.
3738 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
3739 bool SameVF = false) const {
3740 assert(V && "V cannot be nullptr.");
3741 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
3742 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
3743 return TE;
3744 return nullptr;
3745 }
3746
3747 /// Check that the operand node of alternate node does not generate
3748 /// buildvector sequence. If it is, then probably not worth it to build
3749 /// alternate shuffle, if number of buildvector operands + alternate
3750 /// instruction > than the number of buildvector instructions.
3751 /// \param S the instructions state of the analyzed values.
3752 /// \param VL list of the instructions with alternate opcodes.
3753 bool areAltOperandsProfitable(const InstructionsState &S,
3754 ArrayRef<Value *> VL) const;
3755
3756 /// Checks if the specified list of the instructions/values can be vectorized
3757 /// and fills required data before actual scheduling of the instructions.
3758 TreeEntry::EntryState
3759 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3760 bool IsScatterVectorizeUserTE,
3761 OrdersType &CurrentOrder,
3762 SmallVectorImpl<Value *> &PointerOps);
3763
3764 /// Maps a specific scalar to its tree entry(ies).
3766
3767 /// Maps a value to the proposed vectorizable size.
3768 SmallDenseMap<Value *, unsigned> InstrElementSize;
3769
3770 /// A list of scalars that we found that we need to keep as scalars.
3771 ValueSet MustGather;
3772
3773 /// A set of first non-schedulable values.
3774 ValueSet NonScheduledFirst;
3775
3776 /// A map between the vectorized entries and the last instructions in the
3777 /// bundles. The bundles are built in use order, not in the def order of the
3778 /// instructions. So, we cannot rely directly on the last instruction in the
3779 /// bundle being the last instruction in the program order during
3780 /// vectorization process since the basic blocks are affected, need to
3781 /// pre-gather them before.
3782 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3783
3784 /// List of gather nodes, depending on other gather/vector nodes, which should
3785 /// be emitted after the vector instruction emission process to correctly
3786 /// handle order of the vector instructions and shuffles.
3787 SetVector<const TreeEntry *> PostponedGathers;
3788
3789 using ValueToGatherNodesMap =
3791 ValueToGatherNodesMap ValueToGatherNodes;
3792
3793 /// A list of the load entries (node indices), which can be vectorized using
3794 /// strided or masked gather approach, but attempted to be represented as
3795 /// contiguous loads.
3796 SetVector<unsigned> LoadEntriesToVectorize;
3797
3798 /// true if graph nodes transforming mode is on.
3799 bool IsGraphTransformMode = false;
3800
3801 /// The index of the first gathered load entry in the VectorizeTree.
3802 std::optional<unsigned> GatheredLoadsEntriesFirst;
3803
3804 /// This POD struct describes one external user in the vectorized tree.
3805 struct ExternalUser {
3806 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
3807 : Scalar(S), User(U), E(E), Lane(L) {}
3808
3809 /// Which scalar in our function.
3810 Value *Scalar = nullptr;
3811
3812 /// Which user that uses the scalar.
3813 llvm::User *User = nullptr;
3814
3815 /// Vector node, the value is part of.
3816 const TreeEntry &E;
3817
3818 /// Which lane does the scalar belong to.
3819 int Lane;
3820 };
3821 using UserList = SmallVector<ExternalUser, 16>;
3822
3823 /// Checks if two instructions may access the same memory.
3824 ///
3825 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3826 /// is invariant in the calling loop.
3827 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3828 Instruction *Inst2) {
3829 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3830 return true;
3831 // First check if the result is already in the cache.
3832 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3833 auto It = AliasCache.find(Key);
3834 if (It != AliasCache.end())
3835 return It->second;
3836 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3837 // Store the result in the cache.
3838 AliasCache.try_emplace(Key, Aliased);
3839 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3840 return Aliased;
3841 }
3842
3843 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3844
3845 /// Cache for alias results.
3846 /// TODO: consider moving this to the AliasAnalysis itself.
3848
3849 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3850 // globally through SLP because we don't perform any action which
3851 // invalidates capture results.
3852 BatchAAResults BatchAA;
3853
3854 /// Temporary store for deleted instructions. Instructions will be deleted
3855 /// eventually when the BoUpSLP is destructed. The deferral is required to
3856 /// ensure that there are no incorrect collisions in the AliasCache, which
3857 /// can happen if a new instruction is allocated at the same address as a
3858 /// previously deleted instruction.
3859 DenseSet<Instruction *> DeletedInstructions;
3860
3861 /// Set of the instruction, being analyzed already for reductions.
3862 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3863
3864 /// Set of hashes for the list of reduction values already being analyzed.
3865 DenseSet<size_t> AnalyzedReductionVals;
3866
3867 /// Values, already been analyzed for mininmal bitwidth and found to be
3868 /// non-profitable.
3869 DenseSet<Value *> AnalyzedMinBWVals;
3870
3871 /// A list of values that need to extracted out of the tree.
3872 /// This list holds pairs of (Internal Scalar : External User). External User
3873 /// can be nullptr, it means that this Internal Scalar will be used later,
3874 /// after vectorization.
3875 UserList ExternalUses;
3876
3877 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3878 /// extractelement instructions.
3879 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3880
3881 /// Values used only by @llvm.assume calls.
3883
3884 /// Holds all of the instructions that we gathered, shuffle instructions and
3885 /// extractelements.
3886 SetVector<Instruction *> GatherShuffleExtractSeq;
3887
3888 /// A list of blocks that we are going to CSE.
3889 DenseSet<BasicBlock *> CSEBlocks;
3890
3891 /// List of hashes of vector of loads, which are known to be non vectorizable.
3892 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3893
3894 /// Contains all scheduling relevant data for an instruction.
3895 /// A ScheduleData either represents a single instruction or a member of an
3896 /// instruction bundle (= a group of instructions which is combined into a
3897 /// vector instruction).
3898 struct ScheduleData {
3899 // The initial value for the dependency counters. It means that the
3900 // dependencies are not calculated yet.
3901 enum { InvalidDeps = -1 };
3902
3903 ScheduleData() = default;
3904
3905 void init(int BlockSchedulingRegionID, Instruction *I) {
3906 FirstInBundle = this;
3907 NextInBundle = nullptr;
3908 NextLoadStore = nullptr;
3909 IsScheduled = false;
3910 SchedulingRegionID = BlockSchedulingRegionID;
3911 clearDependencies();
3912 Inst = I;
3913 TE = nullptr;
3914 }
3915
3916 /// Verify basic self consistency properties
3917 void verify() {
3918 if (hasValidDependencies()) {
3919 assert(UnscheduledDeps <= Dependencies && "invariant");
3920 } else {
3921 assert(UnscheduledDeps == Dependencies && "invariant");
3922 }
3923
3924 if (IsScheduled) {
3925 assert(isSchedulingEntity() &&
3926 "unexpected scheduled state");
3927 for (const ScheduleData *BundleMember = this; BundleMember;
3928 BundleMember = BundleMember->NextInBundle) {
3929 assert(BundleMember->hasValidDependencies() &&
3930 BundleMember->UnscheduledDeps == 0 &&
3931 "unexpected scheduled state");
3932 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3933 "only bundle is marked scheduled");
3934 }
3935 }
3936
3937 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3938 "all bundle members must be in same basic block");
3939 }
3940
3941 /// Returns true if the dependency information has been calculated.
3942 /// Note that depenendency validity can vary between instructions within
3943 /// a single bundle.
3944 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3945
3946 /// Returns true for single instructions and for bundle representatives
3947 /// (= the head of a bundle).
3948 bool isSchedulingEntity() const { return FirstInBundle == this; }
3949
3950 /// Returns true if it represents an instruction bundle and not only a
3951 /// single instruction.
3952 bool isPartOfBundle() const {
3953 return NextInBundle != nullptr || FirstInBundle != this || TE;
3954 }
3955
3956 /// Returns true if it is ready for scheduling, i.e. it has no more
3957 /// unscheduled depending instructions/bundles.
3958 bool isReady() const {
3959 assert(isSchedulingEntity() &&
3960 "can't consider non-scheduling entity for ready list");
3961 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3962 }
3963
3964 /// Modifies the number of unscheduled dependencies for this instruction,
3965 /// and returns the number of remaining dependencies for the containing
3966 /// bundle.
3967 int incrementUnscheduledDeps(int Incr) {
3968 assert(hasValidDependencies() &&
3969 "increment of unscheduled deps would be meaningless");
3970 UnscheduledDeps += Incr;
3971 return FirstInBundle->unscheduledDepsInBundle();
3972 }
3973
3974 /// Sets the number of unscheduled dependencies to the number of
3975 /// dependencies.
3976 void resetUnscheduledDeps() {
3977 UnscheduledDeps = Dependencies;
3978 }
3979
3980 /// Clears all dependency information.
3981 void clearDependencies() {
3982 Dependencies = InvalidDeps;
3983 resetUnscheduledDeps();
3984 MemoryDependencies.clear();
3985 ControlDependencies.clear();
3986 }
3987
3988 int unscheduledDepsInBundle() const {
3989 assert(isSchedulingEntity() && "only meaningful on the bundle");
3990 int Sum = 0;
3991 for (const ScheduleData *BundleMember = this; BundleMember;
3992 BundleMember = BundleMember->NextInBundle) {
3993 if (BundleMember->UnscheduledDeps == InvalidDeps)
3994 return InvalidDeps;
3995 Sum += BundleMember->UnscheduledDeps;
3996 }
3997 return Sum;
3998 }
3999
4000 void dump(raw_ostream &os) const {
4001 if (!isSchedulingEntity()) {
4002 os << "/ " << *Inst;
4003 } else if (NextInBundle) {
4004 os << '[' << *Inst;
4005 ScheduleData *SD = NextInBundle;
4006 while (SD) {
4007 os << ';' << *SD->Inst;
4008 SD = SD->NextInBundle;
4009 }
4010 os << ']';
4011 } else {
4012 os << *Inst;
4013 }
4014 }
4015
4016 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
4017
4018 Instruction *Inst = nullptr;
4019
4020 /// The TreeEntry that this instruction corresponds to.
4021 TreeEntry *TE = nullptr;
4022
4023 /// Points to the head in an instruction bundle (and always to this for
4024 /// single instructions).
4025 ScheduleData *FirstInBundle = nullptr;
4026
4027 /// Single linked list of all instructions in a bundle. Null if it is a
4028 /// single instruction.
4029 ScheduleData *NextInBundle = nullptr;
4030
4031 /// Single linked list of all memory instructions (e.g. load, store, call)
4032 /// in the block - until the end of the scheduling region.
4033 ScheduleData *NextLoadStore = nullptr;
4034
4035 /// The dependent memory instructions.
4036 /// This list is derived on demand in calculateDependencies().
4037 SmallVector<ScheduleData *, 4> MemoryDependencies;
4038
4039 /// List of instructions which this instruction could be control dependent
4040 /// on. Allowing such nodes to be scheduled below this one could introduce
4041 /// a runtime fault which didn't exist in the original program.
4042 /// ex: this is a load or udiv following a readonly call which inf loops
4043 SmallVector<ScheduleData *, 4> ControlDependencies;
4044
4045 /// This ScheduleData is in the current scheduling region if this matches
4046 /// the current SchedulingRegionID of BlockScheduling.
4047 int SchedulingRegionID = 0;
4048
4049 /// Used for getting a "good" final ordering of instructions.
4050 int SchedulingPriority = 0;
4051
4052 /// The number of dependencies. Constitutes of the number of users of the
4053 /// instruction plus the number of dependent memory instructions (if any).
4054 /// This value is calculated on demand.
4055 /// If InvalidDeps, the number of dependencies is not calculated yet.
4056 int Dependencies = InvalidDeps;
4057
4058 /// The number of dependencies minus the number of dependencies of scheduled
4059 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4060 /// for scheduling.
4061 /// Note that this is negative as long as Dependencies is not calculated.
4062 int UnscheduledDeps = InvalidDeps;
4063
4064 /// True if this instruction is scheduled (or considered as scheduled in the
4065 /// dry-run).
4066 bool IsScheduled = false;
4067 };
4068
4069#ifndef NDEBUG
4071 const BoUpSLP::ScheduleData &SD) {
4072 SD.dump(os);
4073 return os;
4074 }
4075#endif
4076
4077 friend struct GraphTraits<BoUpSLP *>;
4078 friend struct DOTGraphTraits<BoUpSLP *>;
4079
4080 /// Contains all scheduling data for a basic block.
4081 /// It does not schedules instructions, which are not memory read/write
4082 /// instructions and their operands are either constants, or arguments, or
4083 /// phis, or instructions from others blocks, or their users are phis or from
4084 /// the other blocks. The resulting vector instructions can be placed at the
4085 /// beginning of the basic block without scheduling (if operands does not need
4086 /// to be scheduled) or at the end of the block (if users are outside of the
4087 /// block). It allows to save some compile time and memory used by the
4088 /// compiler.
4089 /// ScheduleData is assigned for each instruction in between the boundaries of
4090 /// the tree entry, even for those, which are not part of the graph. It is
4091 /// required to correctly follow the dependencies between the instructions and
4092 /// their correct scheduling. The ScheduleData is not allocated for the
4093 /// instructions, which do not require scheduling, like phis, nodes with
4094 /// extractelements/insertelements only or nodes with instructions, with
4095 /// uses/operands outside of the block.
4096 struct BlockScheduling {
4097 BlockScheduling(BasicBlock *BB)
4098 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4099
4100 void clear() {
4101 ReadyInsts.clear();
4102 ScheduleStart = nullptr;
4103 ScheduleEnd = nullptr;
4104 FirstLoadStoreInRegion = nullptr;
4105 LastLoadStoreInRegion = nullptr;
4106 RegionHasStackSave = false;
4107
4108 // Reduce the maximum schedule region size by the size of the
4109 // previous scheduling run.
4110 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4111 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4112 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4113 ScheduleRegionSize = 0;
4114
4115 // Make a new scheduling region, i.e. all existing ScheduleData is not
4116 // in the new region yet.
4117 ++SchedulingRegionID;
4118 }
4119
4120 ScheduleData *getScheduleData(Instruction *I) {
4121 if (BB != I->getParent())
4122 // Avoid lookup if can't possibly be in map.
4123 return nullptr;
4124 ScheduleData *SD = ScheduleDataMap.lookup(I);
4125 if (SD && isInSchedulingRegion(SD))
4126 return SD;
4127 return nullptr;
4128 }
4129
4130 ScheduleData *getScheduleData(Value *V) {
4131 if (auto *I = dyn_cast<Instruction>(V))
4132 return getScheduleData(I);
4133 return nullptr;
4134 }
4135
4136 bool isInSchedulingRegion(ScheduleData *SD) const {
4137 return SD->SchedulingRegionID == SchedulingRegionID;
4138 }
4139
4140 /// Marks an instruction as scheduled and puts all dependent ready
4141 /// instructions into the ready-list.
4142 template <typename ReadyListType>
4143 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4144 SD->IsScheduled = true;
4145 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4146
4147 for (ScheduleData *BundleMember = SD; BundleMember;
4148 BundleMember = BundleMember->NextInBundle) {
4149
4150 // Handle the def-use chain dependencies.
4151
4152 // Decrement the unscheduled counter and insert to ready list if ready.
4153 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4154 ScheduleData *OpDef = getScheduleData(I);
4155 if (OpDef && OpDef->hasValidDependencies() &&
4156 OpDef->incrementUnscheduledDeps(-1) == 0) {
4157 // There are no more unscheduled dependencies after
4158 // decrementing, so we can put the dependent instruction
4159 // into the ready list.
4160 ScheduleData *DepBundle = OpDef->FirstInBundle;
4161 assert(!DepBundle->IsScheduled &&
4162 "already scheduled bundle gets ready");
4163 ReadyList.insert(DepBundle);
4165 << "SLP: gets ready (def): " << *DepBundle << "\n");
4166 }
4167 };
4168
4169 // If BundleMember is a vector bundle, its operands may have been
4170 // reordered during buildTree(). We therefore need to get its operands
4171 // through the TreeEntry.
4172 if (TreeEntry *TE = BundleMember->TE) {
4173 // Need to search for the lane since the tree entry can be reordered.
4174 auto *In = BundleMember->Inst;
4175 int Lane = std::distance(TE->Scalars.begin(),
4176 find(TE->Scalars, In));
4177 assert(Lane >= 0 && "Lane not set");
4178
4179 // Since vectorization tree is being built recursively this assertion
4180 // ensures that the tree entry has all operands set before reaching
4181 // this code. Couple of exceptions known at the moment are extracts
4182 // where their second (immediate) operand is not added. Since
4183 // immediates do not affect scheduler behavior this is considered
4184 // okay.
4185 assert(
4186 In &&
4187 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4188 In->getNumOperands() == TE->getNumOperands()) &&
4189 "Missed TreeEntry operands?");
4190
4191 for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))
4192 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4193 DecrUnsched(I);
4194 } else {
4195 // If BundleMember is a stand-alone instruction, no operand reordering
4196 // has taken place, so we directly access its operands.
4197 for (Use &U : BundleMember->Inst->operands())
4198 if (auto *I = dyn_cast<Instruction>(U.get()))
4199 DecrUnsched(I);
4200 }
4201 // Handle the memory dependencies.
4202 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4203 if (MemoryDepSD->hasValidDependencies() &&
4204 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4205 // There are no more unscheduled dependencies after decrementing,
4206 // so we can put the dependent instruction into the ready list.
4207 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4208 assert(!DepBundle->IsScheduled &&
4209 "already scheduled bundle gets ready");
4210 ReadyList.insert(DepBundle);
4212 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4213 }
4214 }
4215 // Handle the control dependencies.
4216 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4217 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4218 // There are no more unscheduled dependencies after decrementing,
4219 // so we can put the dependent instruction into the ready list.
4220 ScheduleData *DepBundle = DepSD->FirstInBundle;
4221 assert(!DepBundle->IsScheduled &&
4222 "already scheduled bundle gets ready");
4223 ReadyList.insert(DepBundle);
4225 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4226 }
4227 }
4228 }
4229 }
4230
4231 /// Verify basic self consistency properties of the data structure.
4232 void verify() {
4233 if (!ScheduleStart)
4234 return;
4235
4236 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4237 ScheduleStart->comesBefore(ScheduleEnd) &&
4238 "Not a valid scheduling region?");
4239
4240 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4241 auto *SD = getScheduleData(I);
4242 if (!SD)
4243 continue;
4244 assert(isInSchedulingRegion(SD) &&
4245 "primary schedule data not in window?");
4246 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4247 "entire bundle in window!");
4248 SD->verify();
4249 }
4250
4251 for (auto *SD : ReadyInsts) {
4252 assert(SD->isSchedulingEntity() && SD->isReady() &&
4253 "item in ready list not ready?");
4254 (void)SD;
4255 }
4256 }
4257
4258 /// Put all instructions into the ReadyList which are ready for scheduling.
4259 template <typename ReadyListType>
4260 void initialFillReadyList(ReadyListType &ReadyList) {
4261 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4262 ScheduleData *SD = getScheduleData(I);
4263 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4264 SD->isReady()) {
4265 ReadyList.insert(SD);
4267 << "SLP: initially in ready list: " << *SD << "\n");
4268 }
4269 }
4270 }
4271
4272 /// Build a bundle from the ScheduleData nodes corresponding to the
4273 /// scalar instruction for each lane.
4274 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4275
4276 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4277 /// cyclic dependencies. This is only a dry-run, no instructions are
4278 /// actually moved at this stage.
4279 /// \returns the scheduling bundle. The returned Optional value is not
4280 /// std::nullopt if \p VL is allowed to be scheduled.
4281 std::optional<ScheduleData *>
4282 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4283 const InstructionsState &S);
4284
4285 /// Un-bundles a group of instructions.
4286 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4287
4288 /// Allocates schedule data chunk.
4289 ScheduleData *allocateScheduleDataChunks();
4290
4291 /// Extends the scheduling region so that V is inside the region.
4292 /// \returns true if the region size is within the limit.
4293 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4294
4295 /// Initialize the ScheduleData structures for new instructions in the
4296 /// scheduling region.
4297 void initScheduleData(Instruction *FromI, Instruction *ToI,
4298 ScheduleData *PrevLoadStore,
4299 ScheduleData *NextLoadStore);
4300
4301 /// Updates the dependency information of a bundle and of all instructions/
4302 /// bundles which depend on the original bundle.
4303 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4304 BoUpSLP *SLP);
4305
4306 /// Sets all instruction in the scheduling region to un-scheduled.
4307 void resetSchedule();
4308
4309 BasicBlock *BB;
4310
4311 /// Simple memory allocation for ScheduleData.
4313
4314 /// The size of a ScheduleData array in ScheduleDataChunks.
4315 int ChunkSize;
4316
4317 /// The allocator position in the current chunk, which is the last entry
4318 /// of ScheduleDataChunks.
4319 int ChunkPos;
4320
4321 /// Attaches ScheduleData to Instruction.
4322 /// Note that the mapping survives during all vectorization iterations, i.e.
4323 /// ScheduleData structures are recycled.
4325
4326 /// The ready-list for scheduling (only used for the dry-run).
4327 SetVector<ScheduleData *> ReadyInsts;
4328
4329 /// The first instruction of the scheduling region.
4330 Instruction *ScheduleStart = nullptr;
4331
4332 /// The first instruction _after_ the scheduling region.
4333 Instruction *ScheduleEnd = nullptr;
4334
4335 /// The first memory accessing instruction in the scheduling region
4336 /// (can be null).
4337 ScheduleData *FirstLoadStoreInRegion = nullptr;
4338
4339 /// The last memory accessing instruction in the scheduling region
4340 /// (can be null).
4341 ScheduleData *LastLoadStoreInRegion = nullptr;
4342
4343 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4344 /// region? Used to optimize the dependence calculation for the
4345 /// common case where there isn't.
4346 bool RegionHasStackSave = false;
4347
4348 /// The current size of the scheduling region.
4349 int ScheduleRegionSize = 0;
4350
4351 /// The maximum size allowed for the scheduling region.
4352 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4353
4354 /// The ID of the scheduling region. For a new vectorization iteration this
4355 /// is incremented which "removes" all ScheduleData from the region.
4356 /// Make sure that the initial SchedulingRegionID is greater than the
4357 /// initial SchedulingRegionID in ScheduleData (which is 0).
4358 int SchedulingRegionID = 1;
4359 };
4360
4361 /// Attaches the BlockScheduling structures to basic blocks.
4363
4364 /// Performs the "real" scheduling. Done before vectorization is actually
4365 /// performed in a basic block.
4366 void scheduleBlock(BlockScheduling *BS);
4367
4368 /// List of users to ignore during scheduling and that don't need extracting.
4369 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4370
4371 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4372 /// sorted SmallVectors of unsigned.
4373 struct OrdersTypeDenseMapInfo {
4374 static OrdersType getEmptyKey() {
4375 OrdersType V;
4376 V.push_back(~1U);
4377 return V;
4378 }
4379
4380 static OrdersType getTombstoneKey() {
4381 OrdersType V;
4382 V.push_back(~2U);
4383 return V;
4384 }
4385
4386 static unsigned getHashValue(const OrdersType &V) {
4387 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4388 }
4389
4390 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4391 return LHS == RHS;
4392 }
4393 };
4394
4395 // Analysis and block reference.
4396 Function *F;
4397 ScalarEvolution *SE;
4399 TargetLibraryInfo *TLI;
4400 LoopInfo *LI;
4401 DominatorTree *DT;
4402 AssumptionCache *AC;
4403 DemandedBits *DB;
4404 const DataLayout *DL;
4406
4407 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4408 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4409
4410 /// Instruction builder to construct the vectorized tree.
4412
4413 /// A map of scalar integer values to the smallest bit width with which they
4414 /// can legally be represented. The values map to (width, signed) pairs,
4415 /// where "width" indicates the minimum bit width and "signed" is True if the
4416 /// value must be signed-extended, rather than zero-extended, back to its
4417 /// original width.
4419
4420 /// Final size of the reduced vector, if the current graph represents the
4421 /// input for the reduction and it was possible to narrow the size of the
4422 /// reduction.
4423 unsigned ReductionBitWidth = 0;
4424
4425 /// Canonical graph size before the transformations.
4426 unsigned BaseGraphSize = 1;
4427
4428 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4429 /// type sizes, used in the tree.
4430 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4431
4432 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4433 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4434 DenseSet<unsigned> ExtraBitWidthNodes;
4435};
4436
4437} // end namespace slpvectorizer
4438
4439template <> struct GraphTraits<BoUpSLP *> {
4440 using TreeEntry = BoUpSLP::TreeEntry;
4441
4442 /// NodeRef has to be a pointer per the GraphWriter.
4444
4446
4447 /// Add the VectorizableTree to the index iterator to be able to return
4448 /// TreeEntry pointers.
4449 struct ChildIteratorType
4450 : public iterator_adaptor_base<
4451 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4453
4455 ContainerTy &VT)
4456 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4457
4458 NodeRef operator*() { return I->UserTE; }
4459 };
4460
4462 return R.VectorizableTree[0].get();
4463 }
4464
4465 static ChildIteratorType child_begin(NodeRef N) {
4466 return {N->UserTreeIndices.begin(), N->Container};
4467 }
4468
4469 static ChildIteratorType child_end(NodeRef N) {
4470 return {N->UserTreeIndices.end(), N->Container};
4471 }
4472
4473 /// For the node iterator we just need to turn the TreeEntry iterator into a
4474 /// TreeEntry* iterator so that it dereferences to NodeRef.
4475 class nodes_iterator {
4477 ItTy It;
4478
4479 public:
4480 nodes_iterator(const ItTy &It2) : It(It2) {}
4481 NodeRef operator*() { return It->get(); }
4482 nodes_iterator operator++() {
4483 ++It;
4484 return *this;
4485 }
4486 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4487 };
4488
4489 static nodes_iterator nodes_begin(BoUpSLP *R) {
4490 return nodes_iterator(R->VectorizableTree.begin());
4491 }
4492
4493 static nodes_iterator nodes_end(BoUpSLP *R) {
4494 return nodes_iterator(R->VectorizableTree.end());
4495 }
4496
4497 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4498};
4499
4500template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4501 using TreeEntry = BoUpSLP::TreeEntry;
4502
4503 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4504
4505 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4506 std::string Str;
4508 OS << Entry->Idx << ".\n";
4509 if (isSplat(Entry->Scalars))
4510 OS << "<splat> ";
4511 for (auto *V : Entry->Scalars) {
4512 OS << *V;
4513 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4514 return EU.Scalar == V;
4515 }))
4516 OS << " <extract>";
4517 OS << "\n";
4518 }
4519 return Str;
4520 }
4521
4522 static std::string getNodeAttributes(const TreeEntry *Entry,
4523 const BoUpSLP *) {
4524 if (Entry->isGather())
4525 return "color=red";
4526 if (Entry->State == TreeEntry::ScatterVectorize ||
4527 Entry->State == TreeEntry::StridedVectorize)
4528 return "color=blue";
4529 return "";
4530 }
4531};
4532
4533} // end namespace llvm
4534
4537 for (auto *I : DeletedInstructions) {
4538 if (!I->getParent()) {
4539 // Temporarily insert instruction back to erase them from parent and
4540 // memory later.
4541 if (isa<PHINode>(I))
4542 // Phi nodes must be the very first instructions in the block.
4543 I->insertBefore(F->getEntryBlock(),
4544 F->getEntryBlock().getFirstNonPHIIt());
4545 else
4546 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
4547 continue;
4548 }
4549 for (Use &U : I->operands()) {
4550 auto *Op = dyn_cast<Instruction>(U.get());
4551 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4553 DeadInsts.emplace_back(Op);
4554 }
4555 I->dropAllReferences();
4556 }
4557 for (auto *I : DeletedInstructions) {
4558 assert(I->use_empty() &&
4559 "trying to erase instruction with users.");
4560 I->eraseFromParent();
4561 }
4562
4563 // Cleanup any dead scalar code feeding the vectorized instructions
4565
4566#ifdef EXPENSIVE_CHECKS
4567 // If we could guarantee that this call is not extremely slow, we could
4568 // remove the ifdef limitation (see PR47712).
4569 assert(!verifyFunction(*F, &dbgs()));
4570#endif
4571}
4572
4573/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4574/// contains original mask for the scalars reused in the node. Procedure
4575/// transform this mask in accordance with the given \p Mask.
4577 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4578 "Expected non-empty mask.");
4579 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4580 Prev.swap(Reuses);
4581 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4582 if (Mask[I] != PoisonMaskElem)
4583 Reuses[Mask[I]] = Prev[I];
4584}
4585
4586/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4587/// the original order of the scalars. Procedure transforms the provided order
4588/// in accordance with the given \p Mask. If the resulting \p Order is just an
4589/// identity order, \p Order is cleared.
4591 bool BottomOrder = false) {
4592 assert(!Mask.empty() && "Expected non-empty mask.");
4593 unsigned Sz = Mask.size();
4594 if (BottomOrder) {
4595 SmallVector<unsigned> PrevOrder;
4596 if (Order.empty()) {
4597 PrevOrder.resize(Sz);
4598 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4599 } else {
4600 PrevOrder.swap(Order);
4601 }
4602 Order.assign(Sz, Sz);
4603 for (unsigned I = 0; I < Sz; ++I)
4604 if (Mask[I] != PoisonMaskElem)
4605 Order[I] = PrevOrder[Mask[I]];
4606 if (all_of(enumerate(Order), [&](const auto &Data) {
4607 return Data.value() == Sz || Data.index() == Data.value();
4608 })) {
4609 Order.clear();
4610 return;
4611 }
4612 fixupOrderingIndices(Order);
4613 return;
4614 }
4615 SmallVector<int> MaskOrder;
4616 if (Order.empty()) {
4617 MaskOrder.resize(Sz);
4618 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4619 } else {
4620 inversePermutation(Order, MaskOrder);
4621 }
4622 reorderReuses(MaskOrder, Mask);
4623 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4624 Order.clear();
4625 return;
4626 }
4627 Order.assign(Sz, Sz);
4628 for (unsigned I = 0; I < Sz; ++I)
4629 if (MaskOrder[I] != PoisonMaskElem)
4630 Order[MaskOrder[I]] = I;
4631 fixupOrderingIndices(Order);
4632}
4633
4634std::optional<BoUpSLP::OrdersType>
4635BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4636 assert(TE.isGather() && "Expected gather node only.");
4637 // Try to find subvector extract/insert patterns and reorder only such
4638 // patterns.
4639 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4640 Type *ScalarTy = GatheredScalars.front()->getType();
4641 int NumScalars = GatheredScalars.size();
4642 if (!isValidElementType(ScalarTy))
4643 return std::nullopt;
4644 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4645 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
4646 SmallVector<int> ExtractMask;
4647 SmallVector<int> Mask;
4650 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4652 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4653 /*ForOrder=*/true);
4654 // No shuffled operands - ignore.
4655 if (GatherShuffles.empty() && ExtractShuffles.empty())
4656 return std::nullopt;
4657 OrdersType CurrentOrder(NumScalars, NumScalars);
4658 if (GatherShuffles.size() == 1 &&
4659 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4660 Entries.front().front()->isSame(TE.Scalars)) {
4661 // Perfect match in the graph, will reuse the previously vectorized
4662 // node. Cost is 0.
4663 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4664 return CurrentOrder;
4665 }
4666 auto IsSplatMask = [](ArrayRef<int> Mask) {
4667 int SingleElt = PoisonMaskElem;
4668 return all_of(Mask, [&](int I) {
4669 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4670 SingleElt = I;
4671 return I == PoisonMaskElem || I == SingleElt;
4672 });
4673 };
4674 // Exclusive broadcast mask - ignore.
4675 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4676 (Entries.size() != 1 ||
4677 Entries.front().front()->ReorderIndices.empty())) ||
4678 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4679 return std::nullopt;
4680 SmallBitVector ShuffledSubMasks(NumParts);
4681 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4682 ArrayRef<int> Mask, int PartSz, int NumParts,
4683 function_ref<unsigned(unsigned)> GetVF) {
4684 for (int I : seq<int>(0, NumParts)) {
4685 if (ShuffledSubMasks.test(I))
4686 continue;
4687 const int VF = GetVF(I);
4688 if (VF == 0)
4689 continue;
4690 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4691 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4692 // Shuffle of at least 2 vectors - ignore.
4693 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4694 std::fill(Slice.begin(), Slice.end(), NumScalars);
4695 ShuffledSubMasks.set(I);
4696 continue;
4697 }
4698 // Try to include as much elements from the mask as possible.
4699 int FirstMin = INT_MAX;
4700 int SecondVecFound = false;
4701 for (int K : seq<int>(Limit)) {
4702 int Idx = Mask[I * PartSz + K];
4703 if (Idx == PoisonMaskElem) {
4704 Value *V = GatheredScalars[I * PartSz + K];
4705 if (isConstant(V) && !isa<PoisonValue>(V)) {
4706 SecondVecFound = true;
4707 break;
4708 }
4709 continue;
4710 }
4711 if (Idx < VF) {
4712 if (FirstMin > Idx)
4713 FirstMin = Idx;
4714 } else {
4715 SecondVecFound = true;
4716 break;
4717 }
4718 }
4719 FirstMin = (FirstMin / PartSz) * PartSz;
4720 // Shuffle of at least 2 vectors - ignore.
4721 if (SecondVecFound) {
4722 std::fill(Slice.begin(), Slice.end(), NumScalars);
4723 ShuffledSubMasks.set(I);
4724 continue;
4725 }
4726 for (int K : seq<int>(Limit)) {
4727 int Idx = Mask[I * PartSz + K];
4728 if (Idx == PoisonMaskElem)
4729 continue;
4730 Idx -= FirstMin;
4731 if (Idx >= PartSz) {
4732 SecondVecFound = true;
4733 break;
4734 }
4735 if (CurrentOrder[I * PartSz + Idx] >
4736 static_cast<unsigned>(I * PartSz + K) &&
4737 CurrentOrder[I * PartSz + Idx] !=
4738 static_cast<unsigned>(I * PartSz + Idx))
4739 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4740 }
4741 // Shuffle of at least 2 vectors - ignore.
4742 if (SecondVecFound) {
4743 std::fill(Slice.begin(), Slice.end(), NumScalars);
4744 ShuffledSubMasks.set(I);
4745 continue;
4746 }
4747 }
4748 };
4749 int PartSz = getPartNumElems(NumScalars, NumParts);
4750 if (!ExtractShuffles.empty())
4751 TransformMaskToOrder(
4752 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4753 if (!ExtractShuffles[I])
4754 return 0U;
4755 unsigned VF = 0;
4756 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4757 for (unsigned Idx : seq<unsigned>(Sz)) {
4758 int K = I * PartSz + Idx;
4759 if (ExtractMask[K] == PoisonMaskElem)
4760 continue;
4761 if (!TE.ReuseShuffleIndices.empty())
4762 K = TE.ReuseShuffleIndices[K];
4763 if (K == PoisonMaskElem)
4764 continue;
4765 if (!TE.ReorderIndices.empty())
4766 K = std::distance(TE.ReorderIndices.begin(),
4767 find(TE.ReorderIndices, K));
4768 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4769 if (!EI)
4770 continue;
4771 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4772 ->getElementCount()
4773 .getKnownMinValue());
4774 }
4775 return VF;
4776 });
4777 // Check special corner case - single shuffle of the same entry.
4778 if (GatherShuffles.size() == 1 && NumParts != 1) {
4779 if (ShuffledSubMasks.any())
4780 return std::nullopt;
4781 PartSz = NumScalars;
4782 NumParts = 1;
4783 }
4784 if (!Entries.empty())
4785 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4786 if (!GatherShuffles[I])
4787 return 0U;
4788 return std::max(Entries[I].front()->getVectorFactor(),
4789 Entries[I].back()->getVectorFactor());
4790 });
4791 int NumUndefs =
4792 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4793 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4794 return std::nullopt;
4795 return std::move(CurrentOrder);
4796}
4797
4798static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4799 const TargetLibraryInfo &TLI,
4800 bool CompareOpcodes = true) {
4803 return false;
4804 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4805 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4806 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4807 (!GEP2 || GEP2->getNumOperands() == 2) &&
4808 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4809 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4810 !CompareOpcodes ||
4811 (GEP1 && GEP2 &&
4812 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4813}
4814
4815/// Calculates minimal alignment as a common alignment.
4816template <typename T>
4818 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4819 for (Value *V : VL.drop_front())
4820 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4821 return CommonAlignment;
4822}
4823
4824/// Check if \p Order represents reverse order.
4826 assert(!Order.empty() &&
4827 "Order is empty. Please check it before using isReverseOrder.");
4828 unsigned Sz = Order.size();
4829 return all_of(enumerate(Order), [&](const auto &Pair) {
4830 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4831 });
4832}
4833
4834/// Checks if the provided list of pointers \p Pointers represents the strided
4835/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4836/// Otherwise, if \p Inst is not specified, just initialized optional value is
4837/// returned to show that the pointers represent strided pointers. If \p Inst
4838/// specified, the runtime stride is materialized before the given \p Inst.
4839/// \returns std::nullopt if the pointers are not pointers with the runtime
4840/// stride, nullptr or actual stride value, otherwise.
4841static std::optional<Value *>
4843 const DataLayout &DL, ScalarEvolution &SE,
4844 SmallVectorImpl<unsigned> &SortedIndices,
4845 Instruction *Inst = nullptr) {
4847 const SCEV *PtrSCEVLowest = nullptr;
4848 const SCEV *PtrSCEVHighest = nullptr;
4849 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4850 // addresses).
4851 for (Value *Ptr : PointerOps) {
4852 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4853 if (!PtrSCEV)
4854 return std::nullopt;
4855 SCEVs.push_back(PtrSCEV);
4856 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4857 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4858 continue;
4859 }
4860 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4861 if (isa<SCEVCouldNotCompute>(Diff))
4862 return std::nullopt;
4863 if (Diff->isNonConstantNegative()) {
4864 PtrSCEVLowest = PtrSCEV;
4865 continue;
4866 }
4867 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4868 if (isa<SCEVCouldNotCompute>(Diff1))
4869 return std::nullopt;
4870 if (Diff1->isNonConstantNegative()) {
4871 PtrSCEVHighest = PtrSCEV;
4872 continue;
4873 }
4874 }
4875 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4876 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4877 if (isa<SCEVCouldNotCompute>(Dist))
4878 return std::nullopt;
4879 int Size = DL.getTypeStoreSize(ElemTy);
4880 auto TryGetStride = [&](const SCEV *Dist,
4881 const SCEV *Multiplier) -> const SCEV * {
4882 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4883 if (M->getOperand(0) == Multiplier)
4884 return M->getOperand(1);
4885 if (M->getOperand(1) == Multiplier)
4886 return M->getOperand(0);
4887 return nullptr;
4888 }
4889 if (Multiplier == Dist)
4890 return SE.getConstant(Dist->getType(), 1);
4891 return SE.getUDivExactExpr(Dist, Multiplier);
4892 };
4893 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4894 const SCEV *Stride = nullptr;
4895 if (Size != 1 || SCEVs.size() > 2) {
4896 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4897 Stride = TryGetStride(Dist, Sz);
4898 if (!Stride)
4899 return std::nullopt;
4900 }
4901 if (!Stride || isa<SCEVConstant>(Stride))
4902 return std::nullopt;
4903 // Iterate through all pointers and check if all distances are
4904 // unique multiple of Stride.
4905 using DistOrdPair = std::pair<int64_t, int>;
4906 auto Compare = llvm::less_first();
4907 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4908 int Cnt = 0;
4909 bool IsConsecutive = true;
4910 for (const SCEV *PtrSCEV : SCEVs) {
4911 unsigned Dist = 0;
4912 if (PtrSCEV != PtrSCEVLowest) {
4913 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4914 const SCEV *Coeff = TryGetStride(Diff, Stride);
4915 if (!Coeff)
4916 return std::nullopt;
4917 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4918 if (!SC || isa<SCEVCouldNotCompute>(SC))
4919 return std::nullopt;
4920 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4921 SE.getMulExpr(Stride, SC)))
4922 ->isZero())
4923 return std::nullopt;
4924 Dist = SC->getAPInt().getZExtValue();
4925 }
4926 // If the strides are not the same or repeated, we can't vectorize.
4927 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4928 return std::nullopt;
4929 auto Res = Offsets.emplace(Dist, Cnt);
4930 if (!Res.second)
4931 return std::nullopt;
4932 // Consecutive order if the inserted element is the last one.
4933 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4934 ++Cnt;
4935 }
4936 if (Offsets.size() != SCEVs.size())
4937 return std::nullopt;
4938 SortedIndices.clear();
4939 if (!IsConsecutive) {
4940 // Fill SortedIndices array only if it is non-consecutive.
4941 SortedIndices.resize(PointerOps.size());
4942 Cnt = 0;
4943 for (const std::pair<int64_t, int> &Pair : Offsets) {
4944 SortedIndices[Cnt] = Pair.second;
4945 ++Cnt;
4946 }
4947 }
4948 if (!Inst)
4949 return nullptr;
4950 SCEVExpander Expander(SE, DL, "strided-load-vec");
4951 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4952}
4953
4954static std::pair<InstructionCost, InstructionCost>
4956 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4957 Type *ScalarTy, VectorType *VecTy);
4958
4959/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4960/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4961/// subvector pattern.
4962static InstructionCost
4964 VectorType *Tp, ArrayRef<int> Mask = {},
4966 int Index = 0, VectorType *SubTp = nullptr,
4968 if (Kind != TTI::SK_PermuteTwoSrc)
4969 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4970 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4971 int NumSubElts;
4973 Mask, NumSrcElts, NumSubElts, Index)) {
4974 if (Index + NumSubElts > NumSrcElts &&
4975 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4976 return TTI.getShuffleCost(
4978 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4980 }
4981 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4982}
4983
4984/// Correctly creates insert_subvector, checking that the index is multiple of
4985/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4986/// using default shuffle.
4988 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
4989 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
4990 const unsigned SubVecVF = getNumElements(V->getType());
4991 if (Index % SubVecVF == 0) {
4992 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4993 Builder.getInt64(Index));
4994 } else {
4995 // Create shuffle, insertvector requires that index is multiple of
4996 // the subvector length.
4997 const unsigned VecVF = getNumElements(Vec->getType());
4999 std::iota(Mask.begin(), Mask.end(), 0);
5000 for (unsigned I : seq<unsigned>(SubVecVF))
5001 Mask[I + Index] = I + VecVF;
5002 if (Generator) {
5003 Vec = Generator(Vec, V, Mask);
5004 } else {
5005 // 1. Resize V to the size of Vec.
5006 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
5007 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5008 V = Builder.CreateShuffleVector(V, ResizeMask);
5009 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
5010 }
5011 }
5012 return Vec;
5013}
5014
5015/// Correctly creates extract_subvector, checking that the index is multiple of
5016/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5017/// using default shuffle.
5019 unsigned SubVecVF, unsigned Index) {
5020 if (Index % SubVecVF == 0) {
5021 VectorType *SubVecTy =
5022 getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
5023 return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
5024 }
5025 // Create shuffle, extract_subvector requires that index is multiple of
5026 // the subvector length.
5027 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
5028 std::iota(Mask.begin(), Mask.end(), Index);
5029 return Builder.CreateShuffleVector(Vec, Mask);
5030}
5031
5035 SmallVectorImpl<Value *> &PointerOps,
5036 unsigned *BestVF, bool TryRecursiveCheck) const {
5037 // Check that a vectorized load would load the same memory as a scalar
5038 // load. For example, we don't want to vectorize loads that are smaller
5039 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5040 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5041 // from such a struct, we read/write packed bits disagreeing with the
5042 // unvectorized version.
5043 if (BestVF)
5044 *BestVF = 0;
5046 return LoadsState::Gather;
5047 Type *ScalarTy = VL0->getType();
5048
5049 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
5050 return LoadsState::Gather;
5051
5052 // Make sure all loads in the bundle are simple - we can't vectorize
5053 // atomic or volatile loads.
5054 PointerOps.clear();
5055 const unsigned Sz = VL.size();
5056 PointerOps.resize(Sz);
5057 auto *POIter = PointerOps.begin();
5058 for (Value *V : VL) {
5059 auto *L = dyn_cast<LoadInst>(V);
5060 if (!L || !L->isSimple())
5061 return LoadsState::Gather;
5062 *POIter = L->getPointerOperand();
5063 ++POIter;
5064 }
5065
5066 Order.clear();
5067 // Check the order of pointer operands or that all pointers are the same.
5068 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5069
5070 auto *VecTy = getWidenedType(ScalarTy, Sz);
5071 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5072 if (!IsSorted) {
5073 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5074 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5075 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5077 }
5078
5079 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5080 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5081 return LoadsState::Gather;
5082
5083 if (!all_of(PointerOps, [&](Value *P) {
5084 return arePointersCompatible(P, PointerOps.front(), *TLI);
5085 }))
5086 return LoadsState::Gather;
5087
5088 } else {
5089 Value *Ptr0;
5090 Value *PtrN;
5091 if (Order.empty()) {
5092 Ptr0 = PointerOps.front();
5093 PtrN = PointerOps.back();
5094 } else {
5095 Ptr0 = PointerOps[Order.front()];
5096 PtrN = PointerOps[Order.back()];
5097 }
5098 std::optional<int> Diff =
5099 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5100 // Check that the sorted loads are consecutive.
5101 if (static_cast<unsigned>(*Diff) == Sz - 1)
5102 return LoadsState::Vectorize;
5103 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5104 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5105 return LoadsState::Gather;
5106 // Simple check if not a strided access - clear order.
5107 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5108 // Try to generate strided load node if:
5109 // 1. Target with strided load support is detected.
5110 // 2. The number of loads is greater than MinProfitableStridedLoads,
5111 // or the potential stride <= MaxProfitableLoadStride and the
5112 // potential stride is power-of-2 (to avoid perf regressions for the very
5113 // small number of loads) and max distance > number of loads, or potential
5114 // stride is -1.
5115 // 3. The loads are ordered, or number of unordered loads <=
5116 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5117 // (this check is to avoid extra costs for very expensive shuffles).
5118 // 4. Any pointer operand is an instruction with the users outside of the
5119 // current graph (for masked gathers extra extractelement instructions
5120 // might be required).
5121 auto IsAnyPointerUsedOutGraph =
5122 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5123 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5124 return !isVectorized(U) && !MustGather.contains(U);
5125 });
5126 });
5127 const unsigned AbsoluteDiff = std::abs(*Diff);
5128 if (IsPossibleStrided &&
5129 (IsAnyPointerUsedOutGraph ||
5130 (AbsoluteDiff > Sz &&
5132 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5133 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5134 *Diff == -(static_cast<int>(Sz) - 1))) {
5135 int Stride = *Diff / static_cast<int>(Sz - 1);
5136 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5137 Align Alignment =
5138 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5139 ->getAlign();
5140 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5141 // Iterate through all pointers and check if all distances are
5142 // unique multiple of Dist.
5143 SmallSet<int, 4> Dists;
5144 for (Value *Ptr : PointerOps) {
5145 int Dist = 0;
5146 if (Ptr == PtrN)
5147 Dist = *Diff;
5148 else if (Ptr != Ptr0)
5149 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5150 // If the strides are not the same or repeated, we can't
5151 // vectorize.
5152 if (((Dist / Stride) * Stride) != Dist ||
5153 !Dists.insert(Dist).second)
5154 break;
5155 }
5156 if (Dists.size() == Sz)
5158 }
5159 }
5160 }
5161 }
5162 // Correctly identify compare the cost of loads + shuffles rather than
5163 // strided/masked gather loads. Returns true if vectorized + shuffles
5164 // representation is better than just gather.
5165 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5166 unsigned *BestVF,
5167 bool ProfitableGatherPointers) {
5168 if (BestVF)
5169 *BestVF = 0;
5170 // Compare masked gather cost and loads + insert subvector costs.
5172 auto [ScalarGEPCost, VectorGEPCost] =
5173 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5174 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5175 // Estimate the cost of masked gather GEP. If not a splat, roughly
5176 // estimate as a buildvector, otherwise estimate as splat.
5177 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5178 VectorType *PtrVecTy =
5179 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5180 VecTy->getNumElements());
5181 if (static_cast<unsigned>(count_if(
5182 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5183 any_of(PointerOps, [&](Value *V) {
5184 return getUnderlyingObject(V) !=
5185 getUnderlyingObject(PointerOps.front());
5186 }))
5187 VectorGEPCost += TTI.getScalarizationOverhead(
5188 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5189 else
5190 VectorGEPCost +=
5192 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5193 /*Insert=*/true, /*Extract=*/false, CostKind) +
5195 // The cost of scalar loads.
5196 InstructionCost ScalarLoadsCost =
5197 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5198 [&](InstructionCost C, Value *V) {
5199 return C + TTI.getInstructionCost(
5200 cast<Instruction>(V), CostKind);
5201 }) +
5202 ScalarGEPCost;
5203 // The cost of masked gather.
5204 InstructionCost MaskedGatherCost =
5206 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5207 /*VariableMask=*/false, CommonAlignment, CostKind) +
5208 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5209 InstructionCost GatherCost =
5210 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5211 /*Extract=*/false, CostKind) +
5212 ScalarLoadsCost;
5213 // The list of loads is small or perform partial check already - directly
5214 // compare masked gather cost and gather cost.
5215 constexpr unsigned ListLimit = 4;
5216 if (!TryRecursiveCheck || VL.size() < ListLimit)
5217 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5218
5219 // FIXME: The following code has not been updated for non-power-of-2
5220 // vectors (and not whole registers). The splitting logic here does not
5221 // cover the original vector if the vector factor is not a power of two.
5222 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5223 return false;
5224
5225 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5226 unsigned MinVF = getMinVF(2 * Sz);
5227 DemandedElts.clearAllBits();
5228 // Iterate through possible vectorization factors and check if vectorized +
5229 // shuffles is better than just gather.
5230 for (unsigned VF =
5231 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5232 VF >= MinVF;
5233 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5235 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5236 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5238 SmallVector<Value *> PointerOps;
5239 LoadsState LS =
5240 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5241 /*TryRecursiveCheck=*/false);
5242 // Check that the sorted loads are consecutive.
5243 if (LS == LoadsState::Gather) {
5244 if (BestVF) {
5245 DemandedElts.setAllBits();
5246 break;
5247 }
5248 DemandedElts.setBits(Cnt, Cnt + VF);
5249 continue;
5250 }
5251 // If need the reorder - consider as high-cost masked gather for now.
5252 if ((LS == LoadsState::Vectorize ||
5254 !Order.empty() && !isReverseOrder(Order))
5256 States.push_back(LS);
5257 }
5258 if (DemandedElts.isAllOnes())
5259 // All loads gathered - try smaller VF.
5260 continue;
5261 // Can be vectorized later as a serie of loads/insertelements.
5262 InstructionCost VecLdCost = 0;
5263 if (!DemandedElts.isZero()) {
5264 VecLdCost =
5265 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5266 /*Extract=*/false, CostKind) +
5267 ScalarGEPCost;
5268 for (unsigned Idx : seq<unsigned>(VL.size()))
5269 if (DemandedElts[Idx])
5270 VecLdCost +=
5271 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5272 }
5273 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5274 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5275 for (auto [I, LS] : enumerate(States)) {
5276 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5277 InstructionCost VectorGEPCost =
5278 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5279 ? 0
5280 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5281 LI0->getPointerOperand(),
5282 Instruction::GetElementPtr, CostKind, ScalarTy,
5283 SubVecTy)
5284 .second;
5285 if (LS == LoadsState::ScatterVectorize) {
5286 if (static_cast<unsigned>(
5287 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5288 PointerOps.size() - 1 ||
5289 any_of(PointerOps, [&](Value *V) {
5290 return getUnderlyingObject(V) !=
5291 getUnderlyingObject(PointerOps.front());
5292 }))
5293 VectorGEPCost += TTI.getScalarizationOverhead(
5294 SubVecTy, APInt::getAllOnes(VF),
5295 /*Insert=*/true, /*Extract=*/false, CostKind);
5296 else
5297 VectorGEPCost +=
5299 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5300 /*Insert=*/true, /*Extract=*/false, CostKind) +
5301 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5302 CostKind);
5303 }
5304 switch (LS) {
5306 VecLdCost +=
5307 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5308 LI0->getPointerAddressSpace(), CostKind,
5310 VectorGEPCost;
5311 break;
5313 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5314 LI0->getPointerOperand(),
5315 /*VariableMask=*/false,
5316 CommonAlignment, CostKind) +
5317 VectorGEPCost;
5318 break;
5320 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5321 LI0->getPointerOperand(),
5322 /*VariableMask=*/false,
5323 CommonAlignment, CostKind) +
5324 VectorGEPCost;
5325 break;
5326 case LoadsState::Gather:
5327 // Gathers are already calculated - ignore.
5328 continue;
5329 }
5330 SmallVector<int> ShuffleMask(VL.size());
5331 for (int Idx : seq<int>(0, VL.size()))
5332 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5333 if (I > 0)
5334 VecLdCost +=
5335 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5336 CostKind, I * VF, SubVecTy);
5337 }
5338 // If masked gather cost is higher - better to vectorize, so
5339 // consider it as a gather node. It will be better estimated
5340 // later.
5341 if (MaskedGatherCost >= VecLdCost &&
5342 VecLdCost - GatherCost < -SLPCostThreshold) {
5343 if (BestVF)
5344 *BestVF = VF;
5345 return true;
5346 }
5347 }
5348 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5349 };
5350 // TODO: need to improve analysis of the pointers, if not all of them are
5351 // GEPs or have > 2 operands, we end up with a gather node, which just
5352 // increases the cost.
5353 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5354 bool ProfitableGatherPointers =
5355 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5356 return L->isLoopInvariant(V);
5357 })) <= Sz / 2;
5358 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5359 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5360 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5361 (GEP && GEP->getNumOperands() == 2 &&
5362 isa<Constant, Instruction>(GEP->getOperand(1)));
5363 })) {
5364 // Check if potential masked gather can be represented as series
5365 // of loads + insertsubvectors.
5366 // If masked gather cost is higher - better to vectorize, so
5367 // consider it as a gather node. It will be better estimated
5368 // later.
5369 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5370 ProfitableGatherPointers))
5372 }
5373
5374 return LoadsState::Gather;
5375}
5376
5378 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5379 const DataLayout &DL, ScalarEvolution &SE,
5380 SmallVectorImpl<unsigned> &SortedIndices) {
5381 assert(
5382 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5383 "Expected list of pointer operands.");
5384 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5385 // Ptr into, sort and return the sorted indices with values next to one
5386 // another.
5389 Bases;
5390 Bases
5391 .try_emplace(std::make_pair(
5393 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5394
5395 SortedIndices.clear();
5396 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5397 auto Key = std::make_pair(BBs[Cnt + 1],
5399 bool Found = any_of(Bases.try_emplace(Key).first->second,
5400 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5401 std::optional<int> Diff = getPointersDiff(
5402 ElemTy, std::get<0>(Base.front()), ElemTy,
5403 Ptr, DL, SE,
5404 /*StrictCheck=*/true);
5405 if (!Diff)
5406 return false;
5407
5408 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5409 return true;
5410 });
5411
5412 if (!Found) {
5413 // If we haven't found enough to usefully cluster, return early.
5414 if (Bases.size() > VL.size() / 2 - 1)
5415 return false;
5416
5417 // Not found already - add a new Base
5418 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5419 }
5420 }
5421
5422 if (Bases.size() == VL.size())
5423 return false;
5424
5425 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5426 Bases.front().second.size() == VL.size()))
5427 return false;
5428
5429 // For each of the bases sort the pointers by Offset and check if any of the
5430 // base become consecutively allocated.
5431 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5432 SmallPtrSet<Value *, 13> FirstPointers;
5433 SmallPtrSet<Value *, 13> SecondPointers;
5434 Value *P1 = Ptr1;
5435 Value *P2 = Ptr2;
5436 unsigned Depth = 0;
5437 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5438 if (P1 == P2 || Depth > RecursionMaxDepth)
5439 return false;
5440 FirstPointers.insert(P1);
5441 SecondPointers.insert(P2);
5442 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5443 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5444 ++Depth;
5445 }
5446 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5447 "Unable to find matching root.");
5448 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5449 };
5450 for (auto &Base : Bases) {
5451 for (auto &Vec : Base.second) {
5452 if (Vec.size() > 1) {
5453 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5454 const std::tuple<Value *, int, unsigned> &Y) {
5455 return std::get<1>(X) < std::get<1>(Y);
5456 });
5457 int InitialOffset = std::get<1>(Vec[0]);
5458 bool AnyConsecutive =
5459 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5460 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5461 });
5462 // Fill SortedIndices array only if it looks worth-while to sort the
5463 // ptrs.
5464 if (!AnyConsecutive)
5465 return false;
5466 }
5467 }
5468 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5469 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5470 });
5471 }
5472
5473 for (auto &T : Bases)
5474 for (const auto &Vec : T.second)
5475 for (const auto &P : Vec)
5476 SortedIndices.push_back(std::get<2>(P));
5477
5478 assert(SortedIndices.size() == VL.size() &&
5479 "Expected SortedIndices to be the size of VL");
5480 return true;
5481}
5482
5483std::optional<BoUpSLP::OrdersType>
5484BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5485 assert(TE.isGather() && "Expected gather node only.");
5486 Type *ScalarTy = TE.Scalars[0]->getType();
5487
5489 Ptrs.reserve(TE.Scalars.size());
5491 BBs.reserve(TE.Scalars.size());
5492 for (Value *V : TE.Scalars) {
5493 auto *L = dyn_cast<LoadInst>(V);
5494 if (!L || !L->isSimple())
5495 return std::nullopt;
5496 Ptrs.push_back(L->getPointerOperand());
5497 BBs.push_back(L->getParent());
5498 }
5499
5500 BoUpSLP::OrdersType Order;
5501 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5502 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5503 return std::move(Order);
5504 return std::nullopt;
5505}
5506
5507/// Check if two insertelement instructions are from the same buildvector.
5510 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5511 // Instructions must be from the same basic blocks.
5512 if (VU->getParent() != V->getParent())
5513 return false;
5514 // Checks if 2 insertelements are from the same buildvector.
5515 if (VU->getType() != V->getType())
5516 return false;
5517 // Multiple used inserts are separate nodes.
5518 if (!VU->hasOneUse() && !V->hasOneUse())
5519 return false;
5520 auto *IE1 = VU;
5521 auto *IE2 = V;
5522 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5523 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5524 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5525 return false;
5526 // Go through the vector operand of insertelement instructions trying to find
5527 // either VU as the original vector for IE2 or V as the original vector for
5528 // IE1.
5529 SmallBitVector ReusedIdx(
5530 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5531 bool IsReusedIdx = false;
5532 do {
5533 if (IE2 == VU && !IE1)
5534 return VU->hasOneUse();
5535 if (IE1 == V && !IE2)
5536 return V->hasOneUse();
5537 if (IE1 && IE1 != V) {
5538 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5539 IsReusedIdx |= ReusedIdx.test(Idx1);
5540 ReusedIdx.set(Idx1);
5541 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5542 IE1 = nullptr;
5543 else
5544 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5545 }
5546 if (IE2 && IE2 != VU) {
5547 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5548 IsReusedIdx |= ReusedIdx.test(Idx2);
5549 ReusedIdx.set(Idx2);
5550 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5551 IE2 = nullptr;
5552 else
5553 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5554 }
5555 } while (!IsReusedIdx && (IE1 || IE2));
5556 return false;
5557}
5558
5559std::optional<BoUpSLP::OrdersType>
5560BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5561 // No need to reorder if need to shuffle reuses, still need to shuffle the
5562 // node.
5563 if (!TE.ReuseShuffleIndices.empty()) {
5564 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5565 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5566 "Reshuffling scalars not yet supported for nodes with padding");
5567
5568 if (isSplat(TE.Scalars))
5569 return std::nullopt;
5570 // Check if reuse shuffle indices can be improved by reordering.
5571 // For this, check that reuse mask is "clustered", i.e. each scalar values
5572 // is used once in each submask of size <number_of_scalars>.
5573 // Example: 4 scalar values.
5574 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5575 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5576 // element 3 is used twice in the second submask.
5577 unsigned Sz = TE.Scalars.size();
5578 if (TE.isGather()) {
5579 if (std::optional<OrdersType> CurrentOrder =
5581 SmallVector<int> Mask;
5582 fixupOrderingIndices(*CurrentOrder);
5583 inversePermutation(*CurrentOrder, Mask);
5584 ::addMask(Mask, TE.ReuseShuffleIndices);
5585 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5586 unsigned Sz = TE.Scalars.size();
5587 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5588 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5589 if (Idx != PoisonMaskElem)
5590 Res[Idx + K * Sz] = I + K * Sz;
5591 }
5592 return std::move(Res);
5593 }
5594 }
5595 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5596 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
5597 2 * TE.getVectorFactor())) == 1)
5598 return std::nullopt;
5599 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5600 Sz)) {
5601 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5602 if (TE.ReorderIndices.empty())
5603 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5604 else
5605 inversePermutation(TE.ReorderIndices, ReorderMask);
5606 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5607 unsigned VF = ReorderMask.size();
5608 OrdersType ResOrder(VF, VF);
5609 unsigned NumParts = divideCeil(VF, Sz);
5610 SmallBitVector UsedVals(NumParts);
5611 for (unsigned I = 0; I < VF; I += Sz) {
5612 int Val = PoisonMaskElem;
5613 unsigned UndefCnt = 0;
5614 unsigned Limit = std::min(Sz, VF - I);
5615 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5616 [&](int Idx) {
5617 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5618 Val = Idx;
5619 if (Idx == PoisonMaskElem)
5620 ++UndefCnt;
5621 return Idx != PoisonMaskElem && Idx != Val;
5622 }) ||
5623 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5624 UndefCnt > Sz / 2)
5625 return std::nullopt;
5626 UsedVals.set(Val);
5627 for (unsigned K = 0; K < NumParts; ++K) {
5628 unsigned Idx = Val + Sz * K;
5629 if (Idx < VF)
5630 ResOrder[Idx] = I + K;
5631 }
5632 }
5633 return std::move(ResOrder);
5634 }
5635 unsigned VF = TE.getVectorFactor();
5636 // Try build correct order for extractelement instructions.
5637 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5638 TE.ReuseShuffleIndices.end());
5639 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5640 all_of(TE.Scalars, [Sz](Value *V) {
5641 if (isa<PoisonValue>(V))
5642 return true;
5643 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5644 return Idx && *Idx < Sz;
5645 })) {
5646 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5647 "by BinaryOperator and CastInst.");
5648 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5649 if (TE.ReorderIndices.empty())
5650 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5651 else
5652 inversePermutation(TE.ReorderIndices, ReorderMask);
5653 for (unsigned I = 0; I < VF; ++I) {
5654 int &Idx = ReusedMask[I];
5655 if (Idx == PoisonMaskElem)
5656 continue;
5657 Value *V = TE.Scalars[ReorderMask[Idx]];
5658 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5659 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5660 }
5661 }
5662 // Build the order of the VF size, need to reorder reuses shuffles, they are
5663 // always of VF size.
5664 OrdersType ResOrder(VF);
5665 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5666 auto *It = ResOrder.begin();
5667 for (unsigned K = 0; K < VF; K += Sz) {
5668 OrdersType CurrentOrder(TE.ReorderIndices);
5669 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5670 if (SubMask.front() == PoisonMaskElem)
5671 std::iota(SubMask.begin(), SubMask.end(), 0);
5672 reorderOrder(CurrentOrder, SubMask);
5673 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5674 std::advance(It, Sz);
5675 }
5676 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5677 return Data.index() == Data.value();
5678 }))
5679 return std::nullopt; // No need to reorder.
5680 return std::move(ResOrder);
5681 }
5682 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5683 any_of(TE.UserTreeIndices,
5684 [](const EdgeInfo &EI) {
5685 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5686 }) &&
5687 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5688 return std::nullopt;
5689 if ((TE.State == TreeEntry::Vectorize ||
5690 TE.State == TreeEntry::StridedVectorize) &&
5691 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5692 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5693 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5694 "BinaryOperator and CastInst.");
5695 return TE.ReorderIndices;
5696 }
5697 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5698 if (!TE.ReorderIndices.empty())
5699 return TE.ReorderIndices;
5700
5701 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5702 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5703 if (!V->hasNUsesOrMore(1))
5704 continue;
5705 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5706 if (!II)
5707 continue;
5708 Instruction *BVHead = nullptr;
5709 BasicBlock *BB = II->getParent();
5710 while (II && II->hasOneUse() && II->getParent() == BB) {
5711 BVHead = II;
5712 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5713 }
5714 I = BVHead;
5715 }
5716
5717 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5718 assert(BB1 != BB2 && "Expected different basic blocks.");
5719 auto *NodeA = DT->getNode(BB1);
5720 auto *NodeB = DT->getNode(BB2);
5721 assert(NodeA && "Should only process reachable instructions");
5722 assert(NodeB && "Should only process reachable instructions");
5723 assert((NodeA == NodeB) ==
5724 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5725 "Different nodes should have different DFS numbers");
5726 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5727 };
5728 auto PHICompare = [&](unsigned I1, unsigned I2) {
5729 Value *V1 = TE.Scalars[I1];
5730 Value *V2 = TE.Scalars[I2];
5731 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5732 return false;
5733 if (isa<PoisonValue>(V1))
5734 return true;
5735 if (isa<PoisonValue>(V2))
5736 return false;
5737 if (V1->getNumUses() < V2->getNumUses())
5738 return true;
5739 if (V1->getNumUses() > V2->getNumUses())
5740 return false;
5741 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5742 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5743 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5744 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5745 FirstUserOfPhi2->getParent());
5746 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5747 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5748 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5749 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5750 if (IE1 && !IE2)
5751 return true;
5752 if (!IE1 && IE2)
5753 return false;
5754 if (IE1 && IE2) {
5755 if (UserBVHead[I1] && !UserBVHead[I2])
5756 return true;
5757 if (!UserBVHead[I1])
5758 return false;
5759 if (UserBVHead[I1] == UserBVHead[I2])
5760 return getElementIndex(IE1) < getElementIndex(IE2);
5761 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5762 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5763 UserBVHead[I2]->getParent());
5764 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5765 }
5766 if (EE1 && !EE2)
5767 return true;
5768 if (!EE1 && EE2)
5769 return false;
5770 if (EE1 && EE2) {
5771 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5772 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5773 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5774 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5775 if (!Inst2 && !P2)
5776 return Inst1 || P1;
5777 if (EE1->getOperand(0) == EE2->getOperand(0))
5778 return getElementIndex(EE1) < getElementIndex(EE2);
5779 if (!Inst1 && Inst2)
5780 return false;
5781 if (Inst1 && Inst2) {
5782 if (Inst1->getParent() != Inst2->getParent())
5783 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5784 return Inst1->comesBefore(Inst2);
5785 }
5786 if (!P1 && P2)
5787 return false;
5788 assert(P1 && P2 &&
5789 "Expected either instructions or arguments vector operands.");
5790 return P1->getArgNo() < P2->getArgNo();
5791 }
5792 return false;
5793 };
5794 OrdersType Phis(TE.Scalars.size());
5795 std::iota(Phis.begin(), Phis.end(), 0);
5796 stable_sort(Phis, PHICompare);
5797 if (isIdentityOrder(Phis))
5798 return std::nullopt; // No need to reorder.
5799 return std::move(Phis);
5800 }
5801 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5802 allSameType(TE.Scalars)) {
5803 // TODO: add analysis of other gather nodes with extractelement
5804 // instructions and other values/instructions, not only undefs.
5805 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5806 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5807 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5808 all_of(TE.Scalars, [](Value *V) {
5809 auto *EE = dyn_cast<ExtractElementInst>(V);
5810 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5811 })) {
5812 // Check that gather of extractelements can be represented as
5813 // just a shuffle of a single vector.
5814 OrdersType CurrentOrder;
5815 bool Reuse =
5816 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
5817 if (Reuse || !CurrentOrder.empty())
5818 return std::move(CurrentOrder);
5819 }
5820 // If the gather node is <undef, v, .., poison> and
5821 // insertelement poison, v, 0 [+ permute]
5822 // is cheaper than
5823 // insertelement poison, v, n - try to reorder.
5824 // If rotating the whole graph, exclude the permute cost, the whole graph
5825 // might be transformed.
5826 int Sz = TE.Scalars.size();
5827 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5828 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5829 const auto *It =
5830 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5831 if (It == TE.Scalars.begin())
5832 return OrdersType();
5833 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5834 if (It != TE.Scalars.end()) {
5835 OrdersType Order(Sz, Sz);
5836 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5837 Order[Idx] = 0;
5838 fixupOrderingIndices(Order);
5839 SmallVector<int> Mask;
5840 inversePermutation(Order, Mask);
5841 InstructionCost PermuteCost =
5842 TopToBottom
5843 ? 0
5845 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5846 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5847 PoisonValue::get(Ty), *It);
5848 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5849 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5850 PoisonValue::get(Ty), *It);
5851 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5852 OrdersType Order(Sz, Sz);
5853 Order[Idx] = 0;
5854 return std::move(Order);
5855 }
5856 }
5857 }
5858 if (isSplat(TE.Scalars))
5859 return std::nullopt;
5860 if (TE.Scalars.size() >= 3)
5861 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5862 return Order;
5863 // Check if can include the order of vectorized loads. For masked gathers do
5864 // extra analysis later, so include such nodes into a special list.
5865 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5866 SmallVector<Value *> PointerOps;
5867 OrdersType CurrentOrder;
5868 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5869 CurrentOrder, PointerOps);
5871 return std::move(CurrentOrder);
5872 }
5873 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5874 // has been auditted for correctness with non-power-of-two vectors.
5875 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5876 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5877 return CurrentOrder;
5878 }
5879 return std::nullopt;
5880}
5881
5882/// Checks if the given mask is a "clustered" mask with the same clusters of
5883/// size \p Sz, which are not identity submasks.
5885 unsigned Sz) {
5886 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5887 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5888 return false;
5889 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5890 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5891 if (Cluster != FirstCluster)
5892 return false;
5893 }
5894 return true;
5895}
5896
5897void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5898 // Reorder reuses mask.
5899 reorderReuses(TE.ReuseShuffleIndices, Mask);
5900 const unsigned Sz = TE.Scalars.size();
5901 // For vectorized and non-clustered reused no need to do anything else.
5902 if (!TE.isGather() ||
5904 Sz) ||
5905 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5906 return;
5907 SmallVector<int> NewMask;
5908 inversePermutation(TE.ReorderIndices, NewMask);
5909 addMask(NewMask, TE.ReuseShuffleIndices);
5910 // Clear reorder since it is going to be applied to the new mask.
5911 TE.ReorderIndices.clear();
5912 // Try to improve gathered nodes with clustered reuses, if possible.
5913 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5914 SmallVector<unsigned> NewOrder(Slice);
5915 inversePermutation(NewOrder, NewMask);
5916 reorderScalars(TE.Scalars, NewMask);
5917 // Fill the reuses mask with the identity submasks.
5918 for (auto *It = TE.ReuseShuffleIndices.begin(),
5919 *End = TE.ReuseShuffleIndices.end();
5920 It != End; std::advance(It, Sz))
5921 std::iota(It, std::next(It, Sz), 0);
5922}
5923
5925 ArrayRef<unsigned> SecondaryOrder) {
5926 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5927 "Expected same size of orders");
5928 unsigned Sz = Order.size();
5929 SmallBitVector UsedIndices(Sz);
5930 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5931 if (Order[Idx] != Sz)
5932 UsedIndices.set(Order[Idx]);
5933 }
5934 if (SecondaryOrder.empty()) {
5935 for (unsigned Idx : seq<unsigned>(0, Sz))
5936 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5937 Order[Idx] = Idx;
5938 } else {
5939 for (unsigned Idx : seq<unsigned>(0, Sz))
5940 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5941 !UsedIndices.test(SecondaryOrder[Idx]))
5942 Order[Idx] = SecondaryOrder[Idx];
5943 }
5944}
5945
5947 // Maps VF to the graph nodes.
5949 // ExtractElement gather nodes which can be vectorized and need to handle
5950 // their ordering.
5952
5953 // Phi nodes can have preferred ordering based on their result users
5955
5956 // AltShuffles can also have a preferred ordering that leads to fewer
5957 // instructions, e.g., the addsub instruction in x86.
5958 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5959
5960 // Maps a TreeEntry to the reorder indices of external users.
5962 ExternalUserReorderMap;
5963 // Find all reorderable nodes with the given VF.
5964 // Currently the are vectorized stores,loads,extracts + some gathering of
5965 // extracts.
5966 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5967 const std::unique_ptr<TreeEntry> &TE) {
5968 // Look for external users that will probably be vectorized.
5969 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5970 findExternalStoreUsersReorderIndices(TE.get());
5971 if (!ExternalUserReorderIndices.empty()) {
5972 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5973 ExternalUserReorderMap.try_emplace(TE.get(),
5974 std::move(ExternalUserReorderIndices));
5975 }
5976
5977 // Patterns like [fadd,fsub] can be combined into a single instruction in
5978 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5979 // to take into account their order when looking for the most used order.
5980 if (TE->hasState() && TE->isAltShuffle()) {
5981 VectorType *VecTy =
5982 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5983 unsigned Opcode0 = TE->getOpcode();
5984 unsigned Opcode1 = TE->getAltOpcode();
5985 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5986 // If this pattern is supported by the target then we consider the order.
5987 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5988 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5989 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5990 }
5991 // TODO: Check the reverse order too.
5992 }
5993
5994 if (std::optional<OrdersType> CurrentOrder =
5995 getReorderingData(*TE, /*TopToBottom=*/true)) {
5996 // Do not include ordering for nodes used in the alt opcode vectorization,
5997 // better to reorder them during bottom-to-top stage. If follow the order
5998 // here, it causes reordering of the whole graph though actually it is
5999 // profitable just to reorder the subgraph that starts from the alternate
6000 // opcode vectorization node. Such nodes already end-up with the shuffle
6001 // instruction and it is just enough to change this shuffle rather than
6002 // rotate the scalars for the whole graph.
6003 unsigned Cnt = 0;
6004 const TreeEntry *UserTE = TE.get();
6005 while (UserTE && Cnt < RecursionMaxDepth) {
6006 if (UserTE->UserTreeIndices.size() != 1)
6007 break;
6008 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
6009 return EI.UserTE->State == TreeEntry::Vectorize &&
6010 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
6011 }))
6012 return;
6013 UserTE = UserTE->UserTreeIndices.back().UserTE;
6014 ++Cnt;
6015 }
6016 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
6017 if (!(TE->State == TreeEntry::Vectorize ||
6018 TE->State == TreeEntry::StridedVectorize) ||
6019 !TE->ReuseShuffleIndices.empty())
6020 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
6021 if (TE->State == TreeEntry::Vectorize &&
6022 TE->getOpcode() == Instruction::PHI)
6023 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
6024 }
6025 });
6026
6027 // Reorder the graph nodes according to their vectorization factor.
6028 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
6029 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6030 auto It = VFToOrderedEntries.find(VF);
6031 if (It == VFToOrderedEntries.end())
6032 continue;
6033 // Try to find the most profitable order. We just are looking for the most
6034 // used order and reorder scalar elements in the nodes according to this
6035 // mostly used order.
6036 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
6037 // Delete VF entry upon exit.
6038 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6039
6040 // All operands are reordered and used only in this node - propagate the
6041 // most used order to the user node.
6044 OrdersUses;
6046 for (const TreeEntry *OpTE : OrderedEntries) {
6047 // No need to reorder this nodes, still need to extend and to use shuffle,
6048 // just need to merge reordering shuffle and the reuse shuffle.
6049 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6050 continue;
6051 // Count number of orders uses.
6052 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6053 &PhisToOrders]() -> const OrdersType & {
6054 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6055 auto It = GathersToOrders.find(OpTE);
6056 if (It != GathersToOrders.end())
6057 return It->second;
6058 }
6059 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6060 auto It = AltShufflesToOrders.find(OpTE);
6061 if (It != AltShufflesToOrders.end())
6062 return It->second;
6063 }
6064 if (OpTE->State == TreeEntry::Vectorize &&
6065 OpTE->getOpcode() == Instruction::PHI) {
6066 auto It = PhisToOrders.find(OpTE);
6067 if (It != PhisToOrders.end())
6068 return It->second;
6069 }
6070 return OpTE->ReorderIndices;
6071 }();
6072 // First consider the order of the external scalar users.
6073 auto It = ExternalUserReorderMap.find(OpTE);
6074 if (It != ExternalUserReorderMap.end()) {
6075 const auto &ExternalUserReorderIndices = It->second;
6076 // If the OpTE vector factor != number of scalars - use natural order,
6077 // it is an attempt to reorder node with reused scalars but with
6078 // external uses.
6079 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6080 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6081 ExternalUserReorderIndices.size();
6082 } else {
6083 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6084 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6085 }
6086 // No other useful reorder data in this entry.
6087 if (Order.empty())
6088 continue;
6089 }
6090 // Stores actually store the mask, not the order, need to invert.
6091 if (OpTE->State == TreeEntry::Vectorize &&
6092 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6093 assert(!OpTE->isAltShuffle() &&
6094 "Alternate instructions are only supported by BinaryOperator "
6095 "and CastInst.");
6096 SmallVector<int> Mask;
6097 inversePermutation(Order, Mask);
6098 unsigned E = Order.size();
6099 OrdersType CurrentOrder(E, E);
6100 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6101 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6102 });
6103 fixupOrderingIndices(CurrentOrder);
6104 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6105 } else {
6106 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6107 }
6108 }
6109 if (OrdersUses.empty())
6110 continue;
6111 // Choose the most used order.
6112 unsigned IdentityCnt = 0;
6113 unsigned FilledIdentityCnt = 0;
6114 OrdersType IdentityOrder(VF, VF);
6115 for (auto &Pair : OrdersUses) {
6116 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6117 if (!Pair.first.empty())
6118 FilledIdentityCnt += Pair.second;
6119 IdentityCnt += Pair.second;
6120 combineOrders(IdentityOrder, Pair.first);
6121 }
6122 }
6123 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6124 unsigned Cnt = IdentityCnt;
6125 for (auto &Pair : OrdersUses) {
6126 // Prefer identity order. But, if filled identity found (non-empty order)
6127 // with same number of uses, as the new candidate order, we can choose
6128 // this candidate order.
6129 if (Cnt < Pair.second ||
6130 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6131 Cnt == Pair.second && !BestOrder.empty() &&
6132 isIdentityOrder(BestOrder))) {
6133 combineOrders(Pair.first, BestOrder);
6134 BestOrder = Pair.first;
6135 Cnt = Pair.second;
6136 } else {
6137 combineOrders(BestOrder, Pair.first);
6138 }
6139 }
6140 // Set order of the user node.
6141 if (isIdentityOrder(BestOrder))
6142 continue;
6143 fixupOrderingIndices(BestOrder);
6144 SmallVector<int> Mask;
6145 inversePermutation(BestOrder, Mask);
6146 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6147 unsigned E = BestOrder.size();
6148 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6149 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6150 });
6151 // Do an actual reordering, if profitable.
6152 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6153 // Just do the reordering for the nodes with the given VF.
6154 if (TE->Scalars.size() != VF) {
6155 if (TE->ReuseShuffleIndices.size() == VF) {
6156 // Need to reorder the reuses masks of the operands with smaller VF to
6157 // be able to find the match between the graph nodes and scalar
6158 // operands of the given node during vectorization/cost estimation.
6159 assert(all_of(TE->UserTreeIndices,
6160 [VF, &TE](const EdgeInfo &EI) {
6161 return EI.UserTE->Scalars.size() == VF ||
6162 EI.UserTE->Scalars.size() ==
6163 TE->Scalars.size();
6164 }) &&
6165 "All users must be of VF size.");
6166 if (SLPReVec) {
6167 assert(SLPReVec && "Only supported by REVEC.");
6168 // ShuffleVectorInst does not do reorderOperands (and it should not
6169 // because ShuffleVectorInst supports only a limited set of
6170 // patterns). Only do reorderNodeWithReuses if all of the users are
6171 // not ShuffleVectorInst.
6172 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6173 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6174 }))
6175 continue;
6176 assert(none_of(TE->UserTreeIndices,
6177 [&](const EdgeInfo &EI) {
6178 return isa<ShuffleVectorInst>(
6179 EI.UserTE->getMainOp());
6180 }) &&
6181 "Does not know how to reorder.");
6182 }
6183 // Update ordering of the operands with the smaller VF than the given
6184 // one.
6185 reorderNodeWithReuses(*TE, Mask);
6186 }
6187 continue;
6188 }
6189 if ((TE->State == TreeEntry::Vectorize ||
6190 TE->State == TreeEntry::StridedVectorize) &&
6192 InsertElementInst>(TE->getMainOp()) ||
6193 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6194 assert(!TE->isAltShuffle() &&
6195 "Alternate instructions are only supported by BinaryOperator "
6196 "and CastInst.");
6197 // Build correct orders for extract{element,value}, loads and
6198 // stores.
6199 reorderOrder(TE->ReorderIndices, Mask);
6200 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6201 TE->reorderOperands(Mask);
6202 } else {
6203 // Reorder the node and its operands.
6204 TE->reorderOperands(Mask);
6205 assert(TE->ReorderIndices.empty() &&
6206 "Expected empty reorder sequence.");
6207 reorderScalars(TE->Scalars, Mask);
6208 }
6209 if (!TE->ReuseShuffleIndices.empty()) {
6210 // Apply reversed order to keep the original ordering of the reused
6211 // elements to avoid extra reorder indices shuffling.
6212 OrdersType CurrentOrder;
6213 reorderOrder(CurrentOrder, MaskOrder);
6214 SmallVector<int> NewReuses;
6215 inversePermutation(CurrentOrder, NewReuses);
6216 addMask(NewReuses, TE->ReuseShuffleIndices);
6217 TE->ReuseShuffleIndices.swap(NewReuses);
6218 }
6219 }
6220 }
6221}
6222
6223bool BoUpSLP::canReorderOperands(
6224 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6225 ArrayRef<TreeEntry *> ReorderableGathers,
6226 SmallVectorImpl<TreeEntry *> &GatherOps) {
6227 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6228 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6229 return OpData.first == I &&
6230 (OpData.second->State == TreeEntry::Vectorize ||
6231 OpData.second->State == TreeEntry::StridedVectorize);
6232 }))
6233 continue;
6234 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6235 // Do not reorder if operand node is used by many user nodes.
6236 if (any_of(TE->UserTreeIndices,
6237 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6238 return false;
6239 // Add the node to the list of the ordered nodes with the identity
6240 // order.
6241 Edges.emplace_back(I, TE);
6242 // Add ScatterVectorize nodes to the list of operands, where just
6243 // reordering of the scalars is required. Similar to the gathers, so
6244 // simply add to the list of gathered ops.
6245 // If there are reused scalars, process this node as a regular vectorize
6246 // node, just reorder reuses mask.
6247 if (TE->State != TreeEntry::Vectorize &&
6248 TE->State != TreeEntry::StridedVectorize &&
6249 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6250 GatherOps.push_back(TE);
6251 continue;
6252 }
6253 TreeEntry *Gather = nullptr;
6254 if (count_if(ReorderableGathers,
6255 [&Gather, UserTE, I](TreeEntry *TE) {
6256 assert(TE->State != TreeEntry::Vectorize &&
6257 TE->State != TreeEntry::StridedVectorize &&
6258 "Only non-vectorized nodes are expected.");
6259 if (any_of(TE->UserTreeIndices,
6260 [UserTE, I](const EdgeInfo &EI) {
6261 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6262 })) {
6263 assert(TE->isSame(UserTE->getOperand(I)) &&
6264 "Operand entry does not match operands.");
6265 Gather = TE;
6266 return true;
6267 }
6268 return false;
6269 }) > 1 &&
6270 !allConstant(UserTE->getOperand(I)))
6271 return false;
6272 if (Gather)
6273 GatherOps.push_back(Gather);
6274 }
6275 return true;
6276}
6277
6278void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6279 SetVector<TreeEntry *> OrderedEntries;
6280 DenseSet<const TreeEntry *> GathersToOrders;
6281 // Find all reorderable leaf nodes with the given VF.
6282 // Currently the are vectorized loads,extracts without alternate operands +
6283 // some gathering of extracts.
6284 SmallVector<TreeEntry *> NonVectorized;
6285 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6286 if (TE->State != TreeEntry::Vectorize &&
6287 TE->State != TreeEntry::StridedVectorize)
6288 NonVectorized.push_back(TE.get());
6289 if (std::optional<OrdersType> CurrentOrder =
6290 getReorderingData(*TE, /*TopToBottom=*/false)) {
6291 OrderedEntries.insert(TE.get());
6292 if (!(TE->State == TreeEntry::Vectorize ||
6293 TE->State == TreeEntry::StridedVectorize) ||
6294 !TE->ReuseShuffleIndices.empty())
6295 GathersToOrders.insert(TE.get());
6296 }
6297 }
6298
6299 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6300 // I.e., if the node has operands, that are reordered, try to make at least
6301 // one operand order in the natural order and reorder others + reorder the
6302 // user node itself.
6304 while (!OrderedEntries.empty()) {
6305 // 1. Filter out only reordered nodes.
6306 // 2. If the entry has multiple uses - skip it and jump to the next node.
6308 SmallVector<TreeEntry *> Filtered;
6309 for (TreeEntry *TE : OrderedEntries) {
6310 if (!(TE->State == TreeEntry::Vectorize ||
6311 TE->State == TreeEntry::StridedVectorize ||
6312 (TE->isGather() && GathersToOrders.contains(TE))) ||
6313 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6314 !all_of(drop_begin(TE->UserTreeIndices),
6315 [TE](const EdgeInfo &EI) {
6316 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6317 }) ||
6318 !Visited.insert(TE).second) {
6319 Filtered.push_back(TE);
6320 continue;
6321 }
6322 // Build a map between user nodes and their operands order to speedup
6323 // search. The graph currently does not provide this dependency directly.
6324 for (EdgeInfo &EI : TE->UserTreeIndices)
6325 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6326 }
6327 // Erase filtered entries.
6328 for (TreeEntry *TE : Filtered)
6329 OrderedEntries.remove(TE);
6331 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6332 UsersVec(Users.begin(), Users.end());
6333 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6334 return Data1.first->Idx > Data2.first->Idx;
6335 });
6336 for (auto &Data : UsersVec) {
6337 // Check that operands are used only in the User node.
6338 SmallVector<TreeEntry *> GatherOps;
6339 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6340 GatherOps)) {
6341 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6342 OrderedEntries.remove(Op.second);
6343 continue;
6344 }
6345 // All operands are reordered and used only in this node - propagate the
6346 // most used order to the user node.
6349 OrdersUses;
6350 // Do the analysis for each tree entry only once, otherwise the order of
6351 // the same node my be considered several times, though might be not
6352 // profitable.
6355 for (const auto &Op : Data.second) {
6356 TreeEntry *OpTE = Op.second;
6357 if (!VisitedOps.insert(OpTE).second)
6358 continue;
6359 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6360 continue;
6361 const auto Order = [&]() -> const OrdersType {
6362 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6363 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6364 .value_or(OrdersType(1));
6365 return OpTE->ReorderIndices;
6366 }();
6367 // The order is partially ordered, skip it in favor of fully non-ordered
6368 // orders.
6369 if (Order.size() == 1)
6370 continue;
6371 unsigned NumOps = count_if(
6372 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6373 return P.second == OpTE;
6374 });
6375 // Stores actually store the mask, not the order, need to invert.
6376 if (OpTE->State == TreeEntry::Vectorize &&
6377 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6378 assert(!OpTE->isAltShuffle() &&
6379 "Alternate instructions are only supported by BinaryOperator "
6380 "and CastInst.");
6381 SmallVector<int> Mask;
6382 inversePermutation(Order, Mask);
6383 unsigned E = Order.size();
6384 OrdersType CurrentOrder(E, E);
6385 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6386 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6387 });
6388 fixupOrderingIndices(CurrentOrder);
6389 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6390 NumOps;
6391 } else {
6392 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6393 }
6394 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6395 const auto AllowsReordering = [&](const TreeEntry *TE) {
6396 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6397 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6398 (IgnoreReorder && TE->Idx == 0))
6399 return true;
6400 if (TE->isGather()) {
6401 if (GathersToOrders.contains(TE))
6402 return !getReorderingData(*TE, /*TopToBottom=*/false)
6403 .value_or(OrdersType(1))
6404 .empty();
6405 return true;
6406 }
6407 return false;
6408 };
6409 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6410 TreeEntry *UserTE = EI.UserTE;
6411 if (!VisitedUsers.insert(UserTE).second)
6412 continue;
6413 // May reorder user node if it requires reordering, has reused
6414 // scalars, is an alternate op vectorize node or its op nodes require
6415 // reordering.
6416 if (AllowsReordering(UserTE))
6417 continue;
6418 // Check if users allow reordering.
6419 // Currently look up just 1 level of operands to avoid increase of
6420 // the compile time.
6421 // Profitable to reorder if definitely more operands allow
6422 // reordering rather than those with natural order.
6424 if (static_cast<unsigned>(count_if(
6425 Ops, [UserTE, &AllowsReordering](
6426 const std::pair<unsigned, TreeEntry *> &Op) {
6427 return AllowsReordering(Op.second) &&
6428 all_of(Op.second->UserTreeIndices,
6429 [UserTE](const EdgeInfo &EI) {
6430 return EI.UserTE == UserTE;
6431 });
6432 })) <= Ops.size() / 2)
6433 ++Res.first->second;
6434 }
6435 }
6436 if (OrdersUses.empty()) {
6437 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6438 OrderedEntries.remove(Op.second);
6439 continue;
6440 }
6441 // Choose the most used order.
6442 unsigned IdentityCnt = 0;
6443 unsigned VF = Data.second.front().second->getVectorFactor();
6444 OrdersType IdentityOrder(VF, VF);
6445 for (auto &Pair : OrdersUses) {
6446 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6447 IdentityCnt += Pair.second;
6448 combineOrders(IdentityOrder, Pair.first);
6449 }
6450 }
6451 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6452 unsigned Cnt = IdentityCnt;
6453 for (auto &Pair : OrdersUses) {
6454 // Prefer identity order. But, if filled identity found (non-empty
6455 // order) with same number of uses, as the new candidate order, we can
6456 // choose this candidate order.
6457 if (Cnt < Pair.second) {
6458 combineOrders(Pair.first, BestOrder);
6459 BestOrder = Pair.first;
6460 Cnt = Pair.second;
6461 } else {
6462 combineOrders(BestOrder, Pair.first);
6463 }
6464 }
6465 // Set order of the user node.
6466 if (isIdentityOrder(BestOrder)) {
6467 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6468 OrderedEntries.remove(Op.second);
6469 continue;
6470 }
6471 fixupOrderingIndices(BestOrder);
6472 // Erase operands from OrderedEntries list and adjust their orders.
6473 VisitedOps.clear();
6474 SmallVector<int> Mask;
6475 inversePermutation(BestOrder, Mask);
6476 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6477 unsigned E = BestOrder.size();
6478 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6479 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6480 });
6481 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6482 TreeEntry *TE = Op.second;
6483 OrderedEntries.remove(TE);
6484 if (!VisitedOps.insert(TE).second)
6485 continue;
6486 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6487 reorderNodeWithReuses(*TE, Mask);
6488 continue;
6489 }
6490 // Gathers are processed separately.
6491 if (TE->State != TreeEntry::Vectorize &&
6492 TE->State != TreeEntry::StridedVectorize &&
6493 (TE->State != TreeEntry::ScatterVectorize ||
6494 TE->ReorderIndices.empty()))
6495 continue;
6496 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6497 TE->ReorderIndices.empty()) &&
6498 "Non-matching sizes of user/operand entries.");
6499 reorderOrder(TE->ReorderIndices, Mask);
6500 if (IgnoreReorder && TE == VectorizableTree.front().get())
6501 IgnoreReorder = false;
6502 }
6503 // For gathers just need to reorder its scalars.
6504 for (TreeEntry *Gather : GatherOps) {
6505 assert(Gather->ReorderIndices.empty() &&
6506 "Unexpected reordering of gathers.");
6507 if (!Gather->ReuseShuffleIndices.empty()) {
6508 // Just reorder reuses indices.
6509 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6510 continue;
6511 }
6512 reorderScalars(Gather->Scalars, Mask);
6513 OrderedEntries.remove(Gather);
6514 }
6515 // Reorder operands of the user node and set the ordering for the user
6516 // node itself.
6517 if (Data.first->State != TreeEntry::Vectorize ||
6518 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6519 Data.first->getMainOp()) ||
6520 Data.first->isAltShuffle())
6521 Data.first->reorderOperands(Mask);
6522 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6523 Data.first->isAltShuffle() ||
6524 Data.first->State == TreeEntry::StridedVectorize) {
6525 reorderScalars(Data.first->Scalars, Mask);
6526 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6527 /*BottomOrder=*/true);
6528 if (Data.first->ReuseShuffleIndices.empty() &&
6529 !Data.first->ReorderIndices.empty() &&
6530 !Data.first->isAltShuffle()) {
6531 // Insert user node to the list to try to sink reordering deeper in
6532 // the graph.
6533 OrderedEntries.insert(Data.first);
6534 }
6535 } else {
6536 reorderOrder(Data.first->ReorderIndices, Mask);
6537 }
6538 }
6539 }
6540 // If the reordering is unnecessary, just remove the reorder.
6541 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6542 VectorizableTree.front()->ReuseShuffleIndices.empty())
6543 VectorizableTree.front()->ReorderIndices.clear();
6544}
6545
6546Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6547 if ((Entry.getOpcode() == Instruction::Store ||
6548 Entry.getOpcode() == Instruction::Load) &&
6549 Entry.State == TreeEntry::StridedVectorize &&
6550 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6551 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6552 return dyn_cast<Instruction>(Entry.Scalars.front());
6553}
6554
6556 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6557 DenseMap<Value *, unsigned> ScalarToExtUses;
6558 // Collect the values that we need to extract from the tree.
6559 for (auto &TEPtr : VectorizableTree) {
6560 TreeEntry *Entry = TEPtr.get();
6561
6562 // No need to handle users of gathered values.
6563 if (Entry->isGather())
6564 continue;
6565
6566 // For each lane:
6567 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6568 Value *Scalar = Entry->Scalars[Lane];
6569 if (!isa<Instruction>(Scalar))
6570 continue;
6571 // All uses must be replaced already? No need to do it again.
6572 auto It = ScalarToExtUses.find(Scalar);
6573 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6574 continue;
6575
6576 // Check if the scalar is externally used as an extra arg.
6577 const auto ExtI = ExternallyUsedValues.find(Scalar);
6578 if (ExtI != ExternallyUsedValues.end()) {
6579 int FoundLane = Entry->findLaneForValue(Scalar);
6580 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6581 << FoundLane << " from " << *Scalar << ".\n");
6582 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6583 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
6584 continue;
6585 }
6586 for (User *U : Scalar->users()) {
6587 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6588
6589 Instruction *UserInst = dyn_cast<Instruction>(U);
6590 if (!UserInst || isDeleted(UserInst))
6591 continue;
6592
6593 // Ignore users in the user ignore list.
6594 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6595 continue;
6596
6597 // Skip in-tree scalars that become vectors
6598 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
6599 !UseEntries.empty()) {
6600 // Some in-tree scalars will remain as scalar in vectorized
6601 // instructions. If that is the case, the one in FoundLane will
6602 // be used.
6603 if (any_of(UseEntries, [&](TreeEntry *UseEntry) {
6604 return UseEntry->State == TreeEntry::ScatterVectorize ||
6606 Scalar, getRootEntryInstruction(*UseEntry), TLI,
6607 TTI);
6608 })) {
6609 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6610 << ".\n");
6611 assert(none_of(UseEntries,
6612 [](TreeEntry *UseEntry) {
6613 return UseEntry->isGather();
6614 }) &&
6615 "Bad state");
6616 continue;
6617 }
6618 U = nullptr;
6619 if (It != ScalarToExtUses.end()) {
6620 ExternalUses[It->second].User = nullptr;
6621 break;
6622 }
6623 }
6624
6625 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6626 U = nullptr;
6627 int FoundLane = Entry->findLaneForValue(Scalar);
6628 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6629 << " from lane " << FoundLane << " from " << *Scalar
6630 << ".\n");
6631 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6632 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
6633 if (!U)
6634 break;
6635 }
6636 }
6637 }
6638}
6639
6641BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6644 PtrToStoresMap;
6645 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6646 Value *V = TE->Scalars[Lane];
6647 // Don't iterate over the users of constant data.
6648 if (!isa<Instruction>(V))
6649 continue;
6650 // To save compilation time we don't visit if we have too many users.
6651 if (V->hasNUsesOrMore(UsesLimit))
6652 break;
6653
6654 // Collect stores per pointer object.
6655 for (User *U : V->users()) {
6656 auto *SI = dyn_cast<StoreInst>(U);
6657 // Test whether we can handle the store. V might be a global, which could
6658 // be used in a different function.
6659 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6660 !isValidElementType(SI->getValueOperand()->getType()))
6661 continue;
6662 // Skip entry if already
6663 if (isVectorized(U))
6664 continue;
6665
6666 Value *Ptr =
6667 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6668 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6669 SI->getValueOperand()->getType(), Ptr}];
6670 // For now just keep one store per pointer object per lane.
6671 // TODO: Extend this to support multiple stores per pointer per lane
6672 if (StoresVec.size() > Lane)
6673 continue;
6674 if (!StoresVec.empty()) {
6675 std::optional<int> Diff = getPointersDiff(
6676 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6677 SI->getValueOperand()->getType(),
6678 StoresVec.front()->getPointerOperand(), *DL, *SE,
6679 /*StrictCheck=*/true);
6680 // We failed to compare the pointers so just abandon this store.
6681 if (!Diff)
6682 continue;
6683 }
6684 StoresVec.push_back(SI);
6685 }
6686 }
6687 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6688 unsigned I = 0;
6689 for (auto &P : PtrToStoresMap) {
6690 Res[I].swap(P.second);
6691 ++I;
6692 }
6693 return Res;
6694}
6695
6696bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6697 OrdersType &ReorderIndices) const {
6698 // We check whether the stores in StoreVec can form a vector by sorting them
6699 // and checking whether they are consecutive.
6700
6701 // To avoid calling getPointersDiff() while sorting we create a vector of
6702 // pairs {store, offset from first} and sort this instead.
6704 StoreInst *S0 = StoresVec[0];
6705 StoreOffsetVec.emplace_back(0, 0);
6706 Type *S0Ty = S0->getValueOperand()->getType();
6707 Value *S0Ptr = S0->getPointerOperand();
6708 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6709 StoreInst *SI = StoresVec[Idx];
6710 std::optional<int> Diff =
6711 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6712 SI->getPointerOperand(), *DL, *SE,
6713 /*StrictCheck=*/true);
6714 StoreOffsetVec.emplace_back(*Diff, Idx);
6715 }
6716
6717 // Check if the stores are consecutive by checking if their difference is 1.
6718 if (StoreOffsetVec.size() != StoresVec.size())
6719 return false;
6720 sort(StoreOffsetVec,
6721 [](const std::pair<int, unsigned> &L,
6722 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6723 unsigned Idx = 0;
6724 int PrevDist = 0;
6725 for (const auto &P : StoreOffsetVec) {
6726 if (Idx > 0 && P.first != PrevDist + 1)
6727 return false;
6728 PrevDist = P.first;
6729 ++Idx;
6730 }
6731
6732 // Calculate the shuffle indices according to their offset against the sorted
6733 // StoreOffsetVec.
6734 ReorderIndices.assign(StoresVec.size(), 0);
6735 bool IsIdentity = true;
6736 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6737 ReorderIndices[P.second] = I;
6738 IsIdentity &= P.second == I;
6739 }
6740 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6741 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6742 // same convention here.
6743 if (IsIdentity)
6744 ReorderIndices.clear();
6745
6746 return true;
6747}
6748
6749#ifndef NDEBUG
6751 for (unsigned Idx : Order)
6752 dbgs() << Idx << ", ";
6753 dbgs() << "\n";
6754}
6755#endif
6756
6758BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6759 unsigned NumLanes = TE->Scalars.size();
6760
6761 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6762
6763 // Holds the reorder indices for each candidate store vector that is a user of
6764 // the current TreeEntry.
6765 SmallVector<OrdersType, 1> ExternalReorderIndices;
6766
6767 // Now inspect the stores collected per pointer and look for vectorization
6768 // candidates. For each candidate calculate the reorder index vector and push
6769 // it into `ExternalReorderIndices`
6770 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6771 // If we have fewer than NumLanes stores, then we can't form a vector.
6772 if (StoresVec.size() != NumLanes)
6773 continue;
6774
6775 // If the stores are not consecutive then abandon this StoresVec.
6776 OrdersType ReorderIndices;
6777 if (!canFormVector(StoresVec, ReorderIndices))
6778 continue;
6779
6780 // We now know that the scalars in StoresVec can form a vector instruction,
6781 // so set the reorder indices.
6782 ExternalReorderIndices.push_back(ReorderIndices);
6783 }
6784 return ExternalReorderIndices;
6785}
6786
6788 const SmallDenseSet<Value *> &UserIgnoreLst) {
6789 deleteTree();
6790 UserIgnoreList = &UserIgnoreLst;
6791 if (!allSameType(Roots))
6792 return;
6793 buildTree_rec(Roots, 0, EdgeInfo());
6794}
6795
6797 deleteTree();
6798 if (!allSameType(Roots))
6799 return;
6800 buildTree_rec(Roots, 0, EdgeInfo());
6801}
6802
6803/// Tries to find subvector of loads and builds new vector of only loads if can
6804/// be profitable.
6806 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6808 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6809 bool AddNew = true) {
6810 if (VL.empty())
6811 return;
6812 Type *ScalarTy = getValueType(VL.front());
6813 if (!isValidElementType(ScalarTy))
6814 return;
6816 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6817 for (Value *V : VL) {
6818 auto *LI = dyn_cast<LoadInst>(V);
6819 if (!LI)
6820 continue;
6821 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6822 continue;
6823 bool IsFound = false;
6824 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6825 assert(LI->getParent() == Data.front().first->getParent() &&
6826 LI->getType() == Data.front().first->getType() &&
6827 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6828 getUnderlyingObject(Data.front().first->getPointerOperand(),
6830 "Expected loads with the same type, same parent and same "
6831 "underlying pointer.");
6832 std::optional<int> Dist = getPointersDiff(
6833 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6834 Data.front().first->getPointerOperand(), DL, SE,
6835 /*StrictCheck=*/true);
6836 if (!Dist)
6837 continue;
6838 auto It = Map.find(*Dist);
6839 if (It != Map.end() && It->second != LI)
6840 continue;
6841 if (It == Map.end()) {
6842 Data.emplace_back(LI, *Dist);
6843 Map.try_emplace(*Dist, LI);
6844 }
6845 IsFound = true;
6846 break;
6847 }
6848 if (!IsFound) {
6849 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6850 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6851 }
6852 }
6853 auto FindMatchingLoads =
6856 &GatheredLoads,
6857 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6858 int &Offset, unsigned &Start) {
6859 if (Loads.empty())
6860 return GatheredLoads.end();
6862 LoadInst *LI = Loads.front().first;
6863 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6864 if (Idx < Start)
6865 continue;
6866 ToAdd.clear();
6867 if (LI->getParent() != Data.front().first->getParent() ||
6868 LI->getType() != Data.front().first->getType())
6869 continue;
6870 std::optional<int> Dist =
6872 Data.front().first->getType(),
6873 Data.front().first->getPointerOperand(), DL, SE,
6874 /*StrictCheck=*/true);
6875 if (!Dist)
6876 continue;
6877 SmallSet<int, 4> DataDists;
6879 for (std::pair<LoadInst *, int> P : Data) {
6880 DataDists.insert(P.second);
6881 DataLoads.insert(P.first);
6882 }
6883 // Found matching gathered loads - check if all loads are unique or
6884 // can be effectively vectorized.
6885 unsigned NumUniques = 0;
6886 for (auto [Cnt, Pair] : enumerate(Loads)) {
6887 bool Used = DataLoads.contains(Pair.first);
6888 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6889 ++NumUniques;
6890 ToAdd.insert(Cnt);
6891 } else if (Used) {
6892 Repeated.insert(Cnt);
6893 }
6894 }
6895 if (NumUniques > 0 &&
6896 (Loads.size() == NumUniques ||
6897 (Loads.size() - NumUniques >= 2 &&
6898 Loads.size() - NumUniques >= Loads.size() / 2 &&
6899 (has_single_bit(Data.size() + NumUniques) ||
6900 bit_ceil(Data.size()) <
6901 bit_ceil(Data.size() + NumUniques))))) {
6902 Offset = *Dist;
6903 Start = Idx + 1;
6904 return std::next(GatheredLoads.begin(), Idx);
6905 }
6906 }
6907 ToAdd.clear();
6908 return GatheredLoads.end();
6909 };
6910 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6911 unsigned Start = 0;
6912 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6913 int Offset = 0;
6914 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6915 Offset, Start);
6916 while (It != GatheredLoads.end()) {
6917 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6918 for (unsigned Idx : LocalToAdd)
6919 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6920 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6921 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6922 Start);
6923 }
6924 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6925 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6926 })) {
6927 auto AddNewLoads =
6929 for (unsigned Idx : seq<unsigned>(Data.size())) {
6930 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6931 continue;
6932 Loads.push_back(Data[Idx]);
6933 }
6934 };
6935 if (!AddNew) {
6936 LoadInst *LI = Data.front().first;
6937 It = find_if(
6938 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6939 return PD.front().first->getParent() == LI->getParent() &&
6940 PD.front().first->getType() == LI->getType();
6941 });
6942 while (It != GatheredLoads.end()) {
6943 AddNewLoads(*It);
6944 It = std::find_if(
6945 std::next(It), GatheredLoads.end(),
6946 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6947 return PD.front().first->getParent() == LI->getParent() &&
6948 PD.front().first->getType() == LI->getType();
6949 });
6950 }
6951 }
6952 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6953 AddNewLoads(GatheredLoads.emplace_back());
6954 }
6955 }
6956}
6957
6958void BoUpSLP::tryToVectorizeGatheredLoads(
6959 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6960 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6961 8> &GatheredLoads) {
6962 GatheredLoadsEntriesFirst = VectorizableTree.size();
6963
6964 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6965 LoadEntriesToVectorize.size());
6966 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6967 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6968 VectorizableTree[Idx]->Scalars.end());
6969
6970 // Sort loads by distance.
6971 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6972 const std::pair<LoadInst *, int> &L2) {
6973 return L1.second > L2.second;
6974 };
6975
6976 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
6977 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6978 Loads.size());
6979 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6980 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6981 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6982 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6983 };
6984
6985 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6986 BoUpSLP::ValueSet &VectorizedLoads,
6987 SmallVectorImpl<LoadInst *> &NonVectorized,
6988 bool Final, unsigned MaxVF) {
6990 unsigned StartIdx = 0;
6991 SmallVector<int> CandidateVFs;
6992 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6993 CandidateVFs.push_back(MaxVF);
6994 for (int NumElts = getFloorFullVectorNumberOfElements(
6995 *TTI, Loads.front()->getType(), MaxVF);
6996 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6997 *TTI, Loads.front()->getType(), NumElts - 1)) {
6998 CandidateVFs.push_back(NumElts);
6999 if (VectorizeNonPowerOf2 && NumElts > 2)
7000 CandidateVFs.push_back(NumElts - 1);
7001 }
7002
7003 if (Final && CandidateVFs.empty())
7004 return Results;
7005
7006 unsigned BestVF = Final ? CandidateVFs.back() : 0;
7007 for (unsigned NumElts : CandidateVFs) {
7008 if (Final && NumElts > BestVF)
7009 continue;
7010 SmallVector<unsigned> MaskedGatherVectorized;
7011 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
7012 ++Cnt) {
7013 ArrayRef<LoadInst *> Slice =
7014 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
7015 if (VectorizedLoads.count(Slice.front()) ||
7016 VectorizedLoads.count(Slice.back()) ||
7018 continue;
7019 // Check if it is profitable to try vectorizing gathered loads. It is
7020 // profitable if we have more than 3 consecutive loads or if we have
7021 // less but all users are vectorized or deleted.
7022 bool AllowToVectorize = false;
7023 // Check if it is profitable to vectorize 2-elements loads.
7024 if (NumElts == 2) {
7025 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
7026 Slice.front()->getType(), ElementCount::getFixed(NumElts));
7027 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
7028 for (LoadInst *LI : Slice) {
7029 // If single use/user - allow to vectorize.
7030 if (LI->hasOneUse())
7031 continue;
7032 // 1. Check if number of uses equals number of users.
7033 // 2. All users are deleted.
7034 // 3. The load broadcasts are not allowed or the load is not
7035 // broadcasted.
7036 if (static_cast<unsigned int>(std::distance(
7037 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7038 return false;
7039 if (!IsLegalBroadcastLoad)
7040 continue;
7041 if (LI->hasNUsesOrMore(UsesLimit))
7042 return false;
7043 for (User *U : LI->users()) {
7044 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
7045 continue;
7046 for (const TreeEntry *UTE : getTreeEntries(U)) {
7047 for (int I : seq<int>(UTE->getNumOperands())) {
7048 if (all_of(UTE->getOperand(I), [LI](Value *V) {
7049 return V == LI || isa<PoisonValue>(V);
7050 }))
7051 // Found legal broadcast - do not vectorize.
7052 return false;
7053 }
7054 }
7055 }
7056 }
7057 return true;
7058 };
7059 AllowToVectorize = CheckIfAllowed(Slice);
7060 } else {
7061 AllowToVectorize =
7062 (NumElts >= 3 ||
7063 any_of(ValueToGatherNodes.at(Slice.front()),
7064 [=](const TreeEntry *TE) {
7065 return TE->Scalars.size() == 2 &&
7066 ((TE->Scalars.front() == Slice.front() &&
7067 TE->Scalars.back() == Slice.back()) ||
7068 (TE->Scalars.front() == Slice.back() &&
7069 TE->Scalars.back() == Slice.front()));
7070 })) &&
7071 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7072 Slice.size());
7073 }
7074 if (AllowToVectorize) {
7075 SmallVector<Value *> PointerOps;
7076 OrdersType CurrentOrder;
7077 // Try to build vector load.
7078 ArrayRef<Value *> Values(
7079 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7080 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7081 PointerOps, &BestVF);
7082 if (LS != LoadsState::Gather ||
7083 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7084 if (LS == LoadsState::ScatterVectorize) {
7085 if (MaskedGatherVectorized.empty() ||
7086 Cnt >= MaskedGatherVectorized.back() + NumElts)
7087 MaskedGatherVectorized.push_back(Cnt);
7088 continue;
7089 }
7090 if (LS != LoadsState::Gather) {
7091 Results.emplace_back(Values, LS);
7092 VectorizedLoads.insert(Slice.begin(), Slice.end());
7093 // If we vectorized initial block, no need to try to vectorize it
7094 // again.
7095 if (Cnt == StartIdx)
7096 StartIdx += NumElts;
7097 }
7098 // Check if the whole array was vectorized already - exit.
7099 if (StartIdx >= Loads.size())
7100 break;
7101 // Erase last masked gather candidate, if another candidate within
7102 // the range is found to be better.
7103 if (!MaskedGatherVectorized.empty() &&
7104 Cnt < MaskedGatherVectorized.back() + NumElts)
7105 MaskedGatherVectorized.pop_back();
7106 Cnt += NumElts - 1;
7107 continue;
7108 }
7109 }
7110 if (!AllowToVectorize || BestVF == 0)
7112 }
7113 // Mark masked gathers candidates as vectorized, if any.
7114 for (unsigned Cnt : MaskedGatherVectorized) {
7115 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7116 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7117 ArrayRef<Value *> Values(
7118 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7119 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7120 VectorizedLoads.insert(Slice.begin(), Slice.end());
7121 // If we vectorized initial block, no need to try to vectorize it again.
7122 if (Cnt == StartIdx)
7123 StartIdx += NumElts;
7124 }
7125 }
7126 for (LoadInst *LI : Loads) {
7127 if (!VectorizedLoads.contains(LI))
7128 NonVectorized.push_back(LI);
7129 }
7130 return Results;
7131 };
7132 auto ProcessGatheredLoads =
7133 [&, &TTI = *TTI](
7135 bool Final = false) {
7136 SmallVector<LoadInst *> NonVectorized;
7137 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7138 if (LoadsDists.size() <= 1) {
7139 NonVectorized.push_back(LoadsDists.back().first);
7140 continue;
7141 }
7142 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7143 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7144 transform(LoadsDists, OriginalLoads.begin(),
7145 [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
7146 return L.first;
7147 });
7148 stable_sort(LocalLoadsDists, LoadSorter);
7150 unsigned MaxConsecutiveDistance = 0;
7151 unsigned CurrentConsecutiveDist = 1;
7152 int LastDist = LocalLoadsDists.front().second;
7153 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7154 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7155 if (isVectorized(L.first))
7156 continue;
7157 assert(LastDist >= L.second &&
7158 "Expected first distance always not less than second");
7159 if (static_cast<unsigned>(LastDist - L.second) ==
7160 CurrentConsecutiveDist) {
7161 ++CurrentConsecutiveDist;
7162 MaxConsecutiveDistance =
7163 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7164 Loads.push_back(L.first);
7165 continue;
7166 }
7167 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7168 !Loads.empty())
7169 Loads.pop_back();
7170 CurrentConsecutiveDist = 1;
7171 LastDist = L.second;
7172 Loads.push_back(L.first);
7173 }
7174 if (Loads.size() <= 1)
7175 continue;
7176 if (AllowMaskedGather)
7177 MaxConsecutiveDistance = Loads.size();
7178 else if (MaxConsecutiveDistance < 2)
7179 continue;
7180 BoUpSLP::ValueSet VectorizedLoads;
7181 SmallVector<LoadInst *> SortedNonVectorized;
7183 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7184 Final, MaxConsecutiveDistance);
7185 if (!Results.empty() && !SortedNonVectorized.empty() &&
7186 OriginalLoads.size() == Loads.size() &&
7187 MaxConsecutiveDistance == Loads.size() &&
7189 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7190 return P.second == LoadsState::ScatterVectorize;
7191 })) {
7192 VectorizedLoads.clear();
7193 SmallVector<LoadInst *> UnsortedNonVectorized;
7195 UnsortedResults =
7196 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7197 UnsortedNonVectorized, Final,
7198 OriginalLoads.size());
7199 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7200 SortedNonVectorized.swap(UnsortedNonVectorized);
7201 Results.swap(UnsortedResults);
7202 }
7203 }
7204 for (auto [Slice, _] : Results) {
7205 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7206 << Slice.size() << ")\n");
7207 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
7208 for (Value *L : Slice)
7209 if (!isVectorized(L))
7210 SortedNonVectorized.push_back(cast<LoadInst>(L));
7211 continue;
7212 }
7213
7214 // Select maximum VF as a maximum of user gathered nodes and
7215 // distance between scalar loads in these nodes.
7216 unsigned MaxVF = Slice.size();
7217 unsigned UserMaxVF = 0;
7218 unsigned InterleaveFactor = 0;
7219 if (MaxVF == 2) {
7220 UserMaxVF = MaxVF;
7221 } else {
7222 // Found distance between segments of the interleaved loads.
7223 std::optional<unsigned> InterleavedLoadsDistance = 0;
7224 unsigned Order = 0;
7225 std::optional<unsigned> CommonVF = 0;
7227 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7228 for (auto [Idx, V] : enumerate(Slice)) {
7229 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7230 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7231 unsigned Pos =
7232 EntryToPosition.try_emplace(E, Idx).first->second;
7233 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7234 if (CommonVF) {
7235 if (*CommonVF == 0) {
7236 CommonVF = E->Scalars.size();
7237 continue;
7238 }
7239 if (*CommonVF != E->Scalars.size())
7240 CommonVF.reset();
7241 }
7242 // Check if the load is the part of the interleaved load.
7243 if (Pos != Idx && InterleavedLoadsDistance) {
7244 if (!DeinterleavedNodes.contains(E) &&
7245 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7246 if (isa<Constant>(V))
7247 return false;
7248 if (isVectorized(V))
7249 return true;
7250 const auto &Nodes = ValueToGatherNodes.at(V);
7251 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7252 !is_contained(Slice, V);
7253 })) {
7254 InterleavedLoadsDistance.reset();
7255 continue;
7256 }
7257 DeinterleavedNodes.insert(E);
7258 if (*InterleavedLoadsDistance == 0) {
7259 InterleavedLoadsDistance = Idx - Pos;
7260 continue;
7261 }
7262 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7263 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7264 InterleavedLoadsDistance.reset();
7265 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7266 }
7267 }
7268 }
7269 DeinterleavedNodes.clear();
7270 // Check if the large load represents interleaved load operation.
7271 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7272 CommonVF.value_or(0) != 0) {
7273 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7274 unsigned VF = *CommonVF;
7275 OrdersType Order;
7276 SmallVector<Value *> PointerOps;
7277 // Segmented load detected - vectorize at maximum vector factor.
7278 if (InterleaveFactor <= Slice.size() &&
7280 getWidenedType(Slice.front()->getType(), VF),
7281 InterleaveFactor,
7282 cast<LoadInst>(Slice.front())->getAlign(),
7283 cast<LoadInst>(Slice.front())
7285 canVectorizeLoads(Slice, Slice.front(), Order,
7286 PointerOps) == LoadsState::Vectorize) {
7287 UserMaxVF = InterleaveFactor * VF;
7288 } else {
7289 InterleaveFactor = 0;
7290 }
7291 }
7292 // Cannot represent the loads as consecutive vectorizable nodes -
7293 // just exit.
7294 unsigned ConsecutiveNodesSize = 0;
7295 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7296 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7297 [&, Slice = Slice](const auto &P) {
7298 const auto *It = find_if(Slice, [&](Value *V) {
7299 return std::get<1>(P).contains(V);
7300 });
7301 if (It == Slice.end())
7302 return false;
7304 VectorizableTree[std::get<0>(P)]->Scalars;
7305 ConsecutiveNodesSize += VL.size();
7306 unsigned Start = std::distance(Slice.begin(), It);
7307 unsigned Sz = Slice.size() - Start;
7308 return Sz < VL.size() ||
7309 Slice.slice(std::distance(Slice.begin(), It),
7310 VL.size()) != VL;
7311 }))
7312 continue;
7313 // Try to build long masked gather loads.
7314 UserMaxVF = bit_ceil(UserMaxVF);
7315 if (InterleaveFactor == 0 &&
7316 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7317 [&, Slice = Slice](unsigned Idx) {
7318 OrdersType Order;
7319 SmallVector<Value *> PointerOps;
7320 return canVectorizeLoads(
7321 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7322 Slice[Idx * UserMaxVF], Order,
7323 PointerOps) ==
7324 LoadsState::ScatterVectorize;
7325 }))
7326 UserMaxVF = MaxVF;
7327 if (Slice.size() != ConsecutiveNodesSize)
7328 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7329 }
7330 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7331 bool IsVectorized = true;
7332 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7333 ArrayRef<Value *> SubSlice =
7334 Slice.slice(I, std::min(VF, E - I));
7335 if (isVectorized(SubSlice.front()))
7336 continue;
7337 // Check if the subslice is to be-vectorized entry, which is not
7338 // equal to entry.
7339 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7340 [&](const auto &P) {
7341 return !SubSlice.equals(
7342 VectorizableTree[std::get<0>(P)]
7343 ->Scalars) &&
7344 set_is_subset(SubSlice, std::get<1>(P));
7345 }))
7346 continue;
7347 unsigned Sz = VectorizableTree.size();
7348 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7349 if (Sz == VectorizableTree.size()) {
7350 IsVectorized = false;
7351 // Try non-interleaved vectorization with smaller vector
7352 // factor.
7353 if (InterleaveFactor > 0) {
7354 VF = 2 * (MaxVF / InterleaveFactor);
7355 InterleaveFactor = 0;
7356 }
7357 continue;
7358 }
7359 }
7360 if (IsVectorized)
7361 break;
7362 }
7363 }
7364 NonVectorized.append(SortedNonVectorized);
7365 }
7366 return NonVectorized;
7367 };
7368 for (const auto &GLs : GatheredLoads) {
7369 const auto &Ref = GLs.second;
7370 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7371 if (!Ref.empty() && !NonVectorized.empty() &&
7372 std::accumulate(
7373 Ref.begin(), Ref.end(), 0u,
7374 [](unsigned S,
7375 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
7376 return S + LoadsDists.size();
7377 }) != NonVectorized.size() &&
7378 IsMaskedGatherSupported(NonVectorized)) {
7380 for (LoadInst *LI : NonVectorized) {
7381 // Reinsert non-vectorized loads to other list of loads with the same
7382 // base pointers.
7383 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7384 FinalGatheredLoads,
7385 /*AddNew=*/false);
7386 }
7387 // Final attempt to vectorize non-vectorized loads.
7388 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7389 }
7390 }
7391 // Try to vectorize postponed load entries, previously marked as gathered.
7392 for (unsigned Idx : LoadEntriesToVectorize) {
7393 const TreeEntry &E = *VectorizableTree[Idx];
7394 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7395 // Avoid reordering, if possible.
7396 if (!E.ReorderIndices.empty()) {
7397 // Build a mask out of the reorder indices and reorder scalars per this
7398 // mask.
7399 SmallVector<int> ReorderMask;
7400 inversePermutation(E.ReorderIndices, ReorderMask);
7401 reorderScalars(GatheredScalars, ReorderMask);
7402 }
7403 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7404 }
7405 // If no new entries created, consider it as no gathered loads entries must be
7406 // handled.
7407 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7408 VectorizableTree.size())
7409 GatheredLoadsEntriesFirst.reset();
7410}
7411
7412/// \return true if the specified list of values has only one instruction that
7413/// requires scheduling, false otherwise.
7414#ifndef NDEBUG
7416 Value *NeedsScheduling = nullptr;
7417 for (Value *V : VL) {
7419 continue;
7420 if (!NeedsScheduling) {
7421 NeedsScheduling = V;
7422 continue;
7423 }
7424 return false;
7425 }
7426 return NeedsScheduling;
7427}
7428#endif
7429
7430/// Generates key/subkey pair for the given value to provide effective sorting
7431/// of the values and better detection of the vectorizable values sequences. The
7432/// keys/subkeys can be used for better sorting of the values themselves (keys)
7433/// and in values subgroups (subkeys).
7434static std::pair<size_t, size_t> generateKeySubkey(
7435 Value *V, const TargetLibraryInfo *TLI,
7436 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7437 bool AllowAlternate) {
7438 hash_code Key = hash_value(V->getValueID() + 2);
7439 hash_code SubKey = hash_value(0);
7440 // Sort the loads by the distance between the pointers.
7441 if (auto *LI = dyn_cast<LoadInst>(V)) {
7442 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7443 if (LI->isSimple())
7444 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7445 else
7446 Key = SubKey = hash_value(LI);
7447 } else if (isVectorLikeInstWithConstOps(V)) {
7448 // Sort extracts by the vector operands.
7449 if (isa<ExtractElementInst, UndefValue>(V))
7450 Key = hash_value(Value::UndefValueVal + 1);
7451 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7452 if (!isUndefVector(EI->getVectorOperand()).all() &&
7453 !isa<UndefValue>(EI->getIndexOperand()))
7454 SubKey = hash_value(EI->getVectorOperand());
7455 }
7456 } else if (auto *I = dyn_cast<Instruction>(V)) {
7457 // Sort other instructions just by the opcodes except for CMPInst.
7458 // For CMP also sort by the predicate kind.
7459 if ((isa<BinaryOperator, CastInst>(I)) &&
7460 isValidForAlternation(I->getOpcode())) {
7461 if (AllowAlternate)
7462 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7463 else
7464 Key = hash_combine(hash_value(I->getOpcode()), Key);
7465 SubKey = hash_combine(
7466 hash_value(I->getOpcode()), hash_value(I->getType()),
7467 hash_value(isa<BinaryOperator>(I)
7468 ? I->getType()
7469 : cast<CastInst>(I)->getOperand(0)->getType()));
7470 // For casts, look through the only operand to improve compile time.
7471 if (isa<CastInst>(I)) {
7472 std::pair<size_t, size_t> OpVals =
7473 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7474 /*AllowAlternate=*/true);
7475 Key = hash_combine(OpVals.first, Key);
7476 SubKey = hash_combine(OpVals.first, SubKey);
7477 }
7478 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7479 CmpInst::Predicate Pred = CI->getPredicate();
7480 if (CI->isCommutative())
7481 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7483 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7484 hash_value(SwapPred),
7485 hash_value(CI->getOperand(0)->getType()));
7486 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7489 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7490 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7491 SubKey = hash_combine(hash_value(I->getOpcode()),
7492 hash_value(Call->getCalledFunction()));
7493 } else {
7494 Key = hash_combine(hash_value(Call), Key);
7495 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7496 }
7497 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7498 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7499 hash_value(Op.Tag), SubKey);
7500 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7501 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7502 SubKey = hash_value(Gep->getPointerOperand());
7503 else
7504 SubKey = hash_value(Gep);
7505 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7506 !isa<ConstantInt>(I->getOperand(1))) {
7507 // Do not try to vectorize instructions with potentially high cost.
7508 SubKey = hash_value(I);
7509 } else {
7510 SubKey = hash_value(I->getOpcode());
7511 }
7512 Key = hash_combine(hash_value(I->getParent()), Key);
7513 }
7514 return std::make_pair(Key, SubKey);
7515}
7516
7517/// Checks if the specified instruction \p I is an alternate operation for
7518/// the given \p MainOp and \p AltOp instructions.
7519static bool isAlternateInstruction(const Instruction *I,
7520 const Instruction *MainOp,
7521 const Instruction *AltOp,
7522 const TargetLibraryInfo &TLI);
7523
7524bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7525 ArrayRef<Value *> VL) const {
7526 unsigned Opcode0 = S.getOpcode();
7527 unsigned Opcode1 = S.getAltOpcode();
7528 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7529 // If this pattern is supported by the target then consider it profitable.
7530 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7531 Opcode0, Opcode1, OpcodeMask))
7532 return true;
7534 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7535 Operands.emplace_back();
7536 // Prepare the operand vector.
7537 for (Value *V : VL) {
7538 if (isa<PoisonValue>(V)) {
7539 Operands.back().push_back(
7540 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7541 continue;
7542 }
7543 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7544 }
7545 }
7546 if (Operands.size() == 2) {
7547 // Try find best operands candidates.
7548 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7550 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7551 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7552 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7553 std::optional<int> Res = findBestRootPair(Candidates);
7554 switch (Res.value_or(0)) {
7555 case 0:
7556 break;
7557 case 1:
7558 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7559 break;
7560 case 2:
7561 std::swap(Operands[0][I], Operands[1][I]);
7562 break;
7563 default:
7564 llvm_unreachable("Unexpected index.");
7565 }
7566 }
7567 }
7568 DenseSet<unsigned> UniqueOpcodes;
7569 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7570 unsigned NonInstCnt = 0;
7571 // Estimate number of instructions, required for the vectorized node and for
7572 // the buildvector node.
7573 unsigned UndefCnt = 0;
7574 // Count the number of extra shuffles, required for vector nodes.
7575 unsigned ExtraShuffleInsts = 0;
7576 // Check that operands do not contain same values and create either perfect
7577 // diamond match or shuffled match.
7578 if (Operands.size() == 2) {
7579 // Do not count same operands twice.
7580 if (Operands.front() == Operands.back()) {
7581 Operands.erase(Operands.begin());
7582 } else if (!allConstant(Operands.front()) &&
7583 all_of(Operands.front(), [&](Value *V) {
7584 return is_contained(Operands.back(), V);
7585 })) {
7586 Operands.erase(Operands.begin());
7587 ++ExtraShuffleInsts;
7588 }
7589 }
7590 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7591 // Vectorize node, if:
7592 // 1. at least single operand is constant or splat.
7593 // 2. Operands have many loop invariants (the instructions are not loop
7594 // invariants).
7595 // 3. At least single unique operands is supposed to vectorized.
7596 return none_of(Operands,
7597 [&](ArrayRef<Value *> Op) {
7598 if (allConstant(Op) ||
7599 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7600 getSameOpcode(Op, *TLI)))
7601 return false;
7603 for (Value *V : Op) {
7604 if (isa<Constant, ExtractElementInst>(V) ||
7605 isVectorized(V) || (L && L->isLoopInvariant(V))) {
7606 if (isa<UndefValue>(V))
7607 ++UndefCnt;
7608 continue;
7609 }
7610 auto Res = Uniques.try_emplace(V, 0);
7611 // Found first duplicate - need to add shuffle.
7612 if (!Res.second && Res.first->second == 1)
7613 ++ExtraShuffleInsts;
7614 ++Res.first->getSecond();
7615 if (auto *I = dyn_cast<Instruction>(V))
7616 UniqueOpcodes.insert(I->getOpcode());
7617 else if (Res.second)
7618 ++NonInstCnt;
7619 }
7620 return none_of(Uniques, [&](const auto &P) {
7621 return P.first->hasNUsesOrMore(P.second + 1) &&
7622 none_of(P.first->users(), [&](User *U) {
7623 return isVectorized(U) || Uniques.contains(U);
7624 });
7625 });
7626 }) ||
7627 // Do not vectorize node, if estimated number of vector instructions is
7628 // more than estimated number of buildvector instructions. Number of
7629 // vector operands is number of vector instructions + number of vector
7630 // instructions for operands (buildvectors). Number of buildvector
7631 // instructions is just number_of_operands * number_of_scalars.
7632 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7633 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7634 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7635}
7636
7637BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7638 const InstructionsState &S, ArrayRef<Value *> VL,
7639 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7640 SmallVectorImpl<Value *> &PointerOps) {
7641 assert(S.getMainOp() &&
7642 "Expected instructions with same/alternate opcodes only.");
7643
7644 unsigned ShuffleOrOp =
7645 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7646 Instruction *VL0 = S.getMainOp();
7647 switch (ShuffleOrOp) {
7648 case Instruction::PHI: {
7649 // Too many operands - gather, most probably won't be vectorized.
7650 if (VL0->getNumOperands() > MaxPHINumOperands)
7651 return TreeEntry::NeedToGather;
7652 // Check for terminator values (e.g. invoke).
7653 for (Value *V : VL) {
7654 auto *PHI = dyn_cast<PHINode>(V);
7655 if (!PHI)
7656 continue;
7657 for (Value *Incoming : PHI->incoming_values()) {
7658 Instruction *Term = dyn_cast<Instruction>(Incoming);
7659 if (Term && Term->isTerminator()) {
7661 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7662 return TreeEntry::NeedToGather;
7663 }
7664 }
7665 }
7666
7667 return TreeEntry::Vectorize;
7668 }
7669 case Instruction::ExtractValue:
7670 case Instruction::ExtractElement: {
7671 bool Reuse = canReuseExtract(VL, CurrentOrder);
7672 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7673 // non-full registers).
7674 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7675 return TreeEntry::NeedToGather;
7676 if (Reuse || !CurrentOrder.empty())
7677 return TreeEntry::Vectorize;
7678 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7679 return TreeEntry::NeedToGather;
7680 }
7681 case Instruction::InsertElement: {
7682 // Check that we have a buildvector and not a shuffle of 2 or more
7683 // different vectors.
7684 ValueSet SourceVectors;
7685 for (Value *V : VL) {
7686 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7687 assert(getElementIndex(V) != std::nullopt &&
7688 "Non-constant or undef index?");
7689 }
7690
7691 if (count_if(VL, [&SourceVectors](Value *V) {
7692 return !SourceVectors.contains(V);
7693 }) >= 2) {
7694 // Found 2nd source vector - cancel.
7695 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7696 "different source vectors.\n");
7697 return TreeEntry::NeedToGather;
7698 }
7699
7700 if (any_of(VL, [&SourceVectors](Value *V) {
7701 // The last InsertElement can have multiple uses.
7702 return SourceVectors.contains(V) && !V->hasOneUse();
7703 })) {
7704 assert(SLPReVec && "Only supported by REVEC.");
7705 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7706 "multiple uses.\n");
7707 return TreeEntry::NeedToGather;
7708 }
7709
7710 return TreeEntry::Vectorize;
7711 }
7712 case Instruction::Load: {
7713 // Check that a vectorized load would load the same memory as a scalar
7714 // load. For example, we don't want to vectorize loads that are smaller
7715 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7716 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7717 // from such a struct, we read/write packed bits disagreeing with the
7718 // unvectorized version.
7719 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7721 return TreeEntry::Vectorize;
7723 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7724 // Delay slow vectorized nodes for better vectorization attempts.
7725 LoadEntriesToVectorize.insert(VectorizableTree.size());
7726 return TreeEntry::NeedToGather;
7727 }
7728 return TreeEntry::ScatterVectorize;
7730 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7731 // Delay slow vectorized nodes for better vectorization attempts.
7732 LoadEntriesToVectorize.insert(VectorizableTree.size());
7733 return TreeEntry::NeedToGather;
7734 }
7735 return TreeEntry::StridedVectorize;
7736 case LoadsState::Gather:
7737#ifndef NDEBUG
7738 Type *ScalarTy = VL0->getType();
7739 if (DL->getTypeSizeInBits(ScalarTy) !=
7740 DL->getTypeAllocSizeInBits(ScalarTy))
7741 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7742 else if (any_of(VL, [](Value *V) {
7743 auto *LI = dyn_cast<LoadInst>(V);
7744 return !LI || !LI->isSimple();
7745 }))
7746 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7747 else
7748 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7749#endif // NDEBUG
7751 return TreeEntry::NeedToGather;
7752 }
7753 llvm_unreachable("Unexpected state of loads");
7754 }
7755 case Instruction::ZExt:
7756 case Instruction::SExt:
7757 case Instruction::FPToUI:
7758 case Instruction::FPToSI:
7759 case Instruction::FPExt:
7760 case Instruction::PtrToInt:
7761 case Instruction::IntToPtr:
7762 case Instruction::SIToFP:
7763 case Instruction::UIToFP:
7764 case Instruction::Trunc:
7765 case Instruction::FPTrunc:
7766 case Instruction::BitCast: {
7767 Type *SrcTy = VL0->getOperand(0)->getType();
7768 for (Value *V : VL) {
7769 if (isa<PoisonValue>(V))
7770 continue;
7771 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7772 if (Ty != SrcTy || !isValidElementType(Ty)) {
7773 LLVM_DEBUG(
7774 dbgs() << "SLP: Gathering casts with different src types.\n");
7775 return TreeEntry::NeedToGather;
7776 }
7777 }
7778 return TreeEntry::Vectorize;
7779 }
7780 case Instruction::ICmp:
7781 case Instruction::FCmp: {
7782 // Check that all of the compares have the same predicate.
7783 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7785 Type *ComparedTy = VL0->getOperand(0)->getType();
7786 for (Value *V : VL) {
7787 if (isa<PoisonValue>(V))
7788 continue;
7789 auto *Cmp = cast<CmpInst>(V);
7790 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7791 Cmp->getOperand(0)->getType() != ComparedTy) {
7792 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7793 return TreeEntry::NeedToGather;
7794 }
7795 }
7796 return TreeEntry::Vectorize;
7797 }
7798 case Instruction::Select:
7799 case Instruction::FNeg:
7800 case Instruction::Add:
7801 case Instruction::FAdd:
7802 case Instruction::Sub:
7803 case Instruction::FSub:
7804 case Instruction::Mul:
7805 case Instruction::FMul:
7806 case Instruction::UDiv:
7807 case Instruction::SDiv:
7808 case Instruction::FDiv:
7809 case Instruction::URem:
7810 case Instruction::SRem:
7811 case Instruction::FRem:
7812 case Instruction::Shl:
7813 case Instruction::LShr:
7814 case Instruction::AShr:
7815 case Instruction::And:
7816 case Instruction::Or:
7817 case Instruction::Xor:
7818 case Instruction::Freeze:
7819 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7821 auto *I = dyn_cast<Instruction>(V);
7822 return I && I->isBinaryOp() && !I->isFast();
7823 }))
7824 return TreeEntry::NeedToGather;
7825 return TreeEntry::Vectorize;
7826 case Instruction::GetElementPtr: {
7827 // We don't combine GEPs with complicated (nested) indexing.
7828 for (Value *V : VL) {
7829 auto *I = dyn_cast<GetElementPtrInst>(V);
7830 if (!I)
7831 continue;
7832 if (I->getNumOperands() != 2) {
7833 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7834 return TreeEntry::NeedToGather;
7835 }
7836 }
7837
7838 // We can't combine several GEPs into one vector if they operate on
7839 // different types.
7840 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7841 for (Value *V : VL) {
7842 auto *GEP = dyn_cast<GEPOperator>(V);
7843 if (!GEP)
7844 continue;
7845 Type *CurTy = GEP->getSourceElementType();
7846 if (Ty0 != CurTy) {
7847 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7848 return TreeEntry::NeedToGather;
7849 }
7850 }
7851
7852 // We don't combine GEPs with non-constant indexes.
7853 Type *Ty1 = VL0->getOperand(1)->getType();
7854 for (Value *V : VL) {
7855 auto *I = dyn_cast<GetElementPtrInst>(V);
7856 if (!I)
7857 continue;
7858 auto *Op = I->getOperand(1);
7859 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7860 (Op->getType() != Ty1 &&
7861 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7862 Op->getType()->getScalarSizeInBits() >
7863 DL->getIndexSizeInBits(
7864 V->getType()->getPointerAddressSpace())))) {
7865 LLVM_DEBUG(
7866 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7867 return TreeEntry::NeedToGather;
7868 }
7869 }
7870
7871 return TreeEntry::Vectorize;
7872 }
7873 case Instruction::Store: {
7874 // Check if the stores are consecutive or if we need to swizzle them.
7875 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7876 // Avoid types that are padded when being allocated as scalars, while
7877 // being packed together in a vector (such as i1).
7878 if (DL->getTypeSizeInBits(ScalarTy) !=
7879 DL->getTypeAllocSizeInBits(ScalarTy)) {
7880 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7881 return TreeEntry::NeedToGather;
7882 }
7883 // Make sure all stores in the bundle are simple - we can't vectorize
7884 // atomic or volatile stores.
7885 for (Value *V : VL) {
7886 auto *SI = cast<StoreInst>(V);
7887 if (!SI->isSimple()) {
7888 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7889 return TreeEntry::NeedToGather;
7890 }
7891 PointerOps.push_back(SI->getPointerOperand());
7892 }
7893
7894 // Check the order of pointer operands.
7895 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7896 Value *Ptr0;
7897 Value *PtrN;
7898 if (CurrentOrder.empty()) {
7899 Ptr0 = PointerOps.front();
7900 PtrN = PointerOps.back();
7901 } else {
7902 Ptr0 = PointerOps[CurrentOrder.front()];
7903 PtrN = PointerOps[CurrentOrder.back()];
7904 }
7905 std::optional<int> Dist =
7906 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7907 // Check that the sorted pointer operands are consecutive.
7908 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7909 return TreeEntry::Vectorize;
7910 }
7911
7912 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7913 return TreeEntry::NeedToGather;
7914 }
7915 case Instruction::Call: {
7916 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7918 auto *I = dyn_cast<Instruction>(V);
7919 return I && !I->isFast();
7920 }))
7921 return TreeEntry::NeedToGather;
7922 // Check if the calls are all to the same vectorizable intrinsic or
7923 // library function.
7924 CallInst *CI = cast<CallInst>(VL0);
7926
7927 VFShape Shape = VFShape::get(
7928 CI->getFunctionType(),
7929 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7930 false /*HasGlobalPred*/);
7931 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7932
7933 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7934 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7935 return TreeEntry::NeedToGather;
7936 }
7937 Function *F = CI->getCalledFunction();
7938 unsigned NumArgs = CI->arg_size();
7939 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7940 for (unsigned J = 0; J != NumArgs; ++J)
7942 ScalarArgs[J] = CI->getArgOperand(J);
7943 for (Value *V : VL) {
7944 CallInst *CI2 = dyn_cast<CallInst>(V);
7945 if (!CI2 || CI2->getCalledFunction() != F ||
7946 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7947 (VecFunc &&
7948 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7950 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7951 << "\n");
7952 return TreeEntry::NeedToGather;
7953 }
7954 // Some intrinsics have scalar arguments and should be same in order for
7955 // them to be vectorized.
7956 for (unsigned J = 0; J != NumArgs; ++J) {
7958 Value *A1J = CI2->getArgOperand(J);
7959 if (ScalarArgs[J] != A1J) {
7961 << "SLP: mismatched arguments in call:" << *CI
7962 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7963 return TreeEntry::NeedToGather;
7964 }
7965 }
7966 }
7967 // Verify that the bundle operands are identical between the two calls.
7968 if (CI->hasOperandBundles() &&
7969 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7970 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7971 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7972 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7973 << "!=" << *V << '\n');
7974 return TreeEntry::NeedToGather;
7975 }
7976 }
7977
7978 return TreeEntry::Vectorize;
7979 }
7980 case Instruction::ShuffleVector: {
7981 if (!S.isAltShuffle()) {
7982 // REVEC can support non alternate shuffle.
7984 return TreeEntry::Vectorize;
7985 // If this is not an alternate sequence of opcode like add-sub
7986 // then do not vectorize this instruction.
7987 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7988 return TreeEntry::NeedToGather;
7989 }
7990 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7991 LLVM_DEBUG(
7992 dbgs()
7993 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7994 "the whole alt sequence is not profitable.\n");
7995 return TreeEntry::NeedToGather;
7996 }
7997
7998 return TreeEntry::Vectorize;
7999 }
8000 default:
8001 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
8002 return TreeEntry::NeedToGather;
8003 }
8004}
8005
8006namespace {
8007/// Allows to correctly handle operands of the phi nodes based on the \p Main
8008/// PHINode order of incoming basic blocks/values.
8009class PHIHandler {
8010 DominatorTree &DT;
8011 PHINode *Main = nullptr;
8014
8015public:
8016 PHIHandler() = delete;
8017 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
8018 : DT(DT), Main(Main), Phis(Phis),
8019 Operands(Main->getNumIncomingValues(),
8020 SmallVector<Value *>(Phis.size(), nullptr)) {}
8021 void buildOperands() {
8022 constexpr unsigned FastLimit = 4;
8023 if (Main->getNumIncomingValues() <= FastLimit) {
8024 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8025 BasicBlock *InBB = Main->getIncomingBlock(I);
8026 if (!DT.isReachableFromEntry(InBB)) {
8027 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8028 continue;
8029 }
8030 // Prepare the operand vector.
8031 for (auto [Idx, V] : enumerate(Phis)) {
8032 auto *P = dyn_cast<PHINode>(V);
8033 if (!P) {
8034 assert(isa<PoisonValue>(V) &&
8035 "Expected isa instruction or poison value.");
8036 Operands[I][Idx] = V;
8037 continue;
8038 }
8039 if (P->getIncomingBlock(I) == InBB)
8040 Operands[I][Idx] = P->getIncomingValue(I);
8041 else
8042 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
8043 }
8044 }
8045 return;
8046 }
8048 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8049 BasicBlock *InBB = Main->getIncomingBlock(I);
8050 if (!DT.isReachableFromEntry(InBB)) {
8051 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8052 continue;
8053 }
8054 Blocks.try_emplace(InBB).first->second.push_back(I);
8055 }
8056 for (auto [Idx, V] : enumerate(Phis)) {
8057 if (isa<PoisonValue>(V)) {
8058 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
8059 Operands[I][Idx] = V;
8060 continue;
8061 }
8062 auto *P = cast<PHINode>(V);
8063 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
8064 BasicBlock *InBB = P->getIncomingBlock(I);
8065 if (InBB == Main->getIncomingBlock(I)) {
8066 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8067 continue;
8068 Operands[I][Idx] = P->getIncomingValue(I);
8069 continue;
8070 }
8071 auto It = Blocks.find(InBB);
8072 if (It == Blocks.end())
8073 continue;
8074 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
8075 }
8076 }
8077 for (const auto &P : Blocks) {
8078 if (P.getSecond().size() <= 1)
8079 continue;
8080 unsigned BasicI = P.getSecond().front();
8081 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
8083 [&](const auto &Data) {
8084 return !Data.value() ||
8085 Data.value() == Operands[BasicI][Data.index()];
8086 }) &&
8087 "Expected empty operands list.");
8088 Operands[I] = Operands[BasicI];
8089 }
8090 }
8091 }
8092 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8093};
8094} // namespace
8095
8096void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8097 const EdgeInfo &UserTreeIdx,
8098 unsigned InterleaveFactor) {
8099 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8100
8101 SmallVector<int> ReuseShuffleIndices;
8102 SmallVector<Value *> UniqueValues;
8103 SmallVector<Value *> NonUniqueValueVL;
8104 auto TryToFindDuplicates = [&](const InstructionsState &S,
8105 bool DoNotFail = false) {
8106 // Check that every instruction appears once in this bundle.
8107 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8108 for (Value *V : VL) {
8109 if (isConstant(V)) {
8110 ReuseShuffleIndices.emplace_back(
8111 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8112 UniqueValues.emplace_back(V);
8113 continue;
8114 }
8115 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8116 ReuseShuffleIndices.emplace_back(Res.first->second);
8117 if (Res.second)
8118 UniqueValues.emplace_back(V);
8119 }
8120 size_t NumUniqueScalarValues = UniqueValues.size();
8121 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8122 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8123 if (NumUniqueScalarValues == VL.size() &&
8124 (VectorizeNonPowerOf2 || IsFullVectors)) {
8125 ReuseShuffleIndices.clear();
8126 } else {
8127 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8128 if ((UserTreeIdx.UserTE &&
8129 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8130 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8131 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8132 "for nodes with padding.\n");
8133 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8134 return false;
8135 }
8136 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8137 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8138 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8139 return isa<UndefValue>(V) || !isConstant(V);
8140 }))) {
8141 if (DoNotFail && UniquePositions.size() > 1 &&
8142 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8143 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8144 // Find the number of elements, which forms full vectors.
8145 unsigned PWSz = getFullVectorNumberOfElements(
8146 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8147 if (PWSz == VL.size()) {
8148 ReuseShuffleIndices.clear();
8149 } else {
8150 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8151 NonUniqueValueVL.append(
8152 PWSz - UniqueValues.size(),
8153 PoisonValue::get(UniqueValues.front()->getType()));
8154 // Check that extended with poisons operations are still valid for
8155 // vectorization (div/rem are not allowed).
8156 if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8157 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8158 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8159 return false;
8160 }
8161 VL = NonUniqueValueVL;
8162 }
8163 return true;
8164 }
8165 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8166 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8167 return false;
8168 }
8169 VL = UniqueValues;
8170 }
8171 return true;
8172 };
8173
8174 InstructionsState S = getSameOpcode(VL, *TLI);
8175
8176 // Don't go into catchswitch blocks, which can happen with PHIs.
8177 // Such blocks can only have PHIs and the catchswitch. There is no
8178 // place to insert a shuffle if we need to, so just avoid that issue.
8179 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8180 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8181 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8182 return;
8183 }
8184
8185 // Check if this is a duplicate of another entry.
8186 if (S) {
8187 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
8188 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
8189 if (E->isSame(VL)) {
8190 // Record the reuse of the tree node.
8191 E->UserTreeIndices.push_back(UserTreeIdx);
8192 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8193 << ".\n");
8194 return;
8195 }
8196 SmallPtrSet<Value *, 8> Values(E->Scalars.begin(), E->Scalars.end());
8197 if (all_of(VL, [&](Value *V) {
8198 return isa<PoisonValue>(V) || Values.contains(V);
8199 })) {
8200 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8201 if (TryToFindDuplicates(S))
8202 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8203 ReuseShuffleIndices);
8204 return;
8205 }
8206 }
8207 }
8208
8209 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8210 // a load), in which case peek through to include it in the tree, without
8211 // ballooning over-budget.
8212 if (Depth >= RecursionMaxDepth &&
8213 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8214 (match(S.getMainOp(), m_Load(m_Value())) ||
8215 all_of(VL, [&S](const Value *I) {
8216 return match(I,
8218 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8219 })))) {
8220 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8221 if (TryToFindDuplicates(S))
8222 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8223 ReuseShuffleIndices);
8224 return;
8225 }
8226
8227 // Don't handle scalable vectors
8228 if (S && S.getOpcode() == Instruction::ExtractElement &&
8229 isa<ScalableVectorType>(
8230 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8231 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8232 if (TryToFindDuplicates(S))
8233 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8234 ReuseShuffleIndices);
8235 return;
8236 }
8237
8238 // Don't handle vectors.
8239 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8240 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8241 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8242 return;
8243 }
8244
8245 // If all of the operands are identical or constant we have a simple solution.
8246 // If we deal with insert/extract instructions, they all must have constant
8247 // indices, otherwise we should gather them, not try to vectorize.
8248 // If alternate op node with 2 elements with gathered operands - do not
8249 // vectorize.
8250 auto &&NotProfitableForVectorization = [&S, this,
8252 if (!S || !S.isAltShuffle() || VL.size() > 2)
8253 return false;
8254 if (VectorizableTree.size() < MinTreeSize)
8255 return false;
8256 if (Depth >= RecursionMaxDepth - 1)
8257 return true;
8258 // Check if all operands are extracts, part of vector node or can build a
8259 // regular vectorize node.
8260 SmallVector<unsigned, 8> InstsCount;
8261 for (Value *V : VL) {
8262 auto *I = cast<Instruction>(V);
8263 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8264 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8265 }));
8266 }
8267 bool IsCommutative =
8268 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8269 if ((IsCommutative &&
8270 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8271 (!IsCommutative &&
8272 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8273 return true;
8274 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8276 auto *I1 = cast<Instruction>(VL.front());
8277 auto *I2 = cast<Instruction>(VL.back());
8278 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8279 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8280 I2->getOperand(Op));
8281 if (static_cast<unsigned>(count_if(
8282 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8284 })) >= S.getMainOp()->getNumOperands() / 2)
8285 return false;
8286 if (S.getMainOp()->getNumOperands() > 2)
8287 return true;
8288 if (IsCommutative) {
8289 // Check permuted operands.
8290 Candidates.clear();
8291 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8292 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8293 I2->getOperand((Op + 1) % E));
8294 if (any_of(
8295 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8297 }))
8298 return false;
8299 }
8300 return true;
8301 };
8302 SmallVector<unsigned> SortedIndices;
8303 BasicBlock *BB = nullptr;
8304 bool IsScatterVectorizeUserTE =
8305 UserTreeIdx.UserTE &&
8306 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8307 bool AreAllSameBlock = S && allSameBlock(VL);
8308 bool AreScatterAllGEPSameBlock =
8309 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8310 VL.size() > 2 &&
8311 all_of(VL,
8312 [&BB](Value *V) {
8313 auto *I = dyn_cast<GetElementPtrInst>(V);
8314 if (!I)
8315 return doesNotNeedToBeScheduled(V);
8316 if (!BB)
8317 BB = I->getParent();
8318 return BB == I->getParent() && I->getNumOperands() == 2;
8319 }) &&
8320 BB &&
8321 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8322 SortedIndices));
8323 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8324 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8325 (S &&
8326 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8327 S.getMainOp()) &&
8329 NotProfitableForVectorization(VL)) {
8330 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8331 if (TryToFindDuplicates(S))
8332 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8333 ReuseShuffleIndices);
8334 return;
8335 }
8336
8337 // Don't vectorize ephemeral values.
8338 if (S && !EphValues.empty()) {
8339 for (Value *V : VL) {
8340 if (EphValues.count(V)) {
8341 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8342 << ") is ephemeral.\n");
8343 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8344 return;
8345 }
8346 }
8347 }
8348
8349 // We now know that this is a vector of instructions of the same type from
8350 // the same block.
8351
8352 // Check that none of the instructions in the bundle are already in the tree.
8353 for (Value *V : VL) {
8354 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8356 continue;
8357 if (isVectorized(V)) {
8358 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8359 << ") is already in tree.\n");
8360 if (TryToFindDuplicates(S))
8361 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8362 ReuseShuffleIndices);
8363 return;
8364 }
8365 }
8366
8367 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8368 if (UserIgnoreList && !UserIgnoreList->empty()) {
8369 for (Value *V : VL) {
8370 if (UserIgnoreList->contains(V)) {
8371 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8372 if (TryToFindDuplicates(S))
8373 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8374 ReuseShuffleIndices);
8375 return;
8376 }
8377 }
8378 }
8379
8380 // Special processing for sorted pointers for ScatterVectorize node with
8381 // constant indeces only.
8382 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8383 assert(VL.front()->getType()->isPointerTy() &&
8384 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8385 "Expected pointers only.");
8386 // Reset S to make it GetElementPtr kind of node.
8387 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8388 assert(It != VL.end() && "Expected at least one GEP.");
8389 S = getSameOpcode(*It, *TLI);
8390 }
8391
8392 // Check that all of the users of the scalars that we want to vectorize are
8393 // schedulable.
8394 Instruction *VL0 = S.getMainOp();
8395 BB = VL0->getParent();
8396
8397 if (S &&
8398 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8399 !DT->isReachableFromEntry(BB))) {
8400 // Don't go into unreachable blocks. They may contain instructions with
8401 // dependency cycles which confuse the final scheduling.
8402 // Do not vectorize EH and non-returning blocks, not profitable in most
8403 // cases.
8404 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8405 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8406 return;
8407 }
8408
8409 // Check that every instruction appears once in this bundle.
8410 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8411 return;
8412
8413 // Perform specific checks for each particular instruction kind.
8414 OrdersType CurrentOrder;
8415 SmallVector<Value *> PointerOps;
8416 TreeEntry::EntryState State = getScalarsVectorizationState(
8417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8418 if (State == TreeEntry::NeedToGather) {
8419 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8420 ReuseShuffleIndices);
8421 return;
8422 }
8423
8424 auto &BSRef = BlocksSchedules[BB];
8425 if (!BSRef)
8426 BSRef = std::make_unique<BlockScheduling>(BB);
8427
8428 BlockScheduling &BS = *BSRef;
8429
8430 std::optional<ScheduleData *> Bundle =
8431 BS.tryScheduleBundle(UniqueValues, this, S);
8432#ifdef EXPENSIVE_CHECKS
8433 // Make sure we didn't break any internal invariants
8434 BS.verify();
8435#endif
8436 if (!Bundle) {
8437 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8438 assert((!BS.getScheduleData(VL0) ||
8439 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8440 "tryScheduleBundle should cancelScheduling on failure");
8441 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8442 ReuseShuffleIndices);
8443 NonScheduledFirst.insert(VL.front());
8444 if (S.getOpcode() == Instruction::Load &&
8445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8447 return;
8448 }
8449 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8450
8451 unsigned ShuffleOrOp =
8452 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8453 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8454 // Postpone PHI nodes creation
8455 SmallVector<unsigned> PHIOps;
8456 for (unsigned I : seq<unsigned>(Operands.size())) {
8458 if (Op.empty())
8459 continue;
8460 InstructionsState S = getSameOpcode(Op, *TLI);
8461 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8462 buildTree_rec(Op, Depth + 1, {TE, I});
8463 else
8464 PHIOps.push_back(I);
8465 }
8466 for (unsigned I : PHIOps)
8467 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8468 };
8469 switch (ShuffleOrOp) {
8470 case Instruction::PHI: {
8471 auto *PH = cast<PHINode>(VL0);
8472
8473 TreeEntry *TE =
8474 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8475 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8476 TE->dump());
8477
8478 // Keeps the reordered operands to avoid code duplication.
8479 PHIHandler Handler(*DT, PH, VL);
8480 Handler.buildOperands();
8481 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8482 TE->setOperand(I, Handler.getOperands(I));
8483 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8484 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8485 Operands[I] = Handler.getOperands(I);
8486 CreateOperandNodes(TE, Operands);
8487 return;
8488 }
8489 case Instruction::ExtractValue:
8490 case Instruction::ExtractElement: {
8491 if (CurrentOrder.empty()) {
8492 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8493 } else {
8494 LLVM_DEBUG({
8495 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8496 "with order";
8497 for (unsigned Idx : CurrentOrder)
8498 dbgs() << " " << Idx;
8499 dbgs() << "\n";
8500 });
8501 fixupOrderingIndices(CurrentOrder);
8502 }
8503 // Insert new order with initial value 0, if it does not exist,
8504 // otherwise return the iterator to the existing one.
8505 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8506 ReuseShuffleIndices, CurrentOrder);
8507 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8508 "(ExtractValueInst/ExtractElementInst).\n";
8509 TE->dump());
8510 // This is a special case, as it does not gather, but at the same time
8511 // we are not extending buildTree_rec() towards the operands.
8512 TE->setOperand(*this);
8513 return;
8514 }
8515 case Instruction::InsertElement: {
8516 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8517
8518 auto OrdCompare = [](const std::pair<int, int> &P1,
8519 const std::pair<int, int> &P2) {
8520 return P1.first > P2.first;
8521 };
8523 decltype(OrdCompare)>
8524 Indices(OrdCompare);
8525 for (int I = 0, E = VL.size(); I < E; ++I) {
8526 unsigned Idx = *getElementIndex(VL[I]);
8527 Indices.emplace(Idx, I);
8528 }
8529 OrdersType CurrentOrder(VL.size(), VL.size());
8530 bool IsIdentity = true;
8531 for (int I = 0, E = VL.size(); I < E; ++I) {
8532 CurrentOrder[Indices.top().second] = I;
8533 IsIdentity &= Indices.top().second == I;
8534 Indices.pop();
8535 }
8536 if (IsIdentity)
8537 CurrentOrder.clear();
8538 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8539 {}, CurrentOrder);
8540 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8541 TE->dump());
8542
8543 TE->setOperand(*this);
8544 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8545 return;
8546 }
8547 case Instruction::Load: {
8548 // Check that a vectorized load would load the same memory as a scalar
8549 // load. For example, we don't want to vectorize loads that are smaller
8550 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8551 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8552 // from such a struct, we read/write packed bits disagreeing with the
8553 // unvectorized version.
8554 TreeEntry *TE = nullptr;
8555 fixupOrderingIndices(CurrentOrder);
8556 switch (State) {
8557 case TreeEntry::Vectorize:
8558 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8559 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8560 if (CurrentOrder.empty())
8561 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8562 TE->dump());
8563 else
8565 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8566 TE->dump());
8567 break;
8568 case TreeEntry::StridedVectorize:
8569 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8570 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8571 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8572 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8573 TE->dump());
8574 break;
8575 case TreeEntry::ScatterVectorize:
8576 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8577 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8578 UserTreeIdx, ReuseShuffleIndices);
8579 LLVM_DEBUG(
8580 dbgs()
8581 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8582 TE->dump());
8583 break;
8584 case TreeEntry::CombinedVectorize:
8585 case TreeEntry::NeedToGather:
8586 llvm_unreachable("Unexpected loads state.");
8587 }
8588 TE->setOperand(*this);
8589 if (State == TreeEntry::ScatterVectorize)
8590 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8591 return;
8592 }
8593 case Instruction::ZExt:
8594 case Instruction::SExt:
8595 case Instruction::FPToUI:
8596 case Instruction::FPToSI:
8597 case Instruction::FPExt:
8598 case Instruction::PtrToInt:
8599 case Instruction::IntToPtr:
8600 case Instruction::SIToFP:
8601 case Instruction::UIToFP:
8602 case Instruction::Trunc:
8603 case Instruction::FPTrunc:
8604 case Instruction::BitCast: {
8605 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8606 std::make_pair(std::numeric_limits<unsigned>::min(),
8607 std::numeric_limits<unsigned>::max()));
8608 if (ShuffleOrOp == Instruction::ZExt ||
8609 ShuffleOrOp == Instruction::SExt) {
8610 CastMaxMinBWSizes = std::make_pair(
8611 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8612 PrevMaxBW),
8613 std::min<unsigned>(
8614 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8615 PrevMinBW));
8616 } else if (ShuffleOrOp == Instruction::Trunc) {
8617 CastMaxMinBWSizes = std::make_pair(
8618 std::max<unsigned>(
8619 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8620 PrevMaxBW),
8621 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8622 PrevMinBW));
8623 }
8624 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8625 ReuseShuffleIndices);
8626 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8627 TE->dump());
8628
8629 TE->setOperand(*this);
8630 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8631 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8632 if (ShuffleOrOp == Instruction::Trunc) {
8633 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8634 } else if (ShuffleOrOp == Instruction::SIToFP ||
8635 ShuffleOrOp == Instruction::UIToFP) {
8636 unsigned NumSignBits =
8637 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8638 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8639 APInt Mask = DB->getDemandedBits(OpI);
8640 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8641 }
8642 if (NumSignBits * 2 >=
8643 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8644 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8645 }
8646 return;
8647 }
8648 case Instruction::ICmp:
8649 case Instruction::FCmp: {
8650 // Check that all of the compares have the same predicate.
8651 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8652 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8653 ReuseShuffleIndices);
8654 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8655 TE->dump());
8656
8658 VLOperands Ops(VL, S, *this);
8659 if (cast<CmpInst>(VL0)->isCommutative()) {
8660 // Commutative predicate - collect + sort operands of the instructions
8661 // so that each side is more likely to have the same opcode.
8663 "Commutative Predicate mismatch");
8664 Ops.reorder();
8665 Left = Ops.getVL(0);
8666 Right = Ops.getVL(1);
8667 } else {
8668 // Collect operands - commute if it uses the swapped predicate.
8669 for (Value *V : VL) {
8670 if (isa<PoisonValue>(V)) {
8671 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8672 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8673 continue;
8674 }
8675 auto *Cmp = cast<CmpInst>(V);
8676 Value *LHS = Cmp->getOperand(0);
8677 Value *RHS = Cmp->getOperand(1);
8678 if (Cmp->getPredicate() != P0)
8679 std::swap(LHS, RHS);
8680 Left.push_back(LHS);
8681 Right.push_back(RHS);
8682 }
8683 }
8684 TE->setOperand(0, Left);
8685 TE->setOperand(1, Right);
8686 buildTree_rec(Left, Depth + 1, {TE, 0});
8687 buildTree_rec(Right, Depth + 1, {TE, 1});
8688 if (ShuffleOrOp == Instruction::ICmp) {
8689 unsigned NumSignBits0 =
8690 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8691 if (NumSignBits0 * 2 >=
8692 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8693 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8694 unsigned NumSignBits1 =
8695 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8696 if (NumSignBits1 * 2 >=
8697 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8699 }
8700 return;
8701 }
8702 case Instruction::Select:
8703 case Instruction::FNeg:
8704 case Instruction::Add:
8705 case Instruction::FAdd:
8706 case Instruction::Sub:
8707 case Instruction::FSub:
8708 case Instruction::Mul:
8709 case Instruction::FMul:
8710 case Instruction::UDiv:
8711 case Instruction::SDiv:
8712 case Instruction::FDiv:
8713 case Instruction::URem:
8714 case Instruction::SRem:
8715 case Instruction::FRem:
8716 case Instruction::Shl:
8717 case Instruction::LShr:
8718 case Instruction::AShr:
8719 case Instruction::And:
8720 case Instruction::Or:
8721 case Instruction::Xor:
8722 case Instruction::Freeze: {
8723 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8724 ReuseShuffleIndices);
8725 LLVM_DEBUG(
8726 dbgs() << "SLP: added a new TreeEntry "
8727 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8728 TE->dump());
8729
8730 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8731 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8732 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8733 return;
8734 }
8735 case Instruction::GetElementPtr: {
8736 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8737 ReuseShuffleIndices);
8738 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8739 TE->dump());
8741 // Prepare the operand vector for pointer operands.
8742 for (Value *V : VL) {
8743 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8744 if (!GEP) {
8745 Operands.front().push_back(V);
8746 continue;
8747 }
8748 Operands.front().push_back(GEP->getPointerOperand());
8749 }
8750 TE->setOperand(0, Operands.front());
8751 // Need to cast all indices to the same type before vectorization to
8752 // avoid crash.
8753 // Required to be able to find correct matches between different gather
8754 // nodes and reuse the vectorized values rather than trying to gather them
8755 // again.
8756 int IndexIdx = 1;
8757 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8758 Type *Ty = all_of(VL,
8759 [VL0Ty, IndexIdx](Value *V) {
8760 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8761 if (!GEP)
8762 return true;
8763 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8764 })
8765 ? VL0Ty
8766 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8767 ->getPointerOperandType()
8768 ->getScalarType());
8769 // Prepare the operand vector.
8770 for (Value *V : VL) {
8771 auto *I = dyn_cast<GetElementPtrInst>(V);
8772 if (!I) {
8773 Operands.back().push_back(
8774 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8775 continue;
8776 }
8777 auto *Op = I->getOperand(IndexIdx);
8778 auto *CI = dyn_cast<ConstantInt>(Op);
8779 if (!CI)
8780 Operands.back().push_back(Op);
8781 else
8782 Operands.back().push_back(ConstantFoldIntegerCast(
8783 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8784 }
8785 TE->setOperand(IndexIdx, Operands.back());
8786
8787 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8788 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8789 return;
8790 }
8791 case Instruction::Store: {
8792 bool Consecutive = CurrentOrder.empty();
8793 if (!Consecutive)
8794 fixupOrderingIndices(CurrentOrder);
8795 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8796 ReuseShuffleIndices, CurrentOrder);
8797 if (Consecutive)
8798 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8799 TE->dump());
8800 else
8801 LLVM_DEBUG(
8802 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8803 TE->dump());
8804 TE->setOperand(*this);
8805 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8806 return;
8807 }
8808 case Instruction::Call: {
8809 // Check if the calls are all to the same vectorizable intrinsic or
8810 // library function.
8811 CallInst *CI = cast<CallInst>(VL0);
8813
8814 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8815 ReuseShuffleIndices);
8816 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8817 TE->dump());
8818 TE->setOperand(*this, isCommutative(VL0));
8819 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8820 // For scalar operands no need to create an entry since no need to
8821 // vectorize it.
8823 continue;
8824 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8825 }
8826 return;
8827 }
8828 case Instruction::ShuffleVector: {
8829 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8830 ReuseShuffleIndices);
8831 if (S.isAltShuffle()) {
8832 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8833 TE->dump());
8834 } else {
8835 assert(SLPReVec && "Only supported by REVEC.");
8836 LLVM_DEBUG(
8837 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8838 TE->dump());
8839 }
8840
8841 // Reorder operands if reordering would enable vectorization.
8842 auto *CI = dyn_cast<CmpInst>(VL0);
8843 if (CI && any_of(VL, [](Value *V) {
8844 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8845 })) {
8846 auto *MainCI = cast<CmpInst>(S.getMainOp());
8847 auto *AltCI = cast<CmpInst>(S.getAltOp());
8848 CmpInst::Predicate MainP = MainCI->getPredicate();
8849 CmpInst::Predicate AltP = AltCI->getPredicate();
8850 assert(MainP != AltP &&
8851 "Expected different main/alternate predicates.");
8853 // Collect operands - commute if it uses the swapped predicate or
8854 // alternate operation.
8855 for (Value *V : VL) {
8856 if (isa<PoisonValue>(V)) {
8857 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8858 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8859 continue;
8860 }
8861 auto *Cmp = cast<CmpInst>(V);
8862 Value *LHS = Cmp->getOperand(0);
8863 Value *RHS = Cmp->getOperand(1);
8864
8865 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8866 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8867 std::swap(LHS, RHS);
8868 } else {
8869 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8870 std::swap(LHS, RHS);
8871 }
8872 Left.push_back(LHS);
8873 Right.push_back(RHS);
8874 }
8875 TE->setOperand(0, Left);
8876 TE->setOperand(1, Right);
8877 buildTree_rec(Left, Depth + 1, {TE, 0});
8878 buildTree_rec(Right, Depth + 1, {TE, 1});
8879 return;
8880 }
8881
8882 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8883 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8884 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8885 return;
8886 }
8887 default:
8888 break;
8889 }
8890 llvm_unreachable("Unexpected vectorization of the instructions.");
8891}
8892
8894 unsigned N = 1;
8895 Type *EltTy = T;
8896
8897 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8898 if (EltTy->isEmptyTy())
8899 return 0;
8900 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8901 // Check that struct is homogeneous.
8902 for (const auto *Ty : ST->elements())
8903 if (Ty != *ST->element_begin())
8904 return 0;
8905 N *= ST->getNumElements();
8906 EltTy = *ST->element_begin();
8907 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8908 N *= AT->getNumElements();
8909 EltTy = AT->getElementType();
8910 } else {
8911 auto *VT = cast<FixedVectorType>(EltTy);
8912 N *= VT->getNumElements();
8913 EltTy = VT->getElementType();
8914 }
8915 }
8916
8917 if (!isValidElementType(EltTy))
8918 return 0;
8919 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8920 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8921 VTSize != DL->getTypeStoreSizeInBits(T))
8922 return 0;
8923 return N;
8924}
8925
8926bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
8927 SmallVectorImpl<unsigned> &CurrentOrder,
8928 bool ResizeAllowed) const {
8929 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8930 assert(It != VL.end() && "Expected at least one extract instruction.");
8931 auto *E0 = cast<Instruction>(*It);
8932 assert(
8933 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8934 "Invalid opcode");
8935 // Check if all of the extracts come from the same vector and from the
8936 // correct offset.
8937 Value *Vec = E0->getOperand(0);
8938
8939 CurrentOrder.clear();
8940
8941 // We have to extract from a vector/aggregate with the same number of elements.
8942 unsigned NElts;
8943 if (E0->getOpcode() == Instruction::ExtractValue) {
8944 NElts = canMapToVector(Vec->getType());
8945 if (!NElts)
8946 return false;
8947 // Check if load can be rewritten as load of vector.
8948 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8949 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8950 return false;
8951 } else {
8952 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8953 }
8954
8955 unsigned E = VL.size();
8956 if (!ResizeAllowed && NElts != E)
8957 return false;
8958 SmallVector<int> Indices(E, PoisonMaskElem);
8959 unsigned MinIdx = NElts, MaxIdx = 0;
8960 for (auto [I, V] : enumerate(VL)) {
8961 auto *Inst = dyn_cast<Instruction>(V);
8962 if (!Inst)
8963 continue;
8964 if (Inst->getOperand(0) != Vec)
8965 return false;
8966 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8967 if (isa<UndefValue>(EE->getIndexOperand()))
8968 continue;
8969 std::optional<unsigned> Idx = getExtractIndex(Inst);
8970 if (!Idx)
8971 return false;
8972 const unsigned ExtIdx = *Idx;
8973 if (ExtIdx >= NElts)
8974 continue;
8975 Indices[I] = ExtIdx;
8976 if (MinIdx > ExtIdx)
8977 MinIdx = ExtIdx;
8978 if (MaxIdx < ExtIdx)
8979 MaxIdx = ExtIdx;
8980 }
8981 if (MaxIdx - MinIdx + 1 > E)
8982 return false;
8983 if (MaxIdx + 1 <= E)
8984 MinIdx = 0;
8985
8986 // Check that all of the indices extract from the correct offset.
8987 bool ShouldKeepOrder = true;
8988 // Assign to all items the initial value E + 1 so we can check if the extract
8989 // instruction index was used already.
8990 // Also, later we can check that all the indices are used and we have a
8991 // consecutive access in the extract instructions, by checking that no
8992 // element of CurrentOrder still has value E + 1.
8993 CurrentOrder.assign(E, E);
8994 for (unsigned I = 0; I < E; ++I) {
8995 if (Indices[I] == PoisonMaskElem)
8996 continue;
8997 const unsigned ExtIdx = Indices[I] - MinIdx;
8998 if (CurrentOrder[ExtIdx] != E) {
8999 CurrentOrder.clear();
9000 return false;
9001 }
9002 ShouldKeepOrder &= ExtIdx == I;
9003 CurrentOrder[ExtIdx] = I;
9004 }
9005 if (ShouldKeepOrder)
9006 CurrentOrder.clear();
9007
9008 return ShouldKeepOrder;
9009}
9010
9011bool BoUpSLP::areAllUsersVectorized(
9012 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
9013 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
9014 all_of(I->users(), [this](User *U) {
9015 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
9016 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9017 });
9018}
9019
9020static std::pair<InstructionCost, InstructionCost>
9023 ArrayRef<Type *> ArgTys) {
9025
9026 // Calculate the cost of the scalar and vector calls.
9027 FastMathFlags FMF;
9028 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9029 FMF = FPCI->getFastMathFlags();
9030 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
9031 auto IntrinsicCost =
9033
9034 auto Shape = VFShape::get(CI->getFunctionType(),
9036 false /*HasGlobalPred*/);
9037 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9038 auto LibCost = IntrinsicCost;
9039 if (!CI->isNoBuiltin() && VecFunc) {
9040 // Calculate the cost of the vector library call.
9041 // If the corresponding vector call is cheaper, return its cost.
9042 LibCost =
9043 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9044 }
9045 return {IntrinsicCost, LibCost};
9046}
9047
9048void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9049 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
9050 SmallVectorImpl<Value *> *OpScalars,
9051 SmallVectorImpl<Value *> *AltScalars) const {
9052 unsigned Sz = Scalars.size();
9053 Mask.assign(Sz, PoisonMaskElem);
9054 SmallVector<int> OrderMask;
9055 if (!ReorderIndices.empty())
9056 inversePermutation(ReorderIndices, OrderMask);
9057 for (unsigned I = 0; I < Sz; ++I) {
9058 unsigned Idx = I;
9059 if (!ReorderIndices.empty())
9060 Idx = OrderMask[I];
9061 if (isa<PoisonValue>(Scalars[Idx]))
9062 continue;
9063 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9064 if (IsAltOp(OpInst)) {
9065 Mask[I] = Sz + Idx;
9066 if (AltScalars)
9067 AltScalars->push_back(OpInst);
9068 } else {
9069 Mask[I] = Idx;
9070 if (OpScalars)
9071 OpScalars->push_back(OpInst);
9072 }
9073 }
9074 if (!ReuseShuffleIndices.empty()) {
9075 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9076 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9077 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9078 });
9079 Mask.swap(NewMask);
9080 }
9081}
9082
9084 const Instruction *MainOp,
9085 const Instruction *AltOp,
9086 const TargetLibraryInfo &TLI) {
9087 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9088 auto *AltCI = cast<CmpInst>(AltOp);
9089 CmpInst::Predicate MainP = MainCI->getPredicate();
9090 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
9091 assert(MainP != AltP && "Expected different main/alternate predicates.");
9092 auto *CI = cast<CmpInst>(I);
9093 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9094 return false;
9095 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9096 return true;
9097 CmpInst::Predicate P = CI->getPredicate();
9099
9100 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9101 "CmpInst expected to match either main or alternate predicate or "
9102 "their swap.");
9103 return MainP != P && MainP != SwappedP;
9104 }
9105 return I->getOpcode() == AltOp->getOpcode();
9106}
9107
9108TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9109 assert(!Ops.empty());
9110 const auto *Op0 = Ops.front();
9111
9112 const bool IsConstant = all_of(Ops, [](Value *V) {
9113 // TODO: We should allow undef elements here
9114 return isConstant(V) && !isa<UndefValue>(V);
9115 });
9116 const bool IsUniform = all_of(Ops, [=](Value *V) {
9117 // TODO: We should allow undef elements here
9118 return V == Op0;
9119 });
9120 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9121 // TODO: We should allow undef elements here
9122 if (auto *CI = dyn_cast<ConstantInt>(V))
9123 return CI->getValue().isPowerOf2();
9124 return false;
9125 });
9126 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9127 // TODO: We should allow undef elements here
9128 if (auto *CI = dyn_cast<ConstantInt>(V))
9129 return CI->getValue().isNegatedPowerOf2();
9130 return false;
9131 });
9132
9134 if (IsConstant && IsUniform)
9136 else if (IsConstant)
9138 else if (IsUniform)
9140
9142 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9143 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9144
9145 return {VK, VP};
9146}
9147
9148namespace {
9149/// The base class for shuffle instruction emission and shuffle cost estimation.
9150class BaseShuffleAnalysis {
9151protected:
9152 Type *ScalarTy = nullptr;
9153
9154 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9155
9156 /// V is expected to be a vectorized value.
9157 /// When REVEC is disabled, there is no difference between VF and
9158 /// VNumElements.
9159 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9160 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9161 /// of 8.
9162 unsigned getVF(Value *V) const {
9163 assert(V && "V cannot be nullptr");
9164 assert(isa<FixedVectorType>(V->getType()) &&
9165 "V does not have FixedVectorType");
9166 assert(ScalarTy && "ScalarTy cannot be nullptr");
9167 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9168 unsigned VNumElements =
9169 cast<FixedVectorType>(V->getType())->getNumElements();
9170 assert(VNumElements > ScalarTyNumElements &&
9171 "the number of elements of V is not large enough");
9172 assert(VNumElements % ScalarTyNumElements == 0 &&
9173 "the number of elements of V is not a vectorized value");
9174 return VNumElements / ScalarTyNumElements;
9175 }
9176
9177 /// Checks if the mask is an identity mask.
9178 /// \param IsStrict if is true the function returns false if mask size does
9179 /// not match vector size.
9180 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9181 bool IsStrict) {
9182 int Limit = Mask.size();
9183 int VF = VecTy->getNumElements();
9184 int Index = -1;
9185 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9186 return true;
9187 if (!IsStrict) {
9188 // Consider extract subvector starting from index 0.
9190 Index == 0)
9191 return true;
9192 // All VF-size submasks are identity (e.g.
9193 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9194 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9195 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9196 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9198 }))
9199 return true;
9200 }
9201 return false;
9202 }
9203
9204 /// Tries to combine 2 different masks into single one.
9205 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9206 /// change the size of the vector, \p LocalVF is the original size of the
9207 /// shuffled vector.
9208 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9209 ArrayRef<int> ExtMask) {
9210 unsigned VF = Mask.size();
9211 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9212 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9213 if (ExtMask[I] == PoisonMaskElem)
9214 continue;
9215 int MaskedIdx = Mask[ExtMask[I] % VF];
9216 NewMask[I] =
9217 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9218 }
9219 Mask.swap(NewMask);
9220 }
9221
9222 /// Looks through shuffles trying to reduce final number of shuffles in the
9223 /// code. The function looks through the previously emitted shuffle
9224 /// instructions and properly mark indices in mask as undef.
9225 /// For example, given the code
9226 /// \code
9227 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9228 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9229 /// \endcode
9230 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9231 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9232 /// <0, 1, 2, 3> for the shuffle.
9233 /// If 2 operands are of different size, the smallest one will be resized and
9234 /// the mask recalculated properly.
9235 /// For example, given the code
9236 /// \code
9237 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9238 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9239 /// \endcode
9240 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9241 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9242 /// <0, 1, 2, 3> for the shuffle.
9243 /// So, it tries to transform permutations to simple vector merge, if
9244 /// possible.
9245 /// \param V The input vector which must be shuffled using the given \p Mask.
9246 /// If the better candidate is found, \p V is set to this best candidate
9247 /// vector.
9248 /// \param Mask The input mask for the shuffle. If the best candidate is found
9249 /// during looking-through-shuffles attempt, it is updated accordingly.
9250 /// \param SinglePermute true if the shuffle operation is originally a
9251 /// single-value-permutation. In this case the look-through-shuffles procedure
9252 /// may look for resizing shuffles as the best candidates.
9253 /// \return true if the shuffle results in the non-resizing identity shuffle
9254 /// (and thus can be ignored), false - otherwise.
9255 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9256 bool SinglePermute) {
9257 Value *Op = V;
9258 ShuffleVectorInst *IdentityOp = nullptr;
9259 SmallVector<int> IdentityMask;
9260 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9261 // Exit if not a fixed vector type or changing size shuffle.
9262 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9263 if (!SVTy)
9264 break;
9265 // Remember the identity or broadcast mask, if it is not a resizing
9266 // shuffle. If no better candidates are found, this Op and Mask will be
9267 // used in the final shuffle.
9268 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9269 if (!IdentityOp || !SinglePermute ||
9270 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9272 IdentityMask.size()))) {
9273 IdentityOp = SV;
9274 // Store current mask in the IdentityMask so later we did not lost
9275 // this info if IdentityOp is selected as the best candidate for the
9276 // permutation.
9277 IdentityMask.assign(Mask);
9278 }
9279 }
9280 // Remember the broadcast mask. If no better candidates are found, this Op
9281 // and Mask will be used in the final shuffle.
9282 // Zero splat can be used as identity too, since it might be used with
9283 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9284 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9285 // expensive, the analysis founds out, that the source vector is just a
9286 // broadcast, this original mask can be transformed to identity mask <0,
9287 // 1, 2, 3>.
9288 // \code
9289 // %0 = shuffle %v, poison, zeroinitalizer
9290 // %res = shuffle %0, poison, <3, 1, 2, 0>
9291 // \endcode
9292 // may be transformed to
9293 // \code
9294 // %0 = shuffle %v, poison, zeroinitalizer
9295 // %res = shuffle %0, poison, <0, 1, 2, 3>
9296 // \endcode
9297 if (SV->isZeroEltSplat()) {
9298 IdentityOp = SV;
9299 IdentityMask.assign(Mask);
9300 }
9301 int LocalVF = Mask.size();
9302 if (auto *SVOpTy =
9303 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9304 LocalVF = SVOpTy->getNumElements();
9305 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9306 for (auto [Idx, I] : enumerate(Mask)) {
9307 if (I == PoisonMaskElem ||
9308 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9309 continue;
9310 ExtMask[Idx] = SV->getMaskValue(I);
9311 }
9312 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9313 SV->getOperand(0),
9314 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9315 .all();
9316 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9317 SV->getOperand(1),
9318 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9319 .all();
9320 if (!IsOp1Undef && !IsOp2Undef) {
9321 // Update mask and mark undef elems.
9322 for (int &I : Mask) {
9323 if (I == PoisonMaskElem)
9324 continue;
9325 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9327 I = PoisonMaskElem;
9328 }
9329 break;
9330 }
9331 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9332 combineMasks(LocalVF, ShuffleMask, Mask);
9333 Mask.swap(ShuffleMask);
9334 if (IsOp2Undef)
9335 Op = SV->getOperand(0);
9336 else
9337 Op = SV->getOperand(1);
9338 }
9339 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9340 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9342 if (IdentityOp) {
9343 V = IdentityOp;
9344 assert(Mask.size() == IdentityMask.size() &&
9345 "Expected masks of same sizes.");
9346 // Clear known poison elements.
9347 for (auto [I, Idx] : enumerate(Mask))
9348 if (Idx == PoisonMaskElem)
9349 IdentityMask[I] = PoisonMaskElem;
9350 Mask.swap(IdentityMask);
9351 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9352 return SinglePermute &&
9353 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9354 /*IsStrict=*/true) ||
9355 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9356 Shuffle->isZeroEltSplat() &&
9358 }
9359 V = Op;
9360 return false;
9361 }
9362 V = Op;
9363 return true;
9364 }
9365
9366 /// Smart shuffle instruction emission, walks through shuffles trees and
9367 /// tries to find the best matching vector for the actual shuffle
9368 /// instruction.
9369 template <typename T, typename ShuffleBuilderTy>
9370 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9371 ShuffleBuilderTy &Builder, Type *ScalarTy) {
9372 assert(V1 && "Expected at least one vector value.");
9373 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9374 SmallVector<int> NewMask(Mask);
9375 if (ScalarTyNumElements != 1) {
9376 assert(SLPReVec && "FixedVectorType is not expected.");
9377 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
9378 Mask = NewMask;
9379 }
9380 if (V2)
9381 Builder.resizeToMatch(V1, V2);
9382 int VF = Mask.size();
9383 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9384 VF = FTy->getNumElements();
9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9386 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9387 .all()) {
9388 // Peek through shuffles.
9389 Value *Op1 = V1;
9390 Value *Op2 = V2;
9391 int VF =
9392 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9393 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9394 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9395 for (int I = 0, E = Mask.size(); I < E; ++I) {
9396 if (Mask[I] < VF)
9397 CombinedMask1[I] = Mask[I];
9398 else
9399 CombinedMask2[I] = Mask[I] - VF;
9400 }
9401 Value *PrevOp1;
9402 Value *PrevOp2;
9403 do {
9404 PrevOp1 = Op1;
9405 PrevOp2 = Op2;
9406 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9407 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9408 // Check if we have 2 resizing shuffles - need to peek through operands
9409 // again.
9410 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9411 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9412 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9413 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9414 if (I == PoisonMaskElem)
9415 continue;
9416 ExtMask1[Idx] = SV1->getMaskValue(I);
9417 }
9418 SmallBitVector UseMask1 = buildUseMask(
9419 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9420 ->getNumElements(),
9421 ExtMask1, UseMask::SecondArg);
9422 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9423 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9424 if (I == PoisonMaskElem)
9425 continue;
9426 ExtMask2[Idx] = SV2->getMaskValue(I);
9427 }
9428 SmallBitVector UseMask2 = buildUseMask(
9429 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9430 ->getNumElements(),
9431 ExtMask2, UseMask::SecondArg);
9432 if (SV1->getOperand(0)->getType() ==
9433 SV2->getOperand(0)->getType() &&
9434 SV1->getOperand(0)->getType() != SV1->getType() &&
9435 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9436 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9437 Op1 = SV1->getOperand(0);
9438 Op2 = SV2->getOperand(0);
9439 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9440 int LocalVF = ShuffleMask1.size();
9441 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9442 LocalVF = FTy->getNumElements();
9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9444 CombinedMask1.swap(ShuffleMask1);
9445 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9446 LocalVF = ShuffleMask2.size();
9447 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9450 CombinedMask2.swap(ShuffleMask2);
9451 }
9452 }
9453 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9454 Builder.resizeToMatch(Op1, Op2);
9455 VF = std::max(cast<VectorType>(Op1->getType())
9456 ->getElementCount()
9457 .getKnownMinValue(),
9458 cast<VectorType>(Op2->getType())
9459 ->getElementCount()
9460 .getKnownMinValue());
9461 for (int I = 0, E = Mask.size(); I < E; ++I) {
9462 if (CombinedMask2[I] != PoisonMaskElem) {
9463 assert(CombinedMask1[I] == PoisonMaskElem &&
9464 "Expected undefined mask element");
9465 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9466 }
9467 }
9468 if (Op1 == Op2 &&
9469 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9470 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9471 isa<ShuffleVectorInst>(Op1) &&
9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9473 ArrayRef(CombinedMask1))))
9474 return Builder.createIdentity(Op1);
9475 return Builder.createShuffleVector(
9476 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9477 CombinedMask1);
9478 }
9479 if (isa<PoisonValue>(V1))
9480 return Builder.createPoison(
9481 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9482 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9483 assert(V1 && "Expected non-null value after looking through shuffles.");
9484
9485 if (!IsIdentity)
9486 return Builder.createShuffleVector(V1, NewMask);
9487 return Builder.createIdentity(V1);
9488 }
9489
9490 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
9491 /// shuffle emission.
9492 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
9493 ArrayRef<int> Mask) {
9494 for (unsigned I : seq<unsigned>(CommonMask.size()))
9495 if (Mask[I] != PoisonMaskElem)
9496 CommonMask[I] = I;
9497 }
9498};
9499} // namespace
9500
9501/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9502static std::pair<InstructionCost, InstructionCost>
9504 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9505 Type *ScalarTy, VectorType *VecTy) {
9506 InstructionCost ScalarCost = 0;
9507 InstructionCost VecCost = 0;
9508 // Here we differentiate two cases: (1) when Ptrs represent a regular
9509 // vectorization tree node (as they are pointer arguments of scattered
9510 // loads) or (2) when Ptrs are the arguments of loads or stores being
9511 // vectorized as plane wide unit-stride load/store since all the
9512 // loads/stores are known to be from/to adjacent locations.
9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9514 // Case 2: estimate costs for pointer related costs when vectorizing to
9515 // a wide load/store.
9516 // Scalar cost is estimated as a set of pointers with known relationship
9517 // between them.
9518 // For vector code we will use BasePtr as argument for the wide load/store
9519 // but we also need to account all the instructions which are going to
9520 // stay in vectorized code due to uses outside of these scalar
9521 // loads/stores.
9522 ScalarCost = TTI.getPointersChainCost(
9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9524 CostKind);
9525
9526 SmallVector<const Value *> PtrsRetainedInVecCode;
9527 for (Value *V : Ptrs) {
9528 if (V == BasePtr) {
9529 PtrsRetainedInVecCode.push_back(V);
9530 continue;
9531 }
9532 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9533 // For simplicity assume Ptr to stay in vectorized code if it's not a
9534 // GEP instruction. We don't care since it's cost considered free.
9535 // TODO: We should check for any uses outside of vectorizable tree
9536 // rather than just single use.
9537 if (!Ptr || !Ptr->hasOneUse())
9538 PtrsRetainedInVecCode.push_back(V);
9539 }
9540
9541 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9542 // If all pointers stay in vectorized code then we don't have
9543 // any savings on that.
9544 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9545 }
9546 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9547 TTI::PointersChainInfo::getKnownStride(),
9548 VecTy, CostKind);
9549 } else {
9550 // Case 1: Ptrs are the arguments of loads that we are going to transform
9551 // into masked gather load intrinsic.
9552 // All the scalar GEPs will be removed as a result of vectorization.
9553 // For any external uses of some lanes extract element instructions will
9554 // be generated (which cost is estimated separately).
9555 TTI::PointersChainInfo PtrsInfo =
9556 all_of(Ptrs,
9557 [](const Value *V) {
9558 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9559 return Ptr && !Ptr->hasAllConstantIndices();
9560 })
9561 ? TTI::PointersChainInfo::getUnknownStride()
9562 : TTI::PointersChainInfo::getKnownStride();
9563
9564 ScalarCost =
9565 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9567 if (!BaseGEP) {
9568 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9569 if (It != Ptrs.end())
9570 BaseGEP = cast<GEPOperator>(*It);
9571 }
9572 if (BaseGEP) {
9573 SmallVector<const Value *> Indices(BaseGEP->indices());
9574 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9575 BaseGEP->getPointerOperand(), Indices, VecTy,
9576 CostKind);
9577 }
9578 }
9579
9580 return std::make_pair(ScalarCost, VecCost);
9581}
9582
9583void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9584 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9585 "Expected gather node without reordering.");
9587 SmallSet<size_t, 2> LoadKeyUsed;
9588
9589 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9590 // instructions have same opcode already.
9591 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
9592 all_of(TE.Scalars, isConstant))
9593 return;
9594
9595 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9596 return VectorizableTree[Idx]->isSame(TE.Scalars);
9597 }))
9598 return;
9599
9600 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9601 Key = hash_combine(hash_value(LI->getParent()), Key);
9602 Value *Ptr =
9604 if (LoadKeyUsed.contains(Key)) {
9605 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9606 if (LIt != LoadsMap.end()) {
9607 for (LoadInst *RLI : LIt->second) {
9608 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9609 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9610 /*StrictCheck=*/true))
9611 return hash_value(RLI->getPointerOperand());
9612 }
9613 for (LoadInst *RLI : LIt->second) {
9615 LI->getPointerOperand(), *TLI)) {
9616 hash_code SubKey = hash_value(RLI->getPointerOperand());
9617 return SubKey;
9618 }
9619 }
9620 if (LIt->second.size() > 2) {
9621 hash_code SubKey =
9622 hash_value(LIt->second.back()->getPointerOperand());
9623 return SubKey;
9624 }
9625 }
9626 }
9627 LoadKeyUsed.insert(Key);
9628 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9629 return hash_value(LI->getPointerOperand());
9630 };
9633 bool IsOrdered = true;
9634 unsigned NumInstructions = 0;
9635 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9636 // nodes.
9637 for (auto [I, V] : enumerate(TE.Scalars)) {
9638 size_t Key = 1, Idx = 1;
9639 if (auto *Inst = dyn_cast<Instruction>(V);
9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9641 !isDeleted(Inst) && !isVectorized(V)) {
9642 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9643 /*AllowAlternate=*/false);
9644 ++NumInstructions;
9645 }
9646 auto &Container = SortedValues[Key];
9647 if (IsOrdered && !KeyToIndex.contains(V) &&
9648 !(isa<Constant, ExtractElementInst>(V) ||
9650 ((Container.contains(Idx) &&
9651 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9652 (!Container.empty() && !Container.contains(Idx) &&
9653 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9654 IsOrdered = false;
9655 auto &KTI = KeyToIndex[V];
9656 if (KTI.empty())
9657 Container[Idx].push_back(V);
9658 KTI.push_back(I);
9659 }
9661 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9662 if (!IsOrdered && NumInstructions > 1) {
9663 unsigned Cnt = 0;
9664 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9665 for (const auto &D : SortedValues) {
9666 for (const auto &P : D.second) {
9667 unsigned Sz = 0;
9668 for (Value *V : P.second) {
9669 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9670 for (auto [K, Idx] : enumerate(Indices)) {
9671 TE.ReorderIndices[Cnt + K] = Idx;
9672 TE.Scalars[Cnt + K] = V;
9673 }
9674 Sz += Indices.size();
9675 Cnt += Indices.size();
9676 }
9677 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9678 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9679 *TTI, TE.Scalars.front()->getType(), Sz);
9680 SubVectors.emplace_back(Cnt - Sz, SubVF);
9681 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9682 DemandedElts.clearBit(I);
9683 } else if (!P.second.empty() && isConstant(P.second.front())) {
9684 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9685 DemandedElts.clearBit(I);
9686 }
9687 }
9688 }
9689 }
9690 // Reuses always require shuffles, so consider it as profitable.
9691 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9692 return;
9693 // Do simple cost estimation.
9696 auto *ScalarTy = TE.Scalars.front()->getType();
9697 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9698 for (auto [Idx, Sz] : SubVectors) {
9700 Idx, getWidenedType(ScalarTy, Sz));
9701 }
9702 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9703 assert(SLPReVec && "Only supported by REVEC.");
9704 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9705 // of CreateInsertElement.
9706 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9707 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9708 if (DemandedElts[I])
9709 Cost +=
9710 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9711 CostKind, I * ScalarTyNumElements, FTy);
9712 } else {
9713 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9714 /*Extract=*/false, CostKind);
9715 }
9716 int Sz = TE.Scalars.size();
9717 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9718 TE.ReorderIndices.end());
9719 for (unsigned I : seq<unsigned>(Sz)) {
9720 Value *V = TE.getOrdered(I);
9721 if (isa<PoisonValue>(V)) {
9722 ReorderMask[I] = PoisonMaskElem;
9723 } else if (isConstant(V) || DemandedElts[I]) {
9724 ReorderMask[I] = I + TE.ReorderIndices.size();
9725 }
9726 }
9728 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9731 VecTy, ReorderMask);
9732 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9733 ReorderMask.assign(Sz, PoisonMaskElem);
9734 for (unsigned I : seq<unsigned>(Sz)) {
9735 Value *V = TE.getOrdered(I);
9736 if (isConstant(V)) {
9737 DemandedElts.clearBit(I);
9738 if (!isa<PoisonValue>(V))
9739 ReorderMask[I] = I;
9740 } else {
9741 ReorderMask[I] = I + Sz;
9742 }
9743 }
9745 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9746 if (!DemandedElts.isAllOnes())
9747 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9748 if (Cost >= BVCost) {
9749 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9750 reorderScalars(TE.Scalars, Mask);
9751 TE.ReorderIndices.clear();
9752 }
9753}
9754
9757 BaseGraphSize = VectorizableTree.size();
9758 // Turn graph transforming mode on and off, when done.
9759 class GraphTransformModeRAAI {
9760 bool &SavedIsGraphTransformMode;
9761
9762 public:
9763 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9764 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9765 IsGraphTransformMode = true;
9766 }
9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9768 } TransformContext(IsGraphTransformMode);
9769 // Operands are profitable if they are:
9770 // 1. At least one constant
9771 // or
9772 // 2. Splats
9773 // or
9774 // 3. Results in good vectorization opportunity, i.e. may generate vector
9775 // nodes and reduce cost of the graph.
9776 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9777 const InstructionsState &S) {
9779 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9780 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9781 I2->getOperand(Op));
9782 return all_of(
9783 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9784 return all_of(Cand,
9785 [](const std::pair<Value *, Value *> &P) {
9786 return isa<Constant>(P.first) ||
9787 isa<Constant>(P.second) || P.first == P.second;
9788 }) ||
9790 });
9791 };
9792
9793 // Try to reorder gather nodes for better vectorization opportunities.
9794 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9795 TreeEntry &E = *VectorizableTree[Idx];
9796 if (E.isGather())
9797 reorderGatherNode(E);
9798 }
9799
9800 // The tree may grow here, so iterate over nodes, built before.
9801 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9802 TreeEntry &E = *VectorizableTree[Idx];
9803 if (E.isGather()) {
9804 ArrayRef<Value *> VL = E.Scalars;
9805 const unsigned Sz = getVectorElementSize(VL.front());
9806 unsigned MinVF = getMinVF(2 * Sz);
9807 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9808 // same opcode and same parent block or all constants.
9809 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9810 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9811 E.isAltShuffle() || !allSameBlock(VL)) ||
9812 allConstant(VL) || isSplat(VL))
9813 continue;
9814 // Try to find vectorizable sequences and transform them into a series of
9815 // insertvector instructions.
9816 unsigned StartIdx = 0;
9817 unsigned End = VL.size();
9818 for (unsigned VF = getFloorFullVectorNumberOfElements(
9819 *TTI, VL.front()->getType(), VL.size() - 1);
9820 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9821 *TTI, VL.front()->getType(), VF - 1)) {
9822 if (StartIdx + VF > End)
9823 continue;
9825 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9826 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9827 // If any instruction is vectorized already - do not try again.
9828 // Reuse the existing node, if it fully matches the slice.
9829 if (isVectorized(Slice.front()) &&
9830 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
9831 continue;
9832 // Constant already handled effectively - skip.
9833 if (allConstant(Slice))
9834 continue;
9835 // Do not try to vectorize small splats (less than vector register and
9836 // only with the single non-undef element).
9837 bool IsSplat = isSplat(Slice);
9838 bool IsTwoRegisterSplat = true;
9839 if (IsSplat && VF == 2) {
9840 unsigned NumRegs2VF = ::getNumberOfParts(
9841 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
9842 IsTwoRegisterSplat = NumRegs2VF == 2;
9843 }
9844 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
9845 count(Slice, Slice.front()) ==
9846 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9847 : 1)) {
9848 if (IsSplat)
9849 continue;
9850 InstructionsState S = getSameOpcode(Slice, *TLI);
9851 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9852 (S.getOpcode() == Instruction::Load &&
9854 (S.getOpcode() != Instruction::Load &&
9855 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9856 continue;
9857 if (VF == 2) {
9858 // Try to vectorize reduced values or if all users are vectorized.
9859 // For expensive instructions extra extracts might be profitable.
9860 if ((!UserIgnoreList || E.Idx != 0) &&
9861 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9863 !all_of(Slice, [&](Value *V) {
9864 if (isa<PoisonValue>(V))
9865 return true;
9866 return areAllUsersVectorized(cast<Instruction>(V),
9867 UserIgnoreList);
9868 }))
9869 continue;
9870 if (S.getOpcode() == Instruction::Load) {
9871 OrdersType Order;
9872 SmallVector<Value *> PointerOps;
9873 LoadsState Res =
9874 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9875 // Do not vectorize gathers.
9876 if (Res == LoadsState::ScatterVectorize ||
9877 Res == LoadsState::Gather) {
9878 if (Res == LoadsState::Gather) {
9880 // If reductions and the scalars from the root node are
9881 // analyzed - mark as non-vectorizable reduction.
9882 if (UserIgnoreList && E.Idx == 0)
9883 analyzedReductionVals(Slice);
9884 }
9885 continue;
9886 }
9887 } else if (S.getOpcode() == Instruction::ExtractElement ||
9888 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9890 !CheckOperandsProfitability(
9891 S.getMainOp(),
9892 cast<Instruction>(*find_if(reverse(Slice),
9893 IsaPred<Instruction>)),
9894 S))) {
9895 // Do not vectorize extractelements (handled effectively
9896 // alread). Do not vectorize non-profitable instructions (with
9897 // low cost and non-vectorizable operands.)
9898 continue;
9899 }
9900 }
9901 }
9902 Slices.emplace_back(Cnt, Slice.size());
9903 }
9904 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9905 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9906 if (StartIdx == Cnt)
9907 StartIdx = Cnt + Sz;
9908 if (End == Cnt + Sz)
9909 End = Cnt;
9910 };
9911 for (auto [Cnt, Sz] : Slices) {
9912 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9913 // If any instruction is vectorized already - do not try again.
9914 if (TreeEntry *SE = getSameValuesTreeEntry(Slice.front(), Slice,
9915 /*SameVF=*/true)) {
9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9917 AddCombinedNode(SE->Idx, Cnt, Sz);
9918 continue;
9919 }
9920 unsigned PrevSize = VectorizableTree.size();
9921 [[maybe_unused]] unsigned PrevEntriesSize =
9922 LoadEntriesToVectorize.size();
9923 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9924 if (PrevSize + 1 == VectorizableTree.size() &&
9925 VectorizableTree[PrevSize]->isGather() &&
9926 VectorizableTree[PrevSize]->hasState() &&
9927 VectorizableTree[PrevSize]->getOpcode() !=
9928 Instruction::ExtractElement &&
9929 !isSplat(Slice)) {
9930 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9931 analyzedReductionVals(Slice);
9932 VectorizableTree.pop_back();
9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9934 "LoadEntriesToVectorize expected to remain the same");
9935 continue;
9936 }
9937 AddCombinedNode(PrevSize, Cnt, Sz);
9938 }
9939 }
9940 // Restore ordering, if no extra vectorization happened.
9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9942 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9943 reorderScalars(E.Scalars, Mask);
9944 E.ReorderIndices.clear();
9945 }
9946 }
9947 if (!E.hasState())
9948 continue;
9949 switch (E.getOpcode()) {
9950 case Instruction::Load: {
9951 // No need to reorder masked gather loads, just reorder the scalar
9952 // operands.
9953 if (E.State != TreeEntry::Vectorize)
9954 break;
9955 Type *ScalarTy = E.getMainOp()->getType();
9956 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9958 // Check if profitable to represent consecutive load + reverse as strided
9959 // load with stride -1.
9960 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9961 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9962 SmallVector<int> Mask;
9963 inversePermutation(E.ReorderIndices, Mask);
9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9965 InstructionCost OriginalVecCost =
9966 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9972 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9973 if (StridedCost < OriginalVecCost)
9974 // Strided load is more profitable than consecutive load + reverse -
9975 // transform the node to strided load.
9976 E.State = TreeEntry::StridedVectorize;
9977 }
9978 break;
9979 }
9980 case Instruction::Store: {
9981 Type *ScalarTy =
9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9983 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9985 // Check if profitable to represent consecutive load + reverse as strided
9986 // load with stride -1.
9987 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9988 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9989 SmallVector<int> Mask;
9990 inversePermutation(E.ReorderIndices, Mask);
9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9992 InstructionCost OriginalVecCost =
9993 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9999 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
10000 if (StridedCost < OriginalVecCost)
10001 // Strided store is more profitable than reverse + consecutive store -
10002 // transform the node to strided store.
10003 E.State = TreeEntry::StridedVectorize;
10004 } else if (!E.ReorderIndices.empty()) {
10005 // Check for interleaved stores.
10006 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10008 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
10009 if (Mask.size() < 4)
10010 return 0u;
10011 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10013 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
10015 VecTy, Factor, BaseSI->getAlign(),
10016 BaseSI->getPointerAddressSpace()))
10017 return Factor;
10018 }
10019
10020 return 0u;
10021 };
10022 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10023 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10024 if (InterleaveFactor != 0)
10025 E.setInterleave(InterleaveFactor);
10026 }
10027 break;
10028 }
10029 case Instruction::Select: {
10030 if (E.State != TreeEntry::Vectorize)
10031 break;
10032 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
10033 if (MinMaxID == Intrinsic::not_intrinsic)
10034 break;
10035 // This node is a minmax node.
10036 E.CombinedOp = TreeEntry::MinMax;
10037 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10039 CondEntry->State == TreeEntry::Vectorize) {
10040 // The condition node is part of the combined minmax node.
10041 CondEntry->State = TreeEntry::CombinedVectorize;
10042 }
10043 break;
10044 }
10045 default:
10046 break;
10047 }
10048 }
10049
10050 if (LoadEntriesToVectorize.empty()) {
10051 // Single load node - exit.
10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10053 VectorizableTree.front()->getOpcode() == Instruction::Load)
10054 return;
10055 // Small graph with small VF - exit.
10056 constexpr unsigned SmallTree = 3;
10057 constexpr unsigned SmallVF = 2;
10058 if ((VectorizableTree.size() <= SmallTree &&
10059 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10060 (VectorizableTree.size() <= 2 && UserIgnoreList))
10061 return;
10062
10063 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10064 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
10065 getCanonicalGraphSize() <= SmallTree &&
10066 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10067 [](const std::unique_ptr<TreeEntry> &TE) {
10068 return TE->isGather() && TE->hasState() &&
10069 TE->getOpcode() == Instruction::Load &&
10070 !allSameBlock(TE->Scalars);
10071 }) == 1)
10072 return;
10073 }
10074
10075 // A list of loads to be gathered during the vectorization process. We can
10076 // try to vectorize them at the end, if profitable.
10079 GatheredLoads;
10080
10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10082 TreeEntry &E = *TE;
10083 if (E.isGather() &&
10084 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10085 (!E.hasState() && any_of(E.Scalars,
10086 [&](Value *V) {
10087 return isa<LoadInst>(V) &&
10088 !isVectorized(V) &&
10089 !isDeleted(cast<Instruction>(V));
10090 }))) &&
10091 !isSplat(E.Scalars)) {
10092 for (Value *V : E.Scalars) {
10093 auto *LI = dyn_cast<LoadInst>(V);
10094 if (!LI)
10095 continue;
10096 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10097 continue;
10099 *this, V, *DL, *SE, *TTI,
10100 GatheredLoads[std::make_tuple(
10101 LI->getParent(),
10103 LI->getType())]);
10104 }
10105 }
10106 }
10107 // Try to vectorize gathered loads if this is not just a gather of loads.
10108 if (!GatheredLoads.empty())
10109 tryToVectorizeGatheredLoads(GatheredLoads);
10110}
10111
10112/// Merges shuffle masks and emits final shuffle instruction, if required. It
10113/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10114/// when the actual shuffle instruction is generated only if this is actually
10115/// required. Otherwise, the shuffle instruction emission is delayed till the
10116/// end of the process, to reduce the number of emitted instructions and further
10117/// analysis/transformations.
10118class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10119 bool IsFinalized = false;
10120 SmallVector<int> CommonMask;
10122 const TargetTransformInfo &TTI;
10124 SmallDenseSet<Value *> VectorizedVals;
10125 BoUpSLP &R;
10126 SmallPtrSetImpl<Value *> &CheckedExtracts;
10127 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10128 /// While set, still trying to estimate the cost for the same nodes and we
10129 /// can delay actual cost estimation (virtual shuffle instruction emission).
10130 /// May help better estimate the cost if same nodes must be permuted + allows
10131 /// to move most of the long shuffles cost estimation to TTI.
10132 bool SameNodesEstimated = true;
10133
10134 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10135 if (Ty->getScalarType()->isPointerTy()) {
10139 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10140 Ty->getScalarType());
10141 if (auto *VTy = dyn_cast<VectorType>(Ty))
10142 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10143 return Res;
10144 }
10145 return Constant::getAllOnesValue(Ty);
10146 }
10147
10148 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10149 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10150 return TTI::TCC_Free;
10151 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10152 InstructionCost GatherCost = 0;
10153 SmallVector<Value *> Gathers(VL);
10154 if (!Root && isSplat(VL)) {
10155 // Found the broadcasting of the single scalar, calculate the cost as
10156 // the broadcast.
10157 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10158 assert(It != VL.end() && "Expected at least one non-undef value.");
10159 // Add broadcast for non-identity shuffle only.
10160 bool NeedShuffle =
10161 count(VL, *It) > 1 &&
10162 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10163 if (!NeedShuffle) {
10164 if (isa<FixedVectorType>(ScalarTy)) {
10165 assert(SLPReVec && "FixedVectorType is not expected.");
10166 return TTI.getShuffleCost(
10167 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10168 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10169 cast<FixedVectorType>(ScalarTy));
10170 }
10171 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10172 CostKind, std::distance(VL.begin(), It),
10173 PoisonValue::get(VecTy), *It);
10174 }
10175
10176 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10177 transform(VL, ShuffleMask.begin(), [](Value *V) {
10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10179 });
10180 InstructionCost InsertCost =
10181 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10182 PoisonValue::get(VecTy), *It);
10183 return InsertCost + ::getShuffleCost(TTI,
10185 VecTy, ShuffleMask, CostKind,
10186 /*Index=*/0, /*SubTp=*/nullptr,
10187 /*Args=*/*It);
10188 }
10189 return GatherCost +
10190 (all_of(Gathers, IsaPred<UndefValue>)
10192 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10193 ScalarTy));
10194 };
10195
10196 /// Compute the cost of creating a vector containing the extracted values from
10197 /// \p VL.
10199 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10201 unsigned NumParts) {
10202 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10203 unsigned NumElts =
10204 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10205 auto *EE = dyn_cast<ExtractElementInst>(V);
10206 if (!EE)
10207 return Sz;
10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10209 if (!VecTy)
10210 return Sz;
10211 return std::max(Sz, VecTy->getNumElements());
10212 });
10213 // FIXME: this must be moved to TTI for better estimation.
10214 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10215 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10217 -> std::optional<TTI::ShuffleKind> {
10218 if (NumElts <= EltsPerVector)
10219 return std::nullopt;
10220 int OffsetReg0 =
10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10222 [](int S, int I) {
10223 if (I == PoisonMaskElem)
10224 return S;
10225 return std::min(S, I);
10226 }),
10227 EltsPerVector);
10228 int OffsetReg1 = OffsetReg0;
10229 DenseSet<int> RegIndices;
10230 // Check that if trying to permute same single/2 input vectors.
10232 int FirstRegId = -1;
10233 Indices.assign(1, OffsetReg0);
10234 for (auto [Pos, I] : enumerate(Mask)) {
10235 if (I == PoisonMaskElem)
10236 continue;
10237 int Idx = I - OffsetReg0;
10238 int RegId =
10239 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10240 if (FirstRegId < 0)
10241 FirstRegId = RegId;
10242 RegIndices.insert(RegId);
10243 if (RegIndices.size() > 2)
10244 return std::nullopt;
10245 if (RegIndices.size() == 2) {
10246 ShuffleKind = TTI::SK_PermuteTwoSrc;
10247 if (Indices.size() == 1) {
10248 OffsetReg1 = alignDown(
10249 std::accumulate(
10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10251 [&](int S, int I) {
10252 if (I == PoisonMaskElem)
10253 return S;
10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10255 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10256 if (RegId == FirstRegId)
10257 return S;
10258 return std::min(S, I);
10259 }),
10260 EltsPerVector);
10261 Indices.push_back(OffsetReg1 % NumElts);
10262 }
10263 Idx = I - OffsetReg1;
10264 }
10265 I = (Idx % NumElts) % EltsPerVector +
10266 (RegId == FirstRegId ? 0 : EltsPerVector);
10267 }
10268 return ShuffleKind;
10269 };
10271
10272 // Process extracts in blocks of EltsPerVector to check if the source vector
10273 // operand can be re-used directly. If not, add the cost of creating a
10274 // shuffle to extract the values into a vector register.
10275 for (unsigned Part : seq<unsigned>(NumParts)) {
10276 if (!ShuffleKinds[Part])
10277 continue;
10278 ArrayRef<int> MaskSlice = Mask.slice(
10279 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10280 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10281 copy(MaskSlice, SubMask.begin());
10283 std::optional<TTI::ShuffleKind> RegShuffleKind =
10284 CheckPerRegistersShuffle(SubMask, Indices);
10285 if (!RegShuffleKind) {
10286 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10289 Cost +=
10290 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10291 getWidenedType(ScalarTy, NumElts), MaskSlice);
10292 continue;
10293 }
10294 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10295 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10296 Cost +=
10297 ::getShuffleCost(TTI, *RegShuffleKind,
10298 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10299 }
10300 const unsigned BaseVF = getFullVectorNumberOfElements(
10301 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10302 for (unsigned Idx : Indices) {
10303 assert((Idx + EltsPerVector) <= BaseVF &&
10304 "SK_ExtractSubvector index out of range");
10306 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10307 Idx, getWidenedType(ScalarTy, EltsPerVector));
10308 }
10309 // Second attempt to check, if just a permute is better estimated than
10310 // subvector extract.
10311 SubMask.assign(NumElts, PoisonMaskElem);
10312 copy(MaskSlice, SubMask.begin());
10313 InstructionCost OriginalCost = ::getShuffleCost(
10314 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10315 if (OriginalCost < Cost)
10316 Cost = OriginalCost;
10317 }
10318 return Cost;
10319 }
10320 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10321 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10322 /// elements.
10323 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10324 ArrayRef<int> Mask, unsigned Part,
10325 unsigned SliceSize) {
10326 if (SameNodesEstimated) {
10327 // Delay the cost estimation if the same nodes are reshuffling.
10328 // If we already requested the cost of reshuffling of E1 and E2 before, no
10329 // need to estimate another cost with the sub-Mask, instead include this
10330 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10331 // estimation.
10332 if ((InVectors.size() == 2 &&
10333 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10334 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10335 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10336 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10337 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10338 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10339 "Expected all poisoned elements.");
10340 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10341 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10342 return;
10343 }
10344 // Found non-matching nodes - need to estimate the cost for the matched
10345 // and transform mask.
10346 Cost += createShuffle(InVectors.front(),
10347 InVectors.size() == 1 ? nullptr : InVectors.back(),
10348 CommonMask);
10349 transformMaskAfterShuffle(CommonMask, CommonMask);
10350 } else if (InVectors.size() == 2) {
10351 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10352 transformMaskAfterShuffle(CommonMask, CommonMask);
10353 }
10354 SameNodesEstimated = false;
10355 if (!E2 && InVectors.size() == 1) {
10356 unsigned VF = E1.getVectorFactor();
10357 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10358 VF = std::max(VF,
10359 cast<FixedVectorType>(V1->getType())->getNumElements());
10360 } else {
10361 const auto *E = cast<const TreeEntry *>(InVectors.front());
10362 VF = std::max(VF, E->getVectorFactor());
10363 }
10364 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10365 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10366 CommonMask[Idx] = Mask[Idx] + VF;
10367 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10368 transformMaskAfterShuffle(CommonMask, CommonMask);
10369 } else {
10370 auto P = InVectors.front();
10371 Cost += createShuffle(&E1, E2, Mask);
10372 unsigned VF = Mask.size();
10373 if (Value *V1 = P.dyn_cast<Value *>()) {
10374 VF = std::max(VF,
10375 getNumElements(V1->getType()));
10376 } else {
10377 const auto *E = cast<const TreeEntry *>(P);
10378 VF = std::max(VF, E->getVectorFactor());
10379 }
10380 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10381 if (Mask[Idx] != PoisonMaskElem)
10382 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10383 Cost += createShuffle(P, InVectors.front(), CommonMask);
10384 transformMaskAfterShuffle(CommonMask, CommonMask);
10385 }
10386 }
10387
10388 class ShuffleCostBuilder {
10389 const TargetTransformInfo &TTI;
10390
10391 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10392 int Index = -1;
10393 return Mask.empty() ||
10394 (VF == Mask.size() &&
10397 Index == 0);
10398 }
10399
10400 public:
10401 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10402 ~ShuffleCostBuilder() = default;
10403 InstructionCost createShuffleVector(Value *V1, Value *,
10404 ArrayRef<int> Mask) const {
10405 // Empty mask or identity mask are free.
10406 unsigned VF =
10407 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10408 if (isEmptyOrIdentity(Mask, VF))
10409 return TTI::TCC_Free;
10410 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10411 cast<VectorType>(V1->getType()), Mask);
10412 }
10413 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10414 // Empty mask or identity mask are free.
10415 unsigned VF =
10416 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10417 if (isEmptyOrIdentity(Mask, VF))
10418 return TTI::TCC_Free;
10419 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10420 cast<VectorType>(V1->getType()), Mask);
10421 }
10422 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10423 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10424 return TTI::TCC_Free;
10425 }
10426 void resizeToMatch(Value *&, Value *&) const {}
10427 };
10428
10429 /// Smart shuffle instruction emission, walks through shuffles trees and
10430 /// tries to find the best matching vector for the actual shuffle
10431 /// instruction.
10433 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10435 ArrayRef<int> Mask) {
10436 ShuffleCostBuilder Builder(TTI);
10437 SmallVector<int> CommonMask(Mask);
10438 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10439 unsigned CommonVF = Mask.size();
10440 InstructionCost ExtraCost = 0;
10441 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10442 unsigned VF) -> InstructionCost {
10443 if (E.isGather() && allConstant(E.Scalars))
10444 return TTI::TCC_Free;
10445 Type *EScalarTy = E.Scalars.front()->getType();
10446 bool IsSigned = true;
10447 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10448 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10449 IsSigned = It->second.second;
10450 }
10451 if (EScalarTy != ScalarTy) {
10452 unsigned CastOpcode = Instruction::Trunc;
10453 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10454 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10455 if (DstSz > SrcSz)
10456 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10457 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10458 getWidenedType(EScalarTy, VF),
10459 TTI::CastContextHint::None, CostKind);
10460 }
10461 return TTI::TCC_Free;
10462 };
10463 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10464 if (isa<Constant>(V))
10465 return TTI::TCC_Free;
10466 auto *VecTy = cast<VectorType>(V->getType());
10467 Type *EScalarTy = VecTy->getElementType();
10468 if (EScalarTy != ScalarTy) {
10469 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10470 unsigned CastOpcode = Instruction::Trunc;
10471 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10472 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10473 if (DstSz > SrcSz)
10474 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10475 return TTI.getCastInstrCost(
10476 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10477 VecTy, TTI::CastContextHint::None, CostKind);
10478 }
10479 return TTI::TCC_Free;
10480 };
10481 if (!V1 && !V2 && !P2.isNull()) {
10482 // Shuffle 2 entry nodes.
10483 const TreeEntry *E = cast<const TreeEntry *>(P1);
10484 unsigned VF = E->getVectorFactor();
10485 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10486 CommonVF = std::max(VF, E2->getVectorFactor());
10487 assert(all_of(Mask,
10488 [=](int Idx) {
10489 return Idx < 2 * static_cast<int>(CommonVF);
10490 }) &&
10491 "All elements in mask must be less than 2 * CommonVF.");
10492 if (E->Scalars.size() == E2->Scalars.size()) {
10493 SmallVector<int> EMask = E->getCommonMask();
10494 SmallVector<int> E2Mask = E2->getCommonMask();
10495 if (!EMask.empty() || !E2Mask.empty()) {
10496 for (int &Idx : CommonMask) {
10497 if (Idx == PoisonMaskElem)
10498 continue;
10499 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10500 Idx = EMask[Idx];
10501 else if (Idx >= static_cast<int>(CommonVF))
10502 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10503 E->Scalars.size();
10504 }
10505 }
10506 CommonVF = E->Scalars.size();
10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10508 GetNodeMinBWAffectedCost(*E2, CommonVF);
10509 } else {
10510 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10511 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10512 }
10513 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10514 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10515 } else if (!V1 && P2.isNull()) {
10516 // Shuffle single entry node.
10517 const TreeEntry *E = cast<const TreeEntry *>(P1);
10518 unsigned VF = E->getVectorFactor();
10519 CommonVF = VF;
10520 assert(
10521 all_of(Mask,
10522 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10523 "All elements in mask must be less than CommonVF.");
10524 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10525 SmallVector<int> EMask = E->getCommonMask();
10526 assert(!EMask.empty() && "Expected non-empty common mask.");
10527 for (int &Idx : CommonMask) {
10528 if (Idx != PoisonMaskElem)
10529 Idx = EMask[Idx];
10530 }
10531 CommonVF = E->Scalars.size();
10532 } else if (unsigned Factor = E->getInterleaveFactor();
10533 Factor > 0 && E->Scalars.size() != Mask.size() &&
10535 Factor)) {
10536 // Deinterleaved nodes are free.
10537 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10538 }
10539 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10540 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10541 // Not identity/broadcast? Try to see if the original vector is better.
10542 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10543 CommonVF == CommonMask.size() &&
10544 any_of(enumerate(CommonMask),
10545 [](const auto &&P) {
10546 return P.value() != PoisonMaskElem &&
10547 static_cast<unsigned>(P.value()) != P.index();
10548 }) &&
10549 any_of(CommonMask,
10550 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10551 SmallVector<int> ReorderMask;
10552 inversePermutation(E->ReorderIndices, ReorderMask);
10553 ::addMask(CommonMask, ReorderMask);
10554 }
10555 } else if (V1 && P2.isNull()) {
10556 // Shuffle single vector.
10557 ExtraCost += GetValueMinBWAffectedCost(V1);
10558 CommonVF = getVF(V1);
10559 assert(
10560 all_of(Mask,
10561 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10562 "All elements in mask must be less than CommonVF.");
10563 } else if (V1 && !V2) {
10564 // Shuffle vector and tree node.
10565 unsigned VF = getVF(V1);
10566 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10567 CommonVF = std::max(VF, E2->getVectorFactor());
10568 assert(all_of(Mask,
10569 [=](int Idx) {
10570 return Idx < 2 * static_cast<int>(CommonVF);
10571 }) &&
10572 "All elements in mask must be less than 2 * CommonVF.");
10573 if (E2->Scalars.size() == VF && VF != CommonVF) {
10574 SmallVector<int> E2Mask = E2->getCommonMask();
10575 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10576 for (int &Idx : CommonMask) {
10577 if (Idx == PoisonMaskElem)
10578 continue;
10579 if (Idx >= static_cast<int>(CommonVF))
10580 Idx = E2Mask[Idx - CommonVF] + VF;
10581 }
10582 CommonVF = VF;
10583 }
10584 ExtraCost += GetValueMinBWAffectedCost(V1);
10585 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10586 ExtraCost += GetNodeMinBWAffectedCost(
10587 *E2, std::min(CommonVF, E2->getVectorFactor()));
10588 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10589 } else if (!V1 && V2) {
10590 // Shuffle vector and tree node.
10591 unsigned VF = getVF(V2);
10592 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10593 CommonVF = std::max(VF, E1->getVectorFactor());
10594 assert(all_of(Mask,
10595 [=](int Idx) {
10596 return Idx < 2 * static_cast<int>(CommonVF);
10597 }) &&
10598 "All elements in mask must be less than 2 * CommonVF.");
10599 if (E1->Scalars.size() == VF && VF != CommonVF) {
10600 SmallVector<int> E1Mask = E1->getCommonMask();
10601 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10602 for (int &Idx : CommonMask) {
10603 if (Idx == PoisonMaskElem)
10604 continue;
10605 if (Idx >= static_cast<int>(CommonVF))
10606 Idx = E1Mask[Idx - CommonVF] + VF;
10607 else
10608 Idx = E1Mask[Idx];
10609 }
10610 CommonVF = VF;
10611 }
10612 ExtraCost += GetNodeMinBWAffectedCost(
10613 *E1, std::min(CommonVF, E1->getVectorFactor()));
10614 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10615 ExtraCost += GetValueMinBWAffectedCost(V2);
10616 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10617 } else {
10618 assert(V1 && V2 && "Expected both vectors.");
10619 unsigned VF = getVF(V1);
10620 CommonVF = std::max(VF, getVF(V2));
10621 assert(all_of(Mask,
10622 [=](int Idx) {
10623 return Idx < 2 * static_cast<int>(CommonVF);
10624 }) &&
10625 "All elements in mask must be less than 2 * CommonVF.");
10626 ExtraCost +=
10627 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10628 if (V1->getType() != V2->getType()) {
10629 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10630 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10631 } else {
10632 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10633 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10634 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10635 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10636 }
10637 }
10638 InVectors.front() =
10639 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10640 if (InVectors.size() == 2)
10641 InVectors.pop_back();
10642 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10643 V1, V2, CommonMask, Builder, ScalarTy);
10644 }
10645
10646public:
10648 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10649 SmallPtrSetImpl<Value *> &CheckedExtracts)
10650 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10651 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10652 CheckedExtracts(CheckedExtracts) {}
10653 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10654 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10655 unsigned NumParts, bool &UseVecBaseAsInput) {
10656 UseVecBaseAsInput = false;
10657 if (Mask.empty())
10658 return nullptr;
10659 Value *VecBase = nullptr;
10660 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10661 if (!E->ReorderIndices.empty()) {
10662 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10663 E->ReorderIndices.end());
10664 reorderScalars(VL, ReorderMask);
10665 }
10666 // Check if it can be considered reused if same extractelements were
10667 // vectorized already.
10668 bool PrevNodeFound = any_of(
10669 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10670 [&](const std::unique_ptr<TreeEntry> &TE) {
10671 return ((TE->hasState() && !TE->isAltShuffle() &&
10672 TE->getOpcode() == Instruction::ExtractElement) ||
10673 TE->isGather()) &&
10674 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10675 return VL.size() > Data.index() &&
10676 (Mask[Data.index()] == PoisonMaskElem ||
10677 isa<UndefValue>(VL[Data.index()]) ||
10678 Data.value() == VL[Data.index()]);
10679 });
10680 });
10681 SmallPtrSet<Value *, 4> UniqueBases;
10682 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10683 for (unsigned Part : seq<unsigned>(NumParts)) {
10684 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10685 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10686 for (auto [I, V] :
10687 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10688 // Ignore non-extractelement scalars.
10689 if (isa<UndefValue>(V) ||
10690 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10691 continue;
10692 // If all users of instruction are going to be vectorized and this
10693 // instruction itself is not going to be vectorized, consider this
10694 // instruction as dead and remove its cost from the final cost of the
10695 // vectorized tree.
10696 // Also, avoid adjusting the cost for extractelements with multiple uses
10697 // in different graph entries.
10698 auto *EE = cast<ExtractElementInst>(V);
10699 VecBase = EE->getVectorOperand();
10700 UniqueBases.insert(VecBase);
10701 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
10702 if (!CheckedExtracts.insert(V).second ||
10703 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10704 any_of(EE->users(),
10705 [&](User *U) {
10706 return isa<GetElementPtrInst>(U) &&
10707 !R.areAllUsersVectorized(cast<Instruction>(U),
10708 &VectorizedVals);
10709 }) ||
10710 (!VEs.empty() && !is_contained(VEs, E)))
10711 continue;
10712 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10713 if (!EEIdx)
10714 continue;
10715 unsigned Idx = *EEIdx;
10716 // Take credit for instruction that will become dead.
10717 if (EE->hasOneUse() || !PrevNodeFound) {
10718 Instruction *Ext = EE->user_back();
10719 if (isa<SExtInst, ZExtInst>(Ext) &&
10720 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10721 // Use getExtractWithExtendCost() to calculate the cost of
10722 // extractelement/ext pair.
10723 Cost -=
10724 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10725 EE->getVectorOperandType(), Idx);
10726 // Add back the cost of s|zext which is subtracted separately.
10728 Ext->getOpcode(), Ext->getType(), EE->getType(),
10729 TTI::getCastContextHint(Ext), CostKind, Ext);
10730 continue;
10731 }
10732 }
10733 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10734 CostKind, Idx);
10735 }
10736 }
10737 // Check that gather of extractelements can be represented as just a
10738 // shuffle of a single/two vectors the scalars are extracted from.
10739 // Found the bunch of extractelement instructions that must be gathered
10740 // into a vector and can be represented as a permutation elements in a
10741 // single input vector or of 2 input vectors.
10742 // Done for reused if same extractelements were vectorized already.
10743 if (!PrevNodeFound)
10744 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10745 InVectors.assign(1, E);
10746 CommonMask.assign(Mask.begin(), Mask.end());
10747 transformMaskAfterShuffle(CommonMask, CommonMask);
10748 SameNodesEstimated = false;
10749 if (NumParts != 1 && UniqueBases.size() != 1) {
10750 UseVecBaseAsInput = true;
10751 VecBase =
10752 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10753 }
10754 return VecBase;
10755 }
10756 /// Checks if the specified entry \p E needs to be delayed because of its
10757 /// dependency nodes.
10758 std::optional<InstructionCost>
10759 needToDelay(const TreeEntry *,
10761 // No need to delay the cost estimation during analysis.
10762 return std::nullopt;
10763 }
10764 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10765 if (&E1 == &E2) {
10766 assert(all_of(Mask,
10767 [&](int Idx) {
10768 return Idx < static_cast<int>(E1.getVectorFactor());
10769 }) &&
10770 "Expected single vector shuffle mask.");
10771 add(E1, Mask);
10772 return;
10773 }
10774 if (InVectors.empty()) {
10775 CommonMask.assign(Mask.begin(), Mask.end());
10776 InVectors.assign({&E1, &E2});
10777 return;
10778 }
10779 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10780 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10781 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10782 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10783 const auto *It =
10784 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10785 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10786 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10787 }
10788 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10789 if (InVectors.empty()) {
10790 CommonMask.assign(Mask.begin(), Mask.end());
10791 InVectors.assign(1, &E1);
10792 return;
10793 }
10794 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10795 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10796 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10797 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10798 const auto *It =
10799 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10800 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10801 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10802 if (!SameNodesEstimated && InVectors.size() == 1)
10803 InVectors.emplace_back(&E1);
10804 }
10805 /// Adds 2 input vectors and the mask for their shuffling.
10806 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10807 // May come only for shuffling of 2 vectors with extractelements, already
10808 // handled in adjustExtracts.
10809 assert(InVectors.size() == 1 &&
10810 all_of(enumerate(CommonMask),
10811 [&](auto P) {
10812 if (P.value() == PoisonMaskElem)
10813 return Mask[P.index()] == PoisonMaskElem;
10814 auto *EI = cast<ExtractElementInst>(
10815 cast<const TreeEntry *>(InVectors.front())
10816 ->getOrdered(P.index()));
10817 return EI->getVectorOperand() == V1 ||
10818 EI->getVectorOperand() == V2;
10819 }) &&
10820 "Expected extractelement vectors.");
10821 }
10822 /// Adds another one input vector and the mask for the shuffling.
10823 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10824 if (InVectors.empty()) {
10825 assert(CommonMask.empty() && !ForExtracts &&
10826 "Expected empty input mask/vectors.");
10827 CommonMask.assign(Mask.begin(), Mask.end());
10828 InVectors.assign(1, V1);
10829 return;
10830 }
10831 if (ForExtracts) {
10832 // No need to add vectors here, already handled them in adjustExtracts.
10833 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10834 !CommonMask.empty() &&
10835 all_of(enumerate(CommonMask),
10836 [&](auto P) {
10837 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10838 ->getOrdered(P.index());
10839 if (P.value() == PoisonMaskElem)
10840 return P.value() == Mask[P.index()] ||
10841 isa<UndefValue>(Scalar);
10842 if (isa<Constant>(V1))
10843 return true;
10844 auto *EI = cast<ExtractElementInst>(Scalar);
10845 return EI->getVectorOperand() == V1;
10846 }) &&
10847 "Expected only tree entry for extractelement vectors.");
10848 return;
10849 }
10850 assert(!InVectors.empty() && !CommonMask.empty() &&
10851 "Expected only tree entries from extracts/reused buildvectors.");
10852 unsigned VF = getVF(V1);
10853 if (InVectors.size() == 2) {
10854 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10855 transformMaskAfterShuffle(CommonMask, CommonMask);
10856 VF = std::max<unsigned>(VF, CommonMask.size());
10857 } else if (const auto *InTE =
10858 InVectors.front().dyn_cast<const TreeEntry *>()) {
10859 VF = std::max(VF, InTE->getVectorFactor());
10860 } else {
10861 VF = std::max(
10862 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10863 ->getNumElements());
10864 }
10865 InVectors.push_back(V1);
10866 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10867 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10868 CommonMask[Idx] = Mask[Idx] + VF;
10869 }
10870 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10871 Value *Root = nullptr) {
10872 Cost += getBuildVectorCost(VL, Root);
10873 if (!Root) {
10874 // FIXME: Need to find a way to avoid use of getNullValue here.
10876 unsigned VF = VL.size();
10877 if (MaskVF != 0)
10878 VF = std::min(VF, MaskVF);
10879 for (Value *V : VL.take_front(VF)) {
10880 if (isa<UndefValue>(V)) {
10881 Vals.push_back(cast<Constant>(V));
10882 continue;
10883 }
10884 Vals.push_back(Constant::getNullValue(V->getType()));
10885 }
10886 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10887 assert(SLPReVec && "FixedVectorType is not expected.");
10888 // When REVEC is enabled, we need to expand vector types into scalar
10889 // types.
10890 unsigned VecTyNumElements = VecTy->getNumElements();
10891 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10892 for (auto [I, V] : enumerate(Vals)) {
10893 Type *ScalarTy = V->getType()->getScalarType();
10894 Constant *NewVal;
10895 if (isa<PoisonValue>(V))
10896 NewVal = PoisonValue::get(ScalarTy);
10897 else if (isa<UndefValue>(V))
10898 NewVal = UndefValue::get(ScalarTy);
10899 else
10900 NewVal = Constant::getNullValue(ScalarTy);
10901 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10902 NewVal);
10903 }
10904 Vals.swap(NewVals);
10905 }
10906 return ConstantVector::get(Vals);
10907 }
10910 cast<FixedVectorType>(Root->getType())->getNumElements()),
10911 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10912 }
10914 /// Finalize emission of the shuffles.
10917 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10918 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10919 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10920 IsFinalized = true;
10921 if (Action) {
10922 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10923 if (InVectors.size() == 2)
10924 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10925 else
10926 Cost += createShuffle(Vec, nullptr, CommonMask);
10927 transformMaskAfterShuffle(CommonMask, CommonMask);
10928 assert(VF > 0 &&
10929 "Expected vector length for the final value before action.");
10930 Value *V = cast<Value *>(Vec);
10931 Action(V, CommonMask);
10932 InVectors.front() = V;
10933 }
10934 if (!SubVectors.empty()) {
10935 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10936 if (InVectors.size() == 2)
10937 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10938 else
10939 Cost += createShuffle(Vec, nullptr, CommonMask);
10940 transformMaskAfterShuffle(CommonMask, CommonMask);
10941 // Add subvectors permutation cost.
10942 if (!SubVectorsMask.empty()) {
10943 assert(SubVectorsMask.size() <= CommonMask.size() &&
10944 "Expected same size of masks for subvectors and common mask.");
10945 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10946 copy(SubVectorsMask, SVMask.begin());
10947 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10948 if (I2 != PoisonMaskElem) {
10949 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10950 I1 = I2 + CommonMask.size();
10951 }
10952 }
10954 getWidenedType(ScalarTy, CommonMask.size()),
10955 SVMask, CostKind);
10956 }
10957 for (auto [E, Idx] : SubVectors) {
10958 Type *EScalarTy = E->Scalars.front()->getType();
10959 bool IsSigned = true;
10960 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10961 EScalarTy =
10962 IntegerType::get(EScalarTy->getContext(), It->second.first);
10963 IsSigned = It->second.second;
10964 }
10965 if (ScalarTy != EScalarTy) {
10966 unsigned CastOpcode = Instruction::Trunc;
10967 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10968 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10969 if (DstSz > SrcSz)
10970 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10972 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10973 getWidenedType(EScalarTy, E->getVectorFactor()),
10975 }
10978 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10979 getWidenedType(ScalarTy, E->getVectorFactor()));
10980 if (!CommonMask.empty()) {
10981 std::iota(std::next(CommonMask.begin(), Idx),
10982 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10983 Idx);
10984 }
10985 }
10986 }
10987
10988 if (!ExtMask.empty()) {
10989 if (CommonMask.empty()) {
10990 CommonMask.assign(ExtMask.begin(), ExtMask.end());
10991 } else {
10992 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
10993 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
10994 if (ExtMask[I] == PoisonMaskElem)
10995 continue;
10996 NewMask[I] = CommonMask[ExtMask[I]];
10997 }
10998 CommonMask.swap(NewMask);
10999 }
11000 }
11001 if (CommonMask.empty()) {
11002 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11003 return Cost;
11004 }
11005 return Cost +
11006 createShuffle(InVectors.front(),
11007 InVectors.size() == 2 ? InVectors.back() : nullptr,
11008 CommonMask);
11009 }
11010
11012 assert((IsFinalized || CommonMask.empty()) &&
11013 "Shuffle construction must be finalized.");
11014 }
11015};
11016
11017const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11018 unsigned Idx) const {
11019 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
11020 return VE;
11021 const auto *It =
11022 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11023 return TE->isGather() &&
11024 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11025 return EI.EdgeIdx == Idx && EI.UserTE == E;
11026 }) != TE->UserTreeIndices.end();
11027 });
11028 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
11029 return It->get();
11030}
11031
11032TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
11033 if (TE.State == TreeEntry::ScatterVectorize ||
11034 TE.State == TreeEntry::StridedVectorize)
11036 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11037 !TE.isAltShuffle()) {
11038 if (TE.ReorderIndices.empty())
11040 SmallVector<int> Mask;
11041 inversePermutation(TE.ReorderIndices, Mask);
11042 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11044 }
11046}
11047
11048/// Builds the arguments types vector for the given call instruction with the
11049/// given \p ID for the specified vector factor.
11052 const unsigned VF, unsigned MinBW,
11053 const TargetTransformInfo *TTI) {
11054 SmallVector<Type *> ArgTys;
11055 for (auto [Idx, Arg] : enumerate(CI->args())) {
11058 ArgTys.push_back(Arg->getType());
11059 continue;
11060 }
11061 if (MinBW > 0) {
11062 ArgTys.push_back(
11063 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11064 continue;
11065 }
11066 }
11067 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11068 }
11069 return ArgTys;
11070}
11071
11073BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11074 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11075 ArrayRef<Value *> VL = E->Scalars;
11076
11077 Type *ScalarTy = getValueType(VL[0]);
11078 if (!isValidElementType(ScalarTy))
11081
11082 // If we have computed a smaller type for the expression, update VecTy so
11083 // that the costs will be accurate.
11084 auto It = MinBWs.find(E);
11085 Type *OrigScalarTy = ScalarTy;
11086 if (It != MinBWs.end()) {
11087 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11088 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11089 if (VecTy)
11090 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11091 }
11092 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11093 unsigned EntryVF = E->getVectorFactor();
11094 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11095
11096 if (E->isGather()) {
11097 if (allConstant(VL))
11098 return 0;
11099 if (isa<InsertElementInst>(VL[0]))
11101 if (isa<CmpInst>(VL.front()))
11102 ScalarTy = VL.front()->getType();
11103 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11104 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11105 }
11106 InstructionCost CommonCost = 0;
11108 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11109 !isReverseOrder(E->ReorderIndices))) {
11110 SmallVector<int> NewMask;
11111 if (E->getOpcode() == Instruction::Store) {
11112 // For stores the order is actually a mask.
11113 NewMask.resize(E->ReorderIndices.size());
11114 copy(E->ReorderIndices, NewMask.begin());
11115 } else {
11116 inversePermutation(E->ReorderIndices, NewMask);
11117 }
11118 ::addMask(Mask, NewMask);
11119 }
11120 if (!E->ReuseShuffleIndices.empty())
11121 ::addMask(Mask, E->ReuseShuffleIndices);
11122 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11123 CommonCost =
11124 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11125 assert((E->State == TreeEntry::Vectorize ||
11126 E->State == TreeEntry::ScatterVectorize ||
11127 E->State == TreeEntry::StridedVectorize) &&
11128 "Unhandled state");
11129 assert(E->getOpcode() &&
11130 ((allSameType(VL) && allSameBlock(VL)) ||
11131 (E->getOpcode() == Instruction::GetElementPtr &&
11132 E->getMainOp()->getType()->isPointerTy())) &&
11133 "Invalid VL");
11134 Instruction *VL0 = E->getMainOp();
11135 unsigned ShuffleOrOp =
11136 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11137 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11138 ShuffleOrOp = E->CombinedOp;
11139 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11140 const unsigned Sz = UniqueValues.size();
11141 SmallBitVector UsedScalars(Sz, false);
11142 for (unsigned I = 0; I < Sz; ++I) {
11143 if (isa<Instruction>(UniqueValues[I]) &&
11144 is_contained(getTreeEntries(UniqueValues[I]), E))
11145 continue;
11146 UsedScalars.set(I);
11147 }
11148 auto GetCastContextHint = [&](Value *V) {
11149 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
11150 return getCastContextHint(*OpTEs.front());
11151 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11152 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11153 !SrcState.isAltShuffle())
11156 };
11157 auto GetCostDiff =
11158 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11160 // Calculate the cost of this instruction.
11161 InstructionCost ScalarCost = 0;
11162 if (isa<CastInst, CallInst>(VL0)) {
11163 // For some of the instructions no need to calculate cost for each
11164 // particular instruction, we can use the cost of the single
11165 // instruction x total number of scalar instructions.
11166 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11167 } else {
11168 for (unsigned I = 0; I < Sz; ++I) {
11169 if (UsedScalars.test(I))
11170 continue;
11171 ScalarCost += ScalarEltCost(I);
11172 }
11173 }
11174
11175 InstructionCost VecCost = VectorCost(CommonCost);
11176 // Check if the current node must be resized, if the parent node is not
11177 // resized.
11178 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11179 E->Idx != 0 &&
11180 (E->getOpcode() != Instruction::Load ||
11181 !E->UserTreeIndices.empty())) {
11182 const EdgeInfo &EI =
11183 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11184 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11185 });
11186 if (EI.UserTE->getOpcode() != Instruction::Select ||
11187 EI.EdgeIdx != 0) {
11188 auto UserBWIt = MinBWs.find(EI.UserTE);
11189 Type *UserScalarTy =
11190 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11191 if (UserBWIt != MinBWs.end())
11192 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11193 UserBWIt->second.first);
11194 if (ScalarTy != UserScalarTy) {
11195 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11196 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11197 unsigned VecOpcode;
11198 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11199 if (BWSz > SrcBWSz)
11200 VecOpcode = Instruction::Trunc;
11201 else
11202 VecOpcode =
11203 It->second.second ? Instruction::SExt : Instruction::ZExt;
11204 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11205 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11206 CostKind);
11207 }
11208 }
11209 }
11210 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11211 ScalarCost, "Calculated costs for Tree"));
11212 return VecCost - ScalarCost;
11213 };
11214 // Calculate cost difference from vectorizing set of GEPs.
11215 // Negative value means vectorizing is profitable.
11216 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11217 assert((E->State == TreeEntry::Vectorize ||
11218 E->State == TreeEntry::StridedVectorize) &&
11219 "Entry state expected to be Vectorize or StridedVectorize here.");
11220 InstructionCost ScalarCost = 0;
11221 InstructionCost VecCost = 0;
11222 std::tie(ScalarCost, VecCost) = getGEPCosts(
11223 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11224 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11225 "Calculated GEPs cost for Tree"));
11226
11227 return VecCost - ScalarCost;
11228 };
11229
11230 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11231 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11232 if (MinMaxID == Intrinsic::not_intrinsic)
11234 Type *CanonicalType = Ty;
11235 if (CanonicalType->isPtrOrPtrVectorTy())
11236 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11237 CanonicalType->getContext(),
11238 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11239
11240 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11241 {CanonicalType, CanonicalType});
11242 InstructionCost IntrinsicCost =
11243 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11244 // If the selects are the only uses of the compares, they will be
11245 // dead and we can adjust the cost by removing their cost.
11246 if (VI && SelectOnly) {
11247 assert((!Ty->isVectorTy() || SLPReVec) &&
11248 "Expected only for scalar type.");
11249 auto *CI = cast<CmpInst>(VI->getOperand(0));
11250 IntrinsicCost -= TTI->getCmpSelInstrCost(
11251 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11252 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11253 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11254 }
11255 return IntrinsicCost;
11256 };
11257 switch (ShuffleOrOp) {
11258 case Instruction::PHI: {
11259 // Count reused scalars.
11260 InstructionCost ScalarCost = 0;
11262 for (Value *V : UniqueValues) {
11263 auto *PHI = dyn_cast<PHINode>(V);
11264 if (!PHI)
11265 continue;
11266
11267 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11268 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11269 Value *Op = PHI->getIncomingValue(I);
11270 Operands[I] = Op;
11271 }
11272 if (const TreeEntry *OpTE =
11273 getSameValuesTreeEntry(Operands.front(), Operands))
11274 if (CountedOps.insert(OpTE).second &&
11275 !OpTE->ReuseShuffleIndices.empty())
11276 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11277 OpTE->Scalars.size());
11278 }
11279
11280 return CommonCost - ScalarCost;
11281 }
11282 case Instruction::ExtractValue:
11283 case Instruction::ExtractElement: {
11284 auto GetScalarCost = [&](unsigned Idx) {
11285 if (isa<PoisonValue>(UniqueValues[Idx]))
11287
11288 auto *I = cast<Instruction>(UniqueValues[Idx]);
11289 VectorType *SrcVecTy;
11290 if (ShuffleOrOp == Instruction::ExtractElement) {
11291 auto *EE = cast<ExtractElementInst>(I);
11292 SrcVecTy = EE->getVectorOperandType();
11293 } else {
11294 auto *EV = cast<ExtractValueInst>(I);
11295 Type *AggregateTy = EV->getAggregateOperand()->getType();
11296 unsigned NumElts;
11297 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11298 NumElts = ATy->getNumElements();
11299 else
11300 NumElts = AggregateTy->getStructNumElements();
11301 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11302 }
11303 if (I->hasOneUse()) {
11304 Instruction *Ext = I->user_back();
11305 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11306 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11307 // Use getExtractWithExtendCost() to calculate the cost of
11308 // extractelement/ext pair.
11310 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11311 // Subtract the cost of s|zext which is subtracted separately.
11313 Ext->getOpcode(), Ext->getType(), I->getType(),
11315 return Cost;
11316 }
11317 }
11318 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11320 };
11321 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11322 return GetCostDiff(GetScalarCost, GetVectorCost);
11323 }
11324 case Instruction::InsertElement: {
11325 assert(E->ReuseShuffleIndices.empty() &&
11326 "Unique insertelements only are expected.");
11327 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11328 unsigned const NumElts = SrcVecTy->getNumElements();
11329 unsigned const NumScalars = VL.size();
11330
11331 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
11332
11333 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11334 unsigned OffsetBeg = *getElementIndex(VL.front());
11335 unsigned OffsetEnd = OffsetBeg;
11336 InsertMask[OffsetBeg] = 0;
11337 for (auto [I, V] : enumerate(VL.drop_front())) {
11338 unsigned Idx = *getElementIndex(V);
11339 if (OffsetBeg > Idx)
11340 OffsetBeg = Idx;
11341 else if (OffsetEnd < Idx)
11342 OffsetEnd = Idx;
11343 InsertMask[Idx] = I + 1;
11344 }
11345 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11346 if (NumOfParts > 0 && NumOfParts < NumElts)
11347 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11348 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11349 VecScalarsSz;
11350 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11351 unsigned InsertVecSz = std::min<unsigned>(
11352 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11353 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11354 bool IsWholeSubvector =
11355 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11356 // Check if we can safely insert a subvector. If it is not possible, just
11357 // generate a whole-sized vector and shuffle the source vector and the new
11358 // subvector.
11359 if (OffsetBeg + InsertVecSz > VecSz) {
11360 // Align OffsetBeg to generate correct mask.
11361 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11362 InsertVecSz = VecSz;
11363 }
11364
11365 APInt DemandedElts = APInt::getZero(NumElts);
11366 // TODO: Add support for Instruction::InsertValue.
11368 if (!E->ReorderIndices.empty()) {
11369 inversePermutation(E->ReorderIndices, Mask);
11370 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11371 } else {
11372 Mask.assign(VecSz, PoisonMaskElem);
11373 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11374 }
11375 bool IsIdentity = true;
11376 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11377 Mask.swap(PrevMask);
11378 for (unsigned I = 0; I < NumScalars; ++I) {
11379 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11380 DemandedElts.setBit(InsertIdx);
11381 IsIdentity &= InsertIdx - OffsetBeg == I;
11382 Mask[InsertIdx - OffsetBeg] = I;
11383 }
11384 assert(Offset < NumElts && "Failed to find vector index offset");
11385
11387 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11388 /*Insert*/ true, /*Extract*/ false,
11389 CostKind);
11390
11391 // First cost - resize to actual vector size if not identity shuffle or
11392 // need to shift the vector.
11393 // Do not calculate the cost if the actual size is the register size and
11394 // we can merge this shuffle with the following SK_Select.
11395 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11396 if (!IsIdentity)
11398 InsertVecTy, Mask);
11399 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11400 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11401 }));
11402 // Second cost - permutation with subvector, if some elements are from the
11403 // initial vector or inserting a subvector.
11404 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11405 // subvector of ActualVecTy.
11406 SmallBitVector InMask =
11407 isUndefVector(FirstInsert->getOperand(0),
11408 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11409 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11410 if (InsertVecSz != VecSz) {
11411 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11412 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11413 CostKind, OffsetBeg - Offset, InsertVecTy);
11414 } else {
11415 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11416 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11417 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11418 I <= End; ++I)
11419 if (Mask[I] != PoisonMaskElem)
11420 Mask[I] = I + VecSz;
11421 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11422 Mask[I] =
11423 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11424 Cost +=
11425 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11426 }
11427 }
11428 return Cost;
11429 }
11430 case Instruction::ZExt:
11431 case Instruction::SExt:
11432 case Instruction::FPToUI:
11433 case Instruction::FPToSI:
11434 case Instruction::FPExt:
11435 case Instruction::PtrToInt:
11436 case Instruction::IntToPtr:
11437 case Instruction::SIToFP:
11438 case Instruction::UIToFP:
11439 case Instruction::Trunc:
11440 case Instruction::FPTrunc:
11441 case Instruction::BitCast: {
11442 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11443 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11444 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11445 unsigned Opcode = ShuffleOrOp;
11446 unsigned VecOpcode = Opcode;
11447 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11448 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11449 // Check if the values are candidates to demote.
11450 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11451 if (SrcIt != MinBWs.end()) {
11452 SrcBWSz = SrcIt->second.first;
11453 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11454 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11455 SrcVecTy =
11456 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11457 }
11458 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11459 if (BWSz == SrcBWSz) {
11460 VecOpcode = Instruction::BitCast;
11461 } else if (BWSz < SrcBWSz) {
11462 VecOpcode = Instruction::Trunc;
11463 } else if (It != MinBWs.end()) {
11464 assert(BWSz > SrcBWSz && "Invalid cast!");
11465 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11466 } else if (SrcIt != MinBWs.end()) {
11467 assert(BWSz > SrcBWSz && "Invalid cast!");
11468 VecOpcode =
11469 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11470 }
11471 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11472 !SrcIt->second.second) {
11473 VecOpcode = Instruction::UIToFP;
11474 }
11475 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11476 assert(Idx == 0 && "Expected 0 index only");
11477 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11478 VL0->getOperand(0)->getType(),
11480 };
11481 auto GetVectorCost = [=](InstructionCost CommonCost) {
11482 // Do not count cost here if minimum bitwidth is in effect and it is just
11483 // a bitcast (here it is just a noop).
11484 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11485 return CommonCost;
11486 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11487 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11488
11489 bool IsArithmeticExtendedReduction =
11490 E->Idx == 0 && UserIgnoreList &&
11491 all_of(*UserIgnoreList, [](Value *V) {
11492 auto *I = cast<Instruction>(V);
11493 return is_contained({Instruction::Add, Instruction::FAdd,
11494 Instruction::Mul, Instruction::FMul,
11495 Instruction::And, Instruction::Or,
11496 Instruction::Xor},
11497 I->getOpcode());
11498 });
11499 if (IsArithmeticExtendedReduction &&
11500 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11501 return CommonCost;
11502 return CommonCost +
11503 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11504 VecOpcode == Opcode ? VI : nullptr);
11505 };
11506 return GetCostDiff(GetScalarCost, GetVectorCost);
11507 }
11508 case Instruction::FCmp:
11509 case Instruction::ICmp:
11510 case Instruction::Select: {
11511 CmpPredicate VecPred, SwappedVecPred;
11512 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11513 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11514 match(VL0, MatchCmp))
11515 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11516 else
11517 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11520 auto GetScalarCost = [&](unsigned Idx) {
11521 if (isa<PoisonValue>(UniqueValues[Idx]))
11523
11524 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11525 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11528 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11529 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11530 !match(VI, MatchCmp)) ||
11531 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11532 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11533 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11536
11538 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11539 CostKind, getOperandInfo(VI->getOperand(0)),
11540 getOperandInfo(VI->getOperand(1)), VI);
11541 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11542 if (IntrinsicCost.isValid())
11543 ScalarCost = IntrinsicCost;
11544
11545 return ScalarCost;
11546 };
11547 auto GetVectorCost = [&](InstructionCost CommonCost) {
11548 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11549
11550 InstructionCost VecCost =
11551 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11552 CostKind, getOperandInfo(E->getOperand(0)),
11553 getOperandInfo(E->getOperand(1)), VL0);
11554 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11555 auto *CondType =
11556 getWidenedType(SI->getCondition()->getType(), VL.size());
11557 unsigned CondNumElements = CondType->getNumElements();
11558 unsigned VecTyNumElements = getNumElements(VecTy);
11559 assert(VecTyNumElements >= CondNumElements &&
11560 VecTyNumElements % CondNumElements == 0 &&
11561 "Cannot vectorize Instruction::Select");
11562 if (CondNumElements != VecTyNumElements) {
11563 // When the return type is i1 but the source is fixed vector type, we
11564 // need to duplicate the condition value.
11565 VecCost += ::getShuffleCost(
11566 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11567 createReplicatedMask(VecTyNumElements / CondNumElements,
11568 CondNumElements));
11569 }
11570 }
11571 return VecCost + CommonCost;
11572 };
11573 return GetCostDiff(GetScalarCost, GetVectorCost);
11574 }
11575 case TreeEntry::MinMax: {
11576 auto GetScalarCost = [&](unsigned Idx) {
11577 return GetMinMaxCost(OrigScalarTy);
11578 };
11579 auto GetVectorCost = [&](InstructionCost CommonCost) {
11580 InstructionCost VecCost = GetMinMaxCost(VecTy);
11581 return VecCost + CommonCost;
11582 };
11583 return GetCostDiff(GetScalarCost, GetVectorCost);
11584 }
11585 case Instruction::FNeg:
11586 case Instruction::Add:
11587 case Instruction::FAdd:
11588 case Instruction::Sub:
11589 case Instruction::FSub:
11590 case Instruction::Mul:
11591 case Instruction::FMul:
11592 case Instruction::UDiv:
11593 case Instruction::SDiv:
11594 case Instruction::FDiv:
11595 case Instruction::URem:
11596 case Instruction::SRem:
11597 case Instruction::FRem:
11598 case Instruction::Shl:
11599 case Instruction::LShr:
11600 case Instruction::AShr:
11601 case Instruction::And:
11602 case Instruction::Or:
11603 case Instruction::Xor: {
11604 auto GetScalarCost = [&](unsigned Idx) {
11605 if (isa<PoisonValue>(UniqueValues[Idx]))
11607
11608 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11609 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11610 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11611 TTI::OperandValueInfo Op2Info =
11612 TTI::getOperandInfo(VI->getOperand(OpIdx));
11613 SmallVector<const Value *> Operands(VI->operand_values());
11614 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11615 Op1Info, Op2Info, Operands, VI);
11616 };
11617 auto GetVectorCost = [=](InstructionCost CommonCost) {
11618 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11619 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11620 ArrayRef<Value *> Ops = E->getOperand(I);
11621 if (all_of(Ops, [&](Value *Op) {
11622 auto *CI = dyn_cast<ConstantInt>(Op);
11623 return CI && CI->getValue().countr_one() >= It->second.first;
11624 }))
11625 return CommonCost;
11626 }
11627 }
11628 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11629 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11630 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11631 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11632 Op2Info, {}, nullptr, TLI) +
11633 CommonCost;
11634 };
11635 return GetCostDiff(GetScalarCost, GetVectorCost);
11636 }
11637 case Instruction::GetElementPtr: {
11638 return CommonCost + GetGEPCostDiff(VL, VL0);
11639 }
11640 case Instruction::Load: {
11641 auto GetScalarCost = [&](unsigned Idx) {
11642 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11643 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11644 VI->getAlign(), VI->getPointerAddressSpace(),
11646 };
11647 auto *LI0 = cast<LoadInst>(VL0);
11648 auto GetVectorCost = [&](InstructionCost CommonCost) {
11649 InstructionCost VecLdCost;
11650 switch (E->State) {
11651 case TreeEntry::Vectorize:
11652 if (unsigned Factor = E->getInterleaveFactor()) {
11653 VecLdCost = TTI->getInterleavedMemoryOpCost(
11654 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11655 LI0->getPointerAddressSpace(), CostKind);
11656
11657 } else {
11658 VecLdCost = TTI->getMemoryOpCost(
11659 Instruction::Load, VecTy, LI0->getAlign(),
11660 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11661 }
11662 break;
11663 case TreeEntry::StridedVectorize: {
11664 Align CommonAlignment =
11665 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11666 VecLdCost = TTI->getStridedMemoryOpCost(
11667 Instruction::Load, VecTy, LI0->getPointerOperand(),
11668 /*VariableMask=*/false, CommonAlignment, CostKind);
11669 break;
11670 }
11671 case TreeEntry::ScatterVectorize: {
11672 Align CommonAlignment =
11673 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11674 VecLdCost = TTI->getGatherScatterOpCost(
11675 Instruction::Load, VecTy, LI0->getPointerOperand(),
11676 /*VariableMask=*/false, CommonAlignment, CostKind);
11677 break;
11678 }
11679 case TreeEntry::CombinedVectorize:
11680 case TreeEntry::NeedToGather:
11681 llvm_unreachable("Unexpected vectorization state.");
11682 }
11683 return VecLdCost + CommonCost;
11684 };
11685
11686 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11687 // If this node generates masked gather load then it is not a terminal node.
11688 // Hence address operand cost is estimated separately.
11689 if (E->State == TreeEntry::ScatterVectorize)
11690 return Cost;
11691
11692 // Estimate cost of GEPs since this tree node is a terminator.
11693 SmallVector<Value *> PointerOps(VL.size());
11694 for (auto [I, V] : enumerate(VL))
11695 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11696 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11697 }
11698 case Instruction::Store: {
11699 bool IsReorder = !E->ReorderIndices.empty();
11700 auto GetScalarCost = [=](unsigned Idx) {
11701 auto *VI = cast<StoreInst>(VL[Idx]);
11702 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11703 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11704 VI->getAlign(), VI->getPointerAddressSpace(),
11705 CostKind, OpInfo, VI);
11706 };
11707 auto *BaseSI =
11708 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11709 auto GetVectorCost = [=](InstructionCost CommonCost) {
11710 // We know that we can merge the stores. Calculate the cost.
11711 InstructionCost VecStCost;
11712 if (E->State == TreeEntry::StridedVectorize) {
11713 Align CommonAlignment =
11714 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11715 VecStCost = TTI->getStridedMemoryOpCost(
11716 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11717 /*VariableMask=*/false, CommonAlignment, CostKind);
11718 } else {
11719 assert(E->State == TreeEntry::Vectorize &&
11720 "Expected either strided or consecutive stores.");
11721 if (unsigned Factor = E->getInterleaveFactor()) {
11722 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11723 "No reused shuffles expected");
11724 CommonCost = 0;
11725 VecStCost = TTI->getInterleavedMemoryOpCost(
11726 Instruction::Store, VecTy, Factor, std::nullopt,
11727 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11728 } else {
11729 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11730 VecStCost = TTI->getMemoryOpCost(
11731 Instruction::Store, VecTy, BaseSI->getAlign(),
11732 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11733 }
11734 }
11735 return VecStCost + CommonCost;
11736 };
11737 SmallVector<Value *> PointerOps(VL.size());
11738 for (auto [I, V] : enumerate(VL)) {
11739 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11740 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11741 }
11742
11743 return GetCostDiff(GetScalarCost, GetVectorCost) +
11744 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11745 }
11746 case Instruction::Call: {
11747 auto GetScalarCost = [&](unsigned Idx) {
11748 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11751 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11752 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11753 }
11756 CI->getFunctionType()->params(), CostKind);
11757 };
11758 auto GetVectorCost = [=](InstructionCost CommonCost) {
11759 auto *CI = cast<CallInst>(VL0);
11762 CI, ID, VecTy->getNumElements(),
11763 It != MinBWs.end() ? It->second.first : 0, TTI);
11764 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11765 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11766 };
11767 return GetCostDiff(GetScalarCost, GetVectorCost);
11768 }
11769 case Instruction::ShuffleVector: {
11770 if (!SLPReVec || E->isAltShuffle())
11771 assert(E->isAltShuffle() &&
11772 ((Instruction::isBinaryOp(E->getOpcode()) &&
11773 Instruction::isBinaryOp(E->getAltOpcode())) ||
11774 (Instruction::isCast(E->getOpcode()) &&
11775 Instruction::isCast(E->getAltOpcode())) ||
11776 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11777 "Invalid Shuffle Vector Operand");
11778 // Try to find the previous shuffle node with the same operands and same
11779 // main/alternate ops.
11780 auto TryFindNodeWithEqualOperands = [=]() {
11781 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11782 if (TE.get() == E)
11783 break;
11784 if (TE->hasState() && TE->isAltShuffle() &&
11785 ((TE->getOpcode() == E->getOpcode() &&
11786 TE->getAltOpcode() == E->getAltOpcode()) ||
11787 (TE->getOpcode() == E->getAltOpcode() &&
11788 TE->getAltOpcode() == E->getOpcode())) &&
11789 TE->hasEqualOperands(*E))
11790 return true;
11791 }
11792 return false;
11793 };
11794 auto GetScalarCost = [&](unsigned Idx) {
11795 if (isa<PoisonValue>(UniqueValues[Idx]))
11797
11798 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11799 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11800 (void)E;
11801 return TTI->getInstructionCost(VI, CostKind);
11802 };
11803 // Need to clear CommonCost since the final shuffle cost is included into
11804 // vector cost.
11805 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11806 // VecCost is equal to sum of the cost of creating 2 vectors
11807 // and the cost of creating shuffle.
11808 InstructionCost VecCost = 0;
11809 if (TryFindNodeWithEqualOperands()) {
11810 LLVM_DEBUG({
11811 dbgs() << "SLP: diamond match for alternate node found.\n";
11812 E->dump();
11813 });
11814 // No need to add new vector costs here since we're going to reuse
11815 // same main/alternate vector ops, just do different shuffling.
11816 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11817 VecCost =
11818 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11819 VecCost +=
11820 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11821 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11822 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11823 VecCost = TTIRef.getCmpSelInstrCost(
11824 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11825 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11826 VL0);
11827 VecCost += TTIRef.getCmpSelInstrCost(
11828 E->getOpcode(), VecTy, MaskTy,
11829 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11830 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11831 E->getAltOp());
11832 } else {
11833 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11834 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11835 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11836 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11837 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11838 unsigned SrcBWSz =
11839 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11840 if (SrcIt != MinBWs.end()) {
11841 SrcBWSz = SrcIt->second.first;
11842 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11843 SrcTy = getWidenedType(SrcSclTy, VL.size());
11844 }
11845 if (BWSz <= SrcBWSz) {
11846 if (BWSz < SrcBWSz)
11847 VecCost =
11848 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11850 LLVM_DEBUG({
11851 dbgs()
11852 << "SLP: alternate extension, which should be truncated.\n";
11853 E->dump();
11854 });
11855 return VecCost;
11856 }
11857 }
11858 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11860 VecCost +=
11861 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11863 }
11865 E->buildAltOpShuffleMask(
11866 [&](Instruction *I) {
11867 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11868 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11869 *TLI);
11870 },
11871 Mask);
11873 FinalVecTy, Mask, CostKind);
11874 // Patterns like [fadd,fsub] can be combined into a single instruction
11875 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11876 // need to take into account their order when looking for the most used
11877 // order.
11878 unsigned Opcode0 = E->getOpcode();
11879 unsigned Opcode1 = E->getAltOpcode();
11880 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11881 // If this pattern is supported by the target then we consider the
11882 // order.
11883 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11884 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11885 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11886 return AltVecCost < VecCost ? AltVecCost : VecCost;
11887 }
11888 // TODO: Check the reverse order too.
11889 return VecCost;
11890 };
11891 if (SLPReVec && !E->isAltShuffle())
11892 return GetCostDiff(
11893 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11894 // If a group uses mask in order, the shufflevector can be
11895 // eliminated by instcombine. Then the cost is 0.
11896 assert(isa<ShuffleVectorInst>(VL.front()) &&
11897 "Not supported shufflevector usage.");
11898 auto *SV = cast<ShuffleVectorInst>(VL.front());
11899 unsigned SVNumElements =
11900 cast<FixedVectorType>(SV->getOperand(0)->getType())
11901 ->getNumElements();
11902 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11903 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11904 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11905 int NextIndex = 0;
11906 if (!all_of(Group, [&](Value *V) {
11907 assert(isa<ShuffleVectorInst>(V) &&
11908 "Not supported shufflevector usage.");
11909 auto *SV = cast<ShuffleVectorInst>(V);
11910 int Index;
11911 [[maybe_unused]] bool IsExtractSubvectorMask =
11912 SV->isExtractSubvectorMask(Index);
11913 assert(IsExtractSubvectorMask &&
11914 "Not supported shufflevector usage.");
11915 if (NextIndex != Index)
11916 return false;
11917 NextIndex += SV->getShuffleMask().size();
11918 return true;
11919 }))
11920 return ::getShuffleCost(
11922 calculateShufflevectorMask(E->Scalars));
11923 }
11924 return TTI::TCC_Free;
11925 });
11926 return GetCostDiff(GetScalarCost, GetVectorCost);
11927 }
11928 case Instruction::Freeze:
11929 return CommonCost;
11930 default:
11931 llvm_unreachable("Unknown instruction");
11932 }
11933}
11934
11935bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11936 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11937 << VectorizableTree.size() << " is fully vectorizable .\n");
11938
11939 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11941 return TE->isGather() &&
11942 !any_of(TE->Scalars,
11943 [this](Value *V) { return EphValues.contains(V); }) &&
11944 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11945 TE->Scalars.size() < Limit ||
11946 (((TE->hasState() &&
11947 TE->getOpcode() == Instruction::ExtractElement) ||
11948 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11949 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11950 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
11951 !TE->isAltShuffle()) ||
11952 any_of(TE->Scalars, IsaPred<LoadInst>));
11953 };
11954
11955 // We only handle trees of heights 1 and 2.
11956 if (VectorizableTree.size() == 1 &&
11957 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11958 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11959 (ForReduction &&
11960 AreVectorizableGathers(VectorizableTree[0].get(),
11961 VectorizableTree[0]->Scalars.size()) &&
11962 VectorizableTree[0]->getVectorFactor() > 2)))
11963 return true;
11964
11965 if (VectorizableTree.size() != 2)
11966 return false;
11967
11968 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11969 // with the second gather nodes if they have less scalar operands rather than
11970 // the initial tree element (may be profitable to shuffle the second gather)
11971 // or they are extractelements, which form shuffle.
11973 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11974 AreVectorizableGathers(VectorizableTree[1].get(),
11975 VectorizableTree[0]->Scalars.size()))
11976 return true;
11977
11978 // Gathering cost would be too much for tiny trees.
11979 if (VectorizableTree[0]->isGather() ||
11980 (VectorizableTree[1]->isGather() &&
11981 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11982 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11983 return false;
11984
11985 return true;
11986}
11987
11988static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11990 bool MustMatchOrInst) {
11991 // Look past the root to find a source value. Arbitrarily follow the
11992 // path through operand 0 of any 'or'. Also, peek through optional
11993 // shift-left-by-multiple-of-8-bits.
11994 Value *ZextLoad = Root;
11995 const APInt *ShAmtC;
11996 bool FoundOr = false;
11997 while (!isa<ConstantExpr>(ZextLoad) &&
11998 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11999 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
12000 ShAmtC->urem(8) == 0))) {
12001 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12002 ZextLoad = BinOp->getOperand(0);
12003 if (BinOp->getOpcode() == Instruction::Or)
12004 FoundOr = true;
12005 }
12006 // Check if the input is an extended load of the required or/shift expression.
12007 Value *Load;
12008 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12009 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12010 return false;
12011
12012 // Require that the total load bit width is a legal integer type.
12013 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12014 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12015 Type *SrcTy = Load->getType();
12016 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12017 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12018 return false;
12019
12020 // Everything matched - assume that we can fold the whole sequence using
12021 // load combining.
12022 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
12023 << *(cast<Instruction>(Root)) << "\n");
12024
12025 return true;
12026}
12027
12029 if (RdxKind != RecurKind::Or)
12030 return false;
12031
12032 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12033 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12034 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
12035 /* MatchOr */ false);
12036}
12037
12039 // Peek through a final sequence of stores and check if all operations are
12040 // likely to be load-combined.
12041 unsigned NumElts = Stores.size();
12042 for (Value *Scalar : Stores) {
12043 Value *X;
12044 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
12045 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
12046 return false;
12047 }
12048 return true;
12049}
12050
12051bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
12052 if (!DebugCounter::shouldExecute(VectorizedGraphs))
12053 return true;
12054
12055 // Graph is empty - do nothing.
12056 if (VectorizableTree.empty()) {
12057 assert(ExternalUses.empty() && "We shouldn't have any external users");
12058
12059 return true;
12060 }
12061
12062 // No need to vectorize inserts of gathered values.
12063 if (VectorizableTree.size() == 2 &&
12064 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12065 VectorizableTree[1]->isGather() &&
12066 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12067 !(isSplat(VectorizableTree[1]->Scalars) ||
12068 allConstant(VectorizableTree[1]->Scalars))))
12069 return true;
12070
12071 // If the graph includes only PHI nodes and gathers, it is defnitely not
12072 // profitable for the vectorization, we can skip it, if the cost threshold is
12073 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12074 // gathers/buildvectors.
12075 constexpr int Limit = 4;
12076 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12077 !VectorizableTree.empty() &&
12078 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12079 return (TE->isGather() &&
12080 (!TE->hasState() ||
12081 TE->getOpcode() != Instruction::ExtractElement) &&
12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12083 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12084 }))
12085 return true;
12086
12087 // We can vectorize the tree if its size is greater than or equal to the
12088 // minimum size specified by the MinTreeSize command line option.
12089 if (VectorizableTree.size() >= MinTreeSize)
12090 return false;
12091
12092 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12093 // can vectorize it if we can prove it fully vectorizable.
12094 if (isFullyVectorizableTinyTree(ForReduction))
12095 return false;
12096
12097 // Check if any of the gather node forms an insertelement buildvector
12098 // somewhere.
12099 bool IsAllowedSingleBVNode =
12100 VectorizableTree.size() > 1 ||
12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12102 !VectorizableTree.front()->isAltShuffle() &&
12103 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12105 allSameBlock(VectorizableTree.front()->Scalars));
12106 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12107 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12108 return isa<ExtractElementInst, UndefValue>(V) ||
12109 (IsAllowedSingleBVNode &&
12110 !V->hasNUsesOrMore(UsesLimit) &&
12111 any_of(V->users(), IsaPred<InsertElementInst>));
12112 });
12113 }))
12114 return false;
12115
12116 if (VectorizableTree.back()->isGather() &&
12117 VectorizableTree.back()->hasState() &&
12118 VectorizableTree.back()->isAltShuffle() &&
12119 VectorizableTree.back()->getVectorFactor() > 2 &&
12120 allSameBlock(VectorizableTree.back()->Scalars) &&
12121 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12123 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12124 VectorizableTree.back()->getVectorFactor()),
12125 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12126 /*Insert=*/true, /*Extract=*/false,
12128 return false;
12129
12130 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12131 // vectorizable.
12132 return true;
12133}
12134
12137 constexpr unsigned SmallTree = 3;
12138 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12139 getCanonicalGraphSize() <= SmallTree &&
12140 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12141 [](const std::unique_ptr<TreeEntry> &TE) {
12142 return TE->isGather() && TE->hasState() &&
12143 TE->getOpcode() == Instruction::Load &&
12144 !allSameBlock(TE->Scalars);
12145 }) == 1)
12146 return true;
12147 return false;
12148 }
12149 bool Res = false;
12150 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12151 TreeEntry &E = *VectorizableTree[Idx];
12152 if (!E.isGather())
12153 continue;
12154 if (E.hasState() && E.getOpcode() != Instruction::Load)
12155 return false;
12156 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12157 continue;
12158 Res = true;
12159 }
12160 return Res;
12161}
12162
12164 // Walk from the bottom of the tree to the top, tracking which values are
12165 // live. When we see a call instruction that is not part of our tree,
12166 // query TTI to see if there is a cost to keeping values live over it
12167 // (for example, if spills and fills are required).
12168 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12170
12172 Instruction *PrevInst = nullptr;
12173
12174 // The entries in VectorizableTree are not necessarily ordered by their
12175 // position in basic blocks. Collect them and order them by dominance so later
12176 // instructions are guaranteed to be visited first. For instructions in
12177 // different basic blocks, we only scan to the beginning of the block, so
12178 // their order does not matter, as long as all instructions in a basic block
12179 // are grouped together. Using dominance ensures a deterministic order.
12180 SmallVector<Instruction *, 16> OrderedScalars;
12181 for (const auto &TEPtr : VectorizableTree) {
12182 if (TEPtr->State != TreeEntry::Vectorize)
12183 continue;
12184 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12185 if (!Inst)
12186 continue;
12187 OrderedScalars.push_back(Inst);
12188 }
12189 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12190 auto *NodeA = DT->getNode(A->getParent());
12191 auto *NodeB = DT->getNode(B->getParent());
12192 assert(NodeA && "Should only process reachable instructions");
12193 assert(NodeB && "Should only process reachable instructions");
12194 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195 "Different nodes should have different DFS numbers");
12196 if (NodeA != NodeB)
12197 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198 return B->comesBefore(A);
12199 });
12200
12201 for (Instruction *Inst : OrderedScalars) {
12202 if (!PrevInst) {
12203 PrevInst = Inst;
12204 continue;
12205 }
12206
12207 // Update LiveValues.
12208 LiveValues.erase(PrevInst);
12209 for (auto &J : PrevInst->operands()) {
12210 if (isa<Instruction>(&*J) && isVectorized(&*J))
12211 LiveValues.insert(cast<Instruction>(&*J));
12212 }
12213
12214 LLVM_DEBUG({
12215 dbgs() << "SLP: #LV: " << LiveValues.size();
12216 for (auto *X : LiveValues)
12217 dbgs() << " " << X->getName();
12218 dbgs() << ", Looking at ";
12219 Inst->dump();
12220 });
12221
12222 // Now find the sequence of instructions between PrevInst and Inst.
12223 unsigned NumCalls = 0;
12224 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12225 PrevInstIt =
12226 PrevInst->getIterator().getReverse();
12227 while (InstIt != PrevInstIt) {
12228 if (PrevInstIt == PrevInst->getParent()->rend()) {
12229 PrevInstIt = Inst->getParent()->rbegin();
12230 continue;
12231 }
12232
12233 auto NoCallIntrinsic = [this](Instruction *I) {
12234 auto *II = dyn_cast<IntrinsicInst>(I);
12235 if (!II)
12236 return false;
12237 if (II->isAssumeLikeIntrinsic())
12238 return true;
12239 FastMathFlags FMF;
12241 for (auto &ArgOp : II->args())
12242 Tys.push_back(ArgOp->getType());
12243 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12244 FMF = FPMO->getFastMathFlags();
12245 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12246 FMF);
12247 InstructionCost IntrCost =
12250 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12251 return IntrCost < CallCost;
12252 };
12253
12254 // Debug information does not impact spill cost.
12255 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256 &*PrevInstIt != PrevInst)
12257 NumCalls++;
12258
12259 ++PrevInstIt;
12260 }
12261
12262 if (NumCalls) {
12264 for (auto *II : LiveValues) {
12265 auto *ScalarTy = II->getType();
12266 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267 ScalarTy = VectorTy->getElementType();
12268 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12269 }
12270 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12271 }
12272
12273 PrevInst = Inst;
12274 }
12275
12276 return Cost;
12277}
12278
12279/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12280/// buildvector sequence.
12282 const InsertElementInst *IE2) {
12283 if (IE1 == IE2)
12284 return false;
12285 const auto *I1 = IE1;
12286 const auto *I2 = IE2;
12287 const InsertElementInst *PrevI1;
12288 const InsertElementInst *PrevI2;
12289 unsigned Idx1 = *getElementIndex(IE1);
12290 unsigned Idx2 = *getElementIndex(IE2);
12291 do {
12292 if (I2 == IE1)
12293 return true;
12294 if (I1 == IE2)
12295 return false;
12296 PrevI1 = I1;
12297 PrevI2 = I2;
12298 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12299 getElementIndex(I1).value_or(Idx2) != Idx2)
12300 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12301 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12302 getElementIndex(I2).value_or(Idx1) != Idx1)
12303 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12304 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12305 llvm_unreachable("Two different buildvectors not expected.");
12306}
12307
12308namespace {
12309/// Returns incoming Value *, if the requested type is Value * too, or a default
12310/// value, otherwise.
12311struct ValueSelect {
12312 template <typename U>
12313 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12314 return V;
12315 }
12316 template <typename U>
12317 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12318 return U();
12319 }
12320};
12321} // namespace
12322
12323/// Does the analysis of the provided shuffle masks and performs the requested
12324/// actions on the vectors with the given shuffle masks. It tries to do it in
12325/// several steps.
12326/// 1. If the Base vector is not undef vector, resizing the very first mask to
12327/// have common VF and perform action for 2 input vectors (including non-undef
12328/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12329/// and processed as a shuffle of 2 elements.
12330/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12331/// action only for 1 vector with the given mask, if it is not the identity
12332/// mask.
12333/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12334/// vectors, combing the masks properly between the steps.
12335template <typename T>
12337 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12338 function_ref<unsigned(T *)> GetVF,
12339 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12341 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12342 SmallVector<int> Mask(ShuffleMask.begin()->second);
12343 auto VMIt = std::next(ShuffleMask.begin());
12344 T *Prev = nullptr;
12345 SmallBitVector UseMask =
12346 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12347 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12348 if (!IsBaseUndef.all()) {
12349 // Base is not undef, need to combine it with the next subvectors.
12350 std::pair<T *, bool> Res =
12351 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12352 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12353 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12354 if (Mask[Idx] == PoisonMaskElem)
12355 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12356 else
12357 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12358 }
12359 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
12360 assert((!V || GetVF(V) == Mask.size()) &&
12361 "Expected base vector of VF number of elements.");
12362 Prev = Action(Mask, {nullptr, Res.first});
12363 } else if (ShuffleMask.size() == 1) {
12364 // Base is undef and only 1 vector is shuffled - perform the action only for
12365 // single vector, if the mask is not the identity mask.
12366 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12367 /*ForSingleMask=*/true);
12368 if (Res.second)
12369 // Identity mask is found.
12370 Prev = Res.first;
12371 else
12372 Prev = Action(Mask, {ShuffleMask.begin()->first});
12373 } else {
12374 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12375 // shuffles step by step, combining shuffle between the steps.
12376 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12377 unsigned Vec2VF = GetVF(VMIt->first);
12378 if (Vec1VF == Vec2VF) {
12379 // No need to resize the input vectors since they are of the same size, we
12380 // can shuffle them directly.
12381 ArrayRef<int> SecMask = VMIt->second;
12382 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12383 if (SecMask[I] != PoisonMaskElem) {
12384 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12385 Mask[I] = SecMask[I] + Vec1VF;
12386 }
12387 }
12388 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12389 } else {
12390 // Vectors of different sizes - resize and reshuffle.
12391 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12392 /*ForSingleMask=*/false);
12393 std::pair<T *, bool> Res2 =
12394 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12395 ArrayRef<int> SecMask = VMIt->second;
12396 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12397 if (Mask[I] != PoisonMaskElem) {
12398 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12399 if (Res1.second)
12400 Mask[I] = I;
12401 } else if (SecMask[I] != PoisonMaskElem) {
12402 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12403 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12404 }
12405 }
12406 Prev = Action(Mask, {Res1.first, Res2.first});
12407 }
12408 VMIt = std::next(VMIt);
12409 }
12410 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
12411 // Perform requested actions for the remaining masks/vectors.
12412 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12413 // Shuffle other input vectors, if any.
12414 std::pair<T *, bool> Res =
12415 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12416 ArrayRef<int> SecMask = VMIt->second;
12417 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12418 if (SecMask[I] != PoisonMaskElem) {
12419 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12420 "Multiple uses of scalars.");
12421 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12422 } else if (Mask[I] != PoisonMaskElem) {
12423 Mask[I] = I;
12424 }
12425 }
12426 Prev = Action(Mask, {Prev, Res.first});
12427 }
12428 return Prev;
12429}
12430
12431namespace {
12432/// Data type for handling buildvector sequences with the reused scalars from
12433/// other tree entries.
12434template <typename T> struct ShuffledInsertData {
12435 /// List of insertelements to be replaced by shuffles.
12436 SmallVector<InsertElementInst *> InsertElements;
12437 /// The parent vectors and shuffle mask for the given list of inserts.
12439};
12440} // namespace
12441
12444 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12445 << VectorizableTree.size() << ".\n");
12446
12447 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12448
12449 SmallPtrSet<Value *, 4> CheckedExtracts;
12450 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12451 TreeEntry &TE = *VectorizableTree[I];
12452 // No need to count the cost for combined entries, they are combined and
12453 // just skip their cost.
12454 if (TE.State == TreeEntry::CombinedVectorize) {
12455 LLVM_DEBUG(
12456 dbgs() << "SLP: Skipping cost for combined node that starts with "
12457 << *TE.Scalars[0] << ".\n";
12458 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12459 continue;
12460 }
12461 if (TE.isGather() && TE.hasState()) {
12462 if (const TreeEntry *E =
12463 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
12464 E && E->getVectorFactor() == TE.getVectorFactor()) {
12465 // Some gather nodes might be absolutely the same as some vectorizable
12466 // nodes after reordering, need to handle it.
12467 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12468 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12469 << "SLP: Current total cost = " << Cost << "\n");
12470 continue;
12471 }
12472 }
12473
12474 // Exclude cost of gather loads nodes which are not used. These nodes were
12475 // built as part of the final attempt to vectorize gathered loads.
12476 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12477 "Expected gather nodes with users only.");
12478
12479 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12480 Cost += C;
12481 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12482 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12483 << "SLP: Current total cost = " << Cost << "\n");
12484 }
12485
12486 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12487 InstructionCost ExtractCost = 0;
12489 SmallVector<APInt> DemandedElts;
12490 SmallDenseSet<Value *, 4> UsedInserts;
12492 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12494 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12495 // Keep track {Scalar, Index, User} tuple.
12496 // On AArch64, this helps in fusing a mov instruction, associated with
12497 // extractelement, with fmul in the backend so that extractelement is free.
12499 for (ExternalUser &EU : ExternalUses) {
12500 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12501 }
12502 for (ExternalUser &EU : ExternalUses) {
12503 // Uses by ephemeral values are free (because the ephemeral value will be
12504 // removed prior to code generation, and so the extraction will be
12505 // removed as well).
12506 if (EphValues.count(EU.User))
12507 continue;
12508
12509 // Used in unreachable blocks or in EH pads (rarely executed) or is
12510 // terminated with unreachable instruction.
12511 if (BasicBlock *UserParent =
12512 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12513 UserParent &&
12514 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12515 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12516 continue;
12517
12518 // We only add extract cost once for the same scalar.
12519 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12520 !ExtractCostCalculated.insert(EU.Scalar).second)
12521 continue;
12522
12523 // No extract cost for vector "scalar"
12524 if (isa<FixedVectorType>(EU.Scalar->getType()))
12525 continue;
12526
12527 // If found user is an insertelement, do not calculate extract cost but try
12528 // to detect it as a final shuffled/identity match.
12529 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12530 VU && VU->getOperand(1) == EU.Scalar) {
12531 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12532 if (!UsedInserts.insert(VU).second)
12533 continue;
12534 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12535 if (InsertIdx) {
12536 const TreeEntry *ScalarTE = &EU.E;
12537 auto *It = find_if(
12538 ShuffledInserts,
12539 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12540 // Checks if 2 insertelements are from the same buildvector.
12541 InsertElementInst *VecInsert = Data.InsertElements.front();
12543 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12544 Value *Op0 = II->getOperand(0);
12545 if (isVectorized(II) && !isVectorized(Op0))
12546 return nullptr;
12547 return Op0;
12548 });
12549 });
12550 int VecId = -1;
12551 if (It == ShuffledInserts.end()) {
12552 auto &Data = ShuffledInserts.emplace_back();
12553 Data.InsertElements.emplace_back(VU);
12554 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12555 VecId = ShuffledInserts.size() - 1;
12556 auto It = MinBWs.find(ScalarTE);
12557 if (It != MinBWs.end() &&
12558 VectorCasts
12559 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12560 .second) {
12561 unsigned BWSz = It->second.first;
12562 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12563 unsigned VecOpcode;
12564 if (DstBWSz < BWSz)
12565 VecOpcode = Instruction::Trunc;
12566 else
12567 VecOpcode =
12568 It->second.second ? Instruction::SExt : Instruction::ZExt;
12571 VecOpcode, FTy,
12572 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12573 FTy->getNumElements()),
12575 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12576 << " for extending externally used vector with "
12577 "non-equal minimum bitwidth.\n");
12578 Cost += C;
12579 }
12580 } else {
12581 if (isFirstInsertElement(VU, It->InsertElements.front()))
12582 It->InsertElements.front() = VU;
12583 VecId = std::distance(ShuffledInserts.begin(), It);
12584 }
12585 int InIdx = *InsertIdx;
12586 SmallVectorImpl<int> &Mask =
12587 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12588 if (Mask.empty())
12589 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12590 Mask[InIdx] = EU.Lane;
12591 DemandedElts[VecId].setBit(InIdx);
12592 continue;
12593 }
12594 }
12595 }
12596
12598 // If we plan to rewrite the tree in a smaller type, we will need to sign
12599 // extend the extracted value back to the original type. Here, we account
12600 // for the extract and the added cost of the sign extend if needed.
12601 InstructionCost ExtraCost = TTI::TCC_Free;
12602 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12603 const TreeEntry *Entry = &EU.E;
12604 auto It = MinBWs.find(Entry);
12605 if (It != MinBWs.end()) {
12606 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12607 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12608 ? Instruction::ZExt
12609 : Instruction::SExt;
12610 VecTy = getWidenedType(MinTy, BundleWidth);
12611 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12612 VecTy, EU.Lane);
12613 } else {
12614 ExtraCost =
12615 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12616 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12617 }
12618 // Leave the scalar instructions as is if they are cheaper than extracts.
12619 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12620 Entry->getOpcode() == Instruction::Load) {
12621 // Checks if the user of the external scalar is phi in loop body.
12622 auto IsPhiInLoop = [&](const ExternalUser &U) {
12623 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12624 auto *I = cast<Instruction>(U.Scalar);
12625 const Loop *L = LI->getLoopFor(Phi->getParent());
12626 return L && (Phi->getParent() == I->getParent() ||
12627 L == LI->getLoopFor(I->getParent()));
12628 }
12629 return false;
12630 };
12631 if (!ValueToExtUses) {
12632 ValueToExtUses.emplace();
12633 for_each(enumerate(ExternalUses), [&](const auto &P) {
12634 // Ignore phis in loops.
12635 if (IsPhiInLoop(P.value()))
12636 return;
12637
12638 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12639 });
12640 }
12641 // Can use original instruction, if no operands vectorized or they are
12642 // marked as externally used already.
12643 auto *Inst = cast<Instruction>(EU.Scalar);
12644 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12645 auto OperandIsScalar = [&](Value *V) {
12646 if (!isVectorized(V)) {
12647 // Some extractelements might be not vectorized, but
12648 // transformed into shuffle and removed from the function,
12649 // consider it here.
12650 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12651 return !EE->hasOneUse() || !MustGather.contains(EE);
12652 return true;
12653 }
12654 return ValueToExtUses->contains(V);
12655 };
12656 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12657 bool CanBeUsedAsScalarCast = false;
12658 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12659 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12660 Op && all_of(Op->operands(), OperandIsScalar)) {
12661 InstructionCost OpCost =
12662 (isVectorized(Op) && !ValueToExtUses->contains(Op))
12664 : 0;
12665 if (ScalarCost + OpCost <= ExtraCost) {
12666 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12667 ScalarCost += OpCost;
12668 }
12669 }
12670 }
12671 if (CanBeUsedAsScalar) {
12672 bool KeepScalar = ScalarCost <= ExtraCost;
12673 // Try to keep original scalar if the user is the phi node from the same
12674 // block as the root phis, currently vectorized. It allows to keep
12675 // better ordering info of PHIs, being vectorized currently.
12676 bool IsProfitablePHIUser =
12677 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12678 VectorizableTree.front()->Scalars.size() > 2)) &&
12679 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12680 !Inst->hasNUsesOrMore(UsesLimit) &&
12681 none_of(Inst->users(),
12682 [&](User *U) {
12683 auto *PHIUser = dyn_cast<PHINode>(U);
12684 return (!PHIUser ||
12685 PHIUser->getParent() !=
12686 cast<Instruction>(
12687 VectorizableTree.front()->getMainOp())
12688 ->getParent()) &&
12689 !isVectorized(U);
12690 }) &&
12691 count_if(Entry->Scalars, [&](Value *V) {
12692 return ValueToExtUses->contains(V);
12693 }) <= 2;
12694 if (IsProfitablePHIUser) {
12695 KeepScalar = true;
12696 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12697 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12698 (!GatheredLoadsEntriesFirst.has_value() ||
12699 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12700 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12701 return ValueToExtUses->contains(V);
12702 });
12703 auto It = ExtractsCount.find(Entry);
12704 if (It != ExtractsCount.end()) {
12705 assert(ScalarUsesCount >= It->getSecond().size() &&
12706 "Expected total number of external uses not less than "
12707 "number of scalar uses.");
12708 ScalarUsesCount -= It->getSecond().size();
12709 }
12710 // Keep original scalar if number of externally used instructions in
12711 // the same entry is not power of 2. It may help to do some extra
12712 // vectorization for now.
12713 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12714 }
12715 if (KeepScalar) {
12716 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12717 for_each(Inst->operands(), [&](Value *V) {
12718 auto It = ValueToExtUses->find(V);
12719 if (It != ValueToExtUses->end()) {
12720 // Replace all uses to avoid compiler crash.
12721 ExternalUses[It->second].User = nullptr;
12722 }
12723 });
12724 ExtraCost = ScalarCost;
12725 if (!IsPhiInLoop(EU))
12726 ExtractsCount[Entry].insert(Inst);
12727 if (CanBeUsedAsScalarCast) {
12728 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12729 // Update the users of the operands of the cast operand to avoid
12730 // compiler crash.
12731 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12732 for_each(IOp->operands(), [&](Value *V) {
12733 auto It = ValueToExtUses->find(V);
12734 if (It != ValueToExtUses->end()) {
12735 // Replace all uses to avoid compiler crash.
12736 ExternalUses[It->second].User = nullptr;
12737 }
12738 });
12739 }
12740 }
12741 }
12742 }
12743 }
12744
12745 ExtractCost += ExtraCost;
12746 }
12747 // Insert externals for extract of operands of casts to be emitted as scalars
12748 // instead of extractelement.
12749 for (Value *V : ScalarOpsFromCasts) {
12750 ExternalUsesAsOriginalScalar.insert(V);
12751 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
12752 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
12753 TEs.front()->findLaneForValue(V));
12754 }
12755 }
12756 // Add reduced value cost, if resized.
12757 if (!VectorizedVals.empty()) {
12758 const TreeEntry &Root = *VectorizableTree.front();
12759 auto BWIt = MinBWs.find(&Root);
12760 if (BWIt != MinBWs.end()) {
12761 Type *DstTy = Root.Scalars.front()->getType();
12762 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12763 unsigned SrcSz =
12764 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12765 if (OriginalSz != SrcSz) {
12766 unsigned Opcode = Instruction::Trunc;
12767 if (OriginalSz > SrcSz)
12768 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12769 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12770 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12771 assert(SLPReVec && "Only supported by REVEC.");
12772 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12773 }
12774 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12777 }
12778 }
12779 }
12780
12781 InstructionCost SpillCost = getSpillCost();
12782 Cost += SpillCost + ExtractCost;
12783 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12784 bool) {
12785 InstructionCost C = 0;
12786 unsigned VF = Mask.size();
12787 unsigned VecVF = TE->getVectorFactor();
12788 if (VF != VecVF &&
12789 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12791 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12792 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12793 OrigMask.begin());
12795 getWidenedType(TE->getMainOp()->getType(), VecVF),
12796 OrigMask);
12797 LLVM_DEBUG(
12798 dbgs() << "SLP: Adding cost " << C
12799 << " for final shuffle of insertelement external users.\n";
12800 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12801 Cost += C;
12802 return std::make_pair(TE, true);
12803 }
12804 return std::make_pair(TE, false);
12805 };
12806 // Calculate the cost of the reshuffled vectors, if any.
12807 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12808 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12809 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12810 unsigned VF = 0;
12811 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12813 assert((TEs.size() == 1 || TEs.size() == 2) &&
12814 "Expected exactly 1 or 2 tree entries.");
12815 if (TEs.size() == 1) {
12816 if (VF == 0)
12817 VF = TEs.front()->getVectorFactor();
12818 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12819 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12820 !all_of(enumerate(Mask), [=](const auto &Data) {
12821 return Data.value() == PoisonMaskElem ||
12822 (Data.index() < VF &&
12823 static_cast<int>(Data.index()) == Data.value());
12824 })) {
12827 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12828 << " for final shuffle of insertelement "
12829 "external users.\n";
12830 TEs.front()->dump();
12831 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12832 Cost += C;
12833 }
12834 } else {
12835 if (VF == 0) {
12836 if (TEs.front() &&
12837 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12838 VF = TEs.front()->getVectorFactor();
12839 else
12840 VF = Mask.size();
12841 }
12842 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12845 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12846 << " for final shuffle of vector node and external "
12847 "insertelement users.\n";
12848 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12849 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12850 Cost += C;
12851 }
12852 VF = Mask.size();
12853 return TEs.back();
12854 };
12855 (void)performExtractsShuffleAction<const TreeEntry>(
12856 MutableArrayRef(Vector.data(), Vector.size()), Base,
12857 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12858 EstimateShufflesCost);
12860 cast<FixedVectorType>(
12861 ShuffledInserts[I].InsertElements.front()->getType()),
12862 DemandedElts[I],
12863 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12864 Cost -= InsertCost;
12865 }
12866
12867 // Add the cost for reduced value resize (if required).
12868 if (ReductionBitWidth != 0) {
12869 assert(UserIgnoreList && "Expected reduction tree.");
12870 const TreeEntry &E = *VectorizableTree.front();
12871 auto It = MinBWs.find(&E);
12872 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12873 unsigned SrcSize = It->second.first;
12874 unsigned DstSize = ReductionBitWidth;
12875 unsigned Opcode = Instruction::Trunc;
12876 if (SrcSize < DstSize) {
12877 bool IsArithmeticExtendedReduction =
12878 all_of(*UserIgnoreList, [](Value *V) {
12879 auto *I = cast<Instruction>(V);
12880 return is_contained({Instruction::Add, Instruction::FAdd,
12881 Instruction::Mul, Instruction::FMul,
12882 Instruction::And, Instruction::Or,
12883 Instruction::Xor},
12884 I->getOpcode());
12885 });
12886 if (IsArithmeticExtendedReduction)
12887 Opcode =
12888 Instruction::BitCast; // Handle it by getExtendedReductionCost
12889 else
12890 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12891 }
12892 if (Opcode != Instruction::BitCast) {
12893 auto *SrcVecTy =
12894 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12895 auto *DstVecTy =
12896 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12897 TTI::CastContextHint CCH = getCastContextHint(E);
12898 InstructionCost CastCost;
12899 switch (E.getOpcode()) {
12900 case Instruction::SExt:
12901 case Instruction::ZExt:
12902 case Instruction::Trunc: {
12903 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12904 CCH = getCastContextHint(*OpTE);
12905 break;
12906 }
12907 default:
12908 break;
12909 }
12910 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12912 Cost += CastCost;
12913 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12914 << " for final resize for reduction from " << SrcVecTy
12915 << " to " << DstVecTy << "\n";
12916 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12917 }
12918 }
12919 }
12920
12921#ifndef NDEBUG
12922 SmallString<256> Str;
12923 {
12925 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12926 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12927 << "SLP: Total Cost = " << Cost << ".\n";
12928 }
12929 LLVM_DEBUG(dbgs() << Str);
12930 if (ViewSLPTree)
12931 ViewGraph(this, "SLP" + F->getName(), false, Str);
12932#endif
12933
12934 return Cost;
12935}
12936
12937/// Tries to find extractelement instructions with constant indices from fixed
12938/// vector type and gather such instructions into a bunch, which highly likely
12939/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12940/// successful, the matched scalars are replaced by poison values in \p VL for
12941/// future analysis.
12942std::optional<TTI::ShuffleKind>
12943BoUpSLP::tryToGatherSingleRegisterExtractElements(
12945 // Scan list of gathered scalars for extractelements that can be represented
12946 // as shuffles.
12948 SmallVector<int> UndefVectorExtracts;
12949 for (int I = 0, E = VL.size(); I < E; ++I) {
12950 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12951 if (!EI) {
12952 if (isa<UndefValue>(VL[I]))
12953 UndefVectorExtracts.push_back(I);
12954 continue;
12955 }
12956 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12957 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12958 continue;
12959 std::optional<unsigned> Idx = getExtractIndex(EI);
12960 // Undefined index.
12961 if (!Idx) {
12962 UndefVectorExtracts.push_back(I);
12963 continue;
12964 }
12965 if (Idx >= VecTy->getNumElements()) {
12966 UndefVectorExtracts.push_back(I);
12967 continue;
12968 }
12969 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12970 ExtractMask.reset(*Idx);
12971 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12972 UndefVectorExtracts.push_back(I);
12973 continue;
12974 }
12975 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12976 }
12977 // Sort the vector operands by the maximum number of uses in extractelements.
12979 VectorOpToIdx.takeVector();
12980 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12981 return P1.second.size() > P2.second.size();
12982 });
12983 // Find the best pair of the vectors or a single vector.
12984 const int UndefSz = UndefVectorExtracts.size();
12985 unsigned SingleMax = 0;
12986 unsigned PairMax = 0;
12987 if (!Vectors.empty()) {
12988 SingleMax = Vectors.front().second.size() + UndefSz;
12989 if (Vectors.size() > 1) {
12990 auto *ItNext = std::next(Vectors.begin());
12991 PairMax = SingleMax + ItNext->second.size();
12992 }
12993 }
12994 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12995 return std::nullopt;
12996 // Check if better to perform a shuffle of 2 vectors or just of a single
12997 // vector.
12998 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
12999 SmallVector<Value *> GatheredExtracts(
13000 VL.size(), PoisonValue::get(VL.front()->getType()));
13001 if (SingleMax >= PairMax && SingleMax) {
13002 for (int Idx : Vectors.front().second)
13003 std::swap(GatheredExtracts[Idx], VL[Idx]);
13004 } else if (!Vectors.empty()) {
13005 for (unsigned Idx : {0, 1})
13006 for (int Idx : Vectors[Idx].second)
13007 std::swap(GatheredExtracts[Idx], VL[Idx]);
13008 }
13009 // Add extracts from undefs too.
13010 for (int Idx : UndefVectorExtracts)
13011 std::swap(GatheredExtracts[Idx], VL[Idx]);
13012 // Check that gather of extractelements can be represented as just a
13013 // shuffle of a single/two vectors the scalars are extracted from.
13014 std::optional<TTI::ShuffleKind> Res =
13015 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13016 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
13017 // TODO: try to check other subsets if possible.
13018 // Restore the original VL if attempt was not successful.
13019 copy(SavedVL, VL.begin());
13020 return std::nullopt;
13021 }
13022 // Restore unused scalars from mask, if some of the extractelements were not
13023 // selected for shuffle.
13024 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
13025 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13026 isa<UndefValue>(GatheredExtracts[I])) {
13027 std::swap(VL[I], GatheredExtracts[I]);
13028 continue;
13029 }
13030 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13031 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13032 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13033 is_contained(UndefVectorExtracts, I))
13034 continue;
13035 }
13036 return Res;
13037}
13038
13039/// Tries to find extractelement instructions with constant indices from fixed
13040/// vector type and gather such instructions into a bunch, which highly likely
13041/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13042/// successful, the matched scalars are replaced by poison values in \p VL for
13043/// future analysis.
13045BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13047 unsigned NumParts) const {
13048 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
13049 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13050 Mask.assign(VL.size(), PoisonMaskElem);
13051 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13052 for (unsigned Part : seq<unsigned>(NumParts)) {
13053 // Scan list of gathered scalars for extractelements that can be represented
13054 // as shuffles.
13056 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13057 SmallVector<int> SubMask;
13058 std::optional<TTI::ShuffleKind> Res =
13059 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13060 ShufflesRes[Part] = Res;
13061 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13062 }
13063 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13064 return Res.has_value();
13065 }))
13066 ShufflesRes.clear();
13067 return ShufflesRes;
13068}
13069
13070std::optional<TargetTransformInfo::ShuffleKind>
13071BoUpSLP::isGatherShuffledSingleRegisterEntry(
13072 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13073 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13074 Entries.clear();
13075 // TODO: currently checking only for Scalars in the tree entry, need to count
13076 // reused elements too for better cost estimation.
13077 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13078 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13079 : TE->UserTreeIndices.front();
13080 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13081 const BasicBlock *TEInsertBlock = nullptr;
13082 // Main node of PHI entries keeps the correct order of operands/incoming
13083 // blocks.
13084 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13085 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13086 TEInsertPt = TEInsertBlock->getTerminator();
13087 } else {
13088 TEInsertBlock = TEInsertPt->getParent();
13089 }
13090 if (!DT->isReachableFromEntry(TEInsertBlock))
13091 return std::nullopt;
13092 auto *NodeUI = DT->getNode(TEInsertBlock);
13093 assert(NodeUI && "Should only process reachable instructions");
13094 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13095 auto CheckOrdering = [&](const Instruction *InsertPt) {
13096 // Argument InsertPt is an instruction where vector code for some other
13097 // tree entry (one that shares one or more scalars with TE) is going to be
13098 // generated. This lambda returns true if insertion point of vector code
13099 // for the TE dominates that point (otherwise dependency is the other way
13100 // around). The other node is not limited to be of a gather kind. Gather
13101 // nodes are not scheduled and their vector code is inserted before their
13102 // first user. If user is PHI, that is supposed to be at the end of a
13103 // predecessor block. Otherwise it is the last instruction among scalars of
13104 // the user node. So, instead of checking dependency between instructions
13105 // themselves, we check dependency between their insertion points for vector
13106 // code (since each scalar instruction ends up as a lane of a vector
13107 // instruction).
13108 const BasicBlock *InsertBlock = InsertPt->getParent();
13109 auto *NodeEUI = DT->getNode(InsertBlock);
13110 if (!NodeEUI)
13111 return false;
13112 assert((NodeUI == NodeEUI) ==
13113 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13114 "Different nodes should have different DFS numbers");
13115 // Check the order of the gather nodes users.
13116 if (TEInsertPt->getParent() != InsertBlock &&
13117 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13118 return false;
13119 if (TEInsertPt->getParent() == InsertBlock &&
13120 TEInsertPt->comesBefore(InsertPt))
13121 return false;
13122 return true;
13123 };
13124 // Find all tree entries used by the gathered values. If no common entries
13125 // found - not a shuffle.
13126 // Here we build a set of tree nodes for each gathered value and trying to
13127 // find the intersection between these sets. If we have at least one common
13128 // tree node for each gathered value - we have just a permutation of the
13129 // single vector. If we have 2 different sets, we're in situation where we
13130 // have a permutation of 2 input vectors.
13132 DenseMap<Value *, int> UsedValuesEntry;
13133 for (Value *V : VL) {
13134 if (isConstant(V))
13135 continue;
13136 // Build a list of tree entries where V is used.
13138 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13139 if (TEPtr == TE || TEPtr->Idx == 0)
13140 continue;
13141 assert(any_of(TEPtr->Scalars,
13142 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13143 "Must contain at least single gathered value.");
13144 assert(TEPtr->UserTreeIndices.size() == 1 &&
13145 "Expected only single user of a gather node.");
13146 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13147
13148 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13149 const Instruction *InsertPt =
13150 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13151 : &getLastInstructionInBundle(UseEI.UserTE);
13152 if (TEInsertPt == InsertPt) {
13153 // If 2 gathers are operands of the same entry (regardless of whether
13154 // user is PHI or else), compare operands indices, use the earlier one
13155 // as the base.
13156 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13157 continue;
13158 // If the user instruction is used for some reason in different
13159 // vectorized nodes - make it depend on index.
13160 if (TEUseEI.UserTE != UseEI.UserTE &&
13161 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13162 continue;
13163 }
13164
13165 // Check if the user node of the TE comes after user node of TEPtr,
13166 // otherwise TEPtr depends on TE.
13167 if ((TEInsertBlock != InsertPt->getParent() ||
13168 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13169 !CheckOrdering(InsertPt))
13170 continue;
13171 VToTEs.insert(TEPtr);
13172 }
13173 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
13174 const TreeEntry *VTE = VTEs.front();
13175 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
13176 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
13177 VTEs = VTEs.drop_front();
13178 // Iterate through all vectorized nodes.
13179 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
13180 return MTE->State == TreeEntry::Vectorize;
13181 });
13182 if (MIt == VTEs.end())
13183 continue;
13184 VTE = *MIt;
13185 }
13186 if (none_of(TE->CombinedEntriesWithIndices,
13187 [&](const auto &P) { return P.first == VTE->Idx; })) {
13188 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13189 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13190 continue;
13191 }
13192 VToTEs.insert(VTE);
13193 }
13194 if (VToTEs.empty())
13195 continue;
13196 if (UsedTEs.empty()) {
13197 // The first iteration, just insert the list of nodes to vector.
13198 UsedTEs.push_back(VToTEs);
13199 UsedValuesEntry.try_emplace(V, 0);
13200 } else {
13201 // Need to check if there are any previously used tree nodes which use V.
13202 // If there are no such nodes, consider that we have another one input
13203 // vector.
13204 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13205 unsigned Idx = 0;
13206 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13207 // Do we have a non-empty intersection of previously listed tree entries
13208 // and tree entries using current V?
13209 set_intersect(VToTEs, Set);
13210 if (!VToTEs.empty()) {
13211 // Yes, write the new subset and continue analysis for the next
13212 // scalar.
13213 Set.swap(VToTEs);
13214 break;
13215 }
13216 VToTEs = SavedVToTEs;
13217 ++Idx;
13218 }
13219 // No non-empty intersection found - need to add a second set of possible
13220 // source vectors.
13221 if (Idx == UsedTEs.size()) {
13222 // If the number of input vectors is greater than 2 - not a permutation,
13223 // fallback to the regular gather.
13224 // TODO: support multiple reshuffled nodes.
13225 if (UsedTEs.size() == 2)
13226 continue;
13227 UsedTEs.push_back(SavedVToTEs);
13228 Idx = UsedTEs.size() - 1;
13229 }
13230 UsedValuesEntry.try_emplace(V, Idx);
13231 }
13232 }
13233
13234 if (UsedTEs.empty()) {
13235 Entries.clear();
13236 return std::nullopt;
13237 }
13238
13239 unsigned VF = 0;
13240 if (UsedTEs.size() == 1) {
13241 // Keep the order to avoid non-determinism.
13242 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13243 UsedTEs.front().end());
13244 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13245 return TE1->Idx < TE2->Idx;
13246 });
13247 // Try to find the perfect match in another gather node at first.
13248 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13249 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13250 });
13251 if (It != FirstEntries.end() &&
13252 ((*It)->getVectorFactor() == VL.size() ||
13253 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13254 TE->ReuseShuffleIndices.size() == VL.size() &&
13255 (*It)->isSame(TE->Scalars)))) {
13256 Entries.push_back(*It);
13257 if ((*It)->getVectorFactor() == VL.size()) {
13258 std::iota(std::next(Mask.begin(), Part * VL.size()),
13259 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13260 } else {
13261 SmallVector<int> CommonMask = TE->getCommonMask();
13262 copy(CommonMask, Mask.begin());
13263 }
13264 // Clear undef scalars.
13265 for (unsigned I : seq<unsigned>(VL.size()))
13266 if (isa<PoisonValue>(VL[I]))
13267 Mask[Part * VL.size() + I] = PoisonMaskElem;
13269 }
13270 // No perfect match, just shuffle, so choose the first tree node from the
13271 // tree.
13272 Entries.push_back(FirstEntries.front());
13273 VF = FirstEntries.front()->getVectorFactor();
13274 } else {
13275 // Try to find nodes with the same vector factor.
13276 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13277 // Keep the order of tree nodes to avoid non-determinism.
13279 for (const TreeEntry *TE : UsedTEs.front()) {
13280 unsigned VF = TE->getVectorFactor();
13281 auto It = VFToTE.find(VF);
13282 if (It != VFToTE.end()) {
13283 if (It->second->Idx > TE->Idx)
13284 It->getSecond() = TE;
13285 continue;
13286 }
13287 VFToTE.try_emplace(VF, TE);
13288 }
13289 // Same, keep the order to avoid non-determinism.
13290 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13291 UsedTEs.back().end());
13292 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13293 return TE1->Idx < TE2->Idx;
13294 });
13295 for (const TreeEntry *TE : SecondEntries) {
13296 auto It = VFToTE.find(TE->getVectorFactor());
13297 if (It != VFToTE.end()) {
13298 VF = It->first;
13299 Entries.push_back(It->second);
13300 Entries.push_back(TE);
13301 break;
13302 }
13303 }
13304 // No 2 source vectors with the same vector factor - just choose 2 with max
13305 // index.
13306 if (Entries.empty()) {
13307 Entries.push_back(*llvm::max_element(
13308 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13309 return TE1->Idx < TE2->Idx;
13310 }));
13311 Entries.push_back(SecondEntries.front());
13312 VF = std::max(Entries.front()->getVectorFactor(),
13313 Entries.back()->getVectorFactor());
13314 } else {
13315 VF = Entries.front()->getVectorFactor();
13316 }
13317 }
13318
13319 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13320 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13321 // vectorized.
13322 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13323 auto *PHI = cast<PHINode>(V);
13324 auto *PHI1 = cast<PHINode>(V1);
13325 // Check that all incoming values are compatible/from same parent (if they
13326 // are instructions).
13327 // The incoming values are compatible if they all are constants, or
13328 // instruction with the same/alternate opcodes from the same basic block.
13329 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13330 Value *In = PHI->getIncomingValue(I);
13331 Value *In1 = PHI1->getIncomingValue(I);
13332 if (isConstant(In) && isConstant(In1))
13333 continue;
13334 if (!getSameOpcode({In, In1}, *TLI))
13335 return false;
13336 if (cast<Instruction>(In)->getParent() !=
13337 cast<Instruction>(In1)->getParent())
13338 return false;
13339 }
13340 return true;
13341 };
13342 // Check if the value can be ignored during analysis for shuffled gathers.
13343 // We suppose it is better to ignore instruction, which do not form splats,
13344 // are not vectorized/not extractelements (these instructions will be handled
13345 // by extractelements processing) or may form vector node in future.
13346 auto MightBeIgnored = [=](Value *V) {
13347 auto *I = dyn_cast<Instruction>(V);
13348 return I && !IsSplatOrUndefs && !isVectorized(I) &&
13350 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13351 };
13352 // Check that the neighbor instruction may form a full vector node with the
13353 // current instruction V. It is possible, if they have same/alternate opcode
13354 // and same parent basic block.
13355 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13356 Value *V1 = VL[Idx];
13357 bool UsedInSameVTE = false;
13358 auto It = UsedValuesEntry.find(V1);
13359 if (It != UsedValuesEntry.end())
13360 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13361 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13362 getSameOpcode({V, V1}, *TLI) &&
13363 cast<Instruction>(V)->getParent() ==
13364 cast<Instruction>(V1)->getParent() &&
13365 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13366 };
13367 // Build a shuffle mask for better cost estimation and vector emission.
13368 SmallBitVector UsedIdxs(Entries.size());
13370 for (int I = 0, E = VL.size(); I < E; ++I) {
13371 Value *V = VL[I];
13372 auto It = UsedValuesEntry.find(V);
13373 if (It == UsedValuesEntry.end())
13374 continue;
13375 // Do not try to shuffle scalars, if they are constants, or instructions
13376 // that can be vectorized as a result of the following vector build
13377 // vectorization.
13378 if (isConstant(V) || (MightBeIgnored(V) &&
13379 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13380 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13381 continue;
13382 unsigned Idx = It->second;
13383 EntryLanes.emplace_back(Idx, I);
13384 UsedIdxs.set(Idx);
13385 }
13386 // Iterate through all shuffled scalars and select entries, which can be used
13387 // for final shuffle.
13389 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13390 if (!UsedIdxs.test(I))
13391 continue;
13392 // Fix the entry number for the given scalar. If it is the first entry, set
13393 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13394 // These indices are used when calculating final shuffle mask as the vector
13395 // offset.
13396 for (std::pair<unsigned, int> &Pair : EntryLanes)
13397 if (Pair.first == I)
13398 Pair.first = TempEntries.size();
13399 TempEntries.push_back(Entries[I]);
13400 }
13401 Entries.swap(TempEntries);
13402 if (EntryLanes.size() == Entries.size() &&
13403 !VL.equals(ArrayRef(TE->Scalars)
13404 .slice(Part * VL.size(),
13405 std::min<int>(VL.size(), TE->Scalars.size())))) {
13406 // We may have here 1 or 2 entries only. If the number of scalars is equal
13407 // to the number of entries, no need to do the analysis, it is not very
13408 // profitable. Since VL is not the same as TE->Scalars, it means we already
13409 // have some shuffles before. Cut off not profitable case.
13410 Entries.clear();
13411 return std::nullopt;
13412 }
13413 // Build the final mask, check for the identity shuffle, if possible.
13414 bool IsIdentity = Entries.size() == 1;
13415 // Pair.first is the offset to the vector, while Pair.second is the index of
13416 // scalar in the list.
13417 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13418 unsigned Idx = Part * VL.size() + Pair.second;
13419 Mask[Idx] =
13420 Pair.first * VF +
13421 (ForOrder ? std::distance(
13422 Entries[Pair.first]->Scalars.begin(),
13423 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13424 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13425 IsIdentity &= Mask[Idx] == Pair.second;
13426 }
13427 if (ForOrder || IsIdentity || Entries.empty()) {
13428 switch (Entries.size()) {
13429 case 1:
13430 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13432 break;
13433 case 2:
13434 if (EntryLanes.size() > 2 || VL.size() <= 2)
13436 break;
13437 default:
13438 break;
13439 }
13440 } else if (!isa<VectorType>(VL.front()->getType()) &&
13441 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13442 // Do the cost estimation if shuffle beneficial than buildvector.
13443 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13444 std::next(Mask.begin(), (Part + 1) * VL.size()));
13445 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13446 for (int Idx : SubMask) {
13447 if (Idx == PoisonMaskElem)
13448 continue;
13449 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13450 MinElement = Idx;
13451 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13452 MaxElement = Idx;
13453 }
13454 assert(MaxElement >= 0 && MinElement >= 0 &&
13455 MaxElement % VF >= MinElement % VF &&
13456 "Expected at least single element.");
13457 unsigned NewVF = std::max<unsigned>(
13458 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13459 (MaxElement % VF) -
13460 (MinElement % VF) + 1));
13461 if (NewVF < VF) {
13462 for_each(SubMask, [&](int &Idx) {
13463 if (Idx == PoisonMaskElem)
13464 return;
13465 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13466 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13467 });
13468 } else {
13469 NewVF = VF;
13470 }
13471
13473 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
13474 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13475 auto GetShuffleCost = [&,
13478 VectorType *VecTy) -> InstructionCost {
13479 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13481 Mask, Entries.front()->getInterleaveFactor()))
13482 return TTI::TCC_Free;
13483 return ::getShuffleCost(TTI,
13484 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13486 VecTy, Mask, CostKind);
13487 };
13488 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13489 InstructionCost FirstShuffleCost = 0;
13490 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13491 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13492 FirstShuffleCost = ShuffleCost;
13493 } else {
13494 // Transform mask to include only first entry.
13495 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13496 bool IsIdentity = true;
13497 for (auto [I, Idx] : enumerate(FirstMask)) {
13498 if (Idx >= static_cast<int>(NewVF)) {
13500 } else {
13501 DemandedElts.clearBit(I);
13502 if (Idx != PoisonMaskElem)
13503 IsIdentity &= static_cast<int>(I) == Idx;
13504 }
13505 }
13506 if (!IsIdentity)
13507 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13508 FirstShuffleCost += TTI->getScalarizationOverhead(
13509 MaskVecTy, DemandedElts, /*Insert=*/true,
13510 /*Extract=*/false, CostKind);
13511 }
13512 InstructionCost SecondShuffleCost = 0;
13513 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13514 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13515 SecondShuffleCost = ShuffleCost;
13516 } else {
13517 // Transform mask to include only first entry.
13518 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13519 bool IsIdentity = true;
13520 for (auto [I, Idx] : enumerate(SecondMask)) {
13521 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
13523 } else {
13524 DemandedElts.clearBit(I);
13525 if (Idx != PoisonMaskElem) {
13526 Idx -= NewVF;
13527 IsIdentity &= static_cast<int>(I) == Idx;
13528 }
13529 }
13530 }
13531 if (!IsIdentity)
13532 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13533 SecondShuffleCost += TTI->getScalarizationOverhead(
13534 MaskVecTy, DemandedElts, /*Insert=*/true,
13535 /*Extract=*/false, CostKind);
13536 }
13537 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13538 for (auto [I, Idx] : enumerate(SubMask))
13539 if (Idx == PoisonMaskElem)
13540 DemandedElts.clearBit(I);
13541 InstructionCost BuildVectorCost =
13542 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13543 /*Extract=*/false, CostKind);
13544 const TreeEntry *BestEntry = nullptr;
13545 if (FirstShuffleCost < ShuffleCost) {
13546 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13547 std::next(Mask.begin(), (Part + 1) * VL.size()),
13548 [&](int &Idx) {
13549 if (Idx >= static_cast<int>(VF))
13550 Idx = PoisonMaskElem;
13551 });
13552 BestEntry = Entries.front();
13553 ShuffleCost = FirstShuffleCost;
13554 }
13555 if (SecondShuffleCost < ShuffleCost) {
13556 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13557 std::next(Mask.begin(), (Part + 1) * VL.size()),
13558 [&](int &Idx) {
13559 if (Idx < static_cast<int>(VF))
13560 Idx = PoisonMaskElem;
13561 else
13562 Idx -= VF;
13563 });
13564 BestEntry = Entries[1];
13565 ShuffleCost = SecondShuffleCost;
13566 }
13567 if (BuildVectorCost >= ShuffleCost) {
13568 if (BestEntry) {
13569 Entries.clear();
13570 Entries.push_back(BestEntry);
13571 }
13572 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13574 }
13575 }
13576 Entries.clear();
13577 // Clear the corresponding mask elements.
13578 std::fill(std::next(Mask.begin(), Part * VL.size()),
13579 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13580 return std::nullopt;
13581}
13582
13584BoUpSLP::isGatherShuffledEntry(
13585 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13586 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13587 bool ForOrder) {
13588 assert(NumParts > 0 && NumParts < VL.size() &&
13589 "Expected positive number of registers.");
13590 Entries.clear();
13591 // No need to check for the topmost gather node.
13592 if (TE == VectorizableTree.front().get() &&
13593 (!GatheredLoadsEntriesFirst.has_value() ||
13594 none_of(ArrayRef(VectorizableTree).drop_front(),
13595 [](const std::unique_ptr<TreeEntry> &TE) {
13596 return !TE->isGather();
13597 })))
13598 return {};
13599 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13600 // implemented yet.
13601 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13602 return {};
13603 Mask.assign(VL.size(), PoisonMaskElem);
13604 assert((TE->UserTreeIndices.size() == 1 ||
13605 TE == VectorizableTree.front().get()) &&
13606 "Expected only single user of the gather node.");
13607 assert(VL.size() % NumParts == 0 &&
13608 "Number of scalars must be divisible by NumParts.");
13609 if (!TE->UserTreeIndices.empty() &&
13610 TE->UserTreeIndices.front().UserTE->isGather() &&
13611 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13612 assert(
13613 (TE->Idx == 0 ||
13614 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
13615 isSplat(TE->Scalars)) &&
13616 "Expected splat or extractelements only node.");
13617 return {};
13618 }
13619 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13621 for (unsigned Part : seq<unsigned>(NumParts)) {
13622 ArrayRef<Value *> SubVL =
13623 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13624 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13625 std::optional<TTI::ShuffleKind> SubRes =
13626 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13627 ForOrder);
13628 if (!SubRes)
13629 SubEntries.clear();
13630 Res.push_back(SubRes);
13631 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13632 SubEntries.front()->getVectorFactor() == VL.size() &&
13633 (SubEntries.front()->isSame(TE->Scalars) ||
13634 SubEntries.front()->isSame(VL))) {
13635 SmallVector<const TreeEntry *> LocalSubEntries;
13636 LocalSubEntries.swap(SubEntries);
13637 Entries.clear();
13638 Res.clear();
13639 std::iota(Mask.begin(), Mask.end(), 0);
13640 // Clear undef scalars.
13641 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13642 if (isa<PoisonValue>(VL[I]))
13644 Entries.emplace_back(1, LocalSubEntries.front());
13646 return Res;
13647 }
13648 }
13649 if (all_of(Res,
13650 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13651 Entries.clear();
13652 return {};
13653 }
13654 return Res;
13655}
13656
13657InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13658 Type *ScalarTy) const {
13659 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13660 bool DuplicateNonConst = false;
13661 // Find the cost of inserting/extracting values from the vector.
13662 // Check if the same elements are inserted several times and count them as
13663 // shuffle candidates.
13664 APInt ShuffledElements = APInt::getZero(VL.size());
13665 DenseMap<Value *, unsigned> UniqueElements;
13668 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13669 if (V->getType() != ScalarTy) {
13670 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13672 V = nullptr;
13673 }
13674 if (!ForPoisonSrc)
13675 Cost +=
13676 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13677 I, Constant::getNullValue(VecTy), V);
13678 };
13679 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13680 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13681 Value *V = VL[I];
13682 // No need to shuffle duplicates for constants.
13683 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13684 ShuffledElements.setBit(I);
13685 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13686 continue;
13687 }
13688
13689 auto Res = UniqueElements.try_emplace(V, I);
13690 if (Res.second) {
13691 EstimateInsertCost(I, V);
13692 ShuffleMask[I] = I;
13693 continue;
13694 }
13695
13696 DuplicateNonConst = true;
13697 ShuffledElements.setBit(I);
13698 ShuffleMask[I] = Res.first->second;
13699 }
13700 if (ForPoisonSrc) {
13701 if (isa<FixedVectorType>(ScalarTy)) {
13702 assert(SLPReVec && "Only supported by REVEC.");
13703 // We don't need to insert elements one by one. Instead, we can insert the
13704 // entire vector into the destination.
13705 Cost = 0;
13706 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13707 for (unsigned I : seq<unsigned>(VL.size()))
13708 if (!ShuffledElements[I])
13710 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13711 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13712 } else {
13714 /*DemandedElts*/ ~ShuffledElements,
13715 /*Insert*/ true,
13716 /*Extract*/ false, CostKind, VL);
13717 }
13718 }
13719 if (DuplicateNonConst)
13721 VecTy, ShuffleMask);
13722 return Cost;
13723}
13724
13725Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13726 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13727 if (Res)
13728 return *Res;
13729 // Get the basic block this bundle is in. All instructions in the bundle
13730 // should be in this block (except for extractelement-like instructions with
13731 // constant indices or gathered loads).
13732 auto *Front = E->getMainOp();
13733 auto *BB = Front->getParent();
13734 assert(((GatheredLoadsEntriesFirst.has_value() &&
13735 E->getOpcode() == Instruction::Load && E->isGather() &&
13736 E->Idx < *GatheredLoadsEntriesFirst) ||
13737 all_of(E->Scalars,
13738 [=](Value *V) -> bool {
13739 if (E->getOpcode() == Instruction::GetElementPtr &&
13740 !isa<GetElementPtrInst>(V))
13741 return true;
13742 auto *I = dyn_cast<Instruction>(V);
13743 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13744 isVectorLikeInstWithConstOps(I);
13745 })) &&
13746 "Expected gathered loads or GEPs or instructions from same basic "
13747 "block.");
13748
13749 auto FindLastInst = [&]() {
13750 Instruction *LastInst = Front;
13751 for (Value *V : E->Scalars) {
13752 auto *I = dyn_cast<Instruction>(V);
13753 if (!I)
13754 continue;
13755 if (LastInst->getParent() == I->getParent()) {
13756 if (LastInst->comesBefore(I))
13757 LastInst = I;
13758 continue;
13759 }
13760 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13761 !isa<GetElementPtrInst>(I)) ||
13762 (isVectorLikeInstWithConstOps(LastInst) &&
13764 (GatheredLoadsEntriesFirst.has_value() &&
13765 E->getOpcode() == Instruction::Load && E->isGather() &&
13766 E->Idx < *GatheredLoadsEntriesFirst)) &&
13767 "Expected vector-like or non-GEP in GEP node insts only.");
13768 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13769 LastInst = I;
13770 continue;
13771 }
13772 if (!DT->isReachableFromEntry(I->getParent()))
13773 continue;
13774 auto *NodeA = DT->getNode(LastInst->getParent());
13775 auto *NodeB = DT->getNode(I->getParent());
13776 assert(NodeA && "Should only process reachable instructions");
13777 assert(NodeB && "Should only process reachable instructions");
13778 assert((NodeA == NodeB) ==
13779 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13780 "Different nodes should have different DFS numbers");
13781 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13782 LastInst = I;
13783 }
13784 BB = LastInst->getParent();
13785 return LastInst;
13786 };
13787
13788 auto FindFirstInst = [&]() {
13789 Instruction *FirstInst = Front;
13790 for (Value *V : E->Scalars) {
13791 auto *I = dyn_cast<Instruction>(V);
13792 if (!I)
13793 continue;
13794 if (FirstInst->getParent() == I->getParent()) {
13795 if (I->comesBefore(FirstInst))
13796 FirstInst = I;
13797 continue;
13798 }
13799 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13800 !isa<GetElementPtrInst>(I)) ||
13801 (isVectorLikeInstWithConstOps(FirstInst) &&
13803 "Expected vector-like or non-GEP in GEP node insts only.");
13804 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13805 FirstInst = I;
13806 continue;
13807 }
13808 if (!DT->isReachableFromEntry(I->getParent()))
13809 continue;
13810 auto *NodeA = DT->getNode(FirstInst->getParent());
13811 auto *NodeB = DT->getNode(I->getParent());
13812 assert(NodeA && "Should only process reachable instructions");
13813 assert(NodeB && "Should only process reachable instructions");
13814 assert((NodeA == NodeB) ==
13815 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13816 "Different nodes should have different DFS numbers");
13817 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13818 FirstInst = I;
13819 }
13820 return FirstInst;
13821 };
13822
13823 // Set insertpoint for gathered loads to the very first load.
13824 if (GatheredLoadsEntriesFirst.has_value() &&
13825 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13826 E->getOpcode() == Instruction::Load) {
13827 Res = FindFirstInst();
13828 return *Res;
13829 }
13830
13831 // Set the insert point to the beginning of the basic block if the entry
13832 // should not be scheduled.
13833 if (doesNotNeedToSchedule(E->Scalars) ||
13834 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13835 if ((E->getOpcode() == Instruction::GetElementPtr &&
13836 any_of(E->Scalars,
13837 [](Value *V) {
13838 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13839 })) ||
13840 all_of(E->Scalars,
13841 [](Value *V) {
13842 return isa<PoisonValue>(V) ||
13843 (!isVectorLikeInstWithConstOps(V) &&
13844 isUsedOutsideBlock(V));
13845 }) ||
13846 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13847 return isa<ExtractElementInst, UndefValue>(V) ||
13848 areAllOperandsNonInsts(V);
13849 })))
13850 Res = FindLastInst();
13851 else
13852 Res = FindFirstInst();
13853 return *Res;
13854 }
13855
13856 // Find the last instruction. The common case should be that BB has been
13857 // scheduled, and the last instruction is VL.back(). So we start with
13858 // VL.back() and iterate over schedule data until we reach the end of the
13859 // bundle. The end of the bundle is marked by null ScheduleData.
13860 if (BlocksSchedules.count(BB) && !E->isGather()) {
13861 Value *V = E->isOneOf(E->Scalars.back());
13863 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13864 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13865 if (Bundle && Bundle->isPartOfBundle())
13866 for (; Bundle; Bundle = Bundle->NextInBundle)
13867 Res = Bundle->Inst;
13868 }
13869
13870 // LastInst can still be null at this point if there's either not an entry
13871 // for BB in BlocksSchedules or there's no ScheduleData available for
13872 // VL.back(). This can be the case if buildTree_rec aborts for various
13873 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13874 // size is reached, etc.). ScheduleData is initialized in the scheduling
13875 // "dry-run".
13876 //
13877 // If this happens, we can still find the last instruction by brute force. We
13878 // iterate forwards from Front (inclusive) until we either see all
13879 // instructions in the bundle or reach the end of the block. If Front is the
13880 // last instruction in program order, LastInst will be set to Front, and we
13881 // will visit all the remaining instructions in the block.
13882 //
13883 // One of the reasons we exit early from buildTree_rec is to place an upper
13884 // bound on compile-time. Thus, taking an additional compile-time hit here is
13885 // not ideal. However, this should be exceedingly rare since it requires that
13886 // we both exit early from buildTree_rec and that the bundle be out-of-order
13887 // (causing us to iterate all the way to the end of the block).
13888 if (!Res)
13889 Res = FindLastInst();
13890 assert(Res && "Failed to find last instruction in bundle");
13891 return *Res;
13892}
13893
13894void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13895 auto *Front = E->getMainOp();
13896 Instruction *LastInst = &getLastInstructionInBundle(E);
13897 assert(LastInst && "Failed to find last instruction in bundle");
13898 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13899 // If the instruction is PHI, set the insert point after all the PHIs.
13900 bool IsPHI = isa<PHINode>(LastInst);
13901 if (IsPHI)
13902 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13903 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13904 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13905 } else {
13906 // Set the insertion point after the last instruction in the bundle. Set the
13907 // debug location to Front.
13908 Builder.SetInsertPoint(
13909 LastInst->getParent(),
13911 }
13912 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13913}
13914
13915Value *BoUpSLP::gather(
13916 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13917 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13918 // List of instructions/lanes from current block and/or the blocks which are
13919 // part of the current loop. These instructions will be inserted at the end to
13920 // make it possible to optimize loops and hoist invariant instructions out of
13921 // the loops body with better chances for success.
13923 SmallSet<int, 4> PostponedIndices;
13924 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13925 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13927 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13928 InsertBB = InsertBB->getSinglePredecessor();
13929 return InsertBB && InsertBB == InstBB;
13930 };
13931 for (int I = 0, E = VL.size(); I < E; ++I) {
13932 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13933 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13934 isVectorized(Inst) ||
13935 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13936 PostponedIndices.insert(I).second)
13937 PostponedInsts.emplace_back(Inst, I);
13938 }
13939
13940 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13941 Type *Ty) {
13942 Value *Scalar = V;
13943 if (Scalar->getType() != Ty) {
13944 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13945 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13946 Value *V = Scalar;
13947 if (auto *CI = dyn_cast<CastInst>(Scalar);
13948 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13949 Value *Op = CI->getOperand(0);
13950 if (auto *IOp = dyn_cast<Instruction>(Op);
13951 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
13952 V = Op;
13953 }
13954 Scalar = Builder.CreateIntCast(
13955 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13956 }
13957
13958 Instruction *InsElt;
13959 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13960 assert(SLPReVec && "FixedVectorType is not expected.");
13961 Vec =
13962 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
13963 auto *II = dyn_cast<IntrinsicInst>(Vec);
13964 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13965 return Vec;
13966 InsElt = II;
13967 } else {
13968 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13969 InsElt = dyn_cast<InsertElementInst>(Vec);
13970 if (!InsElt)
13971 return Vec;
13972 }
13973 GatherShuffleExtractSeq.insert(InsElt);
13974 CSEBlocks.insert(InsElt->getParent());
13975 // Add to our 'need-to-extract' list.
13976 if (isa<Instruction>(V)) {
13977 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
13978 // Find which lane we need to extract.
13979 User *UserOp = nullptr;
13980 if (Scalar != V) {
13981 if (auto *SI = dyn_cast<Instruction>(Scalar))
13982 UserOp = SI;
13983 } else {
13984 UserOp = InsElt;
13985 }
13986 if (UserOp) {
13987 unsigned FoundLane = Entries.front()->findLaneForValue(V);
13988 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
13989 }
13990 }
13991 }
13992 return Vec;
13993 };
13994 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13995 Value *Vec = PoisonValue::get(VecTy);
13996 SmallVector<int> NonConsts;
13998 std::iota(Mask.begin(), Mask.end(), 0);
13999 Value *OriginalRoot = Root;
14000 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14001 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14002 SV->getOperand(0)->getType() == VecTy) {
14003 Root = SV->getOperand(0);
14004 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14005 }
14006 // Insert constant values at first.
14007 for (int I = 0, E = VL.size(); I < E; ++I) {
14008 if (PostponedIndices.contains(I))
14009 continue;
14010 if (!isConstant(VL[I])) {
14011 NonConsts.push_back(I);
14012 continue;
14013 }
14014 if (isa<PoisonValue>(VL[I]))
14015 continue;
14016 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14017 Mask[I] = I + E;
14018 }
14019 if (Root) {
14020 if (isa<PoisonValue>(Vec)) {
14021 Vec = OriginalRoot;
14022 } else {
14023 Vec = CreateShuffle(Root, Vec, Mask);
14024 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14025 OI && OI->hasNUses(0) &&
14026 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14027 return TE->VectorizedValue == OI;
14028 }))
14029 eraseInstruction(OI);
14030 }
14031 }
14032 // Insert non-constant values.
14033 for (int I : NonConsts)
14034 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14035 // Append instructions, which are/may be part of the loop, in the end to make
14036 // it possible to hoist non-loop-based instructions.
14037 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14038 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14039
14040 return Vec;
14041}
14042
14043/// Merges shuffle masks and emits final shuffle instruction, if required. It
14044/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14045/// when the actual shuffle instruction is generated only if this is actually
14046/// required. Otherwise, the shuffle instruction emission is delayed till the
14047/// end of the process, to reduce the number of emitted instructions and further
14048/// analysis/transformations.
14049/// The class also will look through the previously emitted shuffle instructions
14050/// and properly mark indices in mask as undef.
14051/// For example, given the code
14052/// \code
14053/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14054/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14055/// \endcode
14056/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14057/// look through %s1 and %s2 and emit
14058/// \code
14059/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14060/// \endcode
14061/// instead.
14062/// If 2 operands are of different size, the smallest one will be resized and
14063/// the mask recalculated properly.
14064/// For example, given the code
14065/// \code
14066/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14067/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14068/// \endcode
14069/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14070/// look through %s1 and %s2 and emit
14071/// \code
14072/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14073/// \endcode
14074/// instead.
14075class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14076 bool IsFinalized = false;
14077 /// Combined mask for all applied operands and masks. It is built during
14078 /// analysis and actual emission of shuffle vector instructions.
14079 SmallVector<int> CommonMask;
14080 /// List of operands for the shuffle vector instruction. It hold at max 2
14081 /// operands, if the 3rd is going to be added, the first 2 are combined into
14082 /// shuffle with \p CommonMask mask, the first operand sets to be the
14083 /// resulting shuffle and the second operand sets to be the newly added
14084 /// operand. The \p CommonMask is transformed in the proper way after that.
14085 SmallVector<Value *, 2> InVectors;
14086 IRBuilderBase &Builder;
14087 BoUpSLP &R;
14088
14089 class ShuffleIRBuilder {
14090 IRBuilderBase &Builder;
14091 /// Holds all of the instructions that we gathered.
14092 SetVector<Instruction *> &GatherShuffleExtractSeq;
14093 /// A list of blocks that we are going to CSE.
14094 DenseSet<BasicBlock *> &CSEBlocks;
14095 /// Data layout.
14096 const DataLayout &DL;
14097
14098 public:
14099 ShuffleIRBuilder(IRBuilderBase &Builder,
14100 SetVector<Instruction *> &GatherShuffleExtractSeq,
14101 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14102 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14103 CSEBlocks(CSEBlocks), DL(DL) {}
14104 ~ShuffleIRBuilder() = default;
14105 /// Creates shufflevector for the 2 operands with the given mask.
14106 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14107 if (V1->getType() != V2->getType()) {
14109 V1->getType()->isIntOrIntVectorTy() &&
14110 "Expected integer vector types only.");
14111 if (V1->getType() != V2->getType()) {
14112 if (cast<VectorType>(V2->getType())
14113 ->getElementType()
14114 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14115 ->getElementType()
14116 ->getIntegerBitWidth())
14117 V2 = Builder.CreateIntCast(
14118 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14119 else
14120 V1 = Builder.CreateIntCast(
14121 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14122 }
14123 }
14124 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14125 if (auto *I = dyn_cast<Instruction>(Vec)) {
14126 GatherShuffleExtractSeq.insert(I);
14127 CSEBlocks.insert(I->getParent());
14128 }
14129 return Vec;
14130 }
14131 /// Creates permutation of the single vector operand with the given mask, if
14132 /// it is not identity mask.
14133 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14134 if (Mask.empty())
14135 return V1;
14136 unsigned VF = Mask.size();
14137 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14138 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14139 return V1;
14140 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14141 if (auto *I = dyn_cast<Instruction>(Vec)) {
14142 GatherShuffleExtractSeq.insert(I);
14143 CSEBlocks.insert(I->getParent());
14144 }
14145 return Vec;
14146 }
14147 Value *createIdentity(Value *V) { return V; }
14148 Value *createPoison(Type *Ty, unsigned VF) {
14149 return PoisonValue::get(getWidenedType(Ty, VF));
14150 }
14151 /// Resizes 2 input vector to match the sizes, if the they are not equal
14152 /// yet. The smallest vector is resized to the size of the larger vector.
14153 void resizeToMatch(Value *&V1, Value *&V2) {
14154 if (V1->getType() == V2->getType())
14155 return;
14156 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14157 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14158 int VF = std::max(V1VF, V2VF);
14159 int MinVF = std::min(V1VF, V2VF);
14160 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14161 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14162 0);
14163 Value *&Op = MinVF == V1VF ? V1 : V2;
14164 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14165 if (auto *I = dyn_cast<Instruction>(Op)) {
14166 GatherShuffleExtractSeq.insert(I);
14167 CSEBlocks.insert(I->getParent());
14168 }
14169 if (MinVF == V1VF)
14170 V1 = Op;
14171 else
14172 V2 = Op;
14173 }
14174 };
14175
14176 /// Smart shuffle instruction emission, walks through shuffles trees and
14177 /// tries to find the best matching vector for the actual shuffle
14178 /// instruction.
14179 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14180 assert(V1 && "Expected at least one vector value.");
14181 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14182 R.CSEBlocks, *R.DL);
14183 return BaseShuffleAnalysis::createShuffle<Value *>(
14184 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14185 }
14186
14187 /// Cast value \p V to the vector type with the same number of elements, but
14188 /// the base type \p ScalarTy.
14189 Value *castToScalarTyElem(Value *V,
14190 std::optional<bool> IsSigned = std::nullopt) {
14191 auto *VecTy = cast<VectorType>(V->getType());
14192 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14193 if (VecTy->getElementType() == ScalarTy->getScalarType())
14194 return V;
14195 return Builder.CreateIntCast(
14196 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14197 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14198 }
14199
14200public:
14202 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14203
14204 /// Adjusts extractelements after reusing them.
14205 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14206 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14207 unsigned NumParts, bool &UseVecBaseAsInput) {
14208 UseVecBaseAsInput = false;
14209 SmallPtrSet<Value *, 4> UniqueBases;
14210 Value *VecBase = nullptr;
14211 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14212 if (!E->ReorderIndices.empty()) {
14213 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14214 E->ReorderIndices.end());
14215 reorderScalars(VL, ReorderMask);
14216 }
14217 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14218 int Idx = Mask[I];
14219 if (Idx == PoisonMaskElem)
14220 continue;
14221 auto *EI = cast<ExtractElementInst>(VL[I]);
14222 VecBase = EI->getVectorOperand();
14223 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
14224 VecBase = TEs.front()->VectorizedValue;
14225 assert(VecBase && "Expected vectorized value.");
14226 UniqueBases.insert(VecBase);
14227 // If the only one use is vectorized - can delete the extractelement
14228 // itself.
14229 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14230 (NumParts != 1 && count(VL, EI) > 1) ||
14231 any_of(EI->users(), [&](User *U) {
14232 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
14233 return UTEs.empty() || UTEs.size() > 1 ||
14234 (isa<GetElementPtrInst>(U) &&
14235 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14236 (!UTEs.empty() &&
14237 count_if(R.VectorizableTree,
14238 [&](const std::unique_ptr<TreeEntry> &TE) {
14239 return any_of(TE->UserTreeIndices,
14240 [&](const EdgeInfo &Edge) {
14241 return Edge.UserTE ==
14242 UTEs.front();
14243 }) &&
14244 is_contained(VL, EI);
14245 }) != 1);
14246 }))
14247 continue;
14248 R.eraseInstruction(EI);
14249 }
14250 if (NumParts == 1 || UniqueBases.size() == 1) {
14251 assert(VecBase && "Expected vectorized value.");
14252 return castToScalarTyElem(VecBase);
14253 }
14254 UseVecBaseAsInput = true;
14255 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14256 for (auto [I, Idx] : enumerate(Mask))
14257 if (Idx != PoisonMaskElem)
14258 Idx = I;
14259 };
14260 // Perform multi-register vector shuffle, joining them into a single virtual
14261 // long vector.
14262 // Need to shuffle each part independently and then insert all this parts
14263 // into a long virtual vector register, forming the original vector.
14264 Value *Vec = nullptr;
14265 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14266 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14267 for (unsigned Part : seq<unsigned>(NumParts)) {
14268 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14269 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14270 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14271 constexpr int MaxBases = 2;
14272 SmallVector<Value *, MaxBases> Bases(MaxBases);
14273 auto VLMask = zip(SubVL, SubMask);
14274 const unsigned VF = std::accumulate(
14275 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14276 if (std::get<1>(D) == PoisonMaskElem)
14277 return S;
14278 Value *VecOp =
14279 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14280 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
14281 !TEs.empty())
14282 VecOp = TEs.front()->VectorizedValue;
14283 assert(VecOp && "Expected vectorized value.");
14284 const unsigned Size =
14285 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14286 return std::max(S, Size);
14287 });
14288 for (const auto [V, I] : VLMask) {
14289 if (I == PoisonMaskElem)
14290 continue;
14291 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14292 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
14293 VecOp = TEs.front()->VectorizedValue;
14294 assert(VecOp && "Expected vectorized value.");
14295 VecOp = castToScalarTyElem(VecOp);
14296 Bases[I / VF] = VecOp;
14297 }
14298 if (!Bases.front())
14299 continue;
14300 Value *SubVec;
14301 if (Bases.back()) {
14302 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14303 TransformToIdentity(SubMask);
14304 } else {
14305 SubVec = Bases.front();
14306 }
14307 if (!Vec) {
14308 Vec = SubVec;
14309 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14310 [&](unsigned P) {
14311 ArrayRef<int> SubMask =
14312 Mask.slice(P * SliceSize,
14313 getNumElems(Mask.size(),
14314 SliceSize, P));
14315 return all_of(SubMask, [](int Idx) {
14316 return Idx == PoisonMaskElem;
14317 });
14318 })) &&
14319 "Expected first part or all previous parts masked.");
14320 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14321 } else {
14322 unsigned NewVF =
14323 cast<FixedVectorType>(Vec->getType())->getNumElements();
14324 if (Vec->getType() != SubVec->getType()) {
14325 unsigned SubVecVF =
14326 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14327 NewVF = std::max(NewVF, SubVecVF);
14328 }
14329 // Adjust SubMask.
14330 for (int &Idx : SubMask)
14331 if (Idx != PoisonMaskElem)
14332 Idx += NewVF;
14333 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14334 Vec = createShuffle(Vec, SubVec, VecMask);
14335 TransformToIdentity(VecMask);
14336 }
14337 }
14338 copy(VecMask, Mask.begin());
14339 return Vec;
14340 }
14341 /// Checks if the specified entry \p E needs to be delayed because of its
14342 /// dependency nodes.
14343 std::optional<Value *>
14344 needToDelay(const TreeEntry *E,
14346 // No need to delay emission if all deps are ready.
14347 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14348 return all_of(
14349 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14350 }))
14351 return std::nullopt;
14352 // Postpone gather emission, will be emitted after the end of the
14353 // process to keep correct order.
14354 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14355 return Builder.CreateAlignedLoad(
14356 ResVecTy,
14358 MaybeAlign());
14359 }
14360 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14361 /// shuffling.
14362 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14363 Value *V1 = E1.VectorizedValue;
14364 if (V1->getType()->isIntOrIntVectorTy())
14365 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14366 if (isa<PoisonValue>(V))
14367 return false;
14368 return !isKnownNonNegative(
14369 V, SimplifyQuery(*R.DL));
14370 }));
14371 Value *V2 = E2.VectorizedValue;
14372 if (V2->getType()->isIntOrIntVectorTy())
14373 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14374 if (isa<PoisonValue>(V))
14375 return false;
14376 return !isKnownNonNegative(
14377 V, SimplifyQuery(*R.DL));
14378 }));
14379 add(V1, V2, Mask);
14380 }
14381 /// Adds single input vector (in form of tree entry) and the mask for its
14382 /// shuffling.
14383 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14384 Value *V1 = E1.VectorizedValue;
14385 if (V1->getType()->isIntOrIntVectorTy())
14386 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14387 if (isa<PoisonValue>(V))
14388 return false;
14389 return !isKnownNonNegative(
14390 V, SimplifyQuery(*R.DL));
14391 }));
14392 add(V1, Mask);
14393 }
14394 /// Adds 2 input vectors and the mask for their shuffling.
14395 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14396 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14397 assert(isa<FixedVectorType>(V1->getType()) &&
14398 isa<FixedVectorType>(V2->getType()) &&
14399 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14400 V1 = castToScalarTyElem(V1);
14401 V2 = castToScalarTyElem(V2);
14402 if (InVectors.empty()) {
14403 InVectors.push_back(V1);
14404 InVectors.push_back(V2);
14405 CommonMask.assign(Mask.begin(), Mask.end());
14406 return;
14407 }
14408 Value *Vec = InVectors.front();
14409 if (InVectors.size() == 2) {
14410 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14411 transformMaskAfterShuffle(CommonMask, CommonMask);
14412 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14413 Mask.size()) {
14414 Vec = createShuffle(Vec, nullptr, CommonMask);
14415 transformMaskAfterShuffle(CommonMask, CommonMask);
14416 }
14417 V1 = createShuffle(V1, V2, Mask);
14418 unsigned VF = std::max(getVF(V1), getVF(Vec));
14419 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14420 if (Mask[Idx] != PoisonMaskElem)
14421 CommonMask[Idx] = Idx + VF;
14422 InVectors.front() = Vec;
14423 if (InVectors.size() == 2)
14424 InVectors.back() = V1;
14425 else
14426 InVectors.push_back(V1);
14427 }
14428 /// Adds another one input vector and the mask for the shuffling.
14429 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14430 assert(isa<FixedVectorType>(V1->getType()) &&
14431 "castToScalarTyElem expects V1 to be FixedVectorType");
14432 V1 = castToScalarTyElem(V1);
14433 if (InVectors.empty()) {
14434 InVectors.push_back(V1);
14435 CommonMask.assign(Mask.begin(), Mask.end());
14436 return;
14437 }
14438 const auto *It = find(InVectors, V1);
14439 if (It == InVectors.end()) {
14440 if (InVectors.size() == 2 ||
14441 InVectors.front()->getType() != V1->getType()) {
14442 Value *V = InVectors.front();
14443 if (InVectors.size() == 2) {
14444 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14445 transformMaskAfterShuffle(CommonMask, CommonMask);
14446 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14447 CommonMask.size()) {
14448 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14449 transformMaskAfterShuffle(CommonMask, CommonMask);
14450 }
14451 unsigned VF = std::max(CommonMask.size(), Mask.size());
14452 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14453 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14454 CommonMask[Idx] =
14455 V->getType() != V1->getType()
14456 ? Idx + VF
14457 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14458 ->getNumElements();
14459 if (V->getType() != V1->getType())
14460 V1 = createShuffle(V1, nullptr, Mask);
14461 InVectors.front() = V;
14462 if (InVectors.size() == 2)
14463 InVectors.back() = V1;
14464 else
14465 InVectors.push_back(V1);
14466 return;
14467 }
14468 // Check if second vector is required if the used elements are already
14469 // used from the first one.
14470 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14471 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14472 InVectors.push_back(V1);
14473 break;
14474 }
14475 }
14476 unsigned VF = 0;
14477 for (Value *V : InVectors)
14478 VF = std::max(VF, getVF(V));
14479 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14480 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14481 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14482 }
14483 /// Adds another one input vector and the mask for the shuffling.
14485 SmallVector<int> NewMask;
14486 inversePermutation(Order, NewMask);
14487 add(V1, NewMask);
14488 }
14489 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14490 Value *Root = nullptr) {
14491 return R.gather(VL, Root, ScalarTy,
14492 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14493 return createShuffle(V1, V2, Mask);
14494 });
14495 }
14496 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14497 /// Finalize emission of the shuffles.
14498 /// \param Action the action (if any) to be performed before final applying of
14499 /// the \p ExtMask mask.
14500 Value *
14502 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14503 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14504 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14505 IsFinalized = true;
14506 if (Action) {
14507 Value *Vec = InVectors.front();
14508 if (InVectors.size() == 2) {
14509 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14510 InVectors.pop_back();
14511 } else {
14512 Vec = createShuffle(Vec, nullptr, CommonMask);
14513 }
14514 transformMaskAfterShuffle(CommonMask, CommonMask);
14515 assert(VF > 0 &&
14516 "Expected vector length for the final value before action.");
14517 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14518 if (VecVF < VF) {
14519 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14520 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14521 Vec = createShuffle(Vec, nullptr, ResizeMask);
14522 }
14523 Action(Vec, CommonMask);
14524 InVectors.front() = Vec;
14525 }
14526 if (!SubVectors.empty()) {
14527 Value *Vec = InVectors.front();
14528 if (InVectors.size() == 2) {
14529 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14530 InVectors.pop_back();
14531 } else {
14532 Vec = createShuffle(Vec, nullptr, CommonMask);
14533 }
14534 transformMaskAfterShuffle(CommonMask, CommonMask);
14535 auto CreateSubVectors = [&](Value *Vec,
14536 SmallVectorImpl<int> &CommonMask) {
14537 for (auto [E, Idx] : SubVectors) {
14538 Value *V = E->VectorizedValue;
14539 if (V->getType()->isIntOrIntVectorTy())
14540 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14541 if (isa<PoisonValue>(V))
14542 return false;
14543 return !isKnownNonNegative(
14544 V, SimplifyQuery(*R.DL));
14545 }));
14546 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
14547 Vec = createInsertVector(
14548 Builder, Vec, V, InsertionIndex,
14549 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
14550 _3));
14551 if (!CommonMask.empty()) {
14552 std::iota(std::next(CommonMask.begin(), Idx),
14553 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14554 Idx);
14555 }
14556 }
14557 return Vec;
14558 };
14559 if (SubVectorsMask.empty()) {
14560 Vec = CreateSubVectors(Vec, CommonMask);
14561 } else {
14562 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14563 copy(SubVectorsMask, SVMask.begin());
14564 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14565 if (I2 != PoisonMaskElem) {
14566 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14567 I1 = I2 + CommonMask.size();
14568 }
14569 }
14570 Value *InsertVec =
14571 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14572 Vec = createShuffle(InsertVec, Vec, SVMask);
14573 transformMaskAfterShuffle(CommonMask, SVMask);
14574 }
14575 InVectors.front() = Vec;
14576 }
14577
14578 if (!ExtMask.empty()) {
14579 if (CommonMask.empty()) {
14580 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14581 } else {
14582 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14583 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14584 if (ExtMask[I] == PoisonMaskElem)
14585 continue;
14586 NewMask[I] = CommonMask[ExtMask[I]];
14587 }
14588 CommonMask.swap(NewMask);
14589 }
14590 }
14591 if (CommonMask.empty()) {
14592 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14593 return InVectors.front();
14594 }
14595 if (InVectors.size() == 2)
14596 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14597 return createShuffle(InVectors.front(), nullptr, CommonMask);
14598 }
14599
14601 assert((IsFinalized || CommonMask.empty()) &&
14602 "Shuffle construction must be finalized.");
14603 }
14604};
14605
14606BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14607 unsigned NodeIdx) {
14608 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14609 InstructionsState S = getSameOpcode(VL, *TLI);
14610 // Special processing for GEPs bundle, which may include non-gep values.
14611 if (!S && VL.front()->getType()->isPointerTy()) {
14612 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14613 if (It != VL.end())
14614 S = getSameOpcode(*It, *TLI);
14615 }
14616 if (!S)
14617 return nullptr;
14618 auto CheckSameVE = [&](const TreeEntry *VE) {
14619 return any_of(VE->UserTreeIndices,
14620 [E, NodeIdx](const EdgeInfo &EI) {
14621 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14622 }) ||
14623 any_of(VectorizableTree,
14624 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14625 return TE->isOperandGatherNode(
14626 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14627 VE->isSame(TE->Scalars);
14628 });
14629 };
14630 TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
14631 if (VE && CheckSameVE(VE))
14632 return VE;
14633 return nullptr;
14634}
14635
14636Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14637 bool PostponedPHIs) {
14638 ValueList &VL = E->getOperand(NodeIdx);
14639 const unsigned VF = VL.size();
14640 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14641 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14642 // V may be affected by MinBWs.
14643 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14644 // factor is the number of elements, not their type.
14645 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14646 unsigned NumElements = getNumElements(VL.front()->getType());
14647 ShuffleInstructionBuilder ShuffleBuilder(
14648 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14649 : ScalarTy,
14650 Builder, *this);
14651 ShuffleBuilder.add(V, Mask);
14653 E->CombinedEntriesWithIndices.size());
14654 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14655 [&](const auto &P) {
14656 return std::make_pair(VectorizableTree[P.first].get(),
14657 P.second);
14658 });
14659 assert((E->CombinedEntriesWithIndices.empty() ||
14660 E->ReorderIndices.empty()) &&
14661 "Expected either combined subnodes or reordering");
14662 return ShuffleBuilder.finalize({}, SubVectors, {});
14663 };
14664 Value *V = vectorizeTree(VE, PostponedPHIs);
14665 if (VF * getNumElements(VL[0]->getType()) !=
14666 cast<FixedVectorType>(V->getType())->getNumElements()) {
14667 if (!VE->ReuseShuffleIndices.empty()) {
14668 // Reshuffle to get only unique values.
14669 // If some of the scalars are duplicated in the vectorization
14670 // tree entry, we do not vectorize them but instead generate a
14671 // mask for the reuses. But if there are several users of the
14672 // same entry, they may have different vectorization factors.
14673 // This is especially important for PHI nodes. In this case, we
14674 // need to adapt the resulting instruction for the user
14675 // vectorization factor and have to reshuffle it again to take
14676 // only unique elements of the vector. Without this code the
14677 // function incorrectly returns reduced vector instruction with
14678 // the same elements, not with the unique ones.
14679
14680 // block:
14681 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14682 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14683 // ... (use %2)
14684 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14685 // br %block
14687 for (auto [I, V] : enumerate(VL)) {
14688 if (isa<PoisonValue>(V))
14689 continue;
14690 Mask[I] = VE->findLaneForValue(V);
14691 }
14692 V = FinalShuffle(V, Mask);
14693 } else {
14694 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14695 "Expected vectorization factor less "
14696 "than original vector size.");
14697 SmallVector<int> UniformMask(VF, 0);
14698 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14699 V = FinalShuffle(V, UniformMask);
14700 }
14701 }
14702 // Need to update the operand gather node, if actually the operand is not a
14703 // vectorized node, but the buildvector/gather node, which matches one of
14704 // the vectorized nodes.
14705 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14706 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14707 }) == VE->UserTreeIndices.end()) {
14708 auto *It =
14709 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14710 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14711 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14712 });
14713 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14714 (*It)->VectorizedValue = V;
14715 }
14716 return V;
14717 }
14718
14719 // Find the corresponding gather entry and vectorize it.
14720 // Allows to be more accurate with tree/graph transformations, checks for the
14721 // correctness of the transformations in many cases.
14722 auto *I = find_if(VectorizableTree,
14723 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14724 return TE->isOperandGatherNode({E, NodeIdx});
14725 });
14726 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14727 assert(I->get()->UserTreeIndices.size() == 1 &&
14728 "Expected only single user for the gather node.");
14729 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14730 return vectorizeTree(I->get(), PostponedPHIs);
14731}
14732
14733template <typename BVTy, typename ResTy, typename... Args>
14734ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14735 Args &...Params) {
14736 assert(E->isGather() && "Expected gather node.");
14737 unsigned VF = E->getVectorFactor();
14738
14739 bool NeedFreeze = false;
14740 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14741 E->ReuseShuffleIndices.end());
14742 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14743 // Clear values, to be replaced by insertvector instructions.
14744 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14745 for_each(MutableArrayRef(GatheredScalars)
14746 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14747 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14749 E->CombinedEntriesWithIndices.size());
14750 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14751 [&](const auto &P) {
14752 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14753 });
14754 // Build a mask out of the reorder indices and reorder scalars per this
14755 // mask.
14756 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14757 E->ReorderIndices.end());
14758 if (!ReorderMask.empty())
14759 reorderScalars(GatheredScalars, ReorderMask);
14760 SmallVector<int> SubVectorsMask;
14761 inversePermutation(E->ReorderIndices, SubVectorsMask);
14762 // Transform non-clustered elements in the mask to poison (-1).
14763 // "Clustered" operations will be reordered using this mask later.
14764 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14765 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14766 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14767 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14768 } else {
14769 SubVectorsMask.clear();
14770 }
14771 SmallVector<Value *> StoredGS(GatheredScalars);
14772 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14773 unsigned I, unsigned SliceSize,
14774 bool IsNotPoisonous) {
14775 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14776 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14777 }))
14778 return false;
14779 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14780 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14781 if (UserTE->getNumOperands() != 2)
14782 return false;
14783 if (!IsNotPoisonous) {
14784 auto *It =
14785 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14786 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14787 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14788 }) != TE->UserTreeIndices.end();
14789 });
14790 if (It == VectorizableTree.end())
14791 return false;
14792 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14793 if (!(*It)->ReorderIndices.empty()) {
14794 inversePermutation((*It)->ReorderIndices, ReorderMask);
14795 reorderScalars(GS, ReorderMask);
14796 }
14797 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14798 Value *V0 = std::get<0>(P);
14799 Value *V1 = std::get<1>(P);
14800 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14801 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14802 is_contained(E->Scalars, V1));
14803 }))
14804 return false;
14805 }
14806 int Idx;
14807 if ((Mask.size() < InputVF &&
14809 Idx == 0) ||
14810 (Mask.size() == InputVF &&
14811 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14812 std::iota(
14813 std::next(Mask.begin(), I * SliceSize),
14814 std::next(Mask.begin(),
14815 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14816 0);
14817 } else {
14818 unsigned IVal =
14819 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14820 std::fill(
14821 std::next(Mask.begin(), I * SliceSize),
14822 std::next(Mask.begin(),
14823 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14824 IVal);
14825 }
14826 return true;
14827 };
14828 BVTy ShuffleBuilder(ScalarTy, Params...);
14829 ResTy Res = ResTy();
14831 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14833 Value *ExtractVecBase = nullptr;
14834 bool UseVecBaseAsInput = false;
14837 Type *OrigScalarTy = GatheredScalars.front()->getType();
14838 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14839 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
14840 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14841 // Check for gathered extracts.
14842 bool Resized = false;
14843 ExtractShuffles =
14844 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14845 if (!ExtractShuffles.empty()) {
14846 SmallVector<const TreeEntry *> ExtractEntries;
14847 for (auto [Idx, I] : enumerate(ExtractMask)) {
14848 if (I == PoisonMaskElem)
14849 continue;
14850 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
14851 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
14852 !TEs.empty())
14853 ExtractEntries.append(TEs.begin(), TEs.end());
14854 }
14855 if (std::optional<ResTy> Delayed =
14856 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14857 // Delay emission of gathers which are not ready yet.
14858 PostponedGathers.insert(E);
14859 // Postpone gather emission, will be emitted after the end of the
14860 // process to keep correct order.
14861 return *Delayed;
14862 }
14863 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14864 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14865 ExtractVecBase = VecBase;
14866 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14867 if (VF == VecBaseTy->getNumElements() &&
14868 GatheredScalars.size() != VF) {
14869 Resized = true;
14870 GatheredScalars.append(VF - GatheredScalars.size(),
14871 PoisonValue::get(OrigScalarTy));
14872 NumParts =
14873 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
14874 }
14875 }
14876 }
14877 // Gather extracts after we check for full matched gathers only.
14878 if (!ExtractShuffles.empty() || !E->hasState() ||
14879 E->getOpcode() != Instruction::Load ||
14880 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14881 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14882 any_of(E->Scalars,
14883 [this](Value *V) {
14884 return isa<LoadInst>(V) && isVectorized(V);
14885 })) ||
14886 (E->hasState() && E->isAltShuffle()) ||
14887 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
14888 isSplat(E->Scalars) ||
14889 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14890 GatherShuffles =
14891 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14892 }
14893 if (!GatherShuffles.empty()) {
14894 if (std::optional<ResTy> Delayed =
14895 ShuffleBuilder.needToDelay(E, Entries)) {
14896 // Delay emission of gathers which are not ready yet.
14897 PostponedGathers.insert(E);
14898 // Postpone gather emission, will be emitted after the end of the
14899 // process to keep correct order.
14900 return *Delayed;
14901 }
14902 if (GatherShuffles.size() == 1 &&
14903 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14904 Entries.front().front()->isSame(E->Scalars)) {
14905 // Perfect match in the graph, will reuse the previously vectorized
14906 // node. Cost is 0.
14907 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14908 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14909 // Restore the mask for previous partially matched values.
14910 Mask.resize(E->Scalars.size());
14911 const TreeEntry *FrontTE = Entries.front().front();
14912 if (FrontTE->ReorderIndices.empty() &&
14913 ((FrontTE->ReuseShuffleIndices.empty() &&
14914 E->Scalars.size() == FrontTE->Scalars.size()) ||
14915 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14916 std::iota(Mask.begin(), Mask.end(), 0);
14917 } else {
14918 for (auto [I, V] : enumerate(E->Scalars)) {
14919 if (isa<PoisonValue>(V)) {
14921 continue;
14922 }
14923 Mask[I] = FrontTE->findLaneForValue(V);
14924 }
14925 }
14926 ShuffleBuilder.add(*FrontTE, Mask);
14927 // Full matched entry found, no need to insert subvectors.
14928 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14929 return Res;
14930 }
14931 if (!Resized) {
14932 if (GatheredScalars.size() != VF &&
14933 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14934 return any_of(TEs, [&](const TreeEntry *TE) {
14935 return TE->getVectorFactor() == VF;
14936 });
14937 }))
14938 GatheredScalars.append(VF - GatheredScalars.size(),
14939 PoisonValue::get(OrigScalarTy));
14940 }
14941 // Remove shuffled elements from list of gathers.
14942 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14943 if (Mask[I] != PoisonMaskElem)
14944 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14945 }
14946 }
14947 }
14948 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14949 SmallVectorImpl<int> &ReuseMask,
14950 bool IsRootPoison) {
14951 // For splats with can emit broadcasts instead of gathers, so try to find
14952 // such sequences.
14953 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14954 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14955 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14956 SmallVector<int> UndefPos;
14957 DenseMap<Value *, unsigned> UniquePositions;
14958 // Gather unique non-const values and all constant values.
14959 // For repeated values, just shuffle them.
14960 int NumNonConsts = 0;
14961 int SinglePos = 0;
14962 for (auto [I, V] : enumerate(Scalars)) {
14963 if (isa<UndefValue>(V)) {
14964 if (!isa<PoisonValue>(V)) {
14965 ReuseMask[I] = I;
14966 UndefPos.push_back(I);
14967 }
14968 continue;
14969 }
14970 if (isConstant(V)) {
14971 ReuseMask[I] = I;
14972 continue;
14973 }
14974 ++NumNonConsts;
14975 SinglePos = I;
14976 Value *OrigV = V;
14977 Scalars[I] = PoisonValue::get(OrigScalarTy);
14978 if (IsSplat) {
14979 Scalars.front() = OrigV;
14980 ReuseMask[I] = 0;
14981 } else {
14982 const auto Res = UniquePositions.try_emplace(OrigV, I);
14983 Scalars[Res.first->second] = OrigV;
14984 ReuseMask[I] = Res.first->second;
14985 }
14986 }
14987 if (NumNonConsts == 1) {
14988 // Restore single insert element.
14989 if (IsSplat) {
14990 ReuseMask.assign(VF, PoisonMaskElem);
14991 std::swap(Scalars.front(), Scalars[SinglePos]);
14992 if (!UndefPos.empty() && UndefPos.front() == 0)
14993 Scalars.front() = UndefValue::get(OrigScalarTy);
14994 }
14995 ReuseMask[SinglePos] = SinglePos;
14996 } else if (!UndefPos.empty() && IsSplat) {
14997 // For undef values, try to replace them with the simple broadcast.
14998 // We can do it if the broadcasted value is guaranteed to be
14999 // non-poisonous, or by freezing the incoming scalar value first.
15000 auto *It = find_if(Scalars, [this, E](Value *V) {
15001 return !isa<UndefValue>(V) &&
15003 (E->UserTreeIndices.size() == 1 &&
15004 any_of(V->uses(), [E](const Use &U) {
15005 // Check if the value already used in the same operation in
15006 // one of the nodes already.
15007 return E->UserTreeIndices.front().EdgeIdx !=
15008 U.getOperandNo() &&
15009 is_contained(
15010 E->UserTreeIndices.front().UserTE->Scalars,
15011 U.getUser());
15012 })));
15013 });
15014 if (It != Scalars.end()) {
15015 // Replace undefs by the non-poisoned scalars and emit broadcast.
15016 int Pos = std::distance(Scalars.begin(), It);
15017 for (int I : UndefPos) {
15018 // Set the undef position to the non-poisoned scalar.
15019 ReuseMask[I] = Pos;
15020 // Replace the undef by the poison, in the mask it is replaced by
15021 // non-poisoned scalar already.
15022 if (I != Pos)
15023 Scalars[I] = PoisonValue::get(OrigScalarTy);
15024 }
15025 } else {
15026 // Replace undefs by the poisons, emit broadcast and then emit
15027 // freeze.
15028 for (int I : UndefPos) {
15029 ReuseMask[I] = PoisonMaskElem;
15030 if (isa<UndefValue>(Scalars[I]))
15031 Scalars[I] = PoisonValue::get(OrigScalarTy);
15032 }
15033 NeedFreeze = true;
15034 }
15035 }
15036 };
15037 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15038 bool IsNonPoisoned = true;
15039 bool IsUsedInExpr = true;
15040 Value *Vec1 = nullptr;
15041 if (!ExtractShuffles.empty()) {
15042 // Gather of extractelements can be represented as just a shuffle of
15043 // a single/two vectors the scalars are extracted from.
15044 // Find input vectors.
15045 Value *Vec2 = nullptr;
15046 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15047 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15048 ExtractMask[I] = PoisonMaskElem;
15049 }
15050 if (UseVecBaseAsInput) {
15051 Vec1 = ExtractVecBase;
15052 } else {
15053 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15054 if (ExtractMask[I] == PoisonMaskElem)
15055 continue;
15056 if (isa<UndefValue>(E->Scalars[I]))
15057 continue;
15058 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15059 Value *VecOp = EI->getVectorOperand();
15060 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
15061 !TEs.empty() && TEs.front()->VectorizedValue)
15062 VecOp = TEs.front()->VectorizedValue;
15063 if (!Vec1) {
15064 Vec1 = VecOp;
15065 } else if (Vec1 != VecOp) {
15066 assert((!Vec2 || Vec2 == VecOp) &&
15067 "Expected only 1 or 2 vectors shuffle.");
15068 Vec2 = VecOp;
15069 }
15070 }
15071 }
15072 if (Vec2) {
15073 IsUsedInExpr = false;
15074 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15075 isGuaranteedNotToBePoison(Vec2, AC);
15076 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15077 } else if (Vec1) {
15078 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15079 IsUsedInExpr &= FindReusedSplat(
15080 ExtractMask,
15081 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15082 ExtractMask.size(), IsNotPoisonedVec);
15083 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15084 IsNonPoisoned &= IsNotPoisonedVec;
15085 } else {
15086 IsUsedInExpr = false;
15087 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15088 /*ForExtracts=*/true);
15089 }
15090 }
15091 if (!GatherShuffles.empty()) {
15092 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15093 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15094 for (const auto [I, TEs] : enumerate(Entries)) {
15095 if (TEs.empty()) {
15096 assert(!GatherShuffles[I] &&
15097 "No shuffles with empty entries list expected.");
15098 continue;
15099 }
15100 assert((TEs.size() == 1 || TEs.size() == 2) &&
15101 "Expected shuffle of 1 or 2 entries.");
15102 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15103 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15104 VecMask.assign(VecMask.size(), PoisonMaskElem);
15105 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15106 if (TEs.size() == 1) {
15107 bool IsNotPoisonedVec =
15108 TEs.front()->VectorizedValue
15109 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15110 : true;
15111 IsUsedInExpr &=
15112 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15113 SliceSize, IsNotPoisonedVec);
15114 ShuffleBuilder.add(*TEs.front(), VecMask);
15115 IsNonPoisoned &= IsNotPoisonedVec;
15116 } else {
15117 IsUsedInExpr = false;
15118 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15119 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15120 IsNonPoisoned &=
15121 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15122 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15123 }
15124 }
15125 }
15126 // Try to figure out best way to combine values: build a shuffle and insert
15127 // elements or just build several shuffles.
15128 // Insert non-constant scalars.
15129 SmallVector<Value *> NonConstants(GatheredScalars);
15130 int EMSz = ExtractMask.size();
15131 int MSz = Mask.size();
15132 // Try to build constant vector and shuffle with it only if currently we
15133 // have a single permutation and more than 1 scalar constants.
15134 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15135 bool IsIdentityShuffle =
15136 ((UseVecBaseAsInput ||
15137 all_of(ExtractShuffles,
15138 [](const std::optional<TTI::ShuffleKind> &SK) {
15139 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15141 })) &&
15142 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15143 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15144 (!GatherShuffles.empty() &&
15145 all_of(GatherShuffles,
15146 [](const std::optional<TTI::ShuffleKind> &SK) {
15147 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15149 }) &&
15150 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15152 bool EnoughConstsForShuffle =
15153 IsSingleShuffle &&
15154 (none_of(GatheredScalars,
15155 [](Value *V) {
15156 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15157 }) ||
15158 any_of(GatheredScalars,
15159 [](Value *V) {
15160 return isa<Constant>(V) && !isa<UndefValue>(V);
15161 })) &&
15162 (!IsIdentityShuffle ||
15163 (GatheredScalars.size() == 2 &&
15164 any_of(GatheredScalars,
15165 [](Value *V) { return !isa<UndefValue>(V); })) ||
15166 count_if(GatheredScalars, [](Value *V) {
15167 return isa<Constant>(V) && !isa<PoisonValue>(V);
15168 }) > 1);
15169 // NonConstants array contains just non-constant values, GatheredScalars
15170 // contains only constant to build final vector and then shuffle.
15171 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15172 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15173 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15174 else
15175 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15176 }
15177 // Generate constants for final shuffle and build a mask for them.
15178 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15179 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15180 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15181 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15182 ShuffleBuilder.add(BV, BVMask);
15183 }
15184 if (all_of(NonConstants, [=](Value *V) {
15185 return isa<PoisonValue>(V) ||
15186 (IsSingleShuffle && ((IsIdentityShuffle &&
15187 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15188 }))
15189 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15190 SubVectorsMask);
15191 else
15192 Res = ShuffleBuilder.finalize(
15193 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15194 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15195 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15196 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15197 });
15198 } else if (!allConstant(GatheredScalars)) {
15199 // Gather unique scalars and all constants.
15200 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15201 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15202 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15203 ShuffleBuilder.add(BV, ReuseMask);
15204 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15205 SubVectorsMask);
15206 } else {
15207 // Gather all constants.
15208 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15209 for (auto [I, V] : enumerate(GatheredScalars)) {
15210 if (!isa<PoisonValue>(V))
15211 Mask[I] = I;
15212 }
15213 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15214 ShuffleBuilder.add(BV, Mask);
15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15216 SubVectorsMask);
15217 }
15218
15219 if (NeedFreeze)
15220 Res = ShuffleBuilder.createFreeze(Res);
15221 return Res;
15222}
15223
15224Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15225 bool PostponedPHIs) {
15226 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15227 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15228 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15229 Builder, *this);
15230}
15231
15232/// \returns \p I after propagating metadata from \p VL only for instructions in
15233/// \p VL.
15236 for (Value *V : VL)
15237 if (isa<Instruction>(V))
15238 Insts.push_back(V);
15239 return llvm::propagateMetadata(Inst, Insts);
15240}
15241
15242Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15243 IRBuilderBase::InsertPointGuard Guard(Builder);
15244
15245 if (E->VectorizedValue &&
15246 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15247 E->isAltShuffle())) {
15248 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15249 return E->VectorizedValue;
15250 }
15251
15252 Value *V = E->Scalars.front();
15253 Type *ScalarTy = V->getType();
15254 if (!isa<CmpInst>(V))
15255 ScalarTy = getValueType(V);
15256 auto It = MinBWs.find(E);
15257 if (It != MinBWs.end()) {
15258 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15259 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15260 if (VecTy)
15261 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15262 }
15263 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15264 if (E->isGather()) {
15265 // Set insert point for non-reduction initial nodes.
15266 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15267 setInsertPointAfterBundle(E);
15268 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15269 E->VectorizedValue = Vec;
15270 return Vec;
15271 }
15272
15273 bool IsReverseOrder =
15274 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15275 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15276 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15277 if (E->getOpcode() == Instruction::Store &&
15278 E->State == TreeEntry::Vectorize) {
15280 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15281 E->ReorderIndices.size());
15282 ShuffleBuilder.add(V, Mask);
15283 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15284 ShuffleBuilder.addOrdered(V, {});
15285 } else {
15286 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15287 }
15289 E->CombinedEntriesWithIndices.size());
15290 transform(
15291 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15292 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15293 });
15294 assert(
15295 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15296 "Expected either combined subnodes or reordering");
15297 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15298 };
15299
15300 assert(!E->isGather() && "Unhandled state");
15301 unsigned ShuffleOrOp =
15302 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15303 Instruction *VL0 = E->getMainOp();
15304 auto GetOperandSignedness = [&](unsigned Idx) {
15305 const TreeEntry *OpE = getOperandEntry(E, Idx);
15306 bool IsSigned = false;
15307 auto It = MinBWs.find(OpE);
15308 if (It != MinBWs.end())
15309 IsSigned = It->second.second;
15310 else
15311 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15312 if (isa<PoisonValue>(V))
15313 return false;
15314 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15315 });
15316 return IsSigned;
15317 };
15318 switch (ShuffleOrOp) {
15319 case Instruction::PHI: {
15320 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15321 E != VectorizableTree.front().get() ||
15322 !E->UserTreeIndices.empty()) &&
15323 "PHI reordering is free.");
15324 if (PostponedPHIs && E->VectorizedValue)
15325 return E->VectorizedValue;
15326 auto *PH = cast<PHINode>(VL0);
15327 Builder.SetInsertPoint(PH->getParent(),
15328 PH->getParent()->getFirstNonPHIIt());
15329 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15330 if (PostponedPHIs || !E->VectorizedValue) {
15331 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15332 E->PHI = NewPhi;
15333 Value *V = NewPhi;
15334
15335 // Adjust insertion point once all PHI's have been generated.
15336 Builder.SetInsertPoint(PH->getParent(),
15337 PH->getParent()->getFirstInsertionPt());
15338 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15339
15340 V = FinalShuffle(V, E);
15341
15342 E->VectorizedValue = V;
15343 if (PostponedPHIs)
15344 return V;
15345 }
15346 PHINode *NewPhi = cast<PHINode>(E->PHI);
15347 // If phi node is fully emitted - exit.
15348 if (NewPhi->getNumIncomingValues() != 0)
15349 return NewPhi;
15350
15351 // PHINodes may have multiple entries from the same block. We want to
15352 // visit every block once.
15354
15355 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15357 BasicBlock *IBB = PH->getIncomingBlock(I);
15358
15359 // Stop emission if all incoming values are generated.
15360 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15361 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15362 return NewPhi;
15363 }
15364
15365 if (!VisitedBBs.insert(IBB).second) {
15366 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15367 continue;
15368 }
15369
15370 Builder.SetInsertPoint(IBB->getTerminator());
15371 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15372 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15373 if (VecTy != Vec->getType()) {
15374 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15375 MinBWs.contains(getOperandEntry(E, I))) &&
15376 "Expected item in MinBWs.");
15377 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15378 }
15379 NewPhi->addIncoming(Vec, IBB);
15380 }
15381
15382 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15383 "Invalid number of incoming values");
15384 assert(E->VectorizedValue && "Expected vectorized value.");
15385 return E->VectorizedValue;
15386 }
15387
15388 case Instruction::ExtractElement: {
15389 Value *V = E->getSingleOperand(0);
15390 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty())
15391 V = TEs.front()->VectorizedValue;
15392 setInsertPointAfterBundle(E);
15393 V = FinalShuffle(V, E);
15394 E->VectorizedValue = V;
15395 return V;
15396 }
15397 case Instruction::ExtractValue: {
15398 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15399 Builder.SetInsertPoint(LI);
15400 Value *Ptr = LI->getPointerOperand();
15401 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15402 Value *NewV = ::propagateMetadata(V, E->Scalars);
15403 NewV = FinalShuffle(NewV, E);
15404 E->VectorizedValue = NewV;
15405 return NewV;
15406 }
15407 case Instruction::InsertElement: {
15408 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15409 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15410 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15411 ArrayRef<Value *> Op = E->getOperand(1);
15412 Type *ScalarTy = Op.front()->getType();
15413 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15414 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15415 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15416 assert(Res.first > 0 && "Expected item in MinBWs.");
15417 V = Builder.CreateIntCast(
15418 V,
15420 ScalarTy,
15421 cast<FixedVectorType>(V->getType())->getNumElements()),
15422 Res.second);
15423 }
15424
15425 // Create InsertVector shuffle if necessary
15426 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15427 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15428 }));
15429 const unsigned NumElts =
15430 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15431 const unsigned NumScalars = E->Scalars.size();
15432
15433 unsigned Offset = *getElementIndex(VL0);
15434 assert(Offset < NumElts && "Failed to find vector index offset");
15435
15436 // Create shuffle to resize vector
15438 if (!E->ReorderIndices.empty()) {
15439 inversePermutation(E->ReorderIndices, Mask);
15440 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15441 } else {
15442 Mask.assign(NumElts, PoisonMaskElem);
15443 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15444 }
15445 // Create InsertVector shuffle if necessary
15446 bool IsIdentity = true;
15447 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15448 Mask.swap(PrevMask);
15449 for (unsigned I = 0; I < NumScalars; ++I) {
15450 Value *Scalar = E->Scalars[PrevMask[I]];
15451 unsigned InsertIdx = *getElementIndex(Scalar);
15452 IsIdentity &= InsertIdx - Offset == I;
15453 Mask[InsertIdx - Offset] = I;
15454 }
15455 if (!IsIdentity || NumElts != NumScalars) {
15456 Value *V2 = nullptr;
15457 bool IsVNonPoisonous =
15459 SmallVector<int> InsertMask(Mask);
15460 if (NumElts != NumScalars && Offset == 0) {
15461 // Follow all insert element instructions from the current buildvector
15462 // sequence.
15463 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15464 do {
15465 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15466 if (!InsertIdx)
15467 break;
15468 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15469 InsertMask[*InsertIdx] = *InsertIdx;
15470 if (!Ins->hasOneUse())
15471 break;
15472 Ins = dyn_cast_or_null<InsertElementInst>(
15473 Ins->getUniqueUndroppableUser());
15474 } while (Ins);
15475 SmallBitVector UseMask =
15476 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15477 SmallBitVector IsFirstPoison =
15478 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15479 SmallBitVector IsFirstUndef =
15480 isUndefVector(FirstInsert->getOperand(0), UseMask);
15481 if (!IsFirstPoison.all()) {
15482 unsigned Idx = 0;
15483 for (unsigned I = 0; I < NumElts; I++) {
15484 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15485 IsFirstUndef.test(I)) {
15486 if (IsVNonPoisonous) {
15487 InsertMask[I] = I < NumScalars ? I : 0;
15488 continue;
15489 }
15490 if (!V2)
15491 V2 = UndefValue::get(V->getType());
15492 if (Idx >= NumScalars)
15493 Idx = NumScalars - 1;
15494 InsertMask[I] = NumScalars + Idx;
15495 ++Idx;
15496 } else if (InsertMask[I] != PoisonMaskElem &&
15497 Mask[I] == PoisonMaskElem) {
15498 InsertMask[I] = PoisonMaskElem;
15499 }
15500 }
15501 } else {
15502 InsertMask = Mask;
15503 }
15504 }
15505 if (!V2)
15506 V2 = PoisonValue::get(V->getType());
15507 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15508 if (auto *I = dyn_cast<Instruction>(V)) {
15509 GatherShuffleExtractSeq.insert(I);
15510 CSEBlocks.insert(I->getParent());
15511 }
15512 }
15513
15514 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15515 for (unsigned I = 0; I < NumElts; I++) {
15516 if (Mask[I] != PoisonMaskElem)
15517 InsertMask[Offset + I] = I;
15518 }
15519 SmallBitVector UseMask =
15520 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15521 SmallBitVector IsFirstUndef =
15522 isUndefVector(FirstInsert->getOperand(0), UseMask);
15523 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15524 NumElts != NumScalars) {
15525 if (IsFirstUndef.all()) {
15526 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15527 SmallBitVector IsFirstPoison =
15528 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15529 if (!IsFirstPoison.all()) {
15530 for (unsigned I = 0; I < NumElts; I++) {
15531 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15532 InsertMask[I] = I + NumElts;
15533 }
15534 }
15535 V = Builder.CreateShuffleVector(
15536 V,
15537 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15538 : FirstInsert->getOperand(0),
15539 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15540 if (auto *I = dyn_cast<Instruction>(V)) {
15541 GatherShuffleExtractSeq.insert(I);
15542 CSEBlocks.insert(I->getParent());
15543 }
15544 }
15545 } else {
15546 SmallBitVector IsFirstPoison =
15547 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15548 for (unsigned I = 0; I < NumElts; I++) {
15549 if (InsertMask[I] == PoisonMaskElem)
15550 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15551 else
15552 InsertMask[I] += NumElts;
15553 }
15554 V = Builder.CreateShuffleVector(
15555 FirstInsert->getOperand(0), V, InsertMask,
15556 cast<Instruction>(E->Scalars.back())->getName());
15557 if (auto *I = dyn_cast<Instruction>(V)) {
15558 GatherShuffleExtractSeq.insert(I);
15559 CSEBlocks.insert(I->getParent());
15560 }
15561 }
15562 }
15563
15564 ++NumVectorInstructions;
15565 E->VectorizedValue = V;
15566 return V;
15567 }
15568 case Instruction::ZExt:
15569 case Instruction::SExt:
15570 case Instruction::FPToUI:
15571 case Instruction::FPToSI:
15572 case Instruction::FPExt:
15573 case Instruction::PtrToInt:
15574 case Instruction::IntToPtr:
15575 case Instruction::SIToFP:
15576 case Instruction::UIToFP:
15577 case Instruction::Trunc:
15578 case Instruction::FPTrunc:
15579 case Instruction::BitCast: {
15580 setInsertPointAfterBundle(E);
15581
15582 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15583 if (E->VectorizedValue) {
15584 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15585 return E->VectorizedValue;
15586 }
15587
15588 auto *CI = cast<CastInst>(VL0);
15589 Instruction::CastOps VecOpcode = CI->getOpcode();
15590 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15591 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15592 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15593 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15594 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15595 // Check if the values are candidates to demote.
15596 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15597 if (SrcIt != MinBWs.end())
15598 SrcBWSz = SrcIt->second.first;
15599 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15600 if (BWSz == SrcBWSz) {
15601 VecOpcode = Instruction::BitCast;
15602 } else if (BWSz < SrcBWSz) {
15603 VecOpcode = Instruction::Trunc;
15604 } else if (It != MinBWs.end()) {
15605 assert(BWSz > SrcBWSz && "Invalid cast!");
15606 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15607 } else if (SrcIt != MinBWs.end()) {
15608 assert(BWSz > SrcBWSz && "Invalid cast!");
15609 VecOpcode =
15610 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15611 }
15612 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15613 !SrcIt->second.second) {
15614 VecOpcode = Instruction::UIToFP;
15615 }
15616 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15617 ? InVec
15618 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15619 V = FinalShuffle(V, E);
15620
15621 E->VectorizedValue = V;
15622 ++NumVectorInstructions;
15623 return V;
15624 }
15625 case Instruction::FCmp:
15626 case Instruction::ICmp: {
15627 setInsertPointAfterBundle(E);
15628
15629 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15630 if (E->VectorizedValue) {
15631 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15632 return E->VectorizedValue;
15633 }
15634 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15635 if (E->VectorizedValue) {
15636 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15637 return E->VectorizedValue;
15638 }
15639 if (L->getType() != R->getType()) {
15640 assert((getOperandEntry(E, 0)->isGather() ||
15641 getOperandEntry(E, 1)->isGather() ||
15642 MinBWs.contains(getOperandEntry(E, 0)) ||
15643 MinBWs.contains(getOperandEntry(E, 1))) &&
15644 "Expected item in MinBWs.");
15645 if (cast<VectorType>(L->getType())
15646 ->getElementType()
15647 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15648 ->getElementType()
15649 ->getIntegerBitWidth()) {
15650 Type *CastTy = R->getType();
15651 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15652 } else {
15653 Type *CastTy = L->getType();
15654 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15655 }
15656 }
15657
15658 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15659 Value *V = Builder.CreateCmp(P0, L, R);
15660 propagateIRFlags(V, E->Scalars, VL0);
15661 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15662 ICmp->setSameSign(/*B=*/false);
15663 // Do not cast for cmps.
15664 VecTy = cast<FixedVectorType>(V->getType());
15665 V = FinalShuffle(V, E);
15666
15667 E->VectorizedValue = V;
15668 ++NumVectorInstructions;
15669 return V;
15670 }
15671 case Instruction::Select: {
15672 setInsertPointAfterBundle(E);
15673
15674 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15675 if (E->VectorizedValue) {
15676 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15677 return E->VectorizedValue;
15678 }
15679 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15680 if (E->VectorizedValue) {
15681 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15682 return E->VectorizedValue;
15683 }
15684 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15685 if (E->VectorizedValue) {
15686 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15687 return E->VectorizedValue;
15688 }
15689 if (True->getType() != VecTy || False->getType() != VecTy) {
15690 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15691 getOperandEntry(E, 2)->isGather() ||
15692 MinBWs.contains(getOperandEntry(E, 1)) ||
15693 MinBWs.contains(getOperandEntry(E, 2))) &&
15694 "Expected item in MinBWs.");
15695 if (True->getType() != VecTy)
15696 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15697 if (False->getType() != VecTy)
15698 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15699 }
15700
15701 unsigned CondNumElements = getNumElements(Cond->getType());
15702 unsigned TrueNumElements = getNumElements(True->getType());
15703 assert(TrueNumElements >= CondNumElements &&
15704 TrueNumElements % CondNumElements == 0 &&
15705 "Cannot vectorize Instruction::Select");
15706 assert(TrueNumElements == getNumElements(False->getType()) &&
15707 "Cannot vectorize Instruction::Select");
15708 if (CondNumElements != TrueNumElements) {
15709 // When the return type is i1 but the source is fixed vector type, we
15710 // need to duplicate the condition value.
15711 Cond = Builder.CreateShuffleVector(
15712 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15713 CondNumElements));
15714 }
15715 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15716 "Cannot vectorize Instruction::Select");
15717 Value *V = Builder.CreateSelect(Cond, True, False);
15718 V = FinalShuffle(V, E);
15719
15720 E->VectorizedValue = V;
15721 ++NumVectorInstructions;
15722 return V;
15723 }
15724 case Instruction::FNeg: {
15725 setInsertPointAfterBundle(E);
15726
15727 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15728
15729 if (E->VectorizedValue) {
15730 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15731 return E->VectorizedValue;
15732 }
15733
15734 Value *V = Builder.CreateUnOp(
15735 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15736 propagateIRFlags(V, E->Scalars, VL0);
15737 if (auto *I = dyn_cast<Instruction>(V))
15738 V = ::propagateMetadata(I, E->Scalars);
15739
15740 V = FinalShuffle(V, E);
15741
15742 E->VectorizedValue = V;
15743 ++NumVectorInstructions;
15744
15745 return V;
15746 }
15747 case Instruction::Freeze: {
15748 setInsertPointAfterBundle(E);
15749
15750 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15751
15752 if (E->VectorizedValue) {
15753 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15754 return E->VectorizedValue;
15755 }
15756
15757 if (Op->getType() != VecTy) {
15758 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15759 MinBWs.contains(getOperandEntry(E, 0))) &&
15760 "Expected item in MinBWs.");
15761 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15762 }
15763 Value *V = Builder.CreateFreeze(Op);
15764 V = FinalShuffle(V, E);
15765
15766 E->VectorizedValue = V;
15767 ++NumVectorInstructions;
15768
15769 return V;
15770 }
15771 case Instruction::Add:
15772 case Instruction::FAdd:
15773 case Instruction::Sub:
15774 case Instruction::FSub:
15775 case Instruction::Mul:
15776 case Instruction::FMul:
15777 case Instruction::UDiv:
15778 case Instruction::SDiv:
15779 case Instruction::FDiv:
15780 case Instruction::URem:
15781 case Instruction::SRem:
15782 case Instruction::FRem:
15783 case Instruction::Shl:
15784 case Instruction::LShr:
15785 case Instruction::AShr:
15786 case Instruction::And:
15787 case Instruction::Or:
15788 case Instruction::Xor: {
15789 setInsertPointAfterBundle(E);
15790
15791 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15792 if (E->VectorizedValue) {
15793 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15794 return E->VectorizedValue;
15795 }
15796 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15797 if (E->VectorizedValue) {
15798 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15799 return E->VectorizedValue;
15800 }
15801 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15802 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15803 ArrayRef<Value *> Ops = E->getOperand(I);
15804 if (all_of(Ops, [&](Value *Op) {
15805 auto *CI = dyn_cast<ConstantInt>(Op);
15806 return CI && CI->getValue().countr_one() >= It->second.first;
15807 })) {
15808 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15809 E->VectorizedValue = V;
15810 ++NumVectorInstructions;
15811 return V;
15812 }
15813 }
15814 }
15815 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15816 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15817 getOperandEntry(E, 1)->isGather() ||
15818 MinBWs.contains(getOperandEntry(E, 0)) ||
15819 MinBWs.contains(getOperandEntry(E, 1))) &&
15820 "Expected item in MinBWs.");
15821 if (LHS->getType() != VecTy)
15822 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15823 if (RHS->getType() != VecTy)
15824 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15825 }
15826
15827 Value *V = Builder.CreateBinOp(
15828 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15829 RHS);
15830 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15831 if (auto *I = dyn_cast<Instruction>(V)) {
15832 V = ::propagateMetadata(I, E->Scalars);
15833 // Drop nuw flags for abs(sub(commutative), true).
15834 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15835 any_of(E->Scalars, [](Value *V) {
15836 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15837 }))
15838 I->setHasNoUnsignedWrap(/*b=*/false);
15839 }
15840
15841 V = FinalShuffle(V, E);
15842
15843 E->VectorizedValue = V;
15844 ++NumVectorInstructions;
15845
15846 return V;
15847 }
15848 case Instruction::Load: {
15849 // Loads are inserted at the head of the tree because we don't want to
15850 // sink them all the way down past store instructions.
15851 setInsertPointAfterBundle(E);
15852
15853 LoadInst *LI = cast<LoadInst>(VL0);
15854 Instruction *NewLI;
15855 Value *PO = LI->getPointerOperand();
15856 if (E->State == TreeEntry::Vectorize) {
15857 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15858 } else if (E->State == TreeEntry::StridedVectorize) {
15859 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15860 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15861 PO = IsReverseOrder ? PtrN : Ptr0;
15862 std::optional<int> Diff = getPointersDiff(
15863 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15864 Type *StrideTy = DL->getIndexType(PO->getType());
15865 Value *StrideVal;
15866 if (Diff) {
15867 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15868 StrideVal =
15869 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15870 DL->getTypeAllocSize(ScalarTy));
15871 } else {
15872 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15873 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15874 return cast<LoadInst>(V)->getPointerOperand();
15875 });
15876 OrdersType Order;
15877 std::optional<Value *> Stride =
15878 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15879 &*Builder.GetInsertPoint());
15880 Value *NewStride =
15881 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15882 StrideVal = Builder.CreateMul(
15883 NewStride,
15884 ConstantInt::get(
15885 StrideTy,
15886 (IsReverseOrder ? -1 : 1) *
15887 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15888 }
15889 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15890 auto *Inst = Builder.CreateIntrinsic(
15891 Intrinsic::experimental_vp_strided_load,
15892 {VecTy, PO->getType(), StrideTy},
15893 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15894 Builder.getInt32(E->Scalars.size())});
15895 Inst->addParamAttr(
15896 /*ArgNo=*/0,
15897 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15898 NewLI = Inst;
15899 } else {
15900 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15901 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15902 if (E->VectorizedValue) {
15903 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15904 return E->VectorizedValue;
15905 }
15906 if (isa<FixedVectorType>(ScalarTy)) {
15907 assert(SLPReVec && "FixedVectorType is not expected.");
15908 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15909 // to expand VecPtr if ScalarTy is a vector type.
15910 unsigned ScalarTyNumElements =
15911 cast<FixedVectorType>(ScalarTy)->getNumElements();
15912 unsigned VecTyNumElements =
15913 cast<FixedVectorType>(VecTy)->getNumElements();
15914 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15915 "Cannot expand getelementptr.");
15916 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15917 SmallVector<Constant *> Indices(VecTyNumElements);
15918 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15919 return Builder.getInt64(I % ScalarTyNumElements);
15920 });
15921 VecPtr = Builder.CreateGEP(
15922 VecTy->getElementType(),
15923 Builder.CreateShuffleVector(
15924 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15925 ConstantVector::get(Indices));
15926 }
15927 // Use the minimum alignment of the gathered loads.
15928 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15929 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15930 }
15931 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15932
15933 V = FinalShuffle(V, E);
15934 E->VectorizedValue = V;
15935 ++NumVectorInstructions;
15936 return V;
15937 }
15938 case Instruction::Store: {
15939 auto *SI = cast<StoreInst>(VL0);
15940
15941 setInsertPointAfterBundle(E);
15942
15943 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15944 if (VecValue->getType() != VecTy)
15945 VecValue =
15946 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15947 VecValue = FinalShuffle(VecValue, E);
15948
15949 Value *Ptr = SI->getPointerOperand();
15950 Instruction *ST;
15951 if (E->State == TreeEntry::Vectorize) {
15952 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15953 } else {
15954 assert(E->State == TreeEntry::StridedVectorize &&
15955 "Expected either strided or consecutive stores.");
15956 if (!E->ReorderIndices.empty()) {
15957 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15958 Ptr = SI->getPointerOperand();
15959 }
15960 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15961 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15962 auto *Inst = Builder.CreateIntrinsic(
15963 Intrinsic::experimental_vp_strided_store,
15964 {VecTy, Ptr->getType(), StrideTy},
15965 {VecValue, Ptr,
15966 ConstantInt::get(
15967 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15968 Builder.getAllOnesMask(VecTy->getElementCount()),
15969 Builder.getInt32(E->Scalars.size())});
15970 Inst->addParamAttr(
15971 /*ArgNo=*/1,
15972 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15973 ST = Inst;
15974 }
15975
15976 Value *V = ::propagateMetadata(ST, E->Scalars);
15977
15978 E->VectorizedValue = V;
15979 ++NumVectorInstructions;
15980 return V;
15981 }
15982 case Instruction::GetElementPtr: {
15983 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15984 setInsertPointAfterBundle(E);
15985
15986 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15987 if (E->VectorizedValue) {
15988 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15989 return E->VectorizedValue;
15990 }
15991
15992 SmallVector<Value *> OpVecs;
15993 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
15994 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15995 if (E->VectorizedValue) {
15996 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15997 return E->VectorizedValue;
15998 }
15999 OpVecs.push_back(OpVec);
16000 }
16001
16002 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16003 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16005 for (Value *V : E->Scalars) {
16006 if (isa<GetElementPtrInst>(V))
16007 GEPs.push_back(V);
16008 }
16009 V = ::propagateMetadata(I, GEPs);
16010 }
16011
16012 V = FinalShuffle(V, E);
16013
16014 E->VectorizedValue = V;
16015 ++NumVectorInstructions;
16016
16017 return V;
16018 }
16019 case Instruction::Call: {
16020 CallInst *CI = cast<CallInst>(VL0);
16021 setInsertPointAfterBundle(E);
16022
16024
16026 CI, ID, VecTy->getNumElements(),
16027 It != MinBWs.end() ? It->second.first : 0, TTI);
16028 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16029 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16030 VecCallCosts.first <= VecCallCosts.second;
16031
16032 Value *ScalarArg = nullptr;
16033 SmallVector<Value *> OpVecs;
16034 SmallVector<Type *, 2> TysForDecl;
16035 // Add return type if intrinsic is overloaded on it.
16036 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16037 TysForDecl.push_back(VecTy);
16038 auto *CEI = cast<CallInst>(VL0);
16039 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16040 ValueList OpVL;
16041 // Some intrinsics have scalar arguments. This argument should not be
16042 // vectorized.
16043 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16044 ScalarArg = CEI->getArgOperand(I);
16045 // if decided to reduce bitwidth of abs intrinsic, it second argument
16046 // must be set false (do not return poison, if value issigned min).
16047 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16048 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16049 ScalarArg = Builder.getFalse();
16050 OpVecs.push_back(ScalarArg);
16052 TysForDecl.push_back(ScalarArg->getType());
16053 continue;
16054 }
16055
16056 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16057 if (E->VectorizedValue) {
16058 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16059 return E->VectorizedValue;
16060 }
16061 ScalarArg = CEI->getArgOperand(I);
16062 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16063 ScalarArg->getType()->getScalarType() &&
16064 It == MinBWs.end()) {
16065 auto *CastTy =
16066 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16067 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16068 } else if (It != MinBWs.end()) {
16069 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16070 }
16071 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16072 OpVecs.push_back(OpVec);
16073 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16074 TysForDecl.push_back(OpVec->getType());
16075 }
16076
16077 Function *CF;
16078 if (!UseIntrinsic) {
16079 VFShape Shape =
16082 static_cast<unsigned>(VecTy->getNumElements())),
16083 false /*HasGlobalPred*/);
16084 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16085 } else {
16086 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16087 }
16088
16090 CI->getOperandBundlesAsDefs(OpBundles);
16091 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16092
16093 propagateIRFlags(V, E->Scalars, VL0);
16094 V = FinalShuffle(V, E);
16095
16096 E->VectorizedValue = V;
16097 ++NumVectorInstructions;
16098 return V;
16099 }
16100 case Instruction::ShuffleVector: {
16101 Value *V;
16102 if (SLPReVec && !E->isAltShuffle()) {
16103 setInsertPointAfterBundle(E);
16104 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16105 if (E->VectorizedValue) {
16106 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16107 return E->VectorizedValue;
16108 }
16109 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16110 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16111 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16112 "Not supported shufflevector usage.");
16113 SmallVector<int> NewMask(ThisMask.size());
16114 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16115 return SVSrc->getShuffleMask()[Mask];
16116 });
16117 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16118 } else {
16119 V = Builder.CreateShuffleVector(Src, ThisMask);
16120 }
16121 propagateIRFlags(V, E->Scalars, VL0);
16122 if (auto *I = dyn_cast<Instruction>(V))
16123 V = ::propagateMetadata(I, E->Scalars);
16124 V = FinalShuffle(V, E);
16125 } else {
16126 assert(E->isAltShuffle() &&
16127 ((Instruction::isBinaryOp(E->getOpcode()) &&
16128 Instruction::isBinaryOp(E->getAltOpcode())) ||
16129 (Instruction::isCast(E->getOpcode()) &&
16130 Instruction::isCast(E->getAltOpcode())) ||
16131 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16132 "Invalid Shuffle Vector Operand");
16133
16134 Value *LHS = nullptr, *RHS = nullptr;
16135 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16136 setInsertPointAfterBundle(E);
16137 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16138 if (E->VectorizedValue) {
16139 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16140 return E->VectorizedValue;
16141 }
16142 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16143 } else {
16144 setInsertPointAfterBundle(E);
16145 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16146 }
16147 if (E->VectorizedValue) {
16148 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16149 return E->VectorizedValue;
16150 }
16151 if (LHS && RHS &&
16152 ((Instruction::isBinaryOp(E->getOpcode()) &&
16153 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16154 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16155 assert((It != MinBWs.end() ||
16156 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16157 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16158 MinBWs.contains(getOperandEntry(E, 0)) ||
16159 MinBWs.contains(getOperandEntry(E, 1))) &&
16160 "Expected item in MinBWs.");
16161 Type *CastTy = VecTy;
16162 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16163 if (cast<VectorType>(LHS->getType())
16164 ->getElementType()
16165 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16166 ->getElementType()
16167 ->getIntegerBitWidth())
16168 CastTy = RHS->getType();
16169 else
16170 CastTy = LHS->getType();
16171 }
16172 if (LHS->getType() != CastTy)
16173 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16174 if (RHS->getType() != CastTy)
16175 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16176 }
16177
16178 Value *V0, *V1;
16179 if (Instruction::isBinaryOp(E->getOpcode())) {
16180 V0 = Builder.CreateBinOp(
16181 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16182 V1 = Builder.CreateBinOp(
16183 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16184 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16185 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16186 auto *AltCI = cast<CmpInst>(E->getAltOp());
16187 CmpInst::Predicate AltPred = AltCI->getPredicate();
16188 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16189 } else {
16190 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16191 unsigned SrcBWSz = DL->getTypeSizeInBits(
16192 cast<VectorType>(LHS->getType())->getElementType());
16193 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16194 if (BWSz <= SrcBWSz) {
16195 if (BWSz < SrcBWSz)
16196 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16197 assert(LHS->getType() == VecTy &&
16198 "Expected same type as operand.");
16199 if (auto *I = dyn_cast<Instruction>(LHS))
16200 LHS = ::propagateMetadata(I, E->Scalars);
16201 LHS = FinalShuffle(LHS, E);
16202 E->VectorizedValue = LHS;
16203 ++NumVectorInstructions;
16204 return LHS;
16205 }
16206 }
16207 V0 = Builder.CreateCast(
16208 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16209 V1 = Builder.CreateCast(
16210 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16211 }
16212 // Add V0 and V1 to later analysis to try to find and remove matching
16213 // instruction, if any.
16214 for (Value *V : {V0, V1}) {
16215 if (auto *I = dyn_cast<Instruction>(V)) {
16216 GatherShuffleExtractSeq.insert(I);
16217 CSEBlocks.insert(I->getParent());
16218 }
16219 }
16220
16221 // Create shuffle to take alternate operations from the vector.
16222 // Also, gather up main and alt scalar ops to propagate IR flags to
16223 // each vector operation.
16224 ValueList OpScalars, AltScalars;
16226 E->buildAltOpShuffleMask(
16227 [E, this](Instruction *I) {
16228 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16229 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16230 *TLI);
16231 },
16232 Mask, &OpScalars, &AltScalars);
16233
16234 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16235 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16236 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16237 // Drop nuw flags for abs(sub(commutative), true).
16238 if (auto *I = dyn_cast<Instruction>(Vec);
16239 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16240 any_of(E->Scalars, [](Value *V) {
16241 if (isa<PoisonValue>(V))
16242 return false;
16243 auto *IV = cast<Instruction>(V);
16244 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16245 }))
16246 I->setHasNoUnsignedWrap(/*b=*/false);
16247 };
16248 DropNuwFlag(V0, E->getOpcode());
16249 DropNuwFlag(V1, E->getAltOpcode());
16250
16251 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16252 assert(SLPReVec && "FixedVectorType is not expected.");
16254 }
16255 V = Builder.CreateShuffleVector(V0, V1, Mask);
16256 if (auto *I = dyn_cast<Instruction>(V)) {
16257 V = ::propagateMetadata(I, E->Scalars);
16258 GatherShuffleExtractSeq.insert(I);
16259 CSEBlocks.insert(I->getParent());
16260 }
16261 }
16262
16263 E->VectorizedValue = V;
16264 ++NumVectorInstructions;
16265
16266 return V;
16267 }
16268 default:
16269 llvm_unreachable("unknown inst");
16270 }
16271 return nullptr;
16272}
16273
16275 ExtraValueToDebugLocsMap ExternallyUsedValues;
16276 return vectorizeTree(ExternallyUsedValues);
16277}
16278
16279Value *
16281 Instruction *ReductionRoot) {
16282 // All blocks must be scheduled before any instructions are inserted.
16283 for (auto &BSIter : BlocksSchedules) {
16284 scheduleBlock(BSIter.second.get());
16285 }
16286 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16287 // need to rebuild it.
16288 EntryToLastInstruction.clear();
16289
16290 if (ReductionRoot)
16291 Builder.SetInsertPoint(ReductionRoot->getParent(),
16292 ReductionRoot->getIterator());
16293 else
16294 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16295
16296 // Emit gathered loads first to emit better code for the users of those
16297 // gathered loads.
16298 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16299 if (GatheredLoadsEntriesFirst.has_value() &&
16300 TE->Idx >= *GatheredLoadsEntriesFirst &&
16301 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16302 assert((!TE->UserTreeIndices.empty() ||
16303 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16304 "Expected gathered load node.");
16305 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16306 }
16307 }
16308 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16309 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16310 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16311 if (TE->State == TreeEntry::Vectorize &&
16312 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16313 TE->VectorizedValue)
16314 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16315 // Run through the list of postponed gathers and emit them, replacing the temp
16316 // emitted allocas with actual vector instructions.
16317 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16319 for (const TreeEntry *E : PostponedNodes) {
16320 auto *TE = const_cast<TreeEntry *>(E);
16321 if (auto *VecTE = getSameValuesTreeEntry(
16322 TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
16323 TE->UserTreeIndices.front().EdgeIdx));
16324 VecTE && VecTE->isSame(TE->Scalars))
16325 // Found gather node which is absolutely the same as one of the
16326 // vectorized nodes. It may happen after reordering.
16327 continue;
16328 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16329 TE->VectorizedValue = nullptr;
16330 auto *UserI =
16331 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16332 // If user is a PHI node, its vector code have to be inserted right before
16333 // block terminator. Since the node was delayed, there were some unresolved
16334 // dependencies at the moment when stab instruction was emitted. In a case
16335 // when any of these dependencies turn out an operand of another PHI, coming
16336 // from this same block, position of a stab instruction will become invalid.
16337 // The is because source vector that supposed to feed this gather node was
16338 // inserted at the end of the block [after stab instruction]. So we need
16339 // to adjust insertion point again to the end of block.
16340 if (isa<PHINode>(UserI)) {
16341 // Insert before all users.
16342 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16343 for (User *U : PrevVec->users()) {
16344 if (U == UserI)
16345 continue;
16346 auto *UI = dyn_cast<Instruction>(U);
16347 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16348 continue;
16349 if (UI->comesBefore(InsertPt))
16350 InsertPt = UI;
16351 }
16352 Builder.SetInsertPoint(InsertPt);
16353 } else {
16354 Builder.SetInsertPoint(PrevVec);
16355 }
16356 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16357 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16358 if (auto *VecI = dyn_cast<Instruction>(Vec);
16359 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16360 Builder.GetInsertPoint()->comesBefore(VecI))
16361 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16362 Builder.GetInsertPoint());
16363 if (Vec->getType() != PrevVec->getType()) {
16364 assert(Vec->getType()->isIntOrIntVectorTy() &&
16365 PrevVec->getType()->isIntOrIntVectorTy() &&
16366 "Expected integer vector types only.");
16367 std::optional<bool> IsSigned;
16368 for (Value *V : TE->Scalars) {
16369 if (isVectorized(V)) {
16370 for (const TreeEntry *MNTE : getTreeEntries(V)) {
16371 auto It = MinBWs.find(MNTE);
16372 if (It != MinBWs.end()) {
16373 IsSigned = IsSigned.value_or(false) || It->second.second;
16374 if (*IsSigned)
16375 break;
16376 }
16377 }
16378 if (IsSigned.value_or(false))
16379 break;
16380 // Scan through gather nodes.
16381 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16382 auto It = MinBWs.find(BVE);
16383 if (It != MinBWs.end()) {
16384 IsSigned = IsSigned.value_or(false) || It->second.second;
16385 if (*IsSigned)
16386 break;
16387 }
16388 }
16389 if (IsSigned.value_or(false))
16390 break;
16391 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16392 IsSigned =
16393 IsSigned.value_or(false) ||
16394 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16395 continue;
16396 }
16397 if (IsSigned.value_or(false))
16398 break;
16399 }
16400 }
16401 if (IsSigned.value_or(false)) {
16402 // Final attempt - check user node.
16403 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16404 if (It != MinBWs.end())
16405 IsSigned = It->second.second;
16406 }
16407 assert(IsSigned &&
16408 "Expected user node or perfect diamond match in MinBWs.");
16409 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16410 }
16411 PrevVec->replaceAllUsesWith(Vec);
16412 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16413 // Replace the stub vector node, if it was used before for one of the
16414 // buildvector nodes already.
16415 auto It = PostponedValues.find(PrevVec);
16416 if (It != PostponedValues.end()) {
16417 for (TreeEntry *VTE : It->getSecond())
16418 VTE->VectorizedValue = Vec;
16419 }
16420 eraseInstruction(PrevVec);
16421 }
16422
16423 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16424 << " values .\n");
16425
16427 // Maps vector instruction to original insertelement instruction
16428 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16429 // Maps extract Scalar to the corresponding extractelement instruction in the
16430 // basic block. Only one extractelement per block should be emitted.
16432 ScalarToEEs;
16433 SmallDenseSet<Value *, 4> UsedInserts;
16435 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16437 // Extract all of the elements with the external uses.
16438 for (const auto &ExternalUse : ExternalUses) {
16439 Value *Scalar = ExternalUse.Scalar;
16440 llvm::User *User = ExternalUse.User;
16441
16442 // Skip users that we already RAUW. This happens when one instruction
16443 // has multiple uses of the same value.
16444 if (User && !is_contained(Scalar->users(), User))
16445 continue;
16446 const TreeEntry *E = &ExternalUse.E;
16447 assert(E && "Invalid scalar");
16448 assert(!E->isGather() && "Extracting from a gather list");
16449 // Non-instruction pointers are not deleted, just skip them.
16450 if (E->getOpcode() == Instruction::GetElementPtr &&
16451 !isa<GetElementPtrInst>(Scalar))
16452 continue;
16453
16454 Value *Vec = E->VectorizedValue;
16455 assert(Vec && "Can't find vectorizable value");
16456
16457 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16458 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16459 if (Scalar->getType() != Vec->getType()) {
16460 Value *Ex = nullptr;
16461 Value *ExV = nullptr;
16462 auto *Inst = dyn_cast<Instruction>(Scalar);
16463 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16464 auto It = ScalarToEEs.find(Scalar);
16465 if (It != ScalarToEEs.end()) {
16466 // No need to emit many extracts, just move the only one in the
16467 // current block.
16468 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16469 : Builder.GetInsertBlock());
16470 if (EEIt != It->second.end()) {
16471 Value *PrevV = EEIt->second.first;
16472 if (auto *I = dyn_cast<Instruction>(PrevV);
16473 I && !ReplaceInst &&
16474 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16475 Builder.GetInsertPoint()->comesBefore(I)) {
16476 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16477 Builder.GetInsertPoint());
16478 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16479 CI->moveAfter(I);
16480 }
16481 Ex = PrevV;
16482 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16483 }
16484 }
16485 if (!Ex) {
16486 // "Reuse" the existing extract to improve final codegen.
16487 if (ReplaceInst) {
16488 // Leave the instruction as is, if it cheaper extracts and all
16489 // operands are scalar.
16490 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16491 IgnoredExtracts.insert(EE);
16492 Ex = EE;
16493 } else {
16494 auto *CloneInst = Inst->clone();
16495 CloneInst->insertBefore(Inst->getIterator());
16496 if (Inst->hasName())
16497 CloneInst->takeName(Inst);
16498 Ex = CloneInst;
16499 }
16500 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16501 ES && isa<Instruction>(Vec)) {
16502 Value *V = ES->getVectorOperand();
16503 auto *IVec = cast<Instruction>(Vec);
16504 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
16505 V = ETEs.front()->VectorizedValue;
16506 if (auto *IV = dyn_cast<Instruction>(V);
16507 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16508 IV->comesBefore(IVec))
16509 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16510 else
16511 Ex = Builder.CreateExtractElement(Vec, Lane);
16512 } else if (auto *VecTy =
16513 dyn_cast<FixedVectorType>(Scalar->getType())) {
16514 assert(SLPReVec && "FixedVectorType is not expected.");
16515 unsigned VecTyNumElements = VecTy->getNumElements();
16516 // When REVEC is enabled, we need to extract a vector.
16517 // Note: The element size of Scalar may be different from the
16518 // element size of Vec.
16519 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
16520 ExternalUse.Lane * VecTyNumElements);
16521 } else {
16522 Ex = Builder.CreateExtractElement(Vec, Lane);
16523 }
16524 // If necessary, sign-extend or zero-extend ScalarRoot
16525 // to the larger type.
16526 ExV = Ex;
16527 if (Scalar->getType() != Ex->getType())
16528 ExV = Builder.CreateIntCast(
16529 Ex, Scalar->getType(),
16530 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16531 auto *I = dyn_cast<Instruction>(Ex);
16532 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16533 : &F->getEntryBlock(),
16534 std::make_pair(Ex, ExV));
16535 }
16536 // The then branch of the previous if may produce constants, since 0
16537 // operand might be a constant.
16538 if (auto *ExI = dyn_cast<Instruction>(Ex);
16539 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16540 GatherShuffleExtractSeq.insert(ExI);
16541 CSEBlocks.insert(ExI->getParent());
16542 }
16543 return ExV;
16544 }
16545 assert(isa<FixedVectorType>(Scalar->getType()) &&
16546 isa<InsertElementInst>(Scalar) &&
16547 "In-tree scalar of vector type is not insertelement?");
16548 auto *IE = cast<InsertElementInst>(Scalar);
16549 VectorToInsertElement.try_emplace(Vec, IE);
16550 return Vec;
16551 };
16552 // If User == nullptr, the Scalar remains as scalar in vectorized
16553 // instructions or is used as extra arg. Generate ExtractElement instruction
16554 // and update the record for this scalar in ExternallyUsedValues.
16555 if (!User) {
16556 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16557 continue;
16558 assert(
16559 (ExternallyUsedValues.count(Scalar) ||
16560 Scalar->hasNUsesOrMore(UsesLimit) ||
16561 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16562 any_of(
16563 Scalar->users(),
16564 [&, TTI = TTI](llvm::User *U) {
16565 if (ExternalUsesAsOriginalScalar.contains(U))
16566 return true;
16567 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
16568 return !UseEntries.empty() &&
16569 (E->State == TreeEntry::Vectorize ||
16570 E->State == TreeEntry::StridedVectorize) &&
16571 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
16572 return (UseEntry->State == TreeEntry::Vectorize ||
16573 UseEntry->State ==
16574 TreeEntry::StridedVectorize) &&
16575 doesInTreeUserNeedToExtract(
16576 Scalar, getRootEntryInstruction(*UseEntry),
16577 TLI, TTI);
16578 });
16579 })) &&
16580 "Scalar with nullptr User must be registered in "
16581 "ExternallyUsedValues map or remain as scalar in vectorized "
16582 "instructions");
16583 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16584 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16585 if (PHI->getParent()->isLandingPad())
16586 Builder.SetInsertPoint(
16587 PHI->getParent(),
16588 std::next(
16589 PHI->getParent()->getLandingPadInst()->getIterator()));
16590 else
16591 Builder.SetInsertPoint(PHI->getParent(),
16592 PHI->getParent()->getFirstNonPHIIt());
16593 } else {
16594 Builder.SetInsertPoint(VecI->getParent(),
16595 std::next(VecI->getIterator()));
16596 }
16597 } else {
16598 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16599 }
16600 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16601 // Required to update internally referenced instructions.
16602 if (Scalar != NewInst) {
16603 assert((!isa<ExtractElementInst>(Scalar) ||
16604 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16605 "Extractelements should not be replaced.");
16606 Scalar->replaceAllUsesWith(NewInst);
16607 }
16608 continue;
16609 }
16610
16611 if (auto *VU = dyn_cast<InsertElementInst>(User);
16612 VU && VU->getOperand(1) == Scalar) {
16613 // Skip if the scalar is another vector op or Vec is not an instruction.
16614 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16615 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16616 if (!UsedInserts.insert(VU).second)
16617 continue;
16618 // Need to use original vector, if the root is truncated.
16619 auto BWIt = MinBWs.find(E);
16620 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16621 auto *ScalarTy = FTy->getElementType();
16622 auto Key = std::make_pair(Vec, ScalarTy);
16623 auto VecIt = VectorCasts.find(Key);
16624 if (VecIt == VectorCasts.end()) {
16625 IRBuilderBase::InsertPointGuard Guard(Builder);
16626 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16627 if (IVec->getParent()->isLandingPad())
16628 Builder.SetInsertPoint(IVec->getParent(),
16629 std::next(IVec->getParent()
16630 ->getLandingPadInst()
16631 ->getIterator()));
16632 else
16633 Builder.SetInsertPoint(
16634 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16635 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16636 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16637 }
16638 Vec = Builder.CreateIntCast(
16639 Vec,
16641 ScalarTy,
16642 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16643 BWIt->second.second);
16644 VectorCasts.try_emplace(Key, Vec);
16645 } else {
16646 Vec = VecIt->second;
16647 }
16648 }
16649
16650 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16651 if (InsertIdx) {
16652 auto *It = find_if(
16653 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16654 // Checks if 2 insertelements are from the same buildvector.
16655 InsertElementInst *VecInsert = Data.InsertElements.front();
16657 VU, VecInsert,
16658 [](InsertElementInst *II) { return II->getOperand(0); });
16659 });
16660 unsigned Idx = *InsertIdx;
16661 if (It == ShuffledInserts.end()) {
16662 (void)ShuffledInserts.emplace_back();
16663 It = std::next(ShuffledInserts.begin(),
16664 ShuffledInserts.size() - 1);
16665 }
16666 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16667 if (Mask.empty())
16668 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16669 Mask[Idx] = ExternalUse.Lane;
16670 It->InsertElements.push_back(cast<InsertElementInst>(User));
16671 continue;
16672 }
16673 }
16674 }
16675 }
16676
16677 // Generate extracts for out-of-tree users.
16678 // Find the insertion point for the extractelement lane.
16679 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16680 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16681 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16682 if (PH->getIncomingValue(I) == Scalar) {
16683 Instruction *IncomingTerminator =
16684 PH->getIncomingBlock(I)->getTerminator();
16685 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16686 Builder.SetInsertPoint(VecI->getParent(),
16687 std::next(VecI->getIterator()));
16688 } else {
16689 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16690 }
16691 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16692 PH->setOperand(I, NewInst);
16693 }
16694 }
16695 } else {
16696 Builder.SetInsertPoint(cast<Instruction>(User));
16697 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16698 User->replaceUsesOfWith(Scalar, NewInst);
16699 }
16700 } else {
16701 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16702 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16703 User->replaceUsesOfWith(Scalar, NewInst);
16704 }
16705
16706 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16707 }
16708
16709 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16710 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16711 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16712 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16713 for (int I = 0, E = Mask.size(); I < E; ++I) {
16714 if (Mask[I] < VF)
16715 CombinedMask1[I] = Mask[I];
16716 else
16717 CombinedMask2[I] = Mask[I] - VF;
16718 }
16719 ShuffleInstructionBuilder ShuffleBuilder(
16720 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16721 ShuffleBuilder.add(V1, CombinedMask1);
16722 if (V2)
16723 ShuffleBuilder.add(V2, CombinedMask2);
16724 return ShuffleBuilder.finalize({}, {}, {});
16725 };
16726
16727 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16728 bool ForSingleMask) {
16729 unsigned VF = Mask.size();
16730 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16731 if (VF != VecVF) {
16732 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16733 Vec = CreateShuffle(Vec, nullptr, Mask);
16734 return std::make_pair(Vec, true);
16735 }
16736 if (!ForSingleMask) {
16737 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16738 for (unsigned I = 0; I < VF; ++I) {
16739 if (Mask[I] != PoisonMaskElem)
16740 ResizeMask[Mask[I]] = Mask[I];
16741 }
16742 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16743 }
16744 }
16745
16746 return std::make_pair(Vec, false);
16747 };
16748 // Perform shuffling of the vectorize tree entries for better handling of
16749 // external extracts.
16750 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16751 // Find the first and the last instruction in the list of insertelements.
16752 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16753 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16754 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16755 Builder.SetInsertPoint(LastInsert);
16756 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16757 Value *NewInst = performExtractsShuffleAction<Value>(
16758 MutableArrayRef(Vector.data(), Vector.size()),
16759 FirstInsert->getOperand(0),
16760 [](Value *Vec) {
16761 return cast<VectorType>(Vec->getType())
16762 ->getElementCount()
16763 .getKnownMinValue();
16764 },
16765 ResizeToVF,
16766 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16767 ArrayRef<Value *> Vals) {
16768 assert((Vals.size() == 1 || Vals.size() == 2) &&
16769 "Expected exactly 1 or 2 input values.");
16770 if (Vals.size() == 1) {
16771 // Do not create shuffle if the mask is a simple identity
16772 // non-resizing mask.
16773 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16774 ->getNumElements() ||
16775 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16776 return CreateShuffle(Vals.front(), nullptr, Mask);
16777 return Vals.front();
16778 }
16779 return CreateShuffle(Vals.front() ? Vals.front()
16780 : FirstInsert->getOperand(0),
16781 Vals.back(), Mask);
16782 });
16783 auto It = ShuffledInserts[I].InsertElements.rbegin();
16784 // Rebuild buildvector chain.
16785 InsertElementInst *II = nullptr;
16786 if (It != ShuffledInserts[I].InsertElements.rend())
16787 II = *It;
16789 while (It != ShuffledInserts[I].InsertElements.rend()) {
16790 assert(II && "Must be an insertelement instruction.");
16791 if (*It == II)
16792 ++It;
16793 else
16794 Inserts.push_back(cast<Instruction>(II));
16795 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16796 }
16797 for (Instruction *II : reverse(Inserts)) {
16798 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16799 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16800 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16801 II->moveAfter(NewI);
16802 NewInst = II;
16803 }
16804 LastInsert->replaceAllUsesWith(NewInst);
16805 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16806 IE->replaceUsesOfWith(IE->getOperand(0),
16807 PoisonValue::get(IE->getOperand(0)->getType()));
16808 IE->replaceUsesOfWith(IE->getOperand(1),
16809 PoisonValue::get(IE->getOperand(1)->getType()));
16810 eraseInstruction(IE);
16811 }
16812 CSEBlocks.insert(LastInsert->getParent());
16813 }
16814
16815 SmallVector<Instruction *> RemovedInsts;
16816 // For each vectorized value:
16817 for (auto &TEPtr : VectorizableTree) {
16818 TreeEntry *Entry = TEPtr.get();
16819
16820 // No need to handle users of gathered values.
16821 if (Entry->isGather())
16822 continue;
16823
16824 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16825
16826 // For each lane:
16827 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16828 Value *Scalar = Entry->Scalars[Lane];
16829
16830 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16831 !isa<GetElementPtrInst>(Scalar))
16832 continue;
16833 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16834 EE && IgnoredExtracts.contains(EE))
16835 continue;
16836 if (isa<PoisonValue>(Scalar))
16837 continue;
16838#ifndef NDEBUG
16839 Type *Ty = Scalar->getType();
16840 if (!Ty->isVoidTy()) {
16841 for (User *U : Scalar->users()) {
16842 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16843
16844 // It is legal to delete users in the ignorelist.
16845 assert((isVectorized(U) ||
16846 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16847 (isa_and_nonnull<Instruction>(U) &&
16848 isDeleted(cast<Instruction>(U)))) &&
16849 "Deleting out-of-tree value");
16850 }
16851 }
16852#endif
16853 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16854 auto *I = cast<Instruction>(Scalar);
16855 RemovedInsts.push_back(I);
16856 }
16857 }
16858
16859 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16860 // new vector instruction.
16861 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16862 V->mergeDIAssignID(RemovedInsts);
16863
16864 // Clear up reduction references, if any.
16865 if (UserIgnoreList) {
16866 for (Instruction *I : RemovedInsts) {
16867 const TreeEntry *IE = getTreeEntries(I).front();
16868 if (IE->Idx != 0 &&
16869 !(VectorizableTree.front()->isGather() &&
16870 !IE->UserTreeIndices.empty() &&
16871 (ValueToGatherNodes.lookup(I).contains(
16872 VectorizableTree.front().get()) ||
16873 any_of(IE->UserTreeIndices,
16874 [&](const EdgeInfo &EI) {
16875 return EI.UserTE == VectorizableTree.front().get() &&
16876 EI.EdgeIdx == UINT_MAX;
16877 }))) &&
16878 !(GatheredLoadsEntriesFirst.has_value() &&
16879 IE->Idx >= *GatheredLoadsEntriesFirst &&
16880 VectorizableTree.front()->isGather() &&
16881 is_contained(VectorizableTree.front()->Scalars, I)))
16882 continue;
16883 SmallVector<SelectInst *> LogicalOpSelects;
16884 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16885 // Do not replace condition of the logical op in form select <cond>.
16886 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16887 (match(U.getUser(), m_LogicalAnd()) ||
16888 match(U.getUser(), m_LogicalOr())) &&
16889 U.getOperandNo() == 0;
16890 if (IsPoisoningLogicalOp) {
16891 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16892 return false;
16893 }
16894 return UserIgnoreList->contains(U.getUser());
16895 });
16896 // Replace conditions of the poisoning logical ops with the non-poison
16897 // constant value.
16898 for (SelectInst *SI : LogicalOpSelects)
16899 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16900 }
16901 }
16902 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16903 // cache correctness.
16904 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16905 // - instructions are not deleted until later.
16906 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16907
16908 Builder.ClearInsertionPoint();
16909 InstrElementSize.clear();
16910
16911 const TreeEntry &RootTE = *VectorizableTree.front();
16912 Value *Vec = RootTE.VectorizedValue;
16913 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16914 It != MinBWs.end() &&
16915 ReductionBitWidth != It->second.first) {
16916 IRBuilder<>::InsertPointGuard Guard(Builder);
16917 Builder.SetInsertPoint(ReductionRoot->getParent(),
16918 ReductionRoot->getIterator());
16919 Vec = Builder.CreateIntCast(
16920 Vec,
16921 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16922 cast<VectorType>(Vec->getType())->getElementCount()),
16923 It->second.second);
16924 }
16925 return Vec;
16926}
16927
16929 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16930 << " gather sequences instructions.\n");
16931 // LICM InsertElementInst sequences.
16932 for (Instruction *I : GatherShuffleExtractSeq) {
16933 if (isDeleted(I))
16934 continue;
16935
16936 // Check if this block is inside a loop.
16937 Loop *L = LI->getLoopFor(I->getParent());
16938 if (!L)
16939 continue;
16940
16941 // Check if it has a preheader.
16942 BasicBlock *PreHeader = L->getLoopPreheader();
16943 if (!PreHeader)
16944 continue;
16945
16946 // If the vector or the element that we insert into it are
16947 // instructions that are defined in this basic block then we can't
16948 // hoist this instruction.
16949 if (any_of(I->operands(), [L](Value *V) {
16950 auto *OpI = dyn_cast<Instruction>(V);
16951 return OpI && L->contains(OpI);
16952 }))
16953 continue;
16954
16955 // We can hoist this instruction. Move it to the pre-header.
16956 I->moveBefore(PreHeader->getTerminator()->getIterator());
16957 CSEBlocks.insert(PreHeader);
16958 }
16959
16960 // Make a list of all reachable blocks in our CSE queue.
16962 CSEWorkList.reserve(CSEBlocks.size());
16963 for (BasicBlock *BB : CSEBlocks)
16964 if (DomTreeNode *N = DT->getNode(BB)) {
16966 CSEWorkList.push_back(N);
16967 }
16968
16969 // Sort blocks by domination. This ensures we visit a block after all blocks
16970 // dominating it are visited.
16971 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
16972 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
16973 "Different nodes should have different DFS numbers");
16974 return A->getDFSNumIn() < B->getDFSNumIn();
16975 });
16976
16977 // Less defined shuffles can be replaced by the more defined copies.
16978 // Between two shuffles one is less defined if it has the same vector operands
16979 // and its mask indeces are the same as in the first one or undefs. E.g.
16980 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
16981 // poison, <0, 0, 0, 0>.
16982 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
16983 Instruction *I2,
16984 SmallVectorImpl<int> &NewMask) {
16985 if (I1->getType() != I2->getType())
16986 return false;
16987 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16988 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16989 if (!SI1 || !SI2)
16990 return I1->isIdenticalTo(I2);
16991 if (SI1->isIdenticalTo(SI2))
16992 return true;
16993 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
16994 if (SI1->getOperand(I) != SI2->getOperand(I))
16995 return false;
16996 // Check if the second instruction is more defined than the first one.
16997 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16998 ArrayRef<int> SM1 = SI1->getShuffleMask();
16999 // Count trailing undefs in the mask to check the final number of used
17000 // registers.
17001 unsigned LastUndefsCnt = 0;
17002 for (int I = 0, E = NewMask.size(); I < E; ++I) {
17003 if (SM1[I] == PoisonMaskElem)
17004 ++LastUndefsCnt;
17005 else
17006 LastUndefsCnt = 0;
17007 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
17008 NewMask[I] != SM1[I])
17009 return false;
17010 if (NewMask[I] == PoisonMaskElem)
17011 NewMask[I] = SM1[I];
17012 }
17013 // Check if the last undefs actually change the final number of used vector
17014 // registers.
17015 return SM1.size() - LastUndefsCnt > 1 &&
17016 ::getNumberOfParts(*TTI, SI1->getType()) ==
17018 *TTI, getWidenedType(SI1->getType()->getElementType(),
17019 SM1.size() - LastUndefsCnt));
17020 };
17021 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
17022 // instructions. TODO: We can further optimize this scan if we split the
17023 // instructions into different buckets based on the insert lane.
17025 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
17026 assert(*I &&
17027 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17028 "Worklist not sorted properly!");
17029 BasicBlock *BB = (*I)->getBlock();
17030 // For all instructions in blocks containing gather sequences:
17031 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17032 if (isDeleted(&In))
17033 continue;
17034 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17035 !GatherShuffleExtractSeq.contains(&In))
17036 continue;
17037
17038 // Check if we can replace this instruction with any of the
17039 // visited instructions.
17040 bool Replaced = false;
17041 for (Instruction *&V : Visited) {
17042 SmallVector<int> NewMask;
17043 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17044 DT->dominates(V->getParent(), In.getParent())) {
17045 In.replaceAllUsesWith(V);
17046 eraseInstruction(&In);
17047 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17048 if (!NewMask.empty())
17049 SI->setShuffleMask(NewMask);
17050 Replaced = true;
17051 break;
17052 }
17053 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17054 GatherShuffleExtractSeq.contains(V) &&
17055 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17056 DT->dominates(In.getParent(), V->getParent())) {
17057 In.moveAfter(V);
17058 V->replaceAllUsesWith(&In);
17060 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17061 if (!NewMask.empty())
17062 SI->setShuffleMask(NewMask);
17063 V = &In;
17064 Replaced = true;
17065 break;
17066 }
17067 }
17068 if (!Replaced) {
17069 assert(!is_contained(Visited, &In));
17070 Visited.push_back(&In);
17071 }
17072 }
17073 }
17074 CSEBlocks.clear();
17075 GatherShuffleExtractSeq.clear();
17076}
17077
17078BoUpSLP::ScheduleData *
17079BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17080 ScheduleData *Bundle = nullptr;
17081 ScheduleData *PrevInBundle = nullptr;
17082 for (Value *V : VL) {
17084 continue;
17085 ScheduleData *BundleMember = getScheduleData(V);
17086 assert(BundleMember &&
17087 "no ScheduleData for bundle member "
17088 "(maybe not in same basic block)");
17089 assert(BundleMember->isSchedulingEntity() &&
17090 "bundle member already part of other bundle");
17091 if (PrevInBundle) {
17092 PrevInBundle->NextInBundle = BundleMember;
17093 } else {
17094 Bundle = BundleMember;
17095 }
17096
17097 // Group the instructions to a bundle.
17098 BundleMember->FirstInBundle = Bundle;
17099 PrevInBundle = BundleMember;
17100 }
17101 assert(Bundle && "Failed to find schedule bundle");
17102 return Bundle;
17103}
17104
17105// Groups the instructions to a bundle (which is then a single scheduling entity)
17106// and schedules instructions until the bundle gets ready.
17107std::optional<BoUpSLP::ScheduleData *>
17108BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17109 const InstructionsState &S) {
17110 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17111 // instructions.
17112 if (isa<PHINode>(S.getMainOp()) ||
17114 return nullptr;
17115
17116 // Initialize the instruction bundle.
17117 Instruction *OldScheduleEnd = ScheduleEnd;
17118 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17119
17120 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17121 ScheduleData *Bundle) {
17122 // The scheduling region got new instructions at the lower end (or it is a
17123 // new region for the first bundle). This makes it necessary to
17124 // recalculate all dependencies.
17125 // It is seldom that this needs to be done a second time after adding the
17126 // initial bundle to the region.
17127 if (ScheduleEnd != OldScheduleEnd) {
17128 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17129 if (ScheduleData *SD = getScheduleData(I))
17130 SD->clearDependencies();
17131 ReSchedule = true;
17132 }
17133 if (Bundle) {
17134 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17135 << " in block " << BB->getName() << "\n");
17136 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17137 }
17138
17139 if (ReSchedule) {
17140 resetSchedule();
17141 initialFillReadyList(ReadyInsts);
17142 }
17143
17144 // Now try to schedule the new bundle or (if no bundle) just calculate
17145 // dependencies. As soon as the bundle is "ready" it means that there are no
17146 // cyclic dependencies and we can schedule it. Note that's important that we
17147 // don't "schedule" the bundle yet (see cancelScheduling).
17148 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17149 !ReadyInsts.empty()) {
17150 ScheduleData *Picked = ReadyInsts.pop_back_val();
17151 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17152 "must be ready to schedule");
17153 schedule(Picked, ReadyInsts);
17154 }
17155 };
17156
17157 // Make sure that the scheduling region contains all
17158 // instructions of the bundle.
17159 for (Value *V : VL) {
17161 continue;
17162 if (!extendSchedulingRegion(V, S)) {
17163 // If the scheduling region got new instructions at the lower end (or it
17164 // is a new region for the first bundle). This makes it necessary to
17165 // recalculate all dependencies.
17166 // Otherwise the compiler may crash trying to incorrectly calculate
17167 // dependencies and emit instruction in the wrong order at the actual
17168 // scheduling.
17169 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17170 return std::nullopt;
17171 }
17172 }
17173
17174 bool ReSchedule = false;
17175 for (Value *V : VL) {
17177 continue;
17178 ScheduleData *BundleMember = getScheduleData(V);
17179 assert(BundleMember &&
17180 "no ScheduleData for bundle member (maybe not in same basic block)");
17181
17182 // Make sure we don't leave the pieces of the bundle in the ready list when
17183 // whole bundle might not be ready.
17184 ReadyInsts.remove(BundleMember);
17185
17186 if (!BundleMember->IsScheduled)
17187 continue;
17188 // A bundle member was scheduled as single instruction before and now
17189 // needs to be scheduled as part of the bundle. We just get rid of the
17190 // existing schedule.
17191 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17192 << " was already scheduled\n");
17193 ReSchedule = true;
17194 }
17195
17196 auto *Bundle = buildBundle(VL);
17197 TryScheduleBundleImpl(ReSchedule, Bundle);
17198 if (!Bundle->isReady()) {
17199 cancelScheduling(VL, S.getMainOp());
17200 return std::nullopt;
17201 }
17202 return Bundle;
17203}
17204
17205void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17206 Value *OpValue) {
17207 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17209 return;
17210
17211 if (doesNotNeedToBeScheduled(OpValue))
17212 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17213 ScheduleData *Bundle = getScheduleData(OpValue);
17214 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17215 assert(!Bundle->IsScheduled &&
17216 "Can't cancel bundle which is already scheduled");
17217 assert(Bundle->isSchedulingEntity() &&
17218 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17219 "tried to unbundle something which is not a bundle");
17220
17221 // Remove the bundle from the ready list.
17222 if (Bundle->isReady())
17223 ReadyInsts.remove(Bundle);
17224
17225 // Un-bundle: make single instructions out of the bundle.
17226 ScheduleData *BundleMember = Bundle;
17227 while (BundleMember) {
17228 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17229 BundleMember->FirstInBundle = BundleMember;
17230 ScheduleData *Next = BundleMember->NextInBundle;
17231 BundleMember->NextInBundle = nullptr;
17232 BundleMember->TE = nullptr;
17233 if (BundleMember->unscheduledDepsInBundle() == 0) {
17234 ReadyInsts.insert(BundleMember);
17235 }
17236 BundleMember = Next;
17237 }
17238}
17239
17240BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17241 // Allocate a new ScheduleData for the instruction.
17242 if (ChunkPos >= ChunkSize) {
17243 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17244 ChunkPos = 0;
17245 }
17246 return &(ScheduleDataChunks.back()[ChunkPos++]);
17247}
17248
17249bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17250 Value *V, const InstructionsState &S) {
17251 Instruction *I = dyn_cast<Instruction>(V);
17252 assert(I && "bundle member must be an instruction");
17253 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17255 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17256 "be scheduled");
17257 if (getScheduleData(I))
17258 return true;
17259 if (!ScheduleStart) {
17260 // It's the first instruction in the new region.
17261 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17262 ScheduleStart = I;
17263 ScheduleEnd = I->getNextNode();
17264 assert(ScheduleEnd && "tried to vectorize a terminator?");
17265 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17266 return true;
17267 }
17268 // Search up and down at the same time, because we don't know if the new
17269 // instruction is above or below the existing scheduling region.
17270 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17271 // against the budget. Otherwise debug info could affect codegen.
17273 ++ScheduleStart->getIterator().getReverse();
17274 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17275 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17276 BasicBlock::iterator LowerEnd = BB->end();
17277 auto IsAssumeLikeIntr = [](const Instruction &I) {
17278 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17279 return II->isAssumeLikeIntrinsic();
17280 return false;
17281 };
17282 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17283 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17284 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17285 &*DownIter != I) {
17286 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17287 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17288 return false;
17289 }
17290
17291 ++UpIter;
17292 ++DownIter;
17293
17294 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17295 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17296 }
17297 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17298 assert(I->getParent() == ScheduleStart->getParent() &&
17299 "Instruction is in wrong basic block.");
17300 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17301 ScheduleStart = I;
17302 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17303 << "\n");
17304 return true;
17305 }
17306 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17307 "Expected to reach top of the basic block or instruction down the "
17308 "lower end.");
17309 assert(I->getParent() == ScheduleEnd->getParent() &&
17310 "Instruction is in wrong basic block.");
17311 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17312 nullptr);
17313 ScheduleEnd = I->getNextNode();
17314 assert(ScheduleEnd && "tried to vectorize a terminator?");
17315 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17316 return true;
17317}
17318
17319void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17320 Instruction *ToI,
17321 ScheduleData *PrevLoadStore,
17322 ScheduleData *NextLoadStore) {
17323 ScheduleData *CurrentLoadStore = PrevLoadStore;
17324 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17325 // No need to allocate data for non-schedulable instructions.
17327 continue;
17328 ScheduleData *SD = ScheduleDataMap.lookup(I);
17329 if (!SD) {
17330 SD = allocateScheduleDataChunks();
17331 ScheduleDataMap[I] = SD;
17332 }
17333 assert(!isInSchedulingRegion(SD) &&
17334 "new ScheduleData already in scheduling region");
17335 SD->init(SchedulingRegionID, I);
17336
17337 if (I->mayReadOrWriteMemory() &&
17338 (!isa<IntrinsicInst>(I) ||
17339 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17340 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17341 Intrinsic::pseudoprobe))) {
17342 // Update the linked list of memory accessing instructions.
17343 if (CurrentLoadStore) {
17344 CurrentLoadStore->NextLoadStore = SD;
17345 } else {
17346 FirstLoadStoreInRegion = SD;
17347 }
17348 CurrentLoadStore = SD;
17349 }
17350
17351 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17352 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17353 RegionHasStackSave = true;
17354 }
17355 if (NextLoadStore) {
17356 if (CurrentLoadStore)
17357 CurrentLoadStore->NextLoadStore = NextLoadStore;
17358 } else {
17359 LastLoadStoreInRegion = CurrentLoadStore;
17360 }
17361}
17362
17363void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17364 bool InsertInReadyList,
17365 BoUpSLP *SLP) {
17366 assert(SD->isSchedulingEntity());
17367
17369 WorkList.push_back(SD);
17370
17371 while (!WorkList.empty()) {
17372 ScheduleData *SD = WorkList.pop_back_val();
17373 for (ScheduleData *BundleMember = SD; BundleMember;
17374 BundleMember = BundleMember->NextInBundle) {
17375 assert(isInSchedulingRegion(BundleMember));
17376 if (BundleMember->hasValidDependencies())
17377 continue;
17378
17379 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17380 << "\n");
17381 BundleMember->Dependencies = 0;
17382 BundleMember->resetUnscheduledDeps();
17383
17384 // Handle def-use chain dependencies.
17385 for (User *U : BundleMember->Inst->users()) {
17386 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17387 BundleMember->Dependencies++;
17388 ScheduleData *DestBundle = UseSD->FirstInBundle;
17389 if (!DestBundle->IsScheduled)
17390 BundleMember->incrementUnscheduledDeps(1);
17391 if (!DestBundle->hasValidDependencies())
17392 WorkList.push_back(DestBundle);
17393 }
17394 }
17395
17396 auto MakeControlDependent = [&](Instruction *I) {
17397 auto *DepDest = getScheduleData(I);
17398 assert(DepDest && "must be in schedule window");
17399 DepDest->ControlDependencies.push_back(BundleMember);
17400 BundleMember->Dependencies++;
17401 ScheduleData *DestBundle = DepDest->FirstInBundle;
17402 if (!DestBundle->IsScheduled)
17403 BundleMember->incrementUnscheduledDeps(1);
17404 if (!DestBundle->hasValidDependencies())
17405 WorkList.push_back(DestBundle);
17406 };
17407
17408 // Any instruction which isn't safe to speculate at the beginning of the
17409 // block is control dependend on any early exit or non-willreturn call
17410 // which proceeds it.
17411 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17412 for (Instruction *I = BundleMember->Inst->getNextNode();
17413 I != ScheduleEnd; I = I->getNextNode()) {
17414 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17415 continue;
17416
17417 // Add the dependency
17418 MakeControlDependent(I);
17419
17421 // Everything past here must be control dependent on I.
17422 break;
17423 }
17424 }
17425
17426 if (RegionHasStackSave) {
17427 // If we have an inalloc alloca instruction, it needs to be scheduled
17428 // after any preceeding stacksave. We also need to prevent any alloca
17429 // from reordering above a preceeding stackrestore.
17430 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17431 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17432 for (Instruction *I = BundleMember->Inst->getNextNode();
17433 I != ScheduleEnd; I = I->getNextNode()) {
17434 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17435 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17436 // Any allocas past here must be control dependent on I, and I
17437 // must be memory dependend on BundleMember->Inst.
17438 break;
17439
17440 if (!isa<AllocaInst>(I))
17441 continue;
17442
17443 // Add the dependency
17444 MakeControlDependent(I);
17445 }
17446 }
17447
17448 // In addition to the cases handle just above, we need to prevent
17449 // allocas and loads/stores from moving below a stacksave or a
17450 // stackrestore. Avoiding moving allocas below stackrestore is currently
17451 // thought to be conservatism. Moving loads/stores below a stackrestore
17452 // can lead to incorrect code.
17453 if (isa<AllocaInst>(BundleMember->Inst) ||
17454 BundleMember->Inst->mayReadOrWriteMemory()) {
17455 for (Instruction *I = BundleMember->Inst->getNextNode();
17456 I != ScheduleEnd; I = I->getNextNode()) {
17457 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17458 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17459 continue;
17460
17461 // Add the dependency
17462 MakeControlDependent(I);
17463 break;
17464 }
17465 }
17466 }
17467
17468 // Handle the memory dependencies (if any).
17469 ScheduleData *DepDest = BundleMember->NextLoadStore;
17470 if (!DepDest)
17471 continue;
17472 Instruction *SrcInst = BundleMember->Inst;
17473 assert(SrcInst->mayReadOrWriteMemory() &&
17474 "NextLoadStore list for non memory effecting bundle?");
17475 MemoryLocation SrcLoc = getLocation(SrcInst);
17476 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17477 unsigned NumAliased = 0;
17478 unsigned DistToSrc = 1;
17479
17480 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17481 assert(isInSchedulingRegion(DepDest));
17482
17483 // We have two limits to reduce the complexity:
17484 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17485 // SLP->isAliased (which is the expensive part in this loop).
17486 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17487 // the whole loop (even if the loop is fast, it's quadratic).
17488 // It's important for the loop break condition (see below) to
17489 // check this limit even between two read-only instructions.
17490 if (DistToSrc >= MaxMemDepDistance ||
17491 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17492 (NumAliased >= AliasedCheckLimit ||
17493 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17494
17495 // We increment the counter only if the locations are aliased
17496 // (instead of counting all alias checks). This gives a better
17497 // balance between reduced runtime and accurate dependencies.
17498 NumAliased++;
17499
17500 DepDest->MemoryDependencies.push_back(BundleMember);
17501 BundleMember->Dependencies++;
17502 ScheduleData *DestBundle = DepDest->FirstInBundle;
17503 if (!DestBundle->IsScheduled) {
17504 BundleMember->incrementUnscheduledDeps(1);
17505 }
17506 if (!DestBundle->hasValidDependencies()) {
17507 WorkList.push_back(DestBundle);
17508 }
17509 }
17510
17511 // Example, explaining the loop break condition: Let's assume our
17512 // starting instruction is i0 and MaxMemDepDistance = 3.
17513 //
17514 // +--------v--v--v
17515 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17516 // +--------^--^--^
17517 //
17518 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17519 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17520 // Previously we already added dependencies from i3 to i6,i7,i8
17521 // (because of MaxMemDepDistance). As we added a dependency from
17522 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17523 // and we can abort this loop at i6.
17524 if (DistToSrc >= 2 * MaxMemDepDistance)
17525 break;
17526 DistToSrc++;
17527 }
17528 }
17529 if (InsertInReadyList && SD->isReady()) {
17530 ReadyInsts.insert(SD);
17531 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17532 << "\n");
17533 }
17534 }
17535}
17536
17537void BoUpSLP::BlockScheduling::resetSchedule() {
17538 assert(ScheduleStart &&
17539 "tried to reset schedule on block which has not been scheduled");
17540 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17541 if (ScheduleData *SD = getScheduleData(I)) {
17542 assert(isInSchedulingRegion(SD) &&
17543 "ScheduleData not in scheduling region");
17544 SD->IsScheduled = false;
17545 SD->resetUnscheduledDeps();
17546 }
17547 }
17548 ReadyInsts.clear();
17549}
17550
17551void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17552 if (!BS->ScheduleStart)
17553 return;
17554
17555 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17556
17557 // A key point - if we got here, pre-scheduling was able to find a valid
17558 // scheduling of the sub-graph of the scheduling window which consists
17559 // of all vector bundles and their transitive users. As such, we do not
17560 // need to reschedule anything *outside of* that subgraph.
17561
17562 BS->resetSchedule();
17563
17564 // For the real scheduling we use a more sophisticated ready-list: it is
17565 // sorted by the original instruction location. This lets the final schedule
17566 // be as close as possible to the original instruction order.
17567 // WARNING: If changing this order causes a correctness issue, that means
17568 // there is some missing dependence edge in the schedule data graph.
17569 struct ScheduleDataCompare {
17570 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17571 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17572 }
17573 };
17574 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17575
17576 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17577 // and fill the ready-list with initial instructions.
17578 int Idx = 0;
17579 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17580 I = I->getNextNode()) {
17581 if (ScheduleData *SD = BS->getScheduleData(I)) {
17582 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
17584 SD->isPartOfBundle() ==
17585 (!SDTEs.empty() &&
17586 !doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
17587 "scheduler and vectorizer bundle mismatch");
17588 SD->FirstInBundle->SchedulingPriority = Idx++;
17589
17590 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17591 BS->calculateDependencies(SD, false, this);
17592 }
17593 }
17594 BS->initialFillReadyList(ReadyInsts);
17595
17596 Instruction *LastScheduledInst = BS->ScheduleEnd;
17597
17598 // Do the "real" scheduling.
17599 while (!ReadyInsts.empty()) {
17600 ScheduleData *Picked = *ReadyInsts.begin();
17601 ReadyInsts.erase(ReadyInsts.begin());
17602
17603 // Move the scheduled instruction(s) to their dedicated places, if not
17604 // there yet.
17605 for (ScheduleData *BundleMember = Picked; BundleMember;
17606 BundleMember = BundleMember->NextInBundle) {
17607 Instruction *PickedInst = BundleMember->Inst;
17608 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17609 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17610 LastScheduledInst = PickedInst;
17611 }
17612
17613 BS->schedule(Picked, ReadyInsts);
17614 }
17615
17616 // Check that we didn't break any of our invariants.
17617#ifdef EXPENSIVE_CHECKS
17618 BS->verify();
17619#endif
17620
17621#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17622 // Check that all schedulable entities got scheduled
17623 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17624 ScheduleData *SD = BS->getScheduleData(I);
17625 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17626 assert(SD->IsScheduled && "must be scheduled at this point");
17627 }
17628#endif
17629
17630 // Avoid duplicate scheduling of the block.
17631 BS->ScheduleStart = nullptr;
17632}
17633
17635 // If V is a store, just return the width of the stored value (or value
17636 // truncated just before storing) without traversing the expression tree.
17637 // This is the common case.
17638 if (auto *Store = dyn_cast<StoreInst>(V))
17639 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17640
17641 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17642 return getVectorElementSize(IEI->getOperand(1));
17643
17644 auto E = InstrElementSize.find(V);
17645 if (E != InstrElementSize.end())
17646 return E->second;
17647
17648 // If V is not a store, we can traverse the expression tree to find loads
17649 // that feed it. The type of the loaded value may indicate a more suitable
17650 // width than V's type. We want to base the vector element size on the width
17651 // of memory operations where possible.
17654 if (auto *I = dyn_cast<Instruction>(V)) {
17655 Worklist.emplace_back(I, I->getParent(), 0);
17656 Visited.insert(I);
17657 }
17658
17659 // Traverse the expression tree in bottom-up order looking for loads. If we
17660 // encounter an instruction we don't yet handle, we give up.
17661 auto Width = 0u;
17662 Value *FirstNonBool = nullptr;
17663 while (!Worklist.empty()) {
17664 auto [I, Parent, Level] = Worklist.pop_back_val();
17665
17666 // We should only be looking at scalar instructions here. If the current
17667 // instruction has a vector type, skip.
17668 auto *Ty = I->getType();
17669 if (isa<VectorType>(Ty))
17670 continue;
17671 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17672 FirstNonBool = I;
17673 if (Level > RecursionMaxDepth)
17674 continue;
17675
17676 // If the current instruction is a load, update MaxWidth to reflect the
17677 // width of the loaded value.
17678 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17679 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17680
17681 // Otherwise, we need to visit the operands of the instruction. We only
17682 // handle the interesting cases from buildTree here. If an operand is an
17683 // instruction we haven't yet visited and from the same basic block as the
17684 // user or the use is a PHI node, we add it to the worklist.
17687 for (Use &U : I->operands()) {
17688 if (auto *J = dyn_cast<Instruction>(U.get()))
17689 if (Visited.insert(J).second &&
17690 (isa<PHINode>(I) || J->getParent() == Parent)) {
17691 Worklist.emplace_back(J, J->getParent(), Level + 1);
17692 continue;
17693 }
17694 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17695 FirstNonBool = U.get();
17696 }
17697 } else {
17698 break;
17699 }
17700 }
17701
17702 // If we didn't encounter a memory access in the expression tree, or if we
17703 // gave up for some reason, just return the width of V. Otherwise, return the
17704 // maximum width we found.
17705 if (!Width) {
17706 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17707 V = FirstNonBool;
17708 Width = DL->getTypeSizeInBits(V->getType());
17709 }
17710
17711 for (Instruction *I : Visited)
17712 InstrElementSize[I] = Width;
17713
17714 return Width;
17715}
17716
17717bool BoUpSLP::collectValuesToDemote(
17718 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17720 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17721 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17722 // We can always demote constants.
17723 if (all_of(E.Scalars, IsaPred<Constant>))
17724 return true;
17725
17726 unsigned OrigBitWidth =
17727 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17728 if (OrigBitWidth == BitWidth) {
17729 MaxDepthLevel = 1;
17730 return true;
17731 }
17732
17733 // Check if the node was analyzed already and must keep its original bitwidth.
17734 if (NodesToKeepBWs.contains(E.Idx))
17735 return false;
17736
17737 // If the value is not a vectorized instruction in the expression and not used
17738 // by the insertelement instruction and not used in multiple vector nodes, it
17739 // cannot be demoted.
17740 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17741 if (isa<PoisonValue>(R))
17742 return false;
17743 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17744 });
17745 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17746 if (isa<PoisonValue>(V))
17747 return true;
17748 if (getTreeEntries(V).size() > 1)
17749 return false;
17750 // For lat shuffle of sext/zext with many uses need to check the extra bit
17751 // for unsigned values, otherwise may have incorrect casting for reused
17752 // scalars.
17753 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17754 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17755 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17756 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17757 return true;
17758 }
17759 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17760 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17761 if (IsSignedNode)
17762 ++BitWidth1;
17763 if (auto *I = dyn_cast<Instruction>(V)) {
17764 APInt Mask = DB->getDemandedBits(I);
17765 unsigned BitWidth2 =
17766 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17767 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17768 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17769 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17770 break;
17771 BitWidth2 *= 2;
17772 }
17773 BitWidth1 = std::min(BitWidth1, BitWidth2);
17774 }
17775 BitWidth = std::max(BitWidth, BitWidth1);
17776 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17777 };
17778 auto FinalAnalysis = [&, TTI = TTI]() {
17779 if (!IsProfitableToDemote)
17780 return false;
17781 bool Res = all_of(
17782 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17783 // Demote gathers.
17784 if (Res && E.isGather()) {
17785 // Check possible extractelement instructions bases and final vector
17786 // length.
17787 SmallPtrSet<Value *, 4> UniqueBases;
17788 for (Value *V : E.Scalars) {
17789 auto *EE = dyn_cast<ExtractElementInst>(V);
17790 if (!EE)
17791 continue;
17792 UniqueBases.insert(EE->getVectorOperand());
17793 }
17794 const unsigned VF = E.Scalars.size();
17795 Type *OrigScalarTy = E.Scalars.front()->getType();
17796 if (UniqueBases.size() <= 2 ||
17797 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
17799 *TTI,
17801 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
17802 VF)))
17803 ToDemote.push_back(E.Idx);
17804 }
17805 return Res;
17806 };
17807 if (E.isGather() || !Visited.insert(&E).second ||
17808 any_of(E.Scalars, [&](Value *V) {
17809 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17810 return isa<InsertElementInst>(U) && !isVectorized(U);
17811 });
17812 }))
17813 return FinalAnalysis();
17814
17815 if (any_of(E.Scalars, [&](Value *V) {
17816 return !all_of(V->users(), [=](User *U) {
17817 return isVectorized(U) ||
17818 (E.Idx == 0 && UserIgnoreList &&
17819 UserIgnoreList->contains(U)) ||
17820 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17821 !U->getType()->isScalableTy() &&
17822 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17823 }) && !IsPotentiallyTruncated(V, BitWidth);
17824 }))
17825 return false;
17826
17827 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17828 bool &NeedToExit) {
17829 NeedToExit = false;
17830 unsigned InitLevel = MaxDepthLevel;
17831 for (const TreeEntry *Op : Operands) {
17832 unsigned Level = InitLevel;
17833 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17834 ToDemote, Visited, NodesToKeepBWs, Level,
17835 IsProfitableToDemote, IsTruncRoot)) {
17836 if (!IsProfitableToDemote)
17837 return false;
17838 NeedToExit = true;
17839 if (!FinalAnalysis())
17840 return false;
17841 continue;
17842 }
17843 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17844 }
17845 return true;
17846 };
17847 auto AttemptCheckBitwidth =
17848 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17849 // Try all bitwidth < OrigBitWidth.
17850 NeedToExit = false;
17851 unsigned BestFailBitwidth = 0;
17852 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17853 if (Checker(BitWidth, OrigBitWidth))
17854 return true;
17855 if (BestFailBitwidth == 0 && FinalAnalysis())
17856 BestFailBitwidth = BitWidth;
17857 }
17858 if (BitWidth >= OrigBitWidth) {
17859 if (BestFailBitwidth == 0) {
17860 BitWidth = OrigBitWidth;
17861 return false;
17862 }
17863 MaxDepthLevel = 1;
17864 BitWidth = BestFailBitwidth;
17865 NeedToExit = true;
17866 return true;
17867 }
17868 return false;
17869 };
17870 auto TryProcessInstruction =
17871 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17872 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17873 if (Operands.empty()) {
17874 if (!IsTruncRoot)
17875 MaxDepthLevel = 1;
17876 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17877 std::ref(BitWidth)));
17878 } else {
17879 // Several vectorized uses? Check if we can truncate it, otherwise -
17880 // exit.
17881 if (E.UserTreeIndices.size() > 1 &&
17882 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17883 std::ref(BitWidth))))
17884 return false;
17885 bool NeedToExit = false;
17886 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17887 return false;
17888 if (NeedToExit)
17889 return true;
17890 if (!ProcessOperands(Operands, NeedToExit))
17891 return false;
17892 if (NeedToExit)
17893 return true;
17894 }
17895
17896 ++MaxDepthLevel;
17897 // Record the entry that we can demote.
17898 ToDemote.push_back(E.Idx);
17899 return IsProfitableToDemote;
17900 };
17901 switch (E.getOpcode()) {
17902
17903 // We can always demote truncations and extensions. Since truncations can
17904 // seed additional demotion, we save the truncated value.
17905 case Instruction::Trunc:
17906 if (IsProfitableToDemoteRoot)
17907 IsProfitableToDemote = true;
17908 return TryProcessInstruction(BitWidth);
17909 case Instruction::ZExt:
17910 case Instruction::SExt:
17911 IsProfitableToDemote = true;
17912 return TryProcessInstruction(BitWidth);
17913
17914 // We can demote certain binary operations if we can demote both of their
17915 // operands.
17916 case Instruction::Add:
17917 case Instruction::Sub:
17918 case Instruction::Mul:
17919 case Instruction::And:
17920 case Instruction::Or:
17921 case Instruction::Xor: {
17922 return TryProcessInstruction(
17923 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17924 }
17925 case Instruction::Freeze:
17926 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17927 case Instruction::Shl: {
17928 // If we are truncating the result of this SHL, and if it's a shift of an
17929 // inrange amount, we can always perform a SHL in a smaller type.
17930 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17931 return all_of(E.Scalars, [&](Value *V) {
17932 if (isa<PoisonValue>(V))
17933 return true;
17934 auto *I = cast<Instruction>(V);
17935 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17936 return AmtKnownBits.getMaxValue().ult(BitWidth);
17937 });
17938 };
17939 return TryProcessInstruction(
17940 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17941 }
17942 case Instruction::LShr: {
17943 // If this is a truncate of a logical shr, we can truncate it to a smaller
17944 // lshr iff we know that the bits we would otherwise be shifting in are
17945 // already zeros.
17946 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17947 return all_of(E.Scalars, [&](Value *V) {
17948 if (isa<PoisonValue>(V))
17949 return true;
17950 auto *I = cast<Instruction>(V);
17951 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17952 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17953 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17954 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17955 SimplifyQuery(*DL));
17956 });
17957 };
17958 return TryProcessInstruction(
17959 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17960 LShrChecker);
17961 }
17962 case Instruction::AShr: {
17963 // If this is a truncate of an arithmetic shr, we can truncate it to a
17964 // smaller ashr iff we know that all the bits from the sign bit of the
17965 // original type and the sign bit of the truncate type are similar.
17966 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17967 return all_of(E.Scalars, [&](Value *V) {
17968 if (isa<PoisonValue>(V))
17969 return true;
17970 auto *I = cast<Instruction>(V);
17971 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17972 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17973 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17974 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17975 nullptr, DT);
17976 });
17977 };
17978 return TryProcessInstruction(
17979 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17980 AShrChecker);
17981 }
17982 case Instruction::UDiv:
17983 case Instruction::URem: {
17984 // UDiv and URem can be truncated if all the truncated bits are zero.
17985 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17986 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17987 return all_of(E.Scalars, [&](Value *V) {
17988 auto *I = cast<Instruction>(V);
17989 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17990 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17991 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17992 });
17993 };
17994 return TryProcessInstruction(
17995 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17996 }
17997
17998 // We can demote selects if we can demote their true and false values.
17999 case Instruction::Select: {
18000 return TryProcessInstruction(
18001 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18002 }
18003
18004 // We can demote phis if we can demote all their incoming operands. Note that
18005 // we don't need to worry about cycles since we ensure single use above.
18006 case Instruction::PHI: {
18007 const unsigned NumOps = E.getNumOperands();
18009 transform(seq<unsigned>(0, NumOps), Ops.begin(),
18010 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
18011
18012 return TryProcessInstruction(BitWidth, Ops);
18013 }
18014
18015 case Instruction::Call: {
18016 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18017 if (!IC)
18018 break;
18020 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
18021 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
18022 break;
18023 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
18024 function_ref<bool(unsigned, unsigned)> CallChecker;
18025 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18026 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18027 return all_of(E.Scalars, [&](Value *V) {
18028 auto *I = cast<Instruction>(V);
18029 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18030 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18031 return MaskedValueIsZero(I->getOperand(0), Mask,
18032 SimplifyQuery(*DL)) &&
18033 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18034 }
18035 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18036 "Expected min/max intrinsics only.");
18037 unsigned SignBits = OrigBitWidth - BitWidth;
18038 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18039 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18040 nullptr, DT);
18041 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18042 nullptr, DT);
18043 return SignBits <= Op0SignBits &&
18044 ((SignBits != Op0SignBits &&
18045 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18046 MaskedValueIsZero(I->getOperand(0), Mask,
18047 SimplifyQuery(*DL))) &&
18048 SignBits <= Op1SignBits &&
18049 ((SignBits != Op1SignBits &&
18050 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18051 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18052 });
18053 };
18054 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18055 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18056 return all_of(E.Scalars, [&](Value *V) {
18057 auto *I = cast<Instruction>(V);
18058 unsigned SignBits = OrigBitWidth - BitWidth;
18059 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18060 unsigned Op0SignBits =
18061 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18062 return SignBits <= Op0SignBits &&
18063 ((SignBits != Op0SignBits &&
18064 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18065 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18066 });
18067 };
18068 if (ID != Intrinsic::abs) {
18069 Operands.push_back(getOperandEntry(&E, 1));
18070 CallChecker = CompChecker;
18071 } else {
18072 CallChecker = AbsChecker;
18073 }
18074 InstructionCost BestCost =
18075 std::numeric_limits<InstructionCost::CostType>::max();
18076 unsigned BestBitWidth = BitWidth;
18077 unsigned VF = E.Scalars.size();
18078 // Choose the best bitwidth based on cost estimations.
18079 auto Checker = [&](unsigned BitWidth, unsigned) {
18080 unsigned MinBW = PowerOf2Ceil(BitWidth);
18081 SmallVector<Type *> ArgTys =
18082 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18083 auto VecCallCosts = getVectorCallCosts(
18084 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18085 TTI, TLI, ArgTys);
18086 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18087 if (Cost < BestCost) {
18088 BestCost = Cost;
18089 BestBitWidth = BitWidth;
18090 }
18091 return false;
18092 };
18093 [[maybe_unused]] bool NeedToExit;
18094 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18095 BitWidth = BestBitWidth;
18096 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18097 }
18098
18099 // Otherwise, conservatively give up.
18100 default:
18101 break;
18102 }
18103 MaxDepthLevel = 1;
18104 return FinalAnalysis();
18105}
18106
18107static RecurKind getRdxKind(Value *V);
18108
18110 // We only attempt to truncate integer expressions.
18111 bool IsStoreOrInsertElt =
18112 VectorizableTree.front()->hasState() &&
18113 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18114 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18115 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18116 ExtraBitWidthNodes.size() <= 1 &&
18117 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18118 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18119 return;
18120
18121 unsigned NodeIdx = 0;
18122 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18123 NodeIdx = 1;
18124
18125 // Ensure the roots of the vectorizable tree don't form a cycle.
18126 if (VectorizableTree[NodeIdx]->isGather() ||
18127 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18128 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18129 [NodeIdx](const EdgeInfo &EI) {
18130 return EI.UserTE->Idx > NodeIdx;
18131 })))
18132 return;
18133
18134 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18135 // resize to the final type.
18136 bool IsTruncRoot = false;
18137 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18138 SmallVector<unsigned> RootDemotes;
18139 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18140 if (NodeIdx != 0 &&
18141 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18142 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18143 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18144 IsTruncRoot = true;
18145 RootDemotes.push_back(NodeIdx);
18146 IsProfitableToDemoteRoot = true;
18147 ++NodeIdx;
18148 }
18149
18150 // Analyzed the reduction already and not profitable - exit.
18151 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18152 return;
18153
18154 SmallVector<unsigned> ToDemote;
18155 auto ComputeMaxBitWidth =
18156 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
18157 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
18158 ToDemote.clear();
18159 // Check if the root is trunc and the next node is gather/buildvector, then
18160 // keep trunc in scalars, which is free in most cases.
18161 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18162 !NodesToKeepBWs.contains(E.Idx) &&
18163 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18164 all_of(E.Scalars, [&](Value *V) {
18165 return V->hasOneUse() || isa<Constant>(V) ||
18166 (!V->hasNUsesOrMore(UsesLimit) &&
18167 none_of(V->users(), [&](User *U) {
18168 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
18169 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18170 if (TEs.empty() || is_contained(TEs, UserTE))
18171 return false;
18172 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18173 SelectInst>(U) ||
18174 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18175 SelectInst>(UserTE->getMainOp()))
18176 return true;
18177 unsigned UserTESz = DL->getTypeSizeInBits(
18178 UserTE->Scalars.front()->getType());
18179 if (all_of(TEs, [&](const TreeEntry *TE) {
18180 auto It = MinBWs.find(TE);
18181 return It != MinBWs.end() &&
18182 It->second.first > UserTESz;
18183 }))
18184 return true;
18185 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18186 }));
18187 })) {
18188 ToDemote.push_back(E.Idx);
18189 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18190 auto It = MinBWs.find(UserTE);
18191 if (It != MinBWs.end())
18192 return It->second.first;
18193 unsigned MaxBitWidth =
18194 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18195 MaxBitWidth = bit_ceil(MaxBitWidth);
18196 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18197 MaxBitWidth = 8;
18198 return MaxBitWidth;
18199 }
18200
18201 if (!E.hasState())
18202 return 0u;
18203
18204 unsigned VF = E.getVectorFactor();
18205 Type *ScalarTy = E.Scalars.front()->getType();
18206 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18207 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18208 if (!TreeRootIT)
18209 return 0u;
18210
18211 if (any_of(E.Scalars,
18212 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18213 return 0u;
18214
18215 unsigned NumParts = ::getNumberOfParts(
18216 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18217
18218 // The maximum bit width required to represent all the values that can be
18219 // demoted without loss of precision. It would be safe to truncate the roots
18220 // of the expression to this width.
18221 unsigned MaxBitWidth = 1u;
18222
18223 // True if the roots can be zero-extended back to their original type,
18224 // rather than sign-extended. We know that if the leading bits are not
18225 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18226 // True.
18227 // Determine if the sign bit of all the roots is known to be zero. If not,
18228 // IsKnownPositive is set to False.
18229 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18230 if (isa<PoisonValue>(R))
18231 return true;
18232 KnownBits Known = computeKnownBits(R, *DL);
18233 return Known.isNonNegative();
18234 });
18235
18236 // We first check if all the bits of the roots are demanded. If they're not,
18237 // we can truncate the roots to this narrower type.
18238 for (Value *Root : E.Scalars) {
18239 if (isa<PoisonValue>(Root))
18240 continue;
18241 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18242 TypeSize NumTypeBits =
18243 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18244 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18245 // If we can't prove that the sign bit is zero, we must add one to the
18246 // maximum bit width to account for the unknown sign bit. This preserves
18247 // the existing sign bit so we can safely sign-extend the root back to the
18248 // original type. Otherwise, if we know the sign bit is zero, we will
18249 // zero-extend the root instead.
18250 //
18251 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18252 // one to the maximum bit width will yield a larger-than-necessary
18253 // type. In general, we need to add an extra bit only if we can't
18254 // prove that the upper bit of the original type is equal to the
18255 // upper bit of the proposed smaller type. If these two bits are
18256 // the same (either zero or one) we know that sign-extending from
18257 // the smaller type will result in the same value. Here, since we
18258 // can't yet prove this, we are just making the proposed smaller
18259 // type larger to ensure correctness.
18260 if (!IsKnownPositive)
18261 ++BitWidth1;
18262
18263 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18264 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18265 MaxBitWidth =
18266 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18267 }
18268
18269 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18270 MaxBitWidth = 8;
18271
18272 // If the original type is large, but reduced type does not improve the reg
18273 // use - ignore it.
18274 if (NumParts > 1 &&
18275 NumParts ==
18277 *TTI, getWidenedType(IntegerType::get(F->getContext(),
18278 bit_ceil(MaxBitWidth)),
18279 VF)))
18280 return 0u;
18281
18282 unsigned Opcode = E.getOpcode();
18283 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18284 Opcode == Instruction::SExt ||
18285 Opcode == Instruction::ZExt || NumParts > 1;
18286 // Conservatively determine if we can actually truncate the roots of the
18287 // expression. Collect the values that can be demoted in ToDemote and
18288 // additional roots that require investigating in Roots.
18290 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18291 bool NeedToDemote = IsProfitableToDemote;
18292
18293 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18294 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18295 NeedToDemote, IsTruncRoot) ||
18296 (MaxDepthLevel <= Limit &&
18297 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18298 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18299 DL->getTypeSizeInBits(TreeRootIT) /
18300 DL->getTypeSizeInBits(
18301 E.getMainOp()->getOperand(0)->getType()) >
18302 2)))))
18303 return 0u;
18304 // Round MaxBitWidth up to the next power-of-two.
18305 MaxBitWidth = bit_ceil(MaxBitWidth);
18306
18307 return MaxBitWidth;
18308 };
18309
18310 // If we can truncate the root, we must collect additional values that might
18311 // be demoted as a result. That is, those seeded by truncations we will
18312 // modify.
18313 // Add reduction ops sizes, if any.
18314 if (UserIgnoreList &&
18315 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18316 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18317 // x i1> to in)).
18318 if (all_of(*UserIgnoreList,
18319 [](Value *V) {
18320 return isa<PoisonValue>(V) ||
18321 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18322 }) &&
18323 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18324 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18325 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18326 Builder.getInt1Ty()) {
18327 ReductionBitWidth = 1;
18328 } else {
18329 for (Value *V : *UserIgnoreList) {
18330 if (isa<PoisonValue>(V))
18331 continue;
18332 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18333 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18334 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18336 ++BitWidth1;
18337 unsigned BitWidth2 = BitWidth1;
18339 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18340 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18341 }
18342 ReductionBitWidth =
18343 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18344 }
18345 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18346 ReductionBitWidth = 8;
18347
18348 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18349 }
18350 }
18351 bool IsTopRoot = NodeIdx == 0;
18352 while (NodeIdx < VectorizableTree.size() &&
18353 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18354 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18355 RootDemotes.push_back(NodeIdx);
18356 ++NodeIdx;
18357 IsTruncRoot = true;
18358 }
18359 bool IsSignedCmp = false;
18360 while (NodeIdx < VectorizableTree.size()) {
18361 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18362 unsigned Limit = 2;
18363 if (IsTopRoot &&
18364 ReductionBitWidth ==
18365 DL->getTypeSizeInBits(
18366 VectorizableTree.front()->Scalars.front()->getType()))
18367 Limit = 3;
18368 unsigned MaxBitWidth = ComputeMaxBitWidth(
18369 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18370 IsTruncRoot, IsSignedCmp);
18371 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18372 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18373 ReductionBitWidth = bit_ceil(MaxBitWidth);
18374 else if (MaxBitWidth == 0)
18375 ReductionBitWidth = 0;
18376 }
18377
18378 for (unsigned Idx : RootDemotes) {
18379 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18380 uint32_t OrigBitWidth =
18381 DL->getTypeSizeInBits(V->getType()->getScalarType());
18382 if (OrigBitWidth > MaxBitWidth) {
18383 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18384 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18385 }
18386 return false;
18387 }))
18388 ToDemote.push_back(Idx);
18389 }
18390 RootDemotes.clear();
18391 IsTopRoot = false;
18392 IsProfitableToDemoteRoot = true;
18393
18394 if (ExtraBitWidthNodes.empty()) {
18395 NodeIdx = VectorizableTree.size();
18396 } else {
18397 unsigned NewIdx = 0;
18398 do {
18399 NewIdx = *ExtraBitWidthNodes.begin();
18400 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18401 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18402 NodeIdx = NewIdx;
18403 IsTruncRoot =
18404 NodeIdx < VectorizableTree.size() &&
18405 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18406 [](const EdgeInfo &EI) {
18407 return EI.EdgeIdx == 0 &&
18408 EI.UserTE->getOpcode() == Instruction::Trunc &&
18409 !EI.UserTE->isAltShuffle();
18410 });
18411 IsSignedCmp =
18412 NodeIdx < VectorizableTree.size() &&
18413 any_of(
18414 VectorizableTree[NodeIdx]->UserTreeIndices,
18415 [&](const EdgeInfo &EI) {
18416 return (EI.UserTE->hasState() &&
18417 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18418 any_of(EI.UserTE->Scalars, [&](Value *V) {
18419 auto *IC = dyn_cast<ICmpInst>(V);
18420 return IC &&
18421 (IC->isSigned() ||
18422 !isKnownNonNegative(IC->getOperand(0),
18423 SimplifyQuery(*DL)) ||
18424 !isKnownNonNegative(IC->getOperand(1),
18425 SimplifyQuery(*DL)));
18426 });
18427 });
18428 }
18429
18430 // If the maximum bit width we compute is less than the width of the roots'
18431 // type, we can proceed with the narrowing. Otherwise, do nothing.
18432 if (MaxBitWidth == 0 ||
18433 MaxBitWidth >=
18434 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18435 ->getBitWidth()) {
18436 if (UserIgnoreList)
18437 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18438 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18439 continue;
18440 }
18441
18442 // Finally, map the values we can demote to the maximum bit with we
18443 // computed.
18444 for (unsigned Idx : ToDemote) {
18445 TreeEntry *TE = VectorizableTree[Idx].get();
18446 if (MinBWs.contains(TE))
18447 continue;
18448 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18449 if (isa<PoisonValue>(R))
18450 return false;
18451 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18452 });
18453 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18454 }
18455 }
18456}
18457
18459 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18460 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18461 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18462 auto *AA = &AM.getResult<AAManager>(F);
18463 auto *LI = &AM.getResult<LoopAnalysis>(F);
18464 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18465 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18466 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18468
18469 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18470 if (!Changed)
18471 return PreservedAnalyses::all();
18472
18475 return PA;
18476}
18477
18479 TargetTransformInfo *TTI_,
18480 TargetLibraryInfo *TLI_, AAResults *AA_,
18481 LoopInfo *LI_, DominatorTree *DT_,
18482 AssumptionCache *AC_, DemandedBits *DB_,
18485 return false;
18486 SE = SE_;
18487 TTI = TTI_;
18488 TLI = TLI_;
18489 AA = AA_;
18490 LI = LI_;
18491 DT = DT_;
18492 AC = AC_;
18493 DB = DB_;
18494 DL = &F.getDataLayout();
18495
18496 Stores.clear();
18497 GEPs.clear();
18498 bool Changed = false;
18499
18500 // If the target claims to have no vector registers don't attempt
18501 // vectorization.
18503 LLVM_DEBUG(
18504 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18505 return false;
18506 }
18507
18508 // Don't vectorize when the attribute NoImplicitFloat is used.
18509 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18510 return false;
18511
18512 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18513
18514 // Use the bottom up slp vectorizer to construct chains that start with
18515 // store instructions.
18516 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18517
18518 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18519 // delete instructions.
18520
18521 // Update DFS numbers now so that we can use them for ordering.
18522 DT->updateDFSNumbers();
18523
18524 // Scan the blocks in the function in post order.
18525 for (auto *BB : post_order(&F.getEntryBlock())) {
18526 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18527 continue;
18528
18529 // Start new block - clear the list of reduction roots.
18530 R.clearReductionData();
18531 collectSeedInstructions(BB);
18532
18533 // Vectorize trees that end at stores.
18534 if (!Stores.empty()) {
18535 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18536 << " underlying objects.\n");
18537 Changed |= vectorizeStoreChains(R);
18538 }
18539
18540 // Vectorize trees that end at reductions.
18541 Changed |= vectorizeChainsInBlock(BB, R);
18542
18543 // Vectorize the index computations of getelementptr instructions. This
18544 // is primarily intended to catch gather-like idioms ending at
18545 // non-consecutive loads.
18546 if (!GEPs.empty()) {
18547 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18548 << " underlying objects.\n");
18549 Changed |= vectorizeGEPIndices(BB, R);
18550 }
18551 }
18552
18553 if (Changed) {
18554 R.optimizeGatherSequence();
18555 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18556 }
18557 return Changed;
18558}
18559
18560std::optional<bool>
18561SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18562 unsigned Idx, unsigned MinVF,
18563 unsigned &Size) {
18564 Size = 0;
18565 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18566 << "\n");
18567 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18568 unsigned VF = Chain.size();
18569
18570 if (!has_single_bit(Sz) ||
18572 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18573 VF) ||
18574 VF < 2 || VF < MinVF) {
18575 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18576 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18577 // all vector lanes are used.
18578 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18579 return false;
18580 }
18581
18582 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18583 << "\n");
18584
18585 SetVector<Value *> ValOps;
18586 for (Value *V : Chain)
18587 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18588 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18589 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18590 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18591 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18592 bool IsAllowedSize =
18593 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18594 ValOps.size()) ||
18595 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18596 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18597 (!S.getMainOp()->isSafeToRemove() ||
18598 any_of(ValOps.getArrayRef(),
18599 [&](Value *V) {
18600 return !isa<ExtractElementInst>(V) &&
18601 (V->getNumUses() > Chain.size() ||
18602 any_of(V->users(), [&](User *U) {
18603 return !Stores.contains(U);
18604 }));
18605 }))) ||
18606 (ValOps.size() > Chain.size() / 2 && !S)) {
18607 Size = (!IsAllowedSize && S) ? 1 : 2;
18608 return false;
18609 }
18610 }
18611 if (R.isLoadCombineCandidate(Chain))
18612 return true;
18613 R.buildTree(Chain);
18614 // Check if tree tiny and store itself or its value is not vectorized.
18615 if (R.isTreeTinyAndNotFullyVectorizable()) {
18616 if (R.isGathered(Chain.front()) ||
18617 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18618 return std::nullopt;
18619 Size = R.getCanonicalGraphSize();
18620 return false;
18621 }
18622 R.reorderTopToBottom();
18623 R.reorderBottomToTop();
18624 R.transformNodes();
18625 R.buildExternalUses();
18626
18627 R.computeMinimumValueSizes();
18628
18629 Size = R.getCanonicalGraphSize();
18630 if (S && S.getOpcode() == Instruction::Load)
18631 Size = 2; // cut off masked gather small trees
18632 InstructionCost Cost = R.getTreeCost();
18633
18634 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18635 if (Cost < -SLPCostThreshold) {
18636 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18637
18638 using namespace ore;
18639
18640 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18641 cast<StoreInst>(Chain[0]))
18642 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18643 << " and with tree size "
18644 << NV("TreeSize", R.getTreeSize()));
18645
18646 R.vectorizeTree();
18647 return true;
18648 }
18649
18650 return false;
18651}
18652
18653/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18654static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18655 bool First) {
18656 unsigned Num = 0;
18657 uint64_t Sum = std::accumulate(
18658 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18659 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18660 unsigned Size = First ? Val.first : Val.second;
18661 if (Size == 1)
18662 return V;
18663 ++Num;
18664 return V + Size;
18665 });
18666 if (Num == 0)
18667 return true;
18668 uint64_t Mean = Sum / Num;
18669 if (Mean == 0)
18670 return true;
18671 uint64_t Dev = std::accumulate(
18672 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18673 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18674 unsigned P = First ? Val.first : Val.second;
18675 if (P == 1)
18676 return V;
18677 return V + (P - Mean) * (P - Mean);
18678 }) /
18679 Num;
18680 return Dev * 81 / (Mean * Mean) == 0;
18681}
18682
18683bool SLPVectorizerPass::vectorizeStores(
18684 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18685 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18686 &Visited) {
18687 // We may run into multiple chains that merge into a single chain. We mark the
18688 // stores that we vectorized so that we don't visit the same store twice.
18689 BoUpSLP::ValueSet VectorizedStores;
18690 bool Changed = false;
18691
18692 struct StoreDistCompare {
18693 bool operator()(const std::pair<unsigned, int> &Op1,
18694 const std::pair<unsigned, int> &Op2) const {
18695 return Op1.second < Op2.second;
18696 }
18697 };
18698 // A set of pairs (index of store in Stores array ref, Distance of the store
18699 // address relative to base store address in units).
18700 using StoreIndexToDistSet =
18701 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18702 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18703 int PrevDist = -1;
18705 // Collect the chain into a list.
18706 for (auto [Idx, Data] : enumerate(Set)) {
18707 if (Operands.empty() || Data.second - PrevDist == 1) {
18708 Operands.push_back(Stores[Data.first]);
18709 PrevDist = Data.second;
18710 if (Idx != Set.size() - 1)
18711 continue;
18712 }
18713 auto E = make_scope_exit([&, &DataVar = Data]() {
18714 Operands.clear();
18715 Operands.push_back(Stores[DataVar.first]);
18716 PrevDist = DataVar.second;
18717 });
18718
18719 if (Operands.size() <= 1 ||
18720 !Visited
18721 .insert({Operands.front(),
18722 cast<StoreInst>(Operands.front())->getValueOperand(),
18723 Operands.back(),
18724 cast<StoreInst>(Operands.back())->getValueOperand(),
18725 Operands.size()})
18726 .second)
18727 continue;
18728
18729 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18730 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18731 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18732
18733 unsigned MaxVF =
18734 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18735 auto *Store = cast<StoreInst>(Operands[0]);
18736 Type *StoreTy = Store->getValueOperand()->getType();
18737 Type *ValueTy = StoreTy;
18738 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18739 ValueTy = Trunc->getSrcTy();
18740 unsigned MinVF = std::max<unsigned>(
18742 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18743 ValueTy)));
18744
18745 if (MaxVF < MinVF) {
18746 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18747 << ") < "
18748 << "MinVF (" << MinVF << ")\n");
18749 continue;
18750 }
18751
18752 unsigned NonPowerOf2VF = 0;
18754 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18755 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18756 // lanes are used.
18757 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18758 if (has_single_bit(CandVF + 1)) {
18759 NonPowerOf2VF = CandVF;
18760 assert(NonPowerOf2VF != MaxVF &&
18761 "Non-power-of-2 VF should not be equal to MaxVF");
18762 }
18763 }
18764
18765 unsigned MaxRegVF = MaxVF;
18766 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18767 if (MaxVF < MinVF) {
18768 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18769 << ") < "
18770 << "MinVF (" << MinVF << ")\n");
18771 continue;
18772 }
18773
18774 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18775 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18776 unsigned Size = MinVF;
18777 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18778 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18779 Size *= 2;
18780 });
18781 unsigned End = Operands.size();
18782 unsigned Repeat = 0;
18783 constexpr unsigned MaxAttempts = 4;
18785 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18786 P.first = P.second = 1;
18787 });
18789 auto IsNotVectorized = [](bool First,
18790 const std::pair<unsigned, unsigned> &P) {
18791 return First ? P.first > 0 : P.second > 0;
18792 };
18793 auto IsVectorized = [](bool First,
18794 const std::pair<unsigned, unsigned> &P) {
18795 return First ? P.first == 0 : P.second == 0;
18796 };
18797 auto VFIsProfitable = [](bool First, unsigned Size,
18798 const std::pair<unsigned, unsigned> &P) {
18799 return First ? Size >= P.first : Size >= P.second;
18800 };
18801 auto FirstSizeSame = [](unsigned Size,
18802 const std::pair<unsigned, unsigned> &P) {
18803 return Size == P.first;
18804 };
18805 while (true) {
18806 ++Repeat;
18807 bool RepeatChanged = false;
18808 bool AnyProfitableGraph = false;
18809 for (unsigned Size : CandidateVFs) {
18810 AnyProfitableGraph = false;
18811 unsigned StartIdx = std::distance(
18812 RangeSizes.begin(),
18813 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18814 std::placeholders::_1)));
18815 while (StartIdx < End) {
18816 unsigned EndIdx =
18817 std::distance(RangeSizes.begin(),
18818 find_if(RangeSizes.drop_front(StartIdx),
18819 std::bind(IsVectorized, Size >= MaxRegVF,
18820 std::placeholders::_1)));
18821 unsigned Sz = EndIdx >= End ? End : EndIdx;
18822 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18823 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18824 Size >= MaxRegVF)) {
18825 ++Cnt;
18826 continue;
18827 }
18829 assert(all_of(Slice,
18830 [&](Value *V) {
18831 return cast<StoreInst>(V)
18832 ->getValueOperand()
18833 ->getType() ==
18834 cast<StoreInst>(Slice.front())
18835 ->getValueOperand()
18836 ->getType();
18837 }) &&
18838 "Expected all operands of same type.");
18839 if (!NonSchedulable.empty()) {
18840 auto [NonSchedSizeMax, NonSchedSizeMin] =
18841 NonSchedulable.lookup(Slice.front());
18842 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18843 Cnt += NonSchedSizeMax;
18844 continue;
18845 }
18846 }
18847 unsigned TreeSize;
18848 std::optional<bool> Res =
18849 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18850 if (!Res) {
18851 NonSchedulable
18852 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18853 .first->getSecond()
18854 .second = Size;
18855 } else if (*Res) {
18856 // Mark the vectorized stores so that we don't vectorize them
18857 // again.
18858 VectorizedStores.insert(Slice.begin(), Slice.end());
18859 // Mark the vectorized stores so that we don't vectorize them
18860 // again.
18861 AnyProfitableGraph = RepeatChanged = Changed = true;
18862 // If we vectorized initial block, no need to try to vectorize
18863 // it again.
18864 for_each(RangeSizes.slice(Cnt, Size),
18865 [](std::pair<unsigned, unsigned> &P) {
18866 P.first = P.second = 0;
18867 });
18868 if (Cnt < StartIdx + MinVF) {
18869 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18870 [](std::pair<unsigned, unsigned> &P) {
18871 P.first = P.second = 0;
18872 });
18873 StartIdx = Cnt + Size;
18874 }
18875 if (Cnt > Sz - Size - MinVF) {
18876 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18877 [](std::pair<unsigned, unsigned> &P) {
18878 P.first = P.second = 0;
18879 });
18880 if (Sz == End)
18881 End = Cnt;
18882 Sz = Cnt;
18883 }
18884 Cnt += Size;
18885 continue;
18886 }
18887 if (Size > 2 && Res &&
18888 !all_of(RangeSizes.slice(Cnt, Size),
18889 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18890 std::placeholders::_1))) {
18891 Cnt += Size;
18892 continue;
18893 }
18894 // Check for the very big VFs that we're not rebuilding same
18895 // trees, just with larger number of elements.
18896 if (Size > MaxRegVF && TreeSize > 1 &&
18897 all_of(RangeSizes.slice(Cnt, Size),
18898 std::bind(FirstSizeSame, TreeSize,
18899 std::placeholders::_1))) {
18900 Cnt += Size;
18901 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18902 ++Cnt;
18903 continue;
18904 }
18905 if (TreeSize > 1)
18906 for_each(RangeSizes.slice(Cnt, Size),
18907 [&](std::pair<unsigned, unsigned> &P) {
18908 if (Size >= MaxRegVF)
18909 P.second = std::max(P.second, TreeSize);
18910 else
18911 P.first = std::max(P.first, TreeSize);
18912 });
18913 ++Cnt;
18914 AnyProfitableGraph = true;
18915 }
18916 if (StartIdx >= End)
18917 break;
18918 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18919 AnyProfitableGraph = true;
18920 StartIdx = std::distance(
18921 RangeSizes.begin(),
18922 find_if(RangeSizes.drop_front(Sz),
18923 std::bind(IsNotVectorized, Size >= MaxRegVF,
18924 std::placeholders::_1)));
18925 }
18926 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18927 break;
18928 }
18929 // All values vectorized - exit.
18930 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18931 return P.first == 0 && P.second == 0;
18932 }))
18933 break;
18934 // Check if tried all attempts or no need for the last attempts at all.
18935 if (Repeat >= MaxAttempts ||
18936 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18937 break;
18938 constexpr unsigned StoresLimit = 64;
18939 const unsigned MaxTotalNum = std::min<unsigned>(
18940 Operands.size(),
18941 static_cast<unsigned>(
18942 End -
18943 std::distance(
18944 RangeSizes.begin(),
18945 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18946 std::placeholders::_1))) +
18947 1));
18948 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18949 unsigned Limit =
18950 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18951 CandidateVFs.clear();
18952 if (bit_floor(Limit) == VF)
18953 CandidateVFs.push_back(Limit);
18954 if (VF > MaxTotalNum || VF >= StoresLimit)
18955 break;
18956 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18957 if (P.first != 0)
18958 P.first = std::max(P.second, P.first);
18959 });
18960 // Last attempt to vectorize max number of elements, if all previous
18961 // attempts were unsuccessful because of the cost issues.
18962 CandidateVFs.push_back(VF);
18963 }
18964 }
18965 };
18966
18967 // Stores pair (first: index of the store into Stores array ref, address of
18968 // which taken as base, second: sorted set of pairs {index, dist}, which are
18969 // indices of stores in the set and their store location distances relative to
18970 // the base address).
18971
18972 // Need to store the index of the very first store separately, since the set
18973 // may be reordered after the insertion and the first store may be moved. This
18974 // container allows to reduce number of calls of getPointersDiff() function.
18976 // Inserts the specified store SI with the given index Idx to the set of the
18977 // stores. If the store with the same distance is found already - stop
18978 // insertion, try to vectorize already found stores. If some stores from this
18979 // sequence were not vectorized - try to vectorize them with the new store
18980 // later. But this logic is applied only to the stores, that come before the
18981 // previous store with the same distance.
18982 // Example:
18983 // 1. store x, %p
18984 // 2. store y, %p+1
18985 // 3. store z, %p+2
18986 // 4. store a, %p
18987 // 5. store b, %p+3
18988 // - Scan this from the last to first store. The very first bunch of stores is
18989 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
18990 // vector).
18991 // - The next store in the list - #1 - has the same distance from store #5 as
18992 // the store #4.
18993 // - Try to vectorize sequence of stores 4,2,3,5.
18994 // - If all these stores are vectorized - just drop them.
18995 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
18996 // - Start new stores sequence.
18997 // The new bunch of stores is {1, {1, 0}}.
18998 // - Add the stores from previous sequence, that were not vectorized.
18999 // Here we consider the stores in the reversed order, rather they are used in
19000 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
19001 // Store #3 can be added -> comes after store #4 with the same distance as
19002 // store #1.
19003 // Store #5 cannot be added - comes before store #4.
19004 // This logic allows to improve the compile time, we assume that the stores
19005 // after previous store with the same distance most likely have memory
19006 // dependencies and no need to waste compile time to try to vectorize them.
19007 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19008 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
19009 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19010 std::optional<int> Diff = getPointersDiff(
19011 Stores[Set.first]->getValueOperand()->getType(),
19012 Stores[Set.first]->getPointerOperand(),
19013 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
19014 /*StrictCheck=*/true);
19015 if (!Diff)
19016 continue;
19017 auto It = Set.second.find(std::make_pair(Idx, *Diff));
19018 if (It == Set.second.end()) {
19019 Set.second.emplace(Idx, *Diff);
19020 return;
19021 }
19022 // Try to vectorize the first found set to avoid duplicate analysis.
19023 TryToVectorize(Set.second);
19024 unsigned ItIdx = It->first;
19025 int ItDist = It->second;
19026 StoreIndexToDistSet PrevSet;
19027 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19028 [&](const std::pair<unsigned, int> &Pair) {
19029 return Pair.first > ItIdx;
19030 });
19031 Set.second.clear();
19032 Set.first = Idx;
19033 Set.second.emplace(Idx, 0);
19034 // Insert stores that followed previous match to try to vectorize them
19035 // with this store.
19036 unsigned StartIdx = ItIdx + 1;
19037 SmallBitVector UsedStores(Idx - StartIdx);
19038 // Distances to previously found dup store (or this store, since they
19039 // store to the same addresses).
19040 SmallVector<int> Dists(Idx - StartIdx, 0);
19041 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19042 // Do not try to vectorize sequences, we already tried.
19043 if (VectorizedStores.contains(Stores[Pair.first]))
19044 break;
19045 unsigned BI = Pair.first - StartIdx;
19046 UsedStores.set(BI);
19047 Dists[BI] = Pair.second - ItDist;
19048 }
19049 for (unsigned I = StartIdx; I < Idx; ++I) {
19050 unsigned BI = I - StartIdx;
19051 if (UsedStores.test(BI))
19052 Set.second.emplace(I, Dists[BI]);
19053 }
19054 return;
19055 }
19056 auto &Res = SortedStores.emplace_back();
19057 Res.first = Idx;
19058 Res.second.emplace(Idx, 0);
19059 };
19060 Type *PrevValTy = nullptr;
19061 for (auto [I, SI] : enumerate(Stores)) {
19062 if (R.isDeleted(SI))
19063 continue;
19064 if (!PrevValTy)
19065 PrevValTy = SI->getValueOperand()->getType();
19066 // Check that we do not try to vectorize stores of different types.
19067 if (PrevValTy != SI->getValueOperand()->getType()) {
19068 for (auto &Set : SortedStores)
19069 TryToVectorize(Set.second);
19070 SortedStores.clear();
19071 PrevValTy = SI->getValueOperand()->getType();
19072 }
19073 FillStoresSet(I, SI);
19074 }
19075
19076 // Final vectorization attempt.
19077 for (auto &Set : SortedStores)
19078 TryToVectorize(Set.second);
19079
19080 return Changed;
19081}
19082
19083void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19084 // Initialize the collections. We will make a single pass over the block.
19085 Stores.clear();
19086 GEPs.clear();
19087
19088 // Visit the store and getelementptr instructions in BB and organize them in
19089 // Stores and GEPs according to the underlying objects of their pointer
19090 // operands.
19091 for (Instruction &I : *BB) {
19092 // Ignore store instructions that are volatile or have a pointer operand
19093 // that doesn't point to a scalar type.
19094 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19095 if (!SI->isSimple())
19096 continue;
19097 if (!isValidElementType(SI->getValueOperand()->getType()))
19098 continue;
19099 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19100 }
19101
19102 // Ignore getelementptr instructions that have more than one index, a
19103 // constant index, or a pointer operand that doesn't point to a scalar
19104 // type.
19105 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19106 if (GEP->getNumIndices() != 1)
19107 continue;
19108 Value *Idx = GEP->idx_begin()->get();
19109 if (isa<Constant>(Idx))
19110 continue;
19111 if (!isValidElementType(Idx->getType()))
19112 continue;
19113 if (GEP->getType()->isVectorTy())
19114 continue;
19115 GEPs[GEP->getPointerOperand()].push_back(GEP);
19116 }
19117 }
19118}
19119
19120bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19121 bool MaxVFOnly) {
19122 if (VL.size() < 2)
19123 return false;
19124
19125 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19126 << VL.size() << ".\n");
19127
19128 // Check that all of the parts are instructions of the same type,
19129 // we permit an alternate opcode via InstructionsState.
19130 InstructionsState S = getSameOpcode(VL, *TLI);
19131 if (!S)
19132 return false;
19133
19134 Instruction *I0 = S.getMainOp();
19135 // Make sure invalid types (including vector type) are rejected before
19136 // determining vectorization factor for scalar instructions.
19137 for (Value *V : VL) {
19138 Type *Ty = V->getType();
19139 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19140 // NOTE: the following will give user internal llvm type name, which may
19141 // not be useful.
19142 R.getORE()->emit([&]() {
19143 std::string TypeStr;
19144 llvm::raw_string_ostream rso(TypeStr);
19145 Ty->print(rso);
19146 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19147 << "Cannot SLP vectorize list: type "
19148 << TypeStr + " is unsupported by vectorizer";
19149 });
19150 return false;
19151 }
19152 }
19153
19154 Type *ScalarTy = getValueType(VL[0]);
19155 unsigned Sz = R.getVectorElementSize(I0);
19156 unsigned MinVF = R.getMinVF(Sz);
19157 unsigned MaxVF = std::max<unsigned>(
19158 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19159 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19160 if (MaxVF < 2) {
19161 R.getORE()->emit([&]() {
19162 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19163 << "Cannot SLP vectorize list: vectorization factor "
19164 << "less than 2 is not supported";
19165 });
19166 return false;
19167 }
19168
19169 bool Changed = false;
19170 bool CandidateFound = false;
19171 InstructionCost MinCost = SLPCostThreshold.getValue();
19172
19173 unsigned NextInst = 0, MaxInst = VL.size();
19174 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19175 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19176 // No actual vectorization should happen, if number of parts is the same as
19177 // provided vectorization factor (i.e. the scalar type is used for vector
19178 // code during codegen).
19179 auto *VecTy = getWidenedType(ScalarTy, VF);
19180 if (TTI->getNumberOfParts(VecTy) == VF)
19181 continue;
19182 for (unsigned I = NextInst; I < MaxInst; ++I) {
19183 unsigned ActualVF = std::min(MaxInst - I, VF);
19184
19185 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19186 continue;
19187
19188 if (MaxVFOnly && ActualVF < MaxVF)
19189 break;
19190 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19191 break;
19192
19193 SmallVector<Value *> Ops(ActualVF, nullptr);
19194 unsigned Idx = 0;
19195 for (Value *V : VL.drop_front(I)) {
19196 // Check that a previous iteration of this loop did not delete the
19197 // Value.
19198 if (auto *Inst = dyn_cast<Instruction>(V);
19199 !Inst || !R.isDeleted(Inst)) {
19200 Ops[Idx] = V;
19201 ++Idx;
19202 if (Idx == ActualVF)
19203 break;
19204 }
19205 }
19206 // Not enough vectorizable instructions - exit.
19207 if (Idx != ActualVF)
19208 break;
19209
19210 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19211 << "\n");
19212
19213 R.buildTree(Ops);
19214 if (R.isTreeTinyAndNotFullyVectorizable())
19215 continue;
19216 R.reorderTopToBottom();
19217 R.reorderBottomToTop(
19218 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19219 !R.doesRootHaveInTreeUses());
19220 R.transformNodes();
19221 R.buildExternalUses();
19222
19223 R.computeMinimumValueSizes();
19224 InstructionCost Cost = R.getTreeCost();
19225 CandidateFound = true;
19226 MinCost = std::min(MinCost, Cost);
19227
19228 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19229 << " for VF=" << ActualVF << "\n");
19230 if (Cost < -SLPCostThreshold) {
19231 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19232 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19233 cast<Instruction>(Ops[0]))
19234 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19235 << " and with tree size "
19236 << ore::NV("TreeSize", R.getTreeSize()));
19237
19238 R.vectorizeTree();
19239 // Move to the next bundle.
19240 I += VF - 1;
19241 NextInst = I + 1;
19242 Changed = true;
19243 }
19244 }
19245 }
19246
19247 if (!Changed && CandidateFound) {
19248 R.getORE()->emit([&]() {
19249 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19250 << "List vectorization was possible but not beneficial with cost "
19251 << ore::NV("Cost", MinCost) << " >= "
19252 << ore::NV("Treshold", -SLPCostThreshold);
19253 });
19254 } else if (!Changed) {
19255 R.getORE()->emit([&]() {
19256 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19257 << "Cannot SLP vectorize list: vectorization was impossible"
19258 << " with available vectorization factors";
19259 });
19260 }
19261 return Changed;
19262}
19263
19264bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19265 if (!I)
19266 return false;
19267
19268 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19269 return false;
19270
19271 Value *P = I->getParent();
19272
19273 // Vectorize in current basic block only.
19274 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19275 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19276 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19277 R.isDeleted(Op0) || R.isDeleted(Op1))
19278 return false;
19279
19280 // First collect all possible candidates
19282 Candidates.emplace_back(Op0, Op1);
19283
19284 auto *A = dyn_cast<BinaryOperator>(Op0);
19285 auto *B = dyn_cast<BinaryOperator>(Op1);
19286 // Try to skip B.
19287 if (A && B && B->hasOneUse()) {
19288 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19289 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19290 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19291 Candidates.emplace_back(A, B0);
19292 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19293 Candidates.emplace_back(A, B1);
19294 }
19295 // Try to skip A.
19296 if (B && A && A->hasOneUse()) {
19297 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19298 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19299 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19300 Candidates.emplace_back(A0, B);
19301 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19302 Candidates.emplace_back(A1, B);
19303 }
19304
19305 if (Candidates.size() == 1)
19306 return tryToVectorizeList({Op0, Op1}, R);
19307
19308 // We have multiple options. Try to pick the single best.
19309 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19310 if (!BestCandidate)
19311 return false;
19312 return tryToVectorizeList(
19313 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19314}
19315
19316namespace {
19317
19318/// Model horizontal reductions.
19319///
19320/// A horizontal reduction is a tree of reduction instructions that has values
19321/// that can be put into a vector as its leaves. For example:
19322///
19323/// mul mul mul mul
19324/// \ / \ /
19325/// + +
19326/// \ /
19327/// +
19328/// This tree has "mul" as its leaf values and "+" as its reduction
19329/// instructions. A reduction can feed into a store or a binary operation
19330/// feeding a phi.
19331/// ...
19332/// \ /
19333/// +
19334/// |
19335/// phi +=
19336///
19337/// Or:
19338/// ...
19339/// \ /
19340/// +
19341/// |
19342/// *p =
19343///
19344class HorizontalReduction {
19345 using ReductionOpsType = SmallVector<Value *, 16>;
19346 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19347 ReductionOpsListType ReductionOps;
19348 /// List of possibly reduced values.
19350 /// Maps reduced value to the corresponding reduction operation.
19352 WeakTrackingVH ReductionRoot;
19353 /// The type of reduction operation.
19354 RecurKind RdxKind;
19355 /// Checks if the optimization of original scalar identity operations on
19356 /// matched horizontal reductions is enabled and allowed.
19357 bool IsSupportedHorRdxIdentityOp = false;
19358
19359 static bool isCmpSelMinMax(Instruction *I) {
19360 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19362 }
19363
19364 // And/or are potentially poison-safe logical patterns like:
19365 // select x, y, false
19366 // select x, true, y
19367 static bool isBoolLogicOp(Instruction *I) {
19368 return isa<SelectInst>(I) &&
19369 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19370 }
19371
19372 /// Checks if instruction is associative and can be vectorized.
19373 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19374 if (Kind == RecurKind::None)
19375 return false;
19376
19377 // Integer ops that map to select instructions or intrinsics are fine.
19379 isBoolLogicOp(I))
19380 return true;
19381
19382 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19383 // FP min/max are associative except for NaN and -0.0. We do not
19384 // have to rule out -0.0 here because the intrinsic semantics do not
19385 // specify a fixed result for it.
19386 return I->getFastMathFlags().noNaNs();
19387 }
19388
19389 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19390 return true;
19391
19392 return I->isAssociative();
19393 }
19394
19395 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19396 // Poison-safe 'or' takes the form: select X, true, Y
19397 // To make that work with the normal operand processing, we skip the
19398 // true value operand.
19399 // TODO: Change the code and data structures to handle this without a hack.
19400 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19401 return I->getOperand(2);
19402 return I->getOperand(Index);
19403 }
19404
19405 /// Creates reduction operation with the current opcode.
19406 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19407 Value *RHS, const Twine &Name, bool UseSelect) {
19408 switch (Kind) {
19409 case RecurKind::Or: {
19410 if (UseSelect &&
19412 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19413 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19414 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19415 Name);
19416 }
19417 case RecurKind::And: {
19418 if (UseSelect &&
19420 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19421 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19422 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19423 Name);
19424 }
19425 case RecurKind::Add:
19426 case RecurKind::Mul:
19427 case RecurKind::Xor:
19428 case RecurKind::FAdd:
19429 case RecurKind::FMul: {
19430 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19431 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19432 Name);
19433 }
19434 case RecurKind::SMax:
19435 case RecurKind::SMin:
19436 case RecurKind::UMax:
19437 case RecurKind::UMin:
19438 if (UseSelect) {
19440 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
19441 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19442 }
19443 [[fallthrough]];
19444 case RecurKind::FMax:
19445 case RecurKind::FMin:
19446 case RecurKind::FMaximum:
19447 case RecurKind::FMinimum: {
19449 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
19450 }
19451 default:
19452 llvm_unreachable("Unknown reduction operation.");
19453 }
19454 }
19455
19456 /// Creates reduction operation with the current opcode with the IR flags
19457 /// from \p ReductionOps, dropping nuw/nsw flags.
19458 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19459 Value *RHS, const Twine &Name,
19460 const ReductionOpsListType &ReductionOps) {
19461 bool UseSelect = ReductionOps.size() == 2 ||
19462 // Logical or/and.
19463 (ReductionOps.size() == 1 &&
19464 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19465 assert((!UseSelect || ReductionOps.size() != 2 ||
19466 isa<SelectInst>(ReductionOps[1][0])) &&
19467 "Expected cmp + select pairs for reduction");
19468 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19470 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19471 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19472 /*IncludeWrapFlags=*/false);
19473 propagateIRFlags(Op, ReductionOps[1], nullptr,
19474 /*IncludeWrapFlags=*/false);
19475 return Op;
19476 }
19477 }
19478 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19479 return Op;
19480 }
19481
19482public:
19483 static RecurKind getRdxKind(Value *V) {
19484 auto *I = dyn_cast<Instruction>(V);
19485 if (!I)
19486 return RecurKind::None;
19487 if (match(I, m_Add(m_Value(), m_Value())))
19488 return RecurKind::Add;
19489 if (match(I, m_Mul(m_Value(), m_Value())))
19490 return RecurKind::Mul;
19491 if (match(I, m_And(m_Value(), m_Value())) ||
19493 return RecurKind::And;
19494 if (match(I, m_Or(m_Value(), m_Value())) ||
19496 return RecurKind::Or;
19497 if (match(I, m_Xor(m_Value(), m_Value())))
19498 return RecurKind::Xor;
19499 if (match(I, m_FAdd(m_Value(), m_Value())))
19500 return RecurKind::FAdd;
19501 if (match(I, m_FMul(m_Value(), m_Value())))
19502 return RecurKind::FMul;
19503
19504 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19505 return RecurKind::FMax;
19506 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19507 return RecurKind::FMin;
19508
19509 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19510 return RecurKind::FMaximum;
19511 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19512 return RecurKind::FMinimum;
19513 // This matches either cmp+select or intrinsics. SLP is expected to handle
19514 // either form.
19515 // TODO: If we are canonicalizing to intrinsics, we can remove several
19516 // special-case paths that deal with selects.
19517 if (match(I, m_SMax(m_Value(), m_Value())))
19518 return RecurKind::SMax;
19519 if (match(I, m_SMin(m_Value(), m_Value())))
19520 return RecurKind::SMin;
19521 if (match(I, m_UMax(m_Value(), m_Value())))
19522 return RecurKind::UMax;
19523 if (match(I, m_UMin(m_Value(), m_Value())))
19524 return RecurKind::UMin;
19525
19526 if (auto *Select = dyn_cast<SelectInst>(I)) {
19527 // Try harder: look for min/max pattern based on instructions producing
19528 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19529 // During the intermediate stages of SLP, it's very common to have
19530 // pattern like this (since optimizeGatherSequence is run only once
19531 // at the end):
19532 // %1 = extractelement <2 x i32> %a, i32 0
19533 // %2 = extractelement <2 x i32> %a, i32 1
19534 // %cond = icmp sgt i32 %1, %2
19535 // %3 = extractelement <2 x i32> %a, i32 0
19536 // %4 = extractelement <2 x i32> %a, i32 1
19537 // %select = select i1 %cond, i32 %3, i32 %4
19538 CmpPredicate Pred;
19539 Instruction *L1;
19540 Instruction *L2;
19541
19542 Value *LHS = Select->getTrueValue();
19543 Value *RHS = Select->getFalseValue();
19544 Value *Cond = Select->getCondition();
19545
19546 // TODO: Support inverse predicates.
19547 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19548 if (!isa<ExtractElementInst>(RHS) ||
19549 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19550 return RecurKind::None;
19551 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19552 if (!isa<ExtractElementInst>(LHS) ||
19553 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19554 return RecurKind::None;
19555 } else {
19556 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19557 return RecurKind::None;
19558 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19559 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19560 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19561 return RecurKind::None;
19562 }
19563
19564 switch (Pred) {
19565 default:
19566 return RecurKind::None;
19567 case CmpInst::ICMP_SGT:
19568 case CmpInst::ICMP_SGE:
19569 return RecurKind::SMax;
19570 case CmpInst::ICMP_SLT:
19571 case CmpInst::ICMP_SLE:
19572 return RecurKind::SMin;
19573 case CmpInst::ICMP_UGT:
19574 case CmpInst::ICMP_UGE:
19575 return RecurKind::UMax;
19576 case CmpInst::ICMP_ULT:
19577 case CmpInst::ICMP_ULE:
19578 return RecurKind::UMin;
19579 }
19580 }
19581 return RecurKind::None;
19582 }
19583
19584 /// Get the index of the first operand.
19585 static unsigned getFirstOperandIndex(Instruction *I) {
19586 return isCmpSelMinMax(I) ? 1 : 0;
19587 }
19588
19589private:
19590 /// Total number of operands in the reduction operation.
19591 static unsigned getNumberOfOperands(Instruction *I) {
19592 return isCmpSelMinMax(I) ? 3 : 2;
19593 }
19594
19595 /// Checks if the instruction is in basic block \p BB.
19596 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19597 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19598 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19599 auto *Sel = cast<SelectInst>(I);
19600 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19601 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19602 }
19603 return I->getParent() == BB;
19604 }
19605
19606 /// Expected number of uses for reduction operations/reduced values.
19607 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19608 if (IsCmpSelMinMax) {
19609 // SelectInst must be used twice while the condition op must have single
19610 // use only.
19611 if (auto *Sel = dyn_cast<SelectInst>(I))
19612 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19613 return I->hasNUses(2);
19614 }
19615
19616 // Arithmetic reduction operation must be used once only.
19617 return I->hasOneUse();
19618 }
19619
19620 /// Initializes the list of reduction operations.
19621 void initReductionOps(Instruction *I) {
19622 if (isCmpSelMinMax(I))
19623 ReductionOps.assign(2, ReductionOpsType());
19624 else
19625 ReductionOps.assign(1, ReductionOpsType());
19626 }
19627
19628 /// Add all reduction operations for the reduction instruction \p I.
19629 void addReductionOps(Instruction *I) {
19630 if (isCmpSelMinMax(I)) {
19631 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19632 ReductionOps[1].emplace_back(I);
19633 } else {
19634 ReductionOps[0].emplace_back(I);
19635 }
19636 }
19637
19638 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19639 int Sz = Data.size();
19640 auto *I = dyn_cast<Instruction>(Data.front());
19641 return Sz > 1 || isConstant(Data.front()) ||
19642 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19643 }
19644
19645public:
19646 HorizontalReduction() = default;
19647
19648 /// Try to find a reduction tree.
19649 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19650 ScalarEvolution &SE, const DataLayout &DL,
19651 const TargetLibraryInfo &TLI) {
19652 RdxKind = HorizontalReduction::getRdxKind(Root);
19653 if (!isVectorizable(RdxKind, Root))
19654 return false;
19655
19656 // Analyze "regular" integer/FP types for reductions - no target-specific
19657 // types or pointers.
19658 Type *Ty = Root->getType();
19659 if (!isValidElementType(Ty) || Ty->isPointerTy())
19660 return false;
19661
19662 // Though the ultimate reduction may have multiple uses, its condition must
19663 // have only single use.
19664 if (auto *Sel = dyn_cast<SelectInst>(Root))
19665 if (!Sel->getCondition()->hasOneUse())
19666 return false;
19667
19668 ReductionRoot = Root;
19669
19670 // Iterate through all the operands of the possible reduction tree and
19671 // gather all the reduced values, sorting them by their value id.
19672 BasicBlock *BB = Root->getParent();
19673 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19675 1, std::make_pair(Root, 0));
19676 // Checks if the operands of the \p TreeN instruction are also reduction
19677 // operations or should be treated as reduced values or an extra argument,
19678 // which is not part of the reduction.
19679 auto CheckOperands = [&](Instruction *TreeN,
19680 SmallVectorImpl<Value *> &PossibleReducedVals,
19681 SmallVectorImpl<Instruction *> &ReductionOps,
19682 unsigned Level) {
19683 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19684 getNumberOfOperands(TreeN)))) {
19685 Value *EdgeVal = getRdxOperand(TreeN, I);
19686 ReducedValsToOps[EdgeVal].push_back(TreeN);
19687 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19688 // If the edge is not an instruction, or it is different from the main
19689 // reduction opcode or has too many uses - possible reduced value.
19690 // Also, do not try to reduce const values, if the operation is not
19691 // foldable.
19692 if (!EdgeInst || Level > RecursionMaxDepth ||
19693 getRdxKind(EdgeInst) != RdxKind ||
19694 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19695 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19696 !isVectorizable(RdxKind, EdgeInst) ||
19697 (R.isAnalyzedReductionRoot(EdgeInst) &&
19698 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19699 PossibleReducedVals.push_back(EdgeVal);
19700 continue;
19701 }
19702 ReductionOps.push_back(EdgeInst);
19703 }
19704 };
19705 // Try to regroup reduced values so that it gets more profitable to try to
19706 // reduce them. Values are grouped by their value ids, instructions - by
19707 // instruction op id and/or alternate op id, plus do extra analysis for
19708 // loads (grouping them by the distabce between pointers) and cmp
19709 // instructions (grouping them by the predicate).
19712 8>
19713 PossibleReducedVals;
19714 initReductionOps(Root);
19716 SmallSet<size_t, 2> LoadKeyUsed;
19717
19718 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19719 Key = hash_combine(hash_value(LI->getParent()), Key);
19720 Value *Ptr =
19722 if (!LoadKeyUsed.insert(Key).second) {
19723 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19724 if (LIt != LoadsMap.end()) {
19725 for (LoadInst *RLI : LIt->second) {
19726 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19727 LI->getType(), LI->getPointerOperand(), DL, SE,
19728 /*StrictCheck=*/true))
19729 return hash_value(RLI->getPointerOperand());
19730 }
19731 for (LoadInst *RLI : LIt->second) {
19733 LI->getPointerOperand(), TLI)) {
19734 hash_code SubKey = hash_value(RLI->getPointerOperand());
19735 return SubKey;
19736 }
19737 }
19738 if (LIt->second.size() > 2) {
19739 hash_code SubKey =
19740 hash_value(LIt->second.back()->getPointerOperand());
19741 return SubKey;
19742 }
19743 }
19744 }
19745 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19746 .first->second.push_back(LI);
19747 return hash_value(LI->getPointerOperand());
19748 };
19749
19750 while (!Worklist.empty()) {
19751 auto [TreeN, Level] = Worklist.pop_back_val();
19752 SmallVector<Value *> PossibleRedVals;
19753 SmallVector<Instruction *> PossibleReductionOps;
19754 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19755 addReductionOps(TreeN);
19756 // Add reduction values. The values are sorted for better vectorization
19757 // results.
19758 for (Value *V : PossibleRedVals) {
19759 size_t Key, Idx;
19760 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19761 /*AllowAlternate=*/false);
19762 ++PossibleReducedVals[Key][Idx]
19763 .insert(std::make_pair(V, 0))
19764 .first->second;
19765 }
19766 for (Instruction *I : reverse(PossibleReductionOps))
19767 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19768 }
19769 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19770 // Sort values by the total number of values kinds to start the reduction
19771 // from the longest possible reduced values sequences.
19772 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19773 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19774 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19775 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19776 It != E; ++It) {
19777 PossibleRedValsVect.emplace_back();
19778 auto RedValsVect = It->second.takeVector();
19779 stable_sort(RedValsVect, llvm::less_second());
19780 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19781 PossibleRedValsVect.back().append(Data.second, Data.first);
19782 }
19783 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19784 return P1.size() > P2.size();
19785 });
19786 int NewIdx = -1;
19787 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19788 if (NewIdx < 0 ||
19789 (!isGoodForReduction(Data) &&
19790 (!isa<LoadInst>(Data.front()) ||
19791 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19793 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19795 cast<LoadInst>(ReducedVals[NewIdx].front())
19796 ->getPointerOperand())))) {
19797 NewIdx = ReducedVals.size();
19798 ReducedVals.emplace_back();
19799 }
19800 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19801 }
19802 }
19803 // Sort the reduced values by number of same/alternate opcode and/or pointer
19804 // operand.
19805 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19806 return P1.size() > P2.size();
19807 });
19808 return true;
19809 }
19810
19811 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19812 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19813 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19814 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19815 constexpr unsigned RegMaxNumber = 4;
19816 constexpr unsigned RedValsMaxNumber = 128;
19817 // If there are a sufficient number of reduction values, reduce
19818 // to a nearby power-of-2. We can safely generate oversized
19819 // vectors and rely on the backend to split them to legal sizes.
19820 if (unsigned NumReducedVals = std::accumulate(
19821 ReducedVals.begin(), ReducedVals.end(), 0,
19822 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19823 if (!isGoodForReduction(Vals))
19824 return Num;
19825 return Num + Vals.size();
19826 });
19827 NumReducedVals < ReductionLimit &&
19828 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19829 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19830 })) {
19831 for (ReductionOpsType &RdxOps : ReductionOps)
19832 for (Value *RdxOp : RdxOps)
19833 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19834 return nullptr;
19835 }
19836
19837 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19838 TargetFolder(DL));
19839 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19840
19841 // Track the reduced values in case if they are replaced by extractelement
19842 // because of the vectorization.
19843 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19844 ReducedVals.front().size());
19845
19846 // The compare instruction of a min/max is the insertion point for new
19847 // instructions and may be replaced with a new compare instruction.
19848 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19849 assert(isa<SelectInst>(RdxRootInst) &&
19850 "Expected min/max reduction to have select root instruction");
19851 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19852 assert(isa<Instruction>(ScalarCond) &&
19853 "Expected min/max reduction to have compare condition");
19854 return cast<Instruction>(ScalarCond);
19855 };
19856
19857 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19858 return isBoolLogicOp(cast<Instruction>(V));
19859 });
19860 // Return new VectorizedTree, based on previous value.
19861 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19862 if (VectorizedTree) {
19863 // Update the final value in the reduction.
19865 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19866 if (AnyBoolLogicOp) {
19867 auto It = ReducedValsToOps.find(VectorizedTree);
19868 auto It1 = ReducedValsToOps.find(Res);
19869 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19870 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19871 (It != ReducedValsToOps.end() &&
19872 any_of(It->getSecond(), [&](Instruction *I) {
19873 return isBoolLogicOp(I) &&
19874 getRdxOperand(I, 0) == VectorizedTree;
19875 }))) {
19876 ;
19877 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19878 (It1 != ReducedValsToOps.end() &&
19879 any_of(It1->getSecond(), [&](Instruction *I) {
19880 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19881 }))) {
19882 std::swap(VectorizedTree, Res);
19883 } else {
19884 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19885 }
19886 }
19887
19888 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19889 ReductionOps);
19890 }
19891 // Initialize the final value in the reduction.
19892 return Res;
19893 };
19894 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19895 ReductionOps.front().size());
19896 for (ReductionOpsType &RdxOps : ReductionOps)
19897 for (Value *RdxOp : RdxOps) {
19898 if (!RdxOp)
19899 continue;
19900 IgnoreList.insert(RdxOp);
19901 }
19902 // Intersect the fast-math-flags from all reduction operations.
19903 FastMathFlags RdxFMF;
19904 RdxFMF.set();
19905 for (Value *U : IgnoreList)
19906 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19907 RdxFMF &= FPMO->getFastMathFlags();
19908 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19909
19910 // Need to track reduced vals, they may be changed during vectorization of
19911 // subvectors.
19912 for (ArrayRef<Value *> Candidates : ReducedVals)
19913 for (Value *V : Candidates)
19914 TrackedVals.try_emplace(V, V);
19915
19917 Value *V) -> unsigned & {
19918 auto *It = MV.find(V);
19919 assert(It != MV.end() && "Unable to find given key.");
19920 return It->second;
19921 };
19922
19923 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19924 // List of the values that were reduced in other trees as part of gather
19925 // nodes and thus requiring extract if fully vectorized in other trees.
19926 SmallPtrSet<Value *, 4> RequiredExtract;
19927 WeakTrackingVH VectorizedTree = nullptr;
19928 bool CheckForReusedReductionOps = false;
19929 // Try to vectorize elements based on their type.
19931 for (ArrayRef<Value *> RV : ReducedVals)
19932 States.push_back(getSameOpcode(RV, TLI));
19933 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19934 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19935 InstructionsState S = States[I];
19936 SmallVector<Value *> Candidates;
19937 Candidates.reserve(2 * OrigReducedVals.size());
19938 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19939 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19940 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19941 // Check if the reduction value was not overriden by the extractelement
19942 // instruction because of the vectorization and exclude it, if it is not
19943 // compatible with other values.
19944 // Also check if the instruction was folded to constant/other value.
19945 auto *Inst = dyn_cast<Instruction>(RdxVal);
19946 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19947 (!S || !S.isOpcodeOrAlt(Inst))) ||
19948 (S && !Inst))
19949 continue;
19950 Candidates.push_back(RdxVal);
19951 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19952 }
19953 bool ShuffledExtracts = false;
19954 // Try to handle shuffled extractelements.
19955 if (S && S.getOpcode() == Instruction::ExtractElement &&
19956 !S.isAltShuffle() && I + 1 < E) {
19957 SmallVector<Value *> CommonCandidates(Candidates);
19958 for (Value *RV : ReducedVals[I + 1]) {
19959 Value *RdxVal = TrackedVals.at(RV);
19960 // Check if the reduction value was not overriden by the
19961 // extractelement instruction because of the vectorization and
19962 // exclude it, if it is not compatible with other values.
19963 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19964 if (!Inst)
19965 continue;
19966 CommonCandidates.push_back(RdxVal);
19967 TrackedToOrig.try_emplace(RdxVal, RV);
19968 }
19970 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19971 ++I;
19972 Candidates.swap(CommonCandidates);
19973 ShuffledExtracts = true;
19974 }
19975 }
19976
19977 // Emit code for constant values.
19978 if (Candidates.size() > 1 && allConstant(Candidates)) {
19979 Value *Res = Candidates.front();
19980 Value *OrigV = TrackedToOrig.at(Candidates.front());
19981 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19982 for (Value *VC : ArrayRef(Candidates).drop_front()) {
19983 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
19984 Value *OrigV = TrackedToOrig.at(VC);
19985 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19986 if (auto *ResI = dyn_cast<Instruction>(Res))
19987 V.analyzedReductionRoot(ResI);
19988 }
19989 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19990 continue;
19991 }
19992
19993 unsigned NumReducedVals = Candidates.size();
19994 if (NumReducedVals < ReductionLimit &&
19995 (NumReducedVals < 2 || !isSplat(Candidates)))
19996 continue;
19997
19998 // Check if we support repeated scalar values processing (optimization of
19999 // original scalar identity operations on matched horizontal reductions).
20000 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20001 RdxKind != RecurKind::FMul &&
20002 RdxKind != RecurKind::FMulAdd;
20003 // Gather same values.
20004 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20005 if (IsSupportedHorRdxIdentityOp)
20006 for (Value *V : Candidates) {
20007 Value *OrigV = TrackedToOrig.at(V);
20008 ++SameValuesCounter.try_emplace(OrigV).first->second;
20009 }
20010 // Used to check if the reduced values used same number of times. In this
20011 // case the compiler may produce better code. E.g. if reduced values are
20012 // aabbccdd (8 x values), then the first node of the tree will have a node
20013 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20014 // Plus, the final reduction will be performed on <8 x aabbccdd>.
20015 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20016 // x abcd) * 2.
20017 // Currently it only handles add/fadd/xor. and/or/min/max do not require
20018 // this analysis, other operations may require an extra estimation of
20019 // the profitability.
20020 bool SameScaleFactor = false;
20021 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20022 SameValuesCounter.size() != Candidates.size();
20023 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20024 if (OptReusedScalars) {
20025 SameScaleFactor =
20026 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20027 RdxKind == RecurKind::Xor) &&
20028 all_of(drop_begin(SameValuesCounter),
20029 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20030 return P.second == SameValuesCounter.front().second;
20031 });
20032 Candidates.resize(SameValuesCounter.size());
20033 transform(SameValuesCounter, Candidates.begin(),
20034 [&](const auto &P) { return TrackedVals.at(P.first); });
20035 NumReducedVals = Candidates.size();
20036 // Have a reduction of the same element.
20037 if (NumReducedVals == 1) {
20038 Value *OrigV = TrackedToOrig.at(Candidates.front());
20039 unsigned Cnt = At(SameValuesCounter, OrigV);
20040 Value *RedVal =
20041 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20042 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20043 VectorizedVals.try_emplace(OrigV, Cnt);
20044 ExternallyUsedValues.insert(OrigV);
20045 continue;
20046 }
20047 }
20048
20049 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20050 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20051 const unsigned MaxElts = std::clamp<unsigned>(
20052 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20053 RegMaxNumber * RedValsMaxNumber);
20054
20055 unsigned ReduxWidth = NumReducedVals;
20056 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20057 unsigned NumParts, NumRegs;
20058 Type *ScalarTy = Candidates.front()->getType();
20059 ReduxWidth =
20060 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20061 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20062 NumParts = ::getNumberOfParts(TTI, Tp);
20063 NumRegs =
20065 while (NumParts > NumRegs) {
20066 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
20067 ReduxWidth = bit_floor(ReduxWidth - 1);
20068 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20069 NumParts = ::getNumberOfParts(TTI, Tp);
20070 NumRegs =
20072 }
20073 if (NumParts > NumRegs / 2)
20074 ReduxWidth = bit_floor(ReduxWidth);
20075 return ReduxWidth;
20076 };
20077 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20078 ReduxWidth = GetVectorFactor(ReduxWidth);
20079 ReduxWidth = std::min(ReduxWidth, MaxElts);
20080
20081 unsigned Start = 0;
20082 unsigned Pos = Start;
20083 // Restarts vectorization attempt with lower vector factor.
20084 unsigned PrevReduxWidth = ReduxWidth;
20085 bool CheckForReusedReductionOpsLocal = false;
20086 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20087 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20088 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20089 // Check if any of the reduction ops are gathered. If so, worth
20090 // trying again with less number of reduction ops.
20091 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20092 }
20093 ++Pos;
20094 if (Pos < NumReducedVals - ReduxWidth + 1)
20095 return IsAnyRedOpGathered;
20096 Pos = Start;
20097 --ReduxWidth;
20098 if (ReduxWidth > 1)
20099 ReduxWidth = GetVectorFactor(ReduxWidth);
20100 return IsAnyRedOpGathered;
20101 };
20102 bool AnyVectorized = false;
20103 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20104 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20105 ReduxWidth >= ReductionLimit) {
20106 // Dependency in tree of the reduction ops - drop this attempt, try
20107 // later.
20108 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20109 Start == 0) {
20110 CheckForReusedReductionOps = true;
20111 break;
20112 }
20113 PrevReduxWidth = ReduxWidth;
20114 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20115 // Been analyzed already - skip.
20116 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20117 (!has_single_bit(ReduxWidth) &&
20118 (IgnoredCandidates.contains(
20119 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20120 IgnoredCandidates.contains(
20121 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20122 bit_floor(ReduxWidth))))) ||
20123 V.areAnalyzedReductionVals(VL)) {
20124 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20125 continue;
20126 }
20127 // Early exit if any of the reduction values were deleted during
20128 // previous vectorization attempts.
20129 if (any_of(VL, [&V](Value *RedVal) {
20130 auto *RedValI = dyn_cast<Instruction>(RedVal);
20131 if (!RedValI)
20132 return false;
20133 return V.isDeleted(RedValI);
20134 }))
20135 break;
20136 V.buildTree(VL, IgnoreList);
20137 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20138 if (!AdjustReducedVals())
20139 V.analyzedReductionVals(VL);
20140 continue;
20141 }
20142 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20143 if (!AdjustReducedVals())
20144 V.analyzedReductionVals(VL);
20145 continue;
20146 }
20147 V.reorderTopToBottom();
20148 // No need to reorder the root node at all.
20149 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20150 // Keep extracted other reduction values, if they are used in the
20151 // vectorization trees.
20152 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20153 ExternallyUsedValues);
20154 // The reduction root is used as the insertion point for new
20155 // instructions, so set it as externally used to prevent it from being
20156 // deleted.
20157 LocalExternallyUsedValues.insert(ReductionRoot);
20158 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20159 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20160 continue;
20161 for (Value *V : ReducedVals[Cnt])
20162 if (isa<Instruction>(V))
20163 LocalExternallyUsedValues.insert(TrackedVals[V]);
20164 }
20165 if (!IsSupportedHorRdxIdentityOp) {
20166 // Number of uses of the candidates in the vector of values.
20167 assert(SameValuesCounter.empty() &&
20168 "Reused values counter map is not empty");
20169 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20170 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20171 continue;
20172 Value *V = Candidates[Cnt];
20173 Value *OrigV = TrackedToOrig.at(V);
20174 ++SameValuesCounter.try_emplace(OrigV).first->second;
20175 }
20176 }
20177 V.transformNodes();
20178 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20179 // Gather externally used values.
20181 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20182 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20183 continue;
20184 Value *RdxVal = Candidates[Cnt];
20185 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20186 RdxVal = It->second;
20187 if (!Visited.insert(RdxVal).second)
20188 continue;
20189 // Check if the scalar was vectorized as part of the vectorization
20190 // tree but not the top node.
20191 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20192 LocalExternallyUsedValues.insert(RdxVal);
20193 continue;
20194 }
20195 Value *OrigV = TrackedToOrig.at(RdxVal);
20196 unsigned NumOps =
20197 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20198 if (NumOps != ReducedValsToOps.at(OrigV).size())
20199 LocalExternallyUsedValues.insert(RdxVal);
20200 }
20201 // Do not need the list of reused scalars in regular mode anymore.
20202 if (!IsSupportedHorRdxIdentityOp)
20203 SameValuesCounter.clear();
20204 for (Value *RdxVal : VL)
20205 if (RequiredExtract.contains(RdxVal))
20206 LocalExternallyUsedValues.insert(RdxVal);
20207 V.buildExternalUses(LocalExternallyUsedValues);
20208
20209 V.computeMinimumValueSizes();
20210
20211 // Estimate cost.
20212 InstructionCost TreeCost = V.getTreeCost(VL);
20213 InstructionCost ReductionCost =
20214 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20215 InstructionCost Cost = TreeCost + ReductionCost;
20216 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20217 << " for reduction\n");
20218 if (!Cost.isValid())
20219 break;
20220 if (Cost >= -SLPCostThreshold) {
20221 V.getORE()->emit([&]() {
20222 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20223 ReducedValsToOps.at(VL[0]).front())
20224 << "Vectorizing horizontal reduction is possible "
20225 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20226 << " and threshold "
20227 << ore::NV("Threshold", -SLPCostThreshold);
20228 });
20229 if (!AdjustReducedVals()) {
20230 V.analyzedReductionVals(VL);
20231 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20232 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20233 // Add subvectors of VL to the list of the analyzed values.
20234 for (unsigned VF = getFloorFullVectorNumberOfElements(
20235 *TTI, VL.front()->getType(), ReduxWidth - 1);
20236 VF >= ReductionLimit;
20238 *TTI, VL.front()->getType(), VF - 1)) {
20239 if (has_single_bit(VF) &&
20240 V.getCanonicalGraphSize() != V.getTreeSize())
20241 continue;
20242 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20243 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20244 }
20245 }
20246 }
20247 continue;
20248 }
20249
20250 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20251 << Cost << ". (HorRdx)\n");
20252 V.getORE()->emit([&]() {
20253 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20254 ReducedValsToOps.at(VL[0]).front())
20255 << "Vectorized horizontal reduction with cost "
20256 << ore::NV("Cost", Cost) << " and with tree size "
20257 << ore::NV("TreeSize", V.getTreeSize());
20258 });
20259
20260 Builder.setFastMathFlags(RdxFMF);
20261
20262 // Emit a reduction. If the root is a select (min/max idiom), the insert
20263 // point is the compare condition of that select.
20264 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20265 Instruction *InsertPt = RdxRootInst;
20266 if (IsCmpSelMinMax)
20267 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20268
20269 // Vectorize a tree.
20270 Value *VectorizedRoot =
20271 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20272 // Update TrackedToOrig mapping, since the tracked values might be
20273 // updated.
20274 for (Value *RdxVal : Candidates) {
20275 Value *OrigVal = TrackedToOrig.at(RdxVal);
20276 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20277 if (TransformedRdxVal != RdxVal)
20278 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20279 }
20280
20281 Builder.SetInsertPoint(InsertPt);
20282
20283 // To prevent poison from leaking across what used to be sequential,
20284 // safe, scalar boolean logic operations, the reduction operand must be
20285 // frozen.
20286 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20287 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20288
20289 // Emit code to correctly handle reused reduced values, if required.
20290 if (OptReusedScalars && !SameScaleFactor) {
20291 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20292 SameValuesCounter, TrackedToOrig);
20293 }
20294
20295 Value *ReducedSubTree;
20296 Type *ScalarTy = VL.front()->getType();
20297 if (isa<FixedVectorType>(ScalarTy)) {
20298 assert(SLPReVec && "FixedVectorType is not expected.");
20299 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20300 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20301 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20302 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20303 // Do reduction for each lane.
20304 // e.g., do reduce add for
20305 // VL[0] = <4 x Ty> <a, b, c, d>
20306 // VL[1] = <4 x Ty> <e, f, g, h>
20307 // Lane[0] = <2 x Ty> <a, e>
20308 // Lane[1] = <2 x Ty> <b, f>
20309 // Lane[2] = <2 x Ty> <c, g>
20310 // Lane[3] = <2 x Ty> <d, h>
20311 // result[0] = reduce add Lane[0]
20312 // result[1] = reduce add Lane[1]
20313 // result[2] = reduce add Lane[2]
20314 // result[3] = reduce add Lane[3]
20316 createStrideMask(I, ScalarTyNumElements, VL.size());
20317 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20318 ReducedSubTree = Builder.CreateInsertElement(
20319 ReducedSubTree,
20320 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20321 }
20322 } else {
20323 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20324 RdxRootInst->getType());
20325 }
20326 if (ReducedSubTree->getType() != VL.front()->getType()) {
20327 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20328 "Expected different reduction type.");
20329 ReducedSubTree =
20330 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20331 V.isSignedMinBitwidthRootNode());
20332 }
20333
20334 // Improved analysis for add/fadd/xor reductions with same scale factor
20335 // for all operands of reductions. We can emit scalar ops for them
20336 // instead.
20337 if (OptReusedScalars && SameScaleFactor)
20338 ReducedSubTree = emitScaleForReusedOps(
20339 ReducedSubTree, Builder, SameValuesCounter.front().second);
20340
20341 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20342 // Count vectorized reduced values to exclude them from final reduction.
20343 for (Value *RdxVal : VL) {
20344 Value *OrigV = TrackedToOrig.at(RdxVal);
20345 if (IsSupportedHorRdxIdentityOp) {
20346 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20347 continue;
20348 }
20349 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20350 if (!V.isVectorized(RdxVal))
20351 RequiredExtract.insert(RdxVal);
20352 }
20353 Pos += ReduxWidth;
20354 Start = Pos;
20355 ReduxWidth = NumReducedVals - Pos;
20356 if (ReduxWidth > 1)
20357 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20358 AnyVectorized = true;
20359 }
20360 if (OptReusedScalars && !AnyVectorized) {
20361 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20362 Value *RdxVal = TrackedVals.at(P.first);
20363 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20364 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20365 VectorizedVals.try_emplace(P.first, P.second);
20366 }
20367 continue;
20368 }
20369 }
20370 if (VectorizedTree) {
20371 // Reorder operands of bool logical op in the natural order to avoid
20372 // possible problem with poison propagation. If not possible to reorder
20373 // (both operands are originally RHS), emit an extra freeze instruction
20374 // for the LHS operand.
20375 // I.e., if we have original code like this:
20376 // RedOp1 = select i1 ?, i1 LHS, i1 false
20377 // RedOp2 = select i1 RHS, i1 ?, i1 false
20378
20379 // Then, we swap LHS/RHS to create a new op that matches the poison
20380 // semantics of the original code.
20381
20382 // If we have original code like this and both values could be poison:
20383 // RedOp1 = select i1 ?, i1 LHS, i1 false
20384 // RedOp2 = select i1 ?, i1 RHS, i1 false
20385
20386 // Then, we must freeze LHS in the new op.
20387 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20388 Instruction *RedOp1,
20389 Instruction *RedOp2,
20390 bool InitStep) {
20391 if (!AnyBoolLogicOp)
20392 return;
20393 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20394 getRdxOperand(RedOp1, 0) == LHS ||
20396 return;
20397 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20398 getRdxOperand(RedOp2, 0) == RHS ||
20400 std::swap(LHS, RHS);
20401 return;
20402 }
20403 if (LHS != VectorizedTree)
20404 LHS = Builder.CreateFreeze(LHS);
20405 };
20406 // Finish the reduction.
20407 // Need to add extra arguments and not vectorized possible reduction
20408 // values.
20409 // Try to avoid dependencies between the scalar remainders after
20410 // reductions.
20411 auto FinalGen =
20413 bool InitStep) {
20414 unsigned Sz = InstVals.size();
20416 Sz % 2);
20417 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20418 Instruction *RedOp = InstVals[I + 1].first;
20419 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20420 Value *RdxVal1 = InstVals[I].second;
20421 Value *StableRdxVal1 = RdxVal1;
20422 auto It1 = TrackedVals.find(RdxVal1);
20423 if (It1 != TrackedVals.end())
20424 StableRdxVal1 = It1->second;
20425 Value *RdxVal2 = InstVals[I + 1].second;
20426 Value *StableRdxVal2 = RdxVal2;
20427 auto It2 = TrackedVals.find(RdxVal2);
20428 if (It2 != TrackedVals.end())
20429 StableRdxVal2 = It2->second;
20430 // To prevent poison from leaking across what used to be
20431 // sequential, safe, scalar boolean logic operations, the
20432 // reduction operand must be frozen.
20433 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20434 RedOp, InitStep);
20435 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20436 StableRdxVal2, "op.rdx", ReductionOps);
20437 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20438 }
20439 if (Sz % 2 == 1)
20440 ExtraReds[Sz / 2] = InstVals.back();
20441 return ExtraReds;
20442 };
20444 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20445 VectorizedTree);
20447 for (ArrayRef<Value *> Candidates : ReducedVals) {
20448 for (Value *RdxVal : Candidates) {
20449 if (!Visited.insert(RdxVal).second)
20450 continue;
20451 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20452 for (Instruction *RedOp :
20453 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20454 ExtraReductions.emplace_back(RedOp, RdxVal);
20455 }
20456 }
20457 // Iterate through all not-vectorized reduction values/extra arguments.
20458 bool InitStep = true;
20459 while (ExtraReductions.size() > 1) {
20461 FinalGen(ExtraReductions, InitStep);
20462 ExtraReductions.swap(NewReds);
20463 InitStep = false;
20464 }
20465 VectorizedTree = ExtraReductions.front().second;
20466
20467 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20468
20469 // The original scalar reduction is expected to have no remaining
20470 // uses outside the reduction tree itself. Assert that we got this
20471 // correct, replace internal uses with undef, and mark for eventual
20472 // deletion.
20473#ifndef NDEBUG
20474 SmallSet<Value *, 4> IgnoreSet;
20475 for (ArrayRef<Value *> RdxOps : ReductionOps)
20476 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20477#endif
20478 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20479 for (Value *Ignore : RdxOps) {
20480 if (!Ignore)
20481 continue;
20482#ifndef NDEBUG
20483 for (auto *U : Ignore->users()) {
20484 assert(IgnoreSet.count(U) &&
20485 "All users must be either in the reduction ops list.");
20486 }
20487#endif
20488 if (!Ignore->use_empty()) {
20489 Value *P = PoisonValue::get(Ignore->getType());
20490 Ignore->replaceAllUsesWith(P);
20491 }
20492 }
20493 V.removeInstructionsAndOperands(RdxOps);
20494 }
20495 } else if (!CheckForReusedReductionOps) {
20496 for (ReductionOpsType &RdxOps : ReductionOps)
20497 for (Value *RdxOp : RdxOps)
20498 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20499 }
20500 return VectorizedTree;
20501 }
20502
20503private:
20504 /// Calculate the cost of a reduction.
20505 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20506 ArrayRef<Value *> ReducedVals,
20507 bool IsCmpSelMinMax, FastMathFlags FMF,
20508 const BoUpSLP &R) {
20510 Type *ScalarTy = ReducedVals.front()->getType();
20511 unsigned ReduxWidth = ReducedVals.size();
20512 FixedVectorType *VectorTy = R.getReductionType();
20513 InstructionCost VectorCost = 0, ScalarCost;
20514 // If all of the reduced values are constant, the vector cost is 0, since
20515 // the reduction value can be calculated at the compile time.
20516 bool AllConsts = allConstant(ReducedVals);
20517 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20519 // Scalar cost is repeated for N-1 elements.
20520 int Cnt = ReducedVals.size();
20521 for (Value *RdxVal : ReducedVals) {
20522 if (Cnt == 1)
20523 break;
20524 --Cnt;
20525 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20526 Cost += GenCostFn();
20527 continue;
20528 }
20529 InstructionCost ScalarCost = 0;
20530 for (User *U : RdxVal->users()) {
20531 auto *RdxOp = cast<Instruction>(U);
20532 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20533 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20534 continue;
20535 }
20536 ScalarCost = InstructionCost::getInvalid();
20537 break;
20538 }
20539 if (ScalarCost.isValid())
20540 Cost += ScalarCost;
20541 else
20542 Cost += GenCostFn();
20543 }
20544 return Cost;
20545 };
20546 switch (RdxKind) {
20547 case RecurKind::Add:
20548 case RecurKind::Mul:
20549 case RecurKind::Or:
20550 case RecurKind::And:
20551 case RecurKind::Xor:
20552 case RecurKind::FAdd:
20553 case RecurKind::FMul: {
20554 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20555 if (!AllConsts) {
20556 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20557 assert(SLPReVec && "FixedVectorType is not expected.");
20558 unsigned ScalarTyNumElements = VecTy->getNumElements();
20559 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20560 VectorCost += TTI->getShuffleCost(
20561 TTI::SK_PermuteSingleSrc, VectorTy,
20562 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20563 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20564 CostKind);
20565 }
20566 VectorCost += TTI->getScalarizationOverhead(
20567 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20568 /*Extract*/ false, TTI::TCK_RecipThroughput);
20569 } else {
20570 Type *RedTy = VectorTy->getElementType();
20571 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20572 std::make_pair(RedTy, true));
20573 if (RType == RedTy) {
20574 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20575 FMF, CostKind);
20576 } else {
20577 VectorCost = TTI->getExtendedReductionCost(
20578 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20579 FMF, CostKind);
20580 }
20581 }
20582 }
20583 ScalarCost = EvaluateScalarCost([&]() {
20584 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20585 });
20586 break;
20587 }
20588 case RecurKind::FMax:
20589 case RecurKind::FMin:
20590 case RecurKind::FMaximum:
20591 case RecurKind::FMinimum:
20592 case RecurKind::SMax:
20593 case RecurKind::SMin:
20594 case RecurKind::UMax:
20595 case RecurKind::UMin: {
20597 if (!AllConsts)
20598 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20599 ScalarCost = EvaluateScalarCost([&]() {
20600 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20601 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20602 });
20603 break;
20604 }
20605 default:
20606 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20607 }
20608
20609 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20610 << " for reduction of " << shortBundleName(ReducedVals)
20611 << " (It is a splitting reduction)\n");
20612 return VectorCost - ScalarCost;
20613 }
20614
20615 /// Emit a horizontal reduction of the vectorized value.
20616 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20617 const TargetTransformInfo *TTI, Type *DestTy) {
20618 assert(VectorizedValue && "Need to have a vectorized tree node");
20619 assert(RdxKind != RecurKind::FMulAdd &&
20620 "A call to the llvm.fmuladd intrinsic is not handled yet");
20621
20622 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20623 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20624 RdxKind == RecurKind::Add &&
20625 DestTy->getScalarType() != FTy->getScalarType()) {
20626 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20627 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20628 Value *V = Builder.CreateBitCast(
20629 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20630 ++NumVectorInstructions;
20631 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20632 }
20633 ++NumVectorInstructions;
20634 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20635 }
20636
20637 /// Emits optimized code for unique scalar value reused \p Cnt times.
20638 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20639 unsigned Cnt) {
20640 assert(IsSupportedHorRdxIdentityOp &&
20641 "The optimization of matched scalar identity horizontal reductions "
20642 "must be supported.");
20643 if (Cnt == 1)
20644 return VectorizedValue;
20645 switch (RdxKind) {
20646 case RecurKind::Add: {
20647 // res = mul vv, n
20648 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20649 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20650 << VectorizedValue << ". (HorRdx)\n");
20651 return Builder.CreateMul(VectorizedValue, Scale);
20652 }
20653 case RecurKind::Xor: {
20654 // res = n % 2 ? 0 : vv
20655 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20656 << ". (HorRdx)\n");
20657 if (Cnt % 2 == 0)
20658 return Constant::getNullValue(VectorizedValue->getType());
20659 return VectorizedValue;
20660 }
20661 case RecurKind::FAdd: {
20662 // res = fmul v, n
20663 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20664 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20665 << VectorizedValue << ". (HorRdx)\n");
20666 return Builder.CreateFMul(VectorizedValue, Scale);
20667 }
20668 case RecurKind::And:
20669 case RecurKind::Or:
20670 case RecurKind::SMax:
20671 case RecurKind::SMin:
20672 case RecurKind::UMax:
20673 case RecurKind::UMin:
20674 case RecurKind::FMax:
20675 case RecurKind::FMin:
20676 case RecurKind::FMaximum:
20677 case RecurKind::FMinimum:
20678 // res = vv
20679 return VectorizedValue;
20680 case RecurKind::Mul:
20681 case RecurKind::FMul:
20682 case RecurKind::FMulAdd:
20683 case RecurKind::IAnyOf:
20684 case RecurKind::FAnyOf:
20685 case RecurKind::IFindLastIV:
20686 case RecurKind::FFindLastIV:
20687 case RecurKind::None:
20688 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20689 }
20690 return nullptr;
20691 }
20692
20693 /// Emits actual operation for the scalar identity values, found during
20694 /// horizontal reduction analysis.
20695 Value *
20696 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20697 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20698 const DenseMap<Value *, Value *> &TrackedToOrig) {
20699 assert(IsSupportedHorRdxIdentityOp &&
20700 "The optimization of matched scalar identity horizontal reductions "
20701 "must be supported.");
20702 ArrayRef<Value *> VL = R.getRootNodeScalars();
20703 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20704 if (VTy->getElementType() != VL.front()->getType()) {
20705 VectorizedValue = Builder.CreateIntCast(
20706 VectorizedValue,
20707 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20708 R.isSignedMinBitwidthRootNode());
20709 }
20710 switch (RdxKind) {
20711 case RecurKind::Add: {
20712 // root = mul prev_root, <1, 1, n, 1>
20714 for (Value *V : VL) {
20715 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20716 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20717 }
20718 auto *Scale = ConstantVector::get(Vals);
20719 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20720 << VectorizedValue << ". (HorRdx)\n");
20721 return Builder.CreateMul(VectorizedValue, Scale);
20722 }
20723 case RecurKind::And:
20724 case RecurKind::Or:
20725 // No need for multiple or/and(s).
20726 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20727 << ". (HorRdx)\n");
20728 return VectorizedValue;
20729 case RecurKind::SMax:
20730 case RecurKind::SMin:
20731 case RecurKind::UMax:
20732 case RecurKind::UMin:
20733 case RecurKind::FMax:
20734 case RecurKind::FMin:
20735 case RecurKind::FMaximum:
20736 case RecurKind::FMinimum:
20737 // No need for multiple min/max(s) of the same value.
20738 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20739 << ". (HorRdx)\n");
20740 return VectorizedValue;
20741 case RecurKind::Xor: {
20742 // Replace values with even number of repeats with 0, since
20743 // x xor x = 0.
20744 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20745 // 7>, if elements 4th and 6th elements have even number of repeats.
20747 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20749 std::iota(Mask.begin(), Mask.end(), 0);
20750 bool NeedShuffle = false;
20751 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20752 Value *V = VL[I];
20753 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20754 if (Cnt % 2 == 0) {
20755 Mask[I] = VF;
20756 NeedShuffle = true;
20757 }
20758 }
20759 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20760 : Mask) dbgs()
20761 << I << " ";
20762 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20763 if (NeedShuffle)
20764 VectorizedValue = Builder.CreateShuffleVector(
20765 VectorizedValue,
20766 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20767 return VectorizedValue;
20768 }
20769 case RecurKind::FAdd: {
20770 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20772 for (Value *V : VL) {
20773 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20774 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20775 }
20776 auto *Scale = ConstantVector::get(Vals);
20777 return Builder.CreateFMul(VectorizedValue, Scale);
20778 }
20779 case RecurKind::Mul:
20780 case RecurKind::FMul:
20781 case RecurKind::FMulAdd:
20782 case RecurKind::IAnyOf:
20783 case RecurKind::FAnyOf:
20784 case RecurKind::IFindLastIV:
20785 case RecurKind::FFindLastIV:
20786 case RecurKind::None:
20787 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20788 }
20789 return nullptr;
20790 }
20791};
20792} // end anonymous namespace
20793
20794/// Gets recurrence kind from the specified value.
20796 return HorizontalReduction::getRdxKind(V);
20797}
20798static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20799 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20800 return cast<FixedVectorType>(IE->getType())->getNumElements();
20801
20802 unsigned AggregateSize = 1;
20803 auto *IV = cast<InsertValueInst>(InsertInst);
20804 Type *CurrentType = IV->getType();
20805 do {
20806 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20807 for (auto *Elt : ST->elements())
20808 if (Elt != ST->getElementType(0)) // check homogeneity
20809 return std::nullopt;
20810 AggregateSize *= ST->getNumElements();
20811 CurrentType = ST->getElementType(0);
20812 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20813 AggregateSize *= AT->getNumElements();
20814 CurrentType = AT->getElementType();
20815 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20816 AggregateSize *= VT->getNumElements();
20817 return AggregateSize;
20818 } else if (CurrentType->isSingleValueType()) {
20819 return AggregateSize;
20820 } else {
20821 return std::nullopt;
20822 }
20823 } while (true);
20824}
20825
20826static void findBuildAggregate_rec(Instruction *LastInsertInst,
20828 SmallVectorImpl<Value *> &BuildVectorOpds,
20829 SmallVectorImpl<Value *> &InsertElts,
20830 unsigned OperandOffset, const BoUpSLP &R) {
20831 do {
20832 Value *InsertedOperand = LastInsertInst->getOperand(1);
20833 std::optional<unsigned> OperandIndex =
20834 getElementIndex(LastInsertInst, OperandOffset);
20835 if (!OperandIndex || R.isDeleted(LastInsertInst))
20836 return;
20837 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20838 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20839 BuildVectorOpds, InsertElts, *OperandIndex, R);
20840
20841 } else {
20842 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20843 InsertElts[*OperandIndex] = LastInsertInst;
20844 }
20845 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20846 } while (LastInsertInst != nullptr &&
20847 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20848 LastInsertInst->hasOneUse());
20849}
20850
20851/// Recognize construction of vectors like
20852/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20853/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20854/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20855/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20856/// starting from the last insertelement or insertvalue instruction.
20857///
20858/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20859/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20860/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20861///
20862/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20863///
20864/// \return true if it matches.
20865static bool findBuildAggregate(Instruction *LastInsertInst,
20867 SmallVectorImpl<Value *> &BuildVectorOpds,
20868 SmallVectorImpl<Value *> &InsertElts,
20869 const BoUpSLP &R) {
20870
20871 assert((isa<InsertElementInst>(LastInsertInst) ||
20872 isa<InsertValueInst>(LastInsertInst)) &&
20873 "Expected insertelement or insertvalue instruction!");
20874
20875 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20876 "Expected empty result vectors!");
20877
20878 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20879 if (!AggregateSize)
20880 return false;
20881 BuildVectorOpds.resize(*AggregateSize);
20882 InsertElts.resize(*AggregateSize);
20883
20884 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20885 R);
20886 llvm::erase(BuildVectorOpds, nullptr);
20887 llvm::erase(InsertElts, nullptr);
20888 if (BuildVectorOpds.size() >= 2)
20889 return true;
20890
20891 return false;
20892}
20893
20894/// Try and get a reduction instruction from a phi node.
20895///
20896/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20897/// if they come from either \p ParentBB or a containing loop latch.
20898///
20899/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20900/// if not possible.
20902 BasicBlock *ParentBB, LoopInfo *LI) {
20903 // There are situations where the reduction value is not dominated by the
20904 // reduction phi. Vectorizing such cases has been reported to cause
20905 // miscompiles. See PR25787.
20906 auto DominatedReduxValue = [&](Value *R) {
20907 return isa<Instruction>(R) &&
20908 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20909 };
20910
20911 Instruction *Rdx = nullptr;
20912
20913 // Return the incoming value if it comes from the same BB as the phi node.
20914 if (P->getIncomingBlock(0) == ParentBB) {
20915 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20916 } else if (P->getIncomingBlock(1) == ParentBB) {
20917 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20918 }
20919
20920 if (Rdx && DominatedReduxValue(Rdx))
20921 return Rdx;
20922
20923 // Otherwise, check whether we have a loop latch to look at.
20924 Loop *BBL = LI->getLoopFor(ParentBB);
20925 if (!BBL)
20926 return nullptr;
20927 BasicBlock *BBLatch = BBL->getLoopLatch();
20928 if (!BBLatch)
20929 return nullptr;
20930
20931 // There is a loop latch, return the incoming value if it comes from
20932 // that. This reduction pattern occasionally turns up.
20933 if (P->getIncomingBlock(0) == BBLatch) {
20934 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20935 } else if (P->getIncomingBlock(1) == BBLatch) {
20936 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20937 }
20938
20939 if (Rdx && DominatedReduxValue(Rdx))
20940 return Rdx;
20941
20942 return nullptr;
20943}
20944
20945static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20946 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20947 return true;
20948 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20949 return true;
20950 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20951 return true;
20952 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20953 return true;
20954 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20955 return true;
20956 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20957 return true;
20958 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20959 return true;
20960 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20961 return true;
20962 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20963 return true;
20964 return false;
20965}
20966
20967/// We could have an initial reduction that is not an add.
20968/// r *= v1 + v2 + v3 + v4
20969/// In such a case start looking for a tree rooted in the first '+'.
20970/// \Returns the new root if found, which may be nullptr if not an instruction.
20972 Instruction *Root) {
20973 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20974 isa<IntrinsicInst>(Root)) &&
20975 "Expected binop, select, or intrinsic for reduction matching");
20976 Value *LHS =
20977 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20978 Value *RHS =
20979 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20980 if (LHS == Phi)
20981 return dyn_cast<Instruction>(RHS);
20982 if (RHS == Phi)
20983 return dyn_cast<Instruction>(LHS);
20984 return nullptr;
20985}
20986
20987/// \p Returns the first operand of \p I that does not match \p Phi. If
20988/// operand is not an instruction it returns nullptr.
20990 Value *Op0 = nullptr;
20991 Value *Op1 = nullptr;
20992 if (!matchRdxBop(I, Op0, Op1))
20993 return nullptr;
20994 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20995}
20996
20997/// \Returns true if \p I is a candidate instruction for reduction vectorization.
20999 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
21000 Value *B0 = nullptr, *B1 = nullptr;
21001 bool IsBinop = matchRdxBop(I, B0, B1);
21002 return IsBinop || IsSelect;
21003}
21004
21005bool SLPVectorizerPass::vectorizeHorReduction(
21006 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
21007 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21008 if (!ShouldVectorizeHor)
21009 return false;
21010 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
21011
21012 if (Root->getParent() != BB || isa<PHINode>(Root))
21013 return false;
21014
21015 // If we can find a secondary reduction root, use that instead.
21016 auto SelectRoot = [&]() {
21017 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
21018 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
21019 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
21020 return NewRoot;
21021 return Root;
21022 };
21023
21024 // Start analysis starting from Root instruction. If horizontal reduction is
21025 // found, try to vectorize it. If it is not a horizontal reduction or
21026 // vectorization is not possible or not effective, and currently analyzed
21027 // instruction is a binary operation, try to vectorize the operands, using
21028 // pre-order DFS traversal order. If the operands were not vectorized, repeat
21029 // the same procedure considering each operand as a possible root of the
21030 // horizontal reduction.
21031 // Interrupt the process if the Root instruction itself was vectorized or all
21032 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21033 // If a horizintal reduction was not matched or vectorized we collect
21034 // instructions for possible later attempts for vectorization.
21035 std::queue<std::pair<Instruction *, unsigned>> Stack;
21036 Stack.emplace(SelectRoot(), 0);
21037 SmallPtrSet<Value *, 8> VisitedInstrs;
21038 bool Res = false;
21039 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21040 if (R.isAnalyzedReductionRoot(Inst))
21041 return nullptr;
21042 if (!isReductionCandidate(Inst))
21043 return nullptr;
21044 HorizontalReduction HorRdx;
21045 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21046 return nullptr;
21047 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21048 };
21049 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21050 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21051 FutureSeed = getNonPhiOperand(Root, P);
21052 if (!FutureSeed)
21053 return false;
21054 }
21055 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21056 // analysis is done separately.
21057 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21058 PostponedInsts.push_back(FutureSeed);
21059 return true;
21060 };
21061
21062 while (!Stack.empty()) {
21063 Instruction *Inst;
21064 unsigned Level;
21065 std::tie(Inst, Level) = Stack.front();
21066 Stack.pop();
21067 // Do not try to analyze instruction that has already been vectorized.
21068 // This may happen when we vectorize instruction operands on a previous
21069 // iteration while stack was populated before that happened.
21070 if (R.isDeleted(Inst))
21071 continue;
21072 if (Value *VectorizedV = TryToReduce(Inst)) {
21073 Res = true;
21074 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21075 // Try to find another reduction.
21076 Stack.emplace(I, Level);
21077 continue;
21078 }
21079 if (R.isDeleted(Inst))
21080 continue;
21081 } else {
21082 // We could not vectorize `Inst` so try to use it as a future seed.
21083 if (!TryAppendToPostponedInsts(Inst)) {
21084 assert(Stack.empty() && "Expected empty stack");
21085 break;
21086 }
21087 }
21088
21089 // Try to vectorize operands.
21090 // Continue analysis for the instruction from the same basic block only to
21091 // save compile time.
21092 if (++Level < RecursionMaxDepth)
21093 for (auto *Op : Inst->operand_values())
21094 if (VisitedInstrs.insert(Op).second)
21095 if (auto *I = dyn_cast<Instruction>(Op))
21096 // Do not try to vectorize CmpInst operands, this is done
21097 // separately.
21098 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21099 !R.isDeleted(I) && I->getParent() == BB)
21100 Stack.emplace(I, Level);
21101 }
21102 return Res;
21103}
21104
21105bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21106 BasicBlock *BB, BoUpSLP &R) {
21107 SmallVector<WeakTrackingVH> PostponedInsts;
21108 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21109 Res |= tryToVectorize(PostponedInsts, R);
21110 return Res;
21111}
21112
21113bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21114 BoUpSLP &R) {
21115 bool Res = false;
21116 for (Value *V : Insts)
21117 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21118 Res |= tryToVectorize(Inst, R);
21119 return Res;
21120}
21121
21122bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21123 BasicBlock *BB, BoUpSLP &R,
21124 bool MaxVFOnly) {
21125 if (!R.canMapToVector(IVI->getType()))
21126 return false;
21127
21128 SmallVector<Value *, 16> BuildVectorOpds;
21129 SmallVector<Value *, 16> BuildVectorInsts;
21130 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21131 return false;
21132
21133 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21134 R.getORE()->emit([&]() {
21135 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21136 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21137 "trying reduction first.";
21138 });
21139 return false;
21140 }
21141 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21142 // Aggregate value is unlikely to be processed in vector register.
21143 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21144}
21145
21146bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21147 BasicBlock *BB, BoUpSLP &R,
21148 bool MaxVFOnly) {
21149 SmallVector<Value *, 16> BuildVectorInsts;
21150 SmallVector<Value *, 16> BuildVectorOpds;
21152 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21153 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21154 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21155 return false;
21156
21157 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21158 R.getORE()->emit([&]() {
21159 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21160 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21161 "trying reduction first.";
21162 });
21163 return false;
21164 }
21165 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21166 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21167}
21168
21169template <typename T>
21171 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21172 function_ref<bool(T *, T *)> AreCompatible,
21173 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21174 bool MaxVFOnly, BoUpSLP &R) {
21175 bool Changed = false;
21176 // Sort by type, parent, operands.
21177 stable_sort(Incoming, Comparator);
21178
21179 // Try to vectorize elements base on their type.
21180 SmallVector<T *> Candidates;
21182 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21183 VL.clear()) {
21184 // Look for the next elements with the same type, parent and operand
21185 // kinds.
21186 auto *I = dyn_cast<Instruction>(*IncIt);
21187 if (!I || R.isDeleted(I)) {
21188 ++IncIt;
21189 continue;
21190 }
21191 auto *SameTypeIt = IncIt;
21192 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21193 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21194 AreCompatible(*SameTypeIt, *IncIt))) {
21195 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21196 ++SameTypeIt;
21197 if (I && !R.isDeleted(I))
21198 VL.push_back(cast<T>(I));
21199 }
21200
21201 // Try to vectorize them.
21202 unsigned NumElts = VL.size();
21203 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21204 << NumElts << ")\n");
21205 // The vectorization is a 3-state attempt:
21206 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21207 // size of maximal register at first.
21208 // 2. Try to vectorize remaining instructions with the same type, if
21209 // possible. This may result in the better vectorization results rather than
21210 // if we try just to vectorize instructions with the same/alternate opcodes.
21211 // 3. Final attempt to try to vectorize all instructions with the
21212 // same/alternate ops only, this may result in some extra final
21213 // vectorization.
21214 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21215 // Success start over because instructions might have been changed.
21216 Changed = true;
21217 VL.swap(Candidates);
21218 Candidates.clear();
21219 for (T *V : VL) {
21220 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21221 Candidates.push_back(V);
21222 }
21223 } else {
21224 /// \Returns the minimum number of elements that we will attempt to
21225 /// vectorize.
21226 auto GetMinNumElements = [&R](Value *V) {
21227 unsigned EltSize = R.getVectorElementSize(V);
21228 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21229 };
21230 if (NumElts < GetMinNumElements(*IncIt) &&
21231 (Candidates.empty() ||
21232 Candidates.front()->getType() == (*IncIt)->getType())) {
21233 for (T *V : VL) {
21234 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21235 Candidates.push_back(V);
21236 }
21237 }
21238 }
21239 // Final attempt to vectorize instructions with the same types.
21240 if (Candidates.size() > 1 &&
21241 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21242 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21243 // Success start over because instructions might have been changed.
21244 Changed = true;
21245 } else if (MaxVFOnly) {
21246 // Try to vectorize using small vectors.
21248 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21249 VL.clear()) {
21250 auto *I = dyn_cast<Instruction>(*It);
21251 if (!I || R.isDeleted(I)) {
21252 ++It;
21253 continue;
21254 }
21255 auto *SameTypeIt = It;
21256 while (SameTypeIt != End &&
21257 (!isa<Instruction>(*SameTypeIt) ||
21258 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21259 AreCompatible(*SameTypeIt, *It))) {
21260 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21261 ++SameTypeIt;
21262 if (I && !R.isDeleted(I))
21263 VL.push_back(cast<T>(I));
21264 }
21265 unsigned NumElts = VL.size();
21266 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21267 /*MaxVFOnly=*/false))
21268 Changed = true;
21269 It = SameTypeIt;
21270 }
21271 }
21272 Candidates.clear();
21273 }
21274
21275 // Start over at the next instruction of a different type (or the end).
21276 IncIt = SameTypeIt;
21277 }
21278 return Changed;
21279}
21280
21281/// Compare two cmp instructions. If IsCompatibility is true, function returns
21282/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21283/// operands. If IsCompatibility is false, function implements strict weak
21284/// ordering relation between two cmp instructions, returning true if the first
21285/// instruction is "less" than the second, i.e. its predicate is less than the
21286/// predicate of the second or the operands IDs are less than the operands IDs
21287/// of the second cmp instruction.
21288template <bool IsCompatibility>
21289static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21290 const DominatorTree &DT) {
21291 assert(isValidElementType(V->getType()) &&
21292 isValidElementType(V2->getType()) &&
21293 "Expected valid element types only.");
21294 if (V == V2)
21295 return IsCompatibility;
21296 auto *CI1 = cast<CmpInst>(V);
21297 auto *CI2 = cast<CmpInst>(V2);
21298 if (CI1->getOperand(0)->getType()->getTypeID() <
21299 CI2->getOperand(0)->getType()->getTypeID())
21300 return !IsCompatibility;
21301 if (CI1->getOperand(0)->getType()->getTypeID() >
21302 CI2->getOperand(0)->getType()->getTypeID())
21303 return false;
21304 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21306 return !IsCompatibility;
21307 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21309 return false;
21310 CmpInst::Predicate Pred1 = CI1->getPredicate();
21311 CmpInst::Predicate Pred2 = CI2->getPredicate();
21314 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21315 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21316 if (BasePred1 < BasePred2)
21317 return !IsCompatibility;
21318 if (BasePred1 > BasePred2)
21319 return false;
21320 // Compare operands.
21321 bool CI1Preds = Pred1 == BasePred1;
21322 bool CI2Preds = Pred2 == BasePred1;
21323 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21324 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21325 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21326 if (Op1 == Op2)
21327 continue;
21328 if (Op1->getValueID() < Op2->getValueID())
21329 return !IsCompatibility;
21330 if (Op1->getValueID() > Op2->getValueID())
21331 return false;
21332 if (auto *I1 = dyn_cast<Instruction>(Op1))
21333 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21334 if (IsCompatibility) {
21335 if (I1->getParent() != I2->getParent())
21336 return false;
21337 } else {
21338 // Try to compare nodes with same parent.
21339 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21340 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21341 if (!NodeI1)
21342 return NodeI2 != nullptr;
21343 if (!NodeI2)
21344 return false;
21345 assert((NodeI1 == NodeI2) ==
21346 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21347 "Different nodes should have different DFS numbers");
21348 if (NodeI1 != NodeI2)
21349 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21350 }
21351 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21352 if (S && (IsCompatibility || !S.isAltShuffle()))
21353 continue;
21354 if (IsCompatibility)
21355 return false;
21356 if (I1->getOpcode() != I2->getOpcode())
21357 return I1->getOpcode() < I2->getOpcode();
21358 }
21359 }
21360 return IsCompatibility;
21361}
21362
21363template <typename ItT>
21364bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21365 BasicBlock *BB, BoUpSLP &R) {
21366 bool Changed = false;
21367 // Try to find reductions first.
21368 for (CmpInst *I : CmpInsts) {
21369 if (R.isDeleted(I))
21370 continue;
21371 for (Value *Op : I->operands())
21372 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21373 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21374 if (R.isDeleted(I))
21375 break;
21376 }
21377 }
21378 // Try to vectorize operands as vector bundles.
21379 for (CmpInst *I : CmpInsts) {
21380 if (R.isDeleted(I))
21381 continue;
21382 Changed |= tryToVectorize(I, R);
21383 }
21384 // Try to vectorize list of compares.
21385 // Sort by type, compare predicate, etc.
21386 auto CompareSorter = [&](Value *V, Value *V2) {
21387 if (V == V2)
21388 return false;
21389 return compareCmp<false>(V, V2, *TLI, *DT);
21390 };
21391
21392 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21393 if (V1 == V2)
21394 return true;
21395 return compareCmp<true>(V1, V2, *TLI, *DT);
21396 };
21397
21399 for (Instruction *V : CmpInsts)
21400 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21401 Vals.push_back(V);
21402 if (Vals.size() <= 1)
21403 return Changed;
21404 Changed |= tryToVectorizeSequence<Value>(
21405 Vals, CompareSorter, AreCompatibleCompares,
21406 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21407 // Exclude possible reductions from other blocks.
21408 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21409 return any_of(V->users(), [V](User *U) {
21410 auto *Select = dyn_cast<SelectInst>(U);
21411 return Select &&
21412 Select->getParent() != cast<Instruction>(V)->getParent();
21413 });
21414 });
21415 if (ArePossiblyReducedInOtherBlock)
21416 return false;
21417 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21418 },
21419 /*MaxVFOnly=*/true, R);
21420 return Changed;
21421}
21422
21423bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21424 BasicBlock *BB, BoUpSLP &R) {
21425 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21426 "This function only accepts Insert instructions");
21427 bool OpsChanged = false;
21428 SmallVector<WeakTrackingVH> PostponedInsts;
21429 for (auto *I : reverse(Instructions)) {
21430 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21431 if (R.isDeleted(I) || isa<CmpInst>(I))
21432 continue;
21433 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21434 OpsChanged |=
21435 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21436 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21437 OpsChanged |=
21438 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21439 }
21440 // pass2 - try to vectorize reductions only
21441 if (R.isDeleted(I))
21442 continue;
21443 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21444 if (R.isDeleted(I) || isa<CmpInst>(I))
21445 continue;
21446 // pass3 - try to match and vectorize a buildvector sequence.
21447 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21448 OpsChanged |=
21449 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21450 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21451 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21452 /*MaxVFOnly=*/false);
21453 }
21454 }
21455 // Now try to vectorize postponed instructions.
21456 OpsChanged |= tryToVectorize(PostponedInsts, R);
21457
21458 Instructions.clear();
21459 return OpsChanged;
21460}
21461
21462bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21463 bool Changed = false;
21465 SmallPtrSet<Value *, 16> VisitedInstrs;
21466 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21467 // node. Allows better to identify the chains that can be vectorized in the
21468 // better way.
21470 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21472 isValidElementType(V2->getType()) &&
21473 "Expected vectorizable types only.");
21474 // It is fine to compare type IDs here, since we expect only vectorizable
21475 // types, like ints, floats and pointers, we don't care about other type.
21476 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21477 return true;
21478 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21479 return false;
21480 if (V1->getType()->getScalarSizeInBits() <
21481 V2->getType()->getScalarSizeInBits())
21482 return true;
21483 if (V1->getType()->getScalarSizeInBits() >
21484 V2->getType()->getScalarSizeInBits())
21485 return false;
21486 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21487 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21488 if (Opcodes1.size() < Opcodes2.size())
21489 return true;
21490 if (Opcodes1.size() > Opcodes2.size())
21491 return false;
21492 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21493 {
21494 // Instructions come first.
21495 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21496 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21497 if (I1 && I2) {
21498 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21499 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21500 if (!NodeI1)
21501 return NodeI2 != nullptr;
21502 if (!NodeI2)
21503 return false;
21504 assert((NodeI1 == NodeI2) ==
21505 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21506 "Different nodes should have different DFS numbers");
21507 if (NodeI1 != NodeI2)
21508 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21509 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21510 if (S && !S.isAltShuffle())
21511 continue;
21512 return I1->getOpcode() < I2->getOpcode();
21513 }
21514 if (I1)
21515 return true;
21516 if (I2)
21517 return false;
21518 }
21519 {
21520 // Non-undef constants come next.
21521 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21522 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21523 if (C1 && C2)
21524 continue;
21525 if (C1)
21526 return true;
21527 if (C2)
21528 return false;
21529 }
21530 bool U1 = isa<UndefValue>(Opcodes1[I]);
21531 bool U2 = isa<UndefValue>(Opcodes2[I]);
21532 {
21533 // Non-constant non-instructions come next.
21534 if (!U1 && !U2) {
21535 auto ValID1 = Opcodes1[I]->getValueID();
21536 auto ValID2 = Opcodes2[I]->getValueID();
21537 if (ValID1 == ValID2)
21538 continue;
21539 if (ValID1 < ValID2)
21540 return true;
21541 if (ValID1 > ValID2)
21542 return false;
21543 }
21544 if (!U1)
21545 return true;
21546 if (!U2)
21547 return false;
21548 }
21549 // Undefs come last.
21550 assert(U1 && U2 && "The only thing left should be undef & undef.");
21551 }
21552 return false;
21553 };
21554 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21555 if (V1 == V2)
21556 return true;
21557 if (V1->getType() != V2->getType())
21558 return false;
21559 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21560 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21561 if (Opcodes1.size() != Opcodes2.size())
21562 return false;
21563 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21564 // Undefs are compatible with any other value.
21565 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21566 continue;
21567 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21568 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21569 if (R.isDeleted(I1) || R.isDeleted(I2))
21570 return false;
21571 if (I1->getParent() != I2->getParent())
21572 return false;
21573 if (getSameOpcode({I1, I2}, *TLI))
21574 continue;
21575 return false;
21576 }
21577 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21578 continue;
21579 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21580 return false;
21581 }
21582 return true;
21583 };
21584
21585 bool HaveVectorizedPhiNodes = false;
21586 do {
21587 // Collect the incoming values from the PHIs.
21588 Incoming.clear();
21589 for (Instruction &I : *BB) {
21590 auto *P = dyn_cast<PHINode>(&I);
21591 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21592 break;
21593
21594 // No need to analyze deleted, vectorized and non-vectorizable
21595 // instructions.
21596 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21597 isValidElementType(P->getType()))
21598 Incoming.push_back(P);
21599 }
21600
21601 if (Incoming.size() <= 1)
21602 break;
21603
21604 // Find the corresponding non-phi nodes for better matching when trying to
21605 // build the tree.
21606 for (Value *V : Incoming) {
21607 SmallVectorImpl<Value *> &Opcodes =
21608 PHIToOpcodes.try_emplace(V).first->getSecond();
21609 if (!Opcodes.empty())
21610 continue;
21611 SmallVector<Value *, 4> Nodes(1, V);
21613 while (!Nodes.empty()) {
21614 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21615 if (!Visited.insert(PHI).second)
21616 continue;
21617 for (Value *V : PHI->incoming_values()) {
21618 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21619 Nodes.push_back(PHI1);
21620 continue;
21621 }
21622 Opcodes.emplace_back(V);
21623 }
21624 }
21625 }
21626
21627 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21628 Incoming, PHICompare, AreCompatiblePHIs,
21629 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21630 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21631 },
21632 /*MaxVFOnly=*/true, R);
21633 Changed |= HaveVectorizedPhiNodes;
21634 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21635 auto *PHI = dyn_cast<PHINode>(P.first);
21636 return !PHI || R.isDeleted(PHI);
21637 }))
21638 PHIToOpcodes.clear();
21639 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21640 } while (HaveVectorizedPhiNodes);
21641
21642 VisitedInstrs.clear();
21643
21644 InstSetVector PostProcessInserts;
21645 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21646 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21647 // also vectorizes `PostProcessCmps`.
21648 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21649 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21650 if (VectorizeCmps) {
21651 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21652 PostProcessCmps.clear();
21653 }
21654 PostProcessInserts.clear();
21655 return Changed;
21656 };
21657 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21658 auto IsInPostProcessInstrs = [&](Instruction *I) {
21659 if (auto *Cmp = dyn_cast<CmpInst>(I))
21660 return PostProcessCmps.contains(Cmp);
21661 return isa<InsertElementInst, InsertValueInst>(I) &&
21662 PostProcessInserts.contains(I);
21663 };
21664 // Returns true if `I` is an instruction without users, like terminator, or
21665 // function call with ignored return value, store. Ignore unused instructions
21666 // (basing on instruction type, except for CallInst and InvokeInst).
21667 auto HasNoUsers = [](Instruction *I) {
21668 return I->use_empty() &&
21669 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21670 };
21671 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21672 // Skip instructions with scalable type. The num of elements is unknown at
21673 // compile-time for scalable type.
21674 if (isa<ScalableVectorType>(It->getType()))
21675 continue;
21676
21677 // Skip instructions marked for the deletion.
21678 if (R.isDeleted(&*It))
21679 continue;
21680 // We may go through BB multiple times so skip the one we have checked.
21681 if (!VisitedInstrs.insert(&*It).second) {
21682 if (HasNoUsers(&*It) &&
21683 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21684 // We would like to start over since some instructions are deleted
21685 // and the iterator may become invalid value.
21686 Changed = true;
21687 It = BB->begin();
21688 E = BB->end();
21689 }
21690 continue;
21691 }
21692
21693 if (isa<DbgInfoIntrinsic>(It))
21694 continue;
21695
21696 // Try to vectorize reductions that use PHINodes.
21697 if (PHINode *P = dyn_cast<PHINode>(It)) {
21698 // Check that the PHI is a reduction PHI.
21699 if (P->getNumIncomingValues() == 2) {
21700 // Try to match and vectorize a horizontal reduction.
21701 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21702 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21703 Changed = true;
21704 It = BB->begin();
21705 E = BB->end();
21706 continue;
21707 }
21708 }
21709 // Try to vectorize the incoming values of the PHI, to catch reductions
21710 // that feed into PHIs.
21711 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21712 // Skip if the incoming block is the current BB for now. Also, bypass
21713 // unreachable IR for efficiency and to avoid crashing.
21714 // TODO: Collect the skipped incoming values and try to vectorize them
21715 // after processing BB.
21716 if (BB == P->getIncomingBlock(I) ||
21717 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21718 continue;
21719
21720 // Postponed instructions should not be vectorized here, delay their
21721 // vectorization.
21722 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21723 PI && !IsInPostProcessInstrs(PI)) {
21724 bool Res =
21725 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21726 Changed |= Res;
21727 if (Res && R.isDeleted(P)) {
21728 It = BB->begin();
21729 E = BB->end();
21730 break;
21731 }
21732 }
21733 }
21734 continue;
21735 }
21736
21737 if (HasNoUsers(&*It)) {
21738 bool OpsChanged = false;
21739 auto *SI = dyn_cast<StoreInst>(It);
21740 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21741 if (SI) {
21742 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21743 // Try to vectorize chain in store, if this is the only store to the
21744 // address in the block.
21745 // TODO: This is just a temporarily solution to save compile time. Need
21746 // to investigate if we can safely turn on slp-vectorize-hor-store
21747 // instead to allow lookup for reduction chains in all non-vectorized
21748 // stores (need to check side effects and compile time).
21749 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21750 SI->getValueOperand()->hasOneUse();
21751 }
21752 if (TryToVectorizeRoot) {
21753 for (auto *V : It->operand_values()) {
21754 // Postponed instructions should not be vectorized here, delay their
21755 // vectorization.
21756 if (auto *VI = dyn_cast<Instruction>(V);
21757 VI && !IsInPostProcessInstrs(VI))
21758 // Try to match and vectorize a horizontal reduction.
21759 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21760 }
21761 }
21762 // Start vectorization of post-process list of instructions from the
21763 // top-tree instructions to try to vectorize as many instructions as
21764 // possible.
21765 OpsChanged |=
21766 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21767 if (OpsChanged) {
21768 // We would like to start over since some instructions are deleted
21769 // and the iterator may become invalid value.
21770 Changed = true;
21771 It = BB->begin();
21772 E = BB->end();
21773 continue;
21774 }
21775 }
21776
21777 if (isa<InsertElementInst, InsertValueInst>(It))
21778 PostProcessInserts.insert(&*It);
21779 else if (isa<CmpInst>(It))
21780 PostProcessCmps.insert(cast<CmpInst>(&*It));
21781 }
21782
21783 return Changed;
21784}
21785
21786bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21787 auto Changed = false;
21788 for (auto &Entry : GEPs) {
21789 // If the getelementptr list has fewer than two elements, there's nothing
21790 // to do.
21791 if (Entry.second.size() < 2)
21792 continue;
21793
21794 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21795 << Entry.second.size() << ".\n");
21796
21797 // Process the GEP list in chunks suitable for the target's supported
21798 // vector size. If a vector register can't hold 1 element, we are done. We
21799 // are trying to vectorize the index computations, so the maximum number of
21800 // elements is based on the size of the index expression, rather than the
21801 // size of the GEP itself (the target's pointer size).
21802 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21803 return !R.isDeleted(GEP);
21804 });
21805 if (It == Entry.second.end())
21806 continue;
21807 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21808 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21809 if (MaxVecRegSize < EltSize)
21810 continue;
21811
21812 unsigned MaxElts = MaxVecRegSize / EltSize;
21813 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21814 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21815 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21816
21817 // Initialize a set a candidate getelementptrs. Note that we use a
21818 // SetVector here to preserve program order. If the index computations
21819 // are vectorizable and begin with loads, we want to minimize the chance
21820 // of having to reorder them later.
21821 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21822
21823 // Some of the candidates may have already been vectorized after we
21824 // initially collected them or their index is optimized to constant value.
21825 // If so, they are marked as deleted, so remove them from the set of
21826 // candidates.
21827 Candidates.remove_if([&R](Value *I) {
21828 return R.isDeleted(cast<Instruction>(I)) ||
21829 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21830 });
21831
21832 // Remove from the set of candidates all pairs of getelementptrs with
21833 // constant differences. Such getelementptrs are likely not good
21834 // candidates for vectorization in a bottom-up phase since one can be
21835 // computed from the other. We also ensure all candidate getelementptr
21836 // indices are unique.
21837 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21838 auto *GEPI = GEPList[I];
21839 if (!Candidates.count(GEPI))
21840 continue;
21841 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21842 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21843 auto *GEPJ = GEPList[J];
21844 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21845 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21846 Candidates.remove(GEPI);
21847 Candidates.remove(GEPJ);
21848 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21849 Candidates.remove(GEPJ);
21850 }
21851 }
21852 }
21853
21854 // We break out of the above computation as soon as we know there are
21855 // fewer than two candidates remaining.
21856 if (Candidates.size() < 2)
21857 continue;
21858
21859 // Add the single, non-constant index of each candidate to the bundle. We
21860 // ensured the indices met these constraints when we originally collected
21861 // the getelementptrs.
21862 SmallVector<Value *, 16> Bundle(Candidates.size());
21863 auto BundleIndex = 0u;
21864 for (auto *V : Candidates) {
21865 auto *GEP = cast<GetElementPtrInst>(V);
21866 auto *GEPIdx = GEP->idx_begin()->get();
21867 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21868 Bundle[BundleIndex++] = GEPIdx;
21869 }
21870
21871 // Try and vectorize the indices. We are currently only interested in
21872 // gather-like cases of the form:
21873 //
21874 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21875 //
21876 // where the loads of "a", the loads of "b", and the subtractions can be
21877 // performed in parallel. It's likely that detecting this pattern in a
21878 // bottom-up phase will be simpler and less costly than building a
21879 // full-blown top-down phase beginning at the consecutive loads.
21880 Changed |= tryToVectorizeList(Bundle, R);
21881 }
21882 }
21883 return Changed;
21884}
21885
21886bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21887 bool Changed = false;
21888 // Sort by type, base pointers and values operand. Value operands must be
21889 // compatible (have the same opcode, same parent), otherwise it is
21890 // definitely not profitable to try to vectorize them.
21891 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21892 if (V->getValueOperand()->getType()->getTypeID() <
21893 V2->getValueOperand()->getType()->getTypeID())
21894 return true;
21895 if (V->getValueOperand()->getType()->getTypeID() >
21896 V2->getValueOperand()->getType()->getTypeID())
21897 return false;
21898 if (V->getPointerOperandType()->getTypeID() <
21899 V2->getPointerOperandType()->getTypeID())
21900 return true;
21901 if (V->getPointerOperandType()->getTypeID() >
21902 V2->getPointerOperandType()->getTypeID())
21903 return false;
21904 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21905 V2->getValueOperand()->getType()->getScalarSizeInBits())
21906 return true;
21907 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21908 V2->getValueOperand()->getType()->getScalarSizeInBits())
21909 return false;
21910 // UndefValues are compatible with all other values.
21911 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21912 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21914 DT->getNode(I1->getParent());
21916 DT->getNode(I2->getParent());
21917 assert(NodeI1 && "Should only process reachable instructions");
21918 assert(NodeI2 && "Should only process reachable instructions");
21919 assert((NodeI1 == NodeI2) ==
21920 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21921 "Different nodes should have different DFS numbers");
21922 if (NodeI1 != NodeI2)
21923 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21924 return I1->getOpcode() < I2->getOpcode();
21925 }
21926 return V->getValueOperand()->getValueID() <
21927 V2->getValueOperand()->getValueID();
21928 };
21929
21930 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21931 if (V1 == V2)
21932 return true;
21933 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21934 return false;
21935 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21936 return false;
21937 // Undefs are compatible with any other value.
21938 if (isa<UndefValue>(V1->getValueOperand()) ||
21939 isa<UndefValue>(V2->getValueOperand()))
21940 return true;
21941 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21942 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21943 if (I1->getParent() != I2->getParent())
21944 return false;
21945 return getSameOpcode({I1, I2}, *TLI).valid();
21946 }
21947 if (isa<Constant>(V1->getValueOperand()) &&
21948 isa<Constant>(V2->getValueOperand()))
21949 return true;
21950 return V1->getValueOperand()->getValueID() ==
21951 V2->getValueOperand()->getValueID();
21952 };
21953
21954 // Attempt to sort and vectorize each of the store-groups.
21956 for (auto &Pair : Stores) {
21957 if (Pair.second.size() < 2)
21958 continue;
21959
21960 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21961 << Pair.second.size() << ".\n");
21962
21963 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21964 continue;
21965
21966 // Reverse stores to do bottom-to-top analysis. This is important if the
21967 // values are stores to the same addresses several times, in this case need
21968 // to follow the stores order (reversed to meet the memory dependecies).
21969 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21970 Pair.second.rend());
21971 Changed |= tryToVectorizeSequence<StoreInst>(
21972 ReversedStores, StoreSorter, AreCompatibleStores,
21973 [&](ArrayRef<StoreInst *> Candidates, bool) {
21974 return vectorizeStores(Candidates, R, Attempted);
21975 },
21976 /*MaxVFOnly=*/false, R);
21977 }
21978 return Changed;
21979}
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1504
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
reverse_iterator rend()
Definition: BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:688
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1986
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1881
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2123
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1980
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1977
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:871
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1072
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:530
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2574
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2186
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:867
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1761
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:881
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:319
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:799
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:315
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:310
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:316
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:732
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7301
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:256
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1054
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2144
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.